List of usage examples for org.jsoup.nodes Element attr
public String attr(String attributeKey)
From source file:io.seldon.importer.articles.AttributesImporterUtils.java
public static List<String> getTagsPartsFromSingleElement(Element tagsElement) { String tagsRaw = tagsElement.attr("content"); String[] parts = tagsRaw.split(","); for (int i = 0; i < parts.length; i++) parts[i] = parts[i].trim().toLowerCase(); List<String> tagsParts = (parts != null) ? new ArrayList<String>(Arrays.asList(parts)) : new ArrayList<String>(); return tagsParts; }
From source file:FILER.java
public static String[] Dealing_Files(File f) throws IOException //return array of important strings in the file { Text = ""; String[] Importants = { "", "", "", "" }; //first element is the title,second is all headers,third is img alt,4th is the url org.jsoup.nodes.Document doc = Jsoup.parse(f, "UTF-8"); Importants[0] = doc.title(); //get the title of the file //Text=Text+" "+doc.title(); String tag = "h"; String All_Headers = ""; Elements Header;/*from w w w .ja v a2 s. c o m*/ for (int i = 1; i < 20; i++) //loop to get text with headers tag of the file { tag = "h" + String.valueOf(i); Header = doc.select(tag); if (Header.size() > 0) { Header = doc.getElementsByTag(tag); String pConcatenated = ""; for (Element x : Header) { pConcatenated += x.text() + " "; } All_Headers = All_Headers + pConcatenated; } else break; } Importants[1] = All_Headers; Text = Text + " " + doc.text(); //get the text of the document Elements img = doc.getElementsByTag("img"); //get the text with img tag for (Element element : img) { if (element.attr("alt") != null && !(element.attr("alt").equals(""))) { Text = Text + " " + element.attr("alt"); Importants[2] = Importants[2] + " " + element.attr("alt"); } } return Importants; }
From source file:Main.java
public static Document getPage(String strProvider, String strUsername, String strPassword) throws IOException { Connection.Response resResponse = Jsoup.connect(String.format(POST_URL, strProvider)).execute(); Element eleHidden = resResponse.parse().select("input[type=hidden]").first(); Document docBody = Jsoup.connect(String.format(POST_URL, strProvider)) .data(eleHidden.attr("name"), eleHidden.attr("value"), "login[username]", strUsername, "page_referrer", "login", "login[password]", strPassword) .method(Method.POST).followRedirects(true).cookie("PHPSESSID", resResponse.cookie("PHPSESSID")) .execute().parse();/*from w w w. java2s .c o m*/ return docBody; }
From source file:edu.ucla.cs.scai.swim.qa.ontology.dbpedia.tipicality.DbpediaCsvDownload.java
private static void download(Element e) throws MalformedURLException, IOException { for (Element c : e.children()) { String tagName = c.tag().getName(); if (tagName.equals("small")) { for (Element c1 : c.children()) { if (c1.tag().getName().equals("a") && c1.text().equalsIgnoreCase("csv")) { String href = c1.attr("href"); System.out.println("Downloading " + href); try { URL remoteFile = new URL(href); ReadableByteChannel rbc = Channels.newChannel(remoteFile.openStream()); String[] s = href.split("\\/"); FileOutputStream fos = new FileOutputStream( DBpediaOntology.DBPEDIA_CSV_FOLDER + s[s.length - 1]); fos.getChannel().transferFrom(rbc, 0, Long.MAX_VALUE); } catch (Exception ex) { ex.printStackTrace(); }/*w w w. ja v a 2s. c o m*/ } } } else if (tagName.equals("ul")) { for (Element c1 : c.children()) { if (c1.tagName().equals("li")) { download(c1); } } } } }
From source file:com.vaadin.sass.testcases.scss.W3ConformanceTests.java
protected static Collection<URI> scrapeIndexForTests(String url, String regexp, int maxTests, Collection<URI> excludeUrls) throws Exception { URI baseUrl = new URI(url); Document doc = Jsoup.connect(url).timeout(10000).get(); Elements elems = doc.select(String.format("a[href~=%s]", regexp)); LinkedHashSet<URI> tests = new LinkedHashSet<URI>(); for (Element e : elems) { URI testUrl = new URI(e.attr("href")); if (!testUrl.isAbsolute()) { testUrl = baseUrl.resolve(testUrl); }/*from w w w .ja v a 2s .co m*/ if (tests.size() < maxTests) { if (!excludeUrls.contains(testUrl)) { tests.add(testUrl); } } else { break; } } return tests; }
From source file:com.nuance.expertassistant.ContentCrawler.java
public static ArrayList<String> listURLs(String StartUrl, int depth) { System.out.println(" Current Depth is : [" + depth + "]"); // System.out.println(" PARENT URL is : [" + StartUrl + "]"); // System.out.println(" URL CRAWL Pattern is : [" + URLCrawlPattern + // "]");/*ww w. j a v a 2 s.c o m*/ final ArrayList<String> tempURLs = new ArrayList<String>(); try { final Document doc = Jsoup.connect(StartUrl).timeout(0).get(); final Elements links = doc.select("a"); for (final Element link : links) { final String absLink = link.attr("abs:href"); if (!visitedURLs.contains(absLink) && absLink.contains(URLCrawlPattern)) { visitedURLs.add(absLink); if (visitedURLs.size() > PageLimit) { ContentExtractor.endDocument(); System.out.println(" Max URL Limit Reached - [Stopping ....] "); System.out.println(" [Stopped] "); exit(0); } tempURLs.add(absLink); System.out.println(" URLs Extracted So Far : [" + visitedURLs.size() + "]"); System.out.println(" Extracting Content From : [" + absLink + "]"); ContentExtractor.extract(absLink); } } } catch (final Exception e) { e.printStackTrace(); } return tempURLs; }
From source file:io.seldon.importer.articles.AttributesImporterUtils.java
public static Set<String> getTags(Document articleDoc, String tagsCssSelector, String title) { Set<String> tagSet = new HashSet<String>(); if (StringUtils.isNotBlank(tagsCssSelector)) { Elements tagsElements = articleDoc.select(tagsCssSelector); Element tagsElement = tagsElements.first(); List<String> tagsParts; if ((tagsElement != null) && (tagsElement.attr("content") != null) && (StringUtils.isNotBlank(tagsElement.attr("content")))) { tagsParts = AttributesImporterUtils.getTagsPartsFromSingleElement(tagsElement); } else {// ww w .j a v a 2 s . co m tagsParts = AttributesImporterUtils.getTagsPartsFromMultipleElement(tagsElements); } List<String> extraTagsParts = AttributesImporterUtils.createExtraTagsPartsFromTitle(title, tagsParts); tagSet.addAll(tagsParts); tagSet.addAll(extraTagsParts); } return tagSet; }
From source file:org.keycloak.testsuite.util.saml.LoginBuilder.java
public static HttpUriRequest handleLoginPage(UserRepresentation user, String loginPage) { String username = user.getUsername(); String password = getPasswordOf(user); org.jsoup.nodes.Document theLoginPage = Jsoup.parse(loginPage); List<NameValuePair> parameters = new LinkedList<>(); for (Element form : theLoginPage.getElementsByTag("form")) { String method = form.attr("method"); String action = form.attr("action"); boolean isPost = method != null && "post".equalsIgnoreCase(method); for (Element input : form.getElementsByTag("input")) { if (Objects.equals(input.id(), "username")) { parameters.add(new BasicNameValuePair(input.attr("name"), username)); } else if (Objects.equals(input.id(), "password")) { parameters.add(new BasicNameValuePair(input.attr("name"), password)); } else { parameters.add(new BasicNameValuePair(input.attr("name"), input.val())); }/* w ww. ja v a 2 s .c o m*/ } if (isPost) { HttpPost res = new HttpPost(action); UrlEncodedFormEntity formEntity; try { formEntity = new UrlEncodedFormEntity(parameters, "UTF-8"); } catch (UnsupportedEncodingException e) { throw new RuntimeException(e); } res.setEntity(formEntity); return res; } else { UriBuilder b = UriBuilder.fromPath(action); for (NameValuePair parameter : parameters) { b.queryParam(parameter.getName(), parameter.getValue()); } return new HttpGet(b.build()); } } throw new IllegalArgumentException("Invalid login form: " + loginPage); }
From source file:com.slidespeech.server.service.TextToSpeechService.java
private static String createXML4Cereproc(String fileName, String speakernotes) throws IOException { List<String> voices = new ArrayList<String>(); try {// w ww . j a v a2 s .c om Document doc = Jsoup.parse(speakernotes, ""); doc.outputSettings().prettyPrint(false); Elements voiceNodes = doc.select("voice"); for (Element voiceNode : voiceNodes) { String lang = (voiceNode.hasAttr("xml:lang") && !voiceNode.attr("xml:lang").equals("")) ? voiceNode.attr("xml:lang") : "en"; String gender = (voiceNode.hasAttr("gender") && !voiceNode.attr("gender").equals("")) ? voiceNode.attr("gender") : "female"; String voiceName = (voiceNode.hasAttr("name") && !voiceNode.attr("name").equals("")) ? voiceNode.attr("name") : ""; //voice name not set by user -> choose one depending on language and gender if (voiceName.equals("")) { voiceName = "isabella";//default //if(lang.equalsIgnoreCase("en") && gender.equalsIgnoreCase("female")) voiceName = "isabella"; if (lang.equalsIgnoreCase("en") && gender.equalsIgnoreCase("male")) voiceName = "william"; if (lang.equalsIgnoreCase("de")) voiceName = "alex"; voiceNode.attr("name", voiceName); } if (!voices.contains(voiceName)) { voices.add(voiceName); } } BufferedWriter out = new BufferedWriter(new FileWriter(fileName)); out.write(doc.select("body").first().html()); //out.write(doc.select("body").first().html()); out.close(); for (int i = 0; i < voices.size(); i++) { if (voices.get(i).equals("william")) voices.set(i, "/opt/cereproc/cerevoice_william_3.0.5_22k.voice"); if (voices.get(i).equals("isabella")) voices.set(i, "/opt/cereproc/cerevoice_isabella_3.0.3_22k.voice"); if (voices.get(i).equals("alex")) voices.set(i, "/opt/cereproc/cerevoice_alex_3.0.0_beta_22k.voice"); } } catch (Exception e) { //Fallback if ssml parsing fails Writer out = new OutputStreamWriter(new FileOutputStream(fileName)); try { out.write(speakernotes); } finally { out.close(); } voices.add("ssml parsing failed"); } return StringUtils.join(voices, ","); }
From source file:com.ignorelist.kassandra.steam.scraper.HtmlTagLoader.java
private static URI getSrcUri(Elements elements) { final Element first = Iterables.getFirst(elements, null); if (null == first) { return null; }/*from w ww . jav a2 s. c om*/ String srcString = first.attr("src"); if (null == srcString) { return null; } try { for (Map.Entry<String, String> replacement : URL_REPLACE.entrySet()) { srcString = srcString.replace(replacement.getKey(), replacement.getValue()); } return new URI(srcString); } catch (URISyntaxException ex) { Logger.getLogger(HtmlTagLoader.class.getName()).log(Level.SEVERE, null, ex); return null; } }