List of usage examples for org.jsoup.nodes Element attr
public String attr(String attributeKey)
From source file:com.cognifide.aet.job.common.comparators.w3chtml5.WarningNodeToW3cHtml5IssueFunction.java
@Override public W3cHtml5Issue apply(Node child) { if (!(child instanceof Element)) { return null; }//from www .jav a2 s .c o m Element element = (Element) child; W3cHtml5IssueType issueType = W3cHtml5IssueType .valueOf(StringUtils.removeStart(element.attr("class"), "msg_").toUpperCase()); String message = element.getElementsByAttributeValue("class", "msg").html(); String additionalInfo = element.child(1).html(); return new W3cHtml5Issue(0, 0, message, StringUtils.EMPTY, StringUtils.EMPTY, StringUtils.EMPTY, additionalInfo, issueType); }
From source file:it.polito.tellmefirst.web.rest.apimanager.ImageManager.java
public String scrapeDBpediaImageFromPage(String pageURL) { LOG.debug("[scrapeDBpediaImageFromPage] - BEGIN"); String result = ""; try {/*from w w w .j ava2 s .c o m*/ Document doc = Jsoup.connect(pageURL).get(); Element image = doc.select("div.fullImageLink").select("img").first(); result = "http:" + image.attr("src"); } catch (Exception e) { LOG.error("[scrapeDBpediaImageFromPage] - EXCEPTION: ", e); } LOG.debug("[scrapeDBpediaImageFromPage] - END"); return result; }
From source file:org.brunocvcunha.taskerbox.impl.crawler.SniptAction.java
@Override public void action(final Document entry) { log.debug("Validating " + entry.title()); for (Element el : entry.select(".grid-block").select("a")) { final String id = el.attr("href").replace("http://snipt.org/", ""); final String title = id + " - " + el.text(); if (canAct(id)) { addAct(id);//from w w w . j a v a 2 s . c o m spreadAction(id, title); serializeAlreadyAct(); sleep(FETCH_INTERVAL); } } }
From source file:net.sf.jabref.logic.fetcher.DoiResolution.java
@Override public Optional<URL> findFullText(BibEntry entry) throws IOException { Objects.requireNonNull(entry); Optional<URL> pdfLink = Optional.empty(); Optional<DOI> doi = DOI.build(entry.getField("doi")); if (doi.isPresent()) { String sciLink = doi.get().getURLAsASCIIString(); // follow all redirects and scan for a single pdf link if (!sciLink.isEmpty()) { try { Connection connection = Jsoup.connect(sciLink); connection.followRedirects(true); connection.ignoreHttpErrors(true); // some publishers are quite slow (default is 3s) connection.timeout(5000); Document html = connection.get(); // scan for PDF Elements elements = html.body().select("[href]"); List<Optional<URL>> links = new ArrayList<>(); for (Element element : elements) { String href = element.attr("abs:href"); // Only check if pdf is included in the link // See https://github.com/lehner/LocalCopy for scrape ideas if (href.contains("pdf") && MimeTypeDetector.isPdfContentType(href)) { links.add(Optional.of(new URL(href))); }/* w w w .jav a 2 s . c o m*/ } // return if only one link was found (high accuracy) if (links.size() == 1) { LOGGER.info("Fulltext PDF found @ " + sciLink); pdfLink = links.get(0); } } catch (IOException e) { LOGGER.warn("DoiResolution fetcher failed: ", e); } } } return pdfLink; }
From source file:net.sf.jabref.logic.fulltext.DoiResolution.java
@Override public Optional<URL> findFullText(BibEntry entry) throws IOException { Objects.requireNonNull(entry); Optional<URL> pdfLink = Optional.empty(); Optional<DOI> doi = entry.getFieldOptional(FieldName.DOI).flatMap(DOI::build); if (doi.isPresent()) { String sciLink = doi.get().getURIAsASCIIString(); // follow all redirects and scan for a single pdf link if (!sciLink.isEmpty()) { try { Connection connection = Jsoup.connect(sciLink); connection.followRedirects(true); connection.ignoreHttpErrors(true); // some publishers are quite slow (default is 3s) connection.timeout(5000); Document html = connection.get(); // scan for PDF Elements elements = html.body().select("[href]"); List<Optional<URL>> links = new ArrayList<>(); for (Element element : elements) { String href = element.attr("abs:href"); // Only check if pdf is included in the link // See https://github.com/lehner/LocalCopy for scrape ideas if (href.contains("pdf") && MimeTypeDetector.isPdfContentType(href)) { links.add(Optional.of(new URL(href))); }/* ww w. j a va 2s. c om*/ } // return if only one link was found (high accuracy) if (links.size() == 1) { LOGGER.info("Fulltext PDF found @ " + sciLink); pdfLink = links.get(0); } } catch (IOException e) { LOGGER.warn("DoiResolution fetcher failed: ", e); } } } return pdfLink; }
From source file:hello.Scraper.java
@Transformer(inputChannel = "channel3", outputChannel = "channel4") public DumpEntry convert(Element payload) throws ParseException { String dateStr = payload.ownText().substring(0, 19); DateFormat format = new SimpleDateFormat("yyyy-MM-dd hh:mm:ss"); format.setTimeZone(TimeZone.getTimeZone("GMT")); Date timestamp = format.parse(dateStr); Elements list = payload.select("a"); String id;//from ww w. j a v a 2 s.c o m String ref; if (list.size() > 0) { Element a = list.get(0); id = a.ownText(); ref = a.attr("href"); } else { id = "private data"; ref = null; } Element span = payload.select("span").get(0); String status = span.ownText(); return new DumpEntry(timestamp, id, ref, status); }
From source file:org.brunocvcunha.taskerbox.impl.crawler.PastebinAction.java
@Override public void action(final Document entry) { log.debug("Validating " + entry.title()); for (Element el : entry.select(".maintable").select("a")) { final String id = el.attr("href").substring(1); if (id.startsWith("archive")) { continue; }//from w w w.ja v a 2 s. com final String title = id + " - " + el.text(); if (canAct(id)) { addAct(id); spreadAction(id, title); serializeAlreadyAct(); sleep(FETCH_INTERVAL); } } }
From source file:HttpCilentExample.HttpCilentExample.java
public List<NameValuePair> getFormParams(String html, String username, String password) throws UnsupportedEncodingException { System.out.println("Extracting form's data..."); Document doc = Jsoup.parse(html); // Google form id Element loginform = doc.getElementById("gaia_loginform"); Elements inputElements = loginform.getElementsByTag("input"); List<NameValuePair> paramList = new ArrayList<NameValuePair>(); for (Element inputElement : inputElements) { String key = inputElement.attr("name"); String value = inputElement.attr("value"); if (key.equals("Email")) value = username;/*from w ww .ja v a2s . c o m*/ else if (key.equals("Passwd")) value = password; paramList.add(new BasicNameValuePair(key, value)); } return paramList; }
From source file:it.polito.tellmefirst.web.rest.apimanager.ImageManager.java
public int[] scrapeImageSizeFromPage(String pageURL) { LOG.debug("[scrapeImageSizeFromPage] - BEGIN"); int[] result = { 0, 0 }; try {/* w ww. j ava 2 s .co m*/ Document doc = Jsoup.connect(pageURL).get(); Element image = doc.select("div.fullImageLink").select("img").first(); result[0] = Integer.valueOf(image.attr("width")); result[1] = Integer.valueOf(image.attr("height")); } catch (Exception e) { LOG.error("[scrapeImageSizeFromPage] - EXCEPTION: ", e); } LOG.debug("[scrapeImageSizeFromPage] - END"); return result; }
From source file:com.johan.vertretungsplan.parser.UntisMonitorParser.java
private void loadUrl(String url, String encoding, boolean following, List<Document> docs, String startUrl) throws IOException { String html = httpGet(url, encoding).replace(" ", ""); Document doc = Jsoup.parse(html); docs.add(doc);/*from w w w . ja va2 s . co m*/ if (following && doc.select("meta[http-equiv=refresh]").size() > 0) { Element meta = doc.select("meta[http-equiv=refresh]").first(); String attr = meta.attr("content").toLowerCase(); String redirectUrl = url.substring(0, url.lastIndexOf("/") + 1) + attr.substring(attr.indexOf("url=") + 4); if (!redirectUrl.equals(startUrl)) loadUrl(redirectUrl, encoding, true, docs, startUrl); } }