List of usage examples for org.jsoup.nodes Element attr
public String attr(String attributeKey)
From source file:com.mycompany.crawlertest.GrabPage.java
private void processLinks(Elements links) { for (Element link : links) { String href = link.attr("href"); if (StringUtils.isBlank(href) || href.startsWith("#")) { continue; }/*from w w w. j av a 2 s .co m*/ try { URL nextUrl = new URL(url, href); Uttils.URLS.add(href); urlList.add(nextUrl); } catch (MalformedURLException e) { // ignore bad urls } } }
From source file:com.subgraph.vega.internal.analysis.urls.HtmlUrlExtractor.java
private List<URI> extractURIs(Document document, String query, String attribute) { final ArrayList<URI> uris = new ArrayList<URI>(); for (Element e : document.select(query)) { String link = e.attr(attribute); URI uri = createURI(link); if (uri != null) uris.add(uri);//from w w w . j a v a 2 s . com } return uris; }
From source file:com.soulgalore.crawler.core.impl.AhrefPageURLParser.java
private Set<CrawlerURL> fetch(String query, String attributeKey, Document doc, String url) { final Set<CrawlerURL> urls = new HashSet<CrawlerURL>(); final Elements elements = doc.select(query); for (Element src : elements) { if (src.attr(attributeKey).isEmpty()) continue; // don't fetch mailto links if (src.attr(attributeKey).startsWith(MAIL_TO)) continue; else if (IFRAME.equals(src.tag().getName())) urls.add(new CrawlerURL(src.attr(attributeKey), url)); else//www .j a va 2 s . c o m urls.add(new CrawlerURL(src.attr(attributeKey), url)); } return urls; }
From source file:io.seldon.importer.articles.dynamicextractors.AllElementsAttrValueDynamicExtractor.java
@Override public String extract(AttributeDetail attributeDetail, String url, Document articleDoc) throws Exception { String attrib_value = null;//from www. ja v a2 s . c o m if ((attributeDetail.extractor_args != null) && (attributeDetail.extractor_args.size() >= 2)) { String cssSelector = attributeDetail.extractor_args.get(0); String attributeName = attributeDetail.extractor_args.get(1); Elements elements = articleDoc.select(cssSelector); if (StringUtils.isNotBlank(cssSelector)) { if (elements != null) { StringBuilder sb = new StringBuilder(); boolean isFirstInList = true; for (Element e : elements) { String eText = e.attr(attributeName); eText = StringUtils.strip(eText); eText = eText.toLowerCase(); if (StringUtils.isBlank(eText)) continue; if (isFirstInList) { isFirstInList = false; } else { sb.append(","); } sb.append(eText); } attrib_value = sb.toString(); } } } return attrib_value; }
From source file:it.polito.tellmefirst.apimanager.ImageManager.java
public String scrapeImageFromPage(String pageURL) { LOG.debug("[scrapeImageFromPage] - BEGIN url=" + pageURL); long startTime = System.currentTimeMillis(); String result = DEFAULT_IMAGE; try {/* w ww. j a v a 2 s. c o m*/ Document doc = Jsoup.connect(pageURL).get(); Element image = doc.select("img").first(); result = image.attr("src"); } catch (Exception e) { LOG.error("[scrapeImageFromPage] - EXCEPTION: ", e); } long endTime = System.currentTimeMillis(); long duration = (endTime - startTime) / 1000; //no prod LOG.debug("########### [scrapeImageFromPage] took " + duration + " seconds. ###########"); LOG.debug("[scrapeImageFromPage] - END"); return result; }
From source file:it.polito.tellmefirst.web.rest.apimanager.ImageManager.java
public String scrapeImageFromPage(String pageURL) { LOG.debug("[scrapeImageFromPage] - BEGIN"); String result = Enhancer.DEFAULT_IMAGE; try {/*from www. j av a 2 s . c om*/ Document doc = Jsoup.connect(pageURL).get(); Element image = doc.select("div.fullImageLink").select("img").first(); result = image.attr("src"); } catch (Exception e) { LOG.error("[scrapeImageFromPage] - EXCEPTION: ", e); } LOG.debug("[scrapeImageFromPage] - END"); return result; }
From source file:it.polito.tellmefirst.apimanager.ImageManager.java
public String scrapeDBpediaImageFromPage(String pageURL) { LOG.debug("[scrapeDBpediaImageFromPage] - BEGIN url=" + pageURL); long startTime = System.currentTimeMillis(); String result = ""; try {/*from w ww .j a v a2s. c o m*/ Document doc = Jsoup.connect(pageURL).timeout(10 * 1000).get(); Element image = doc.select("img").first(); result = "http:" + image.attr("src"); } catch (Exception e) { LOG.error("[scrapeDBpediaImageFromPage] - EXCEPTION: ", e); } long endTime = System.currentTimeMillis(); long duration = (endTime - startTime) / 1000; //no prod LOG.debug("########### [scrapeDBpediaImageFromPage] took " + duration + " seconds. ###########"); LOG.debug("[scrapeDBpediaImageFromPage] - END"); return result; }
From source file:org.jasig.portlet.proxy.search.AnchorSearchStrategy.java
@Override public List<SearchResult> search(SearchRequest searchQuery, EventRequest request, Document document) { List<SearchResult> results = new ArrayList<SearchResult>(); final String[] whitelistRegexes = request.getPreferences().getValues("anchorWhitelistRegex", new String[] {}); String searchTerms = searchQuery.getSearchTerms().toLowerCase(); Elements links = document.select("a[href]"); for (Element link : links) { String linkUrl = link.attr("abs:href"); for (String searchTerm : searchTerms.split(" ")) { if (link.text().toLowerCase().contains(searchTerm)) { log.debug("found a match, term: [" + searchTerm + "], anchor URL: [" + linkUrl + "], anchor text: [" + link.text() + "]"); SearchResult result = new SearchResult(); result.setTitle(link.text()); result.setSummary(link.text()); PortletUrl pUrl = new PortletUrl(); pUrl.setPortletMode(PortletMode.VIEW.toString()); pUrl.setType(PortletUrlType.RENDER); pUrl.setWindowState(WindowState.MAXIMIZED.toString()); PortletUrlParameter param = new PortletUrlParameter(); param.setName("proxy.url"); param.getValue().add(linkUrl); pUrl.getParam().add(param); new SearchUtil().updateUrls(linkUrl, request, whitelistRegexes); result.setPortletUrl(pUrl); results.add(result);//from w w w . j a va 2s .c o m } } } return results; }
From source file:org.brunocvcunha.taskerbox.impl.crawler.SlexyAction.java
@Override public void action(final Document entry) { log.debug("Validating " + entry.title()); for (Element el : entry.select(".main").select("a")) { final String id = el.attr("href").replace("/view/", ""); final String title = id; if (canAct(id)) { addAct(id);//w ww. j a v a 2 s . c om spreadAction(id, title); serializeAlreadyAct(); sleep(FETCH_INTERVAL); } } }
From source file:it.polito.tellmefirst.apimanager.ImageManager.java
public int[] scrapeImageSizeFromPage(String pageURL) { LOG.debug("[scrapeImageSizeFromPage] - BEGIN"); int[] result = { 0, 0 }; try {// w w w .j a v a 2 s . c om Document doc = Jsoup.connect(pageURL).timeout(10 * 1000).get(); Element image = doc.select("img").first(); result[0] = Integer.valueOf(image.attr("width")); result[1] = Integer.valueOf(image.attr("height")); } catch (Exception e) { LOG.error("[scrapeImageSizeFromPage] - EXCEPTION: ", e); } LOG.debug("[scrapeImageSizeFromPage] - END"); return result; }