Example usage for org.jsoup.nodes Element attr

List of usage examples for org.jsoup.nodes Element attr

Introduction

In this page you can find the example usage for org.jsoup.nodes Element attr.

Prototype

public String attr(String attributeKey) 

Source Link

Document

Get an attribute's value by its key.

Usage

From source file:com.mycompany.crawlertest.GrabPage.java

private void processLinks(Elements links) {
    for (Element link : links) {
        String href = link.attr("href");
        if (StringUtils.isBlank(href) || href.startsWith("#")) {
            continue;
        }/*from w  w w.  j  av  a 2  s  .co m*/
        try {
            URL nextUrl = new URL(url, href);
            Uttils.URLS.add(href);
            urlList.add(nextUrl);
        } catch (MalformedURLException e) { // ignore bad urls
        }
    }
}

From source file:com.subgraph.vega.internal.analysis.urls.HtmlUrlExtractor.java

private List<URI> extractURIs(Document document, String query, String attribute) {
    final ArrayList<URI> uris = new ArrayList<URI>();
    for (Element e : document.select(query)) {
        String link = e.attr(attribute);
        URI uri = createURI(link);
        if (uri != null)
            uris.add(uri);//from   w  w  w . j  a  v a 2  s . com
    }
    return uris;
}

From source file:com.soulgalore.crawler.core.impl.AhrefPageURLParser.java

private Set<CrawlerURL> fetch(String query, String attributeKey, Document doc, String url) {

    final Set<CrawlerURL> urls = new HashSet<CrawlerURL>();

    final Elements elements = doc.select(query);

    for (Element src : elements) {

        if (src.attr(attributeKey).isEmpty())
            continue;

        // don't fetch mailto links
        if (src.attr(attributeKey).startsWith(MAIL_TO))
            continue;

        else if (IFRAME.equals(src.tag().getName()))
            urls.add(new CrawlerURL(src.attr(attributeKey), url));

        else//www .j  a va  2 s  . c  o  m
            urls.add(new CrawlerURL(src.attr(attributeKey), url));

    }

    return urls;

}

From source file:io.seldon.importer.articles.dynamicextractors.AllElementsAttrValueDynamicExtractor.java

@Override
public String extract(AttributeDetail attributeDetail, String url, Document articleDoc) throws Exception {
    String attrib_value = null;//from   www.  ja  v a2 s . c o  m

    if ((attributeDetail.extractor_args != null) && (attributeDetail.extractor_args.size() >= 2)) {
        String cssSelector = attributeDetail.extractor_args.get(0);
        String attributeName = attributeDetail.extractor_args.get(1);
        Elements elements = articleDoc.select(cssSelector);
        if (StringUtils.isNotBlank(cssSelector)) {
            if (elements != null) {
                StringBuilder sb = new StringBuilder();
                boolean isFirstInList = true;
                for (Element e : elements) {
                    String eText = e.attr(attributeName);
                    eText = StringUtils.strip(eText);
                    eText = eText.toLowerCase();
                    if (StringUtils.isBlank(eText))
                        continue;
                    if (isFirstInList) {
                        isFirstInList = false;
                    } else {
                        sb.append(",");
                    }
                    sb.append(eText);
                }
                attrib_value = sb.toString();
            }
        }
    }

    return attrib_value;
}

From source file:it.polito.tellmefirst.apimanager.ImageManager.java

public String scrapeImageFromPage(String pageURL) {

    LOG.debug("[scrapeImageFromPage] - BEGIN url=" + pageURL);
    long startTime = System.currentTimeMillis();
    String result = DEFAULT_IMAGE;
    try {/*  w ww.  j  a v  a  2 s. c o  m*/
        Document doc = Jsoup.connect(pageURL).get();
        Element image = doc.select("img").first();
        result = image.attr("src");
    } catch (Exception e) {
        LOG.error("[scrapeImageFromPage] - EXCEPTION: ", e);
    }
    long endTime = System.currentTimeMillis();
    long duration = (endTime - startTime) / 1000;
    //no prod
    LOG.debug("########### [scrapeImageFromPage] took " + duration + " seconds. ###########");
    LOG.debug("[scrapeImageFromPage] - END");
    return result;
}

From source file:it.polito.tellmefirst.web.rest.apimanager.ImageManager.java

public String scrapeImageFromPage(String pageURL) {
    LOG.debug("[scrapeImageFromPage] - BEGIN");
    String result = Enhancer.DEFAULT_IMAGE;
    try {/*from www. j av a 2 s . c  om*/
        Document doc = Jsoup.connect(pageURL).get();
        Element image = doc.select("div.fullImageLink").select("img").first();
        result = image.attr("src");
    } catch (Exception e) {
        LOG.error("[scrapeImageFromPage] - EXCEPTION: ", e);
    }
    LOG.debug("[scrapeImageFromPage] - END");
    return result;
}

From source file:it.polito.tellmefirst.apimanager.ImageManager.java

public String scrapeDBpediaImageFromPage(String pageURL) {
    LOG.debug("[scrapeDBpediaImageFromPage] - BEGIN url=" + pageURL);
    long startTime = System.currentTimeMillis();
    String result = "";
    try {/*from w  ww .j a v  a2s.  c  o m*/
        Document doc = Jsoup.connect(pageURL).timeout(10 * 1000).get();
        Element image = doc.select("img").first();
        result = "http:" + image.attr("src");
    } catch (Exception e) {
        LOG.error("[scrapeDBpediaImageFromPage] - EXCEPTION: ", e);
    }
    long endTime = System.currentTimeMillis();
    long duration = (endTime - startTime) / 1000;
    //no prod
    LOG.debug("########### [scrapeDBpediaImageFromPage] took " + duration + " seconds. ###########");
    LOG.debug("[scrapeDBpediaImageFromPage] - END");

    return result;
}

From source file:org.jasig.portlet.proxy.search.AnchorSearchStrategy.java

@Override
public List<SearchResult> search(SearchRequest searchQuery, EventRequest request, Document document) {
    List<SearchResult> results = new ArrayList<SearchResult>();
    final String[] whitelistRegexes = request.getPreferences().getValues("anchorWhitelistRegex",
            new String[] {});
    String searchTerms = searchQuery.getSearchTerms().toLowerCase();

    Elements links = document.select("a[href]");
    for (Element link : links) {
        String linkUrl = link.attr("abs:href");
        for (String searchTerm : searchTerms.split(" ")) {
            if (link.text().toLowerCase().contains(searchTerm)) {
                log.debug("found a match, term: [" + searchTerm + "], anchor URL: [" + linkUrl
                        + "], anchor text: [" + link.text() + "]");
                SearchResult result = new SearchResult();
                result.setTitle(link.text());
                result.setSummary(link.text());

                PortletUrl pUrl = new PortletUrl();
                pUrl.setPortletMode(PortletMode.VIEW.toString());
                pUrl.setType(PortletUrlType.RENDER);
                pUrl.setWindowState(WindowState.MAXIMIZED.toString());
                PortletUrlParameter param = new PortletUrlParameter();
                param.setName("proxy.url");
                param.getValue().add(linkUrl);
                pUrl.getParam().add(param);

                new SearchUtil().updateUrls(linkUrl, request, whitelistRegexes);

                result.setPortletUrl(pUrl);
                results.add(result);//from   w  w w . j a  va 2s .c o  m
            }
        }
    }
    return results;
}

From source file:org.brunocvcunha.taskerbox.impl.crawler.SlexyAction.java

@Override
public void action(final Document entry) {

    log.debug("Validating " + entry.title());

    for (Element el : entry.select(".main").select("a")) {
        final String id = el.attr("href").replace("/view/", "");

        final String title = id;

        if (canAct(id)) {
            addAct(id);//w ww. j  a  v  a 2 s . c  om

            spreadAction(id, title);
            serializeAlreadyAct();
            sleep(FETCH_INTERVAL);
        }

    }

}

From source file:it.polito.tellmefirst.apimanager.ImageManager.java

public int[] scrapeImageSizeFromPage(String pageURL) {
    LOG.debug("[scrapeImageSizeFromPage] - BEGIN");
    int[] result = { 0, 0 };
    try {// w w w .j a v  a  2  s  . c  om
        Document doc = Jsoup.connect(pageURL).timeout(10 * 1000).get();
        Element image = doc.select("img").first();
        result[0] = Integer.valueOf(image.attr("width"));
        result[1] = Integer.valueOf(image.attr("height"));
    } catch (Exception e) {
        LOG.error("[scrapeImageSizeFromPage] - EXCEPTION: ", e);
    }
    LOG.debug("[scrapeImageSizeFromPage] - END");
    return result;
}