Example usage for org.jsoup.nodes Element attr

Introduction

In this page you can find the example usage for org.jsoup.nodes Element attr.

Prototype

public String attr(String attributeKey)

Source Link

Document

Get an attribute's value by its key.

Usage

From source file:com.cognifide.aet.job.common.comparators.w3chtml5.WarningNodeToW3cHtml5IssueFunction.java

@Override
public W3cHtml5Issue apply(Node child) {
    if (!(child instanceof Element)) {
        return null;
    }//from  www .jav  a2  s .c  o m
    Element element = (Element) child;
    W3cHtml5IssueType issueType = W3cHtml5IssueType
            .valueOf(StringUtils.removeStart(element.attr("class"), "msg_").toUpperCase());
    String message = element.getElementsByAttributeValue("class", "msg").html();
    String additionalInfo = element.child(1).html();
    return new W3cHtml5Issue(0, 0, message, StringUtils.EMPTY, StringUtils.EMPTY, StringUtils.EMPTY,
            additionalInfo, issueType);

}

From source file:it.polito.tellmefirst.web.rest.apimanager.ImageManager.java

public String scrapeDBpediaImageFromPage(String pageURL) {
    LOG.debug("[scrapeDBpediaImageFromPage] - BEGIN");
    String result = "";
    try {/*from w w w  .j ava2  s  .c  o  m*/
        Document doc = Jsoup.connect(pageURL).get();
        Element image = doc.select("div.fullImageLink").select("img").first();
        result = "http:" + image.attr("src");
    } catch (Exception e) {
        LOG.error("[scrapeDBpediaImageFromPage] - EXCEPTION: ", e);
    }
    LOG.debug("[scrapeDBpediaImageFromPage] - END");
    return result;
}

From source file:org.brunocvcunha.taskerbox.impl.crawler.SniptAction.java

@Override
public void action(final Document entry) {

    log.debug("Validating " + entry.title());

    for (Element el : entry.select(".grid-block").select("a")) {
        final String id = el.attr("href").replace("http://snipt.org/", "");

        final String title = id + " - " + el.text();

        if (canAct(id)) {
            addAct(id);//from   w  w w .  j  a v  a 2 s  .  c o m

            spreadAction(id, title);
            serializeAlreadyAct();
            sleep(FETCH_INTERVAL);
        }

    }

}

From source file:net.sf.jabref.logic.fetcher.DoiResolution.java

@Override
public Optional<URL> findFullText(BibEntry entry) throws IOException {
    Objects.requireNonNull(entry);
    Optional<URL> pdfLink = Optional.empty();

    Optional<DOI> doi = DOI.build(entry.getField("doi"));

    if (doi.isPresent()) {
        String sciLink = doi.get().getURLAsASCIIString();

        // follow all redirects and scan for a single pdf link
        if (!sciLink.isEmpty()) {
            try {
                Connection connection = Jsoup.connect(sciLink);
                connection.followRedirects(true);
                connection.ignoreHttpErrors(true);
                // some publishers are quite slow (default is 3s)
                connection.timeout(5000);

                Document html = connection.get();
                // scan for PDF
                Elements elements = html.body().select("[href]");
                List<Optional<URL>> links = new ArrayList<>();

                for (Element element : elements) {
                    String href = element.attr("abs:href");
                    // Only check if pdf is included in the link
                    // See https://github.com/lehner/LocalCopy for scrape ideas
                    if (href.contains("pdf") && MimeTypeDetector.isPdfContentType(href)) {
                        links.add(Optional.of(new URL(href)));
                    }/* w w  w  .jav a 2  s . c o  m*/
                }
                // return if only one link was found (high accuracy)
                if (links.size() == 1) {
                    LOGGER.info("Fulltext PDF found @ " + sciLink);
                    pdfLink = links.get(0);
                }
            } catch (IOException e) {
                LOGGER.warn("DoiResolution fetcher failed: ", e);
            }
        }
    }
    return pdfLink;
}

From source file:net.sf.jabref.logic.fulltext.DoiResolution.java

@Override
public Optional<URL> findFullText(BibEntry entry) throws IOException {
    Objects.requireNonNull(entry);
    Optional<URL> pdfLink = Optional.empty();

    Optional<DOI> doi = entry.getFieldOptional(FieldName.DOI).flatMap(DOI::build);

    if (doi.isPresent()) {
        String sciLink = doi.get().getURIAsASCIIString();

        // follow all redirects and scan for a single pdf link
        if (!sciLink.isEmpty()) {
            try {
                Connection connection = Jsoup.connect(sciLink);
                connection.followRedirects(true);
                connection.ignoreHttpErrors(true);
                // some publishers are quite slow (default is 3s)
                connection.timeout(5000);

                Document html = connection.get();
                // scan for PDF
                Elements elements = html.body().select("[href]");
                List<Optional<URL>> links = new ArrayList<>();

                for (Element element : elements) {
                    String href = element.attr("abs:href");
                    // Only check if pdf is included in the link
                    // See https://github.com/lehner/LocalCopy for scrape ideas
                    if (href.contains("pdf") && MimeTypeDetector.isPdfContentType(href)) {
                        links.add(Optional.of(new URL(href)));
                    }/* ww w. j a  va 2s. c om*/
                }
                // return if only one link was found (high accuracy)
                if (links.size() == 1) {
                    LOGGER.info("Fulltext PDF found @ " + sciLink);
                    pdfLink = links.get(0);
                }
            } catch (IOException e) {
                LOGGER.warn("DoiResolution fetcher failed: ", e);
            }
        }
    }
    return pdfLink;
}

From source file:hello.Scraper.java

@Transformer(inputChannel = "channel3", outputChannel = "channel4")
public DumpEntry convert(Element payload) throws ParseException {
    String dateStr = payload.ownText().substring(0, 19);

    DateFormat format = new SimpleDateFormat("yyyy-MM-dd hh:mm:ss");
    format.setTimeZone(TimeZone.getTimeZone("GMT"));

    Date timestamp = format.parse(dateStr);

    Elements list = payload.select("a");
    String id;//from ww  w. j a  v a 2  s.c o m
    String ref;
    if (list.size() > 0) {
        Element a = list.get(0);
        id = a.ownText();
        ref = a.attr("href");
    } else {
        id = "private data";
        ref = null;
    }

    Element span = payload.select("span").get(0);
    String status = span.ownText();

    return new DumpEntry(timestamp, id, ref, status);
}

From source file:org.brunocvcunha.taskerbox.impl.crawler.PastebinAction.java

@Override
public void action(final Document entry) {

    log.debug("Validating " + entry.title());

    for (Element el : entry.select(".maintable").select("a")) {
        final String id = el.attr("href").substring(1);
        if (id.startsWith("archive")) {
            continue;
        }//from w  w w.ja  v a 2  s.  com

        final String title = id + " - " + el.text();

        if (canAct(id)) {
            addAct(id);

            spreadAction(id, title);
            serializeAlreadyAct();
            sleep(FETCH_INTERVAL);
        }

    }

}

From source file:HttpCilentExample.HttpCilentExample.java

public List<NameValuePair> getFormParams(String html, String username, String password)
        throws UnsupportedEncodingException {

    System.out.println("Extracting form's data...");

    Document doc = Jsoup.parse(html);

    // Google form id
    Element loginform = doc.getElementById("gaia_loginform");
    Elements inputElements = loginform.getElementsByTag("input");

    List<NameValuePair> paramList = new ArrayList<NameValuePair>();

    for (Element inputElement : inputElements) {
        String key = inputElement.attr("name");
        String value = inputElement.attr("value");

        if (key.equals("Email"))
            value = username;/*from   w  ww  .ja v a2s . c o m*/
        else if (key.equals("Passwd"))
            value = password;

        paramList.add(new BasicNameValuePair(key, value));

    }

    return paramList;
}

From source file:it.polito.tellmefirst.web.rest.apimanager.ImageManager.java

public int[] scrapeImageSizeFromPage(String pageURL) {
    LOG.debug("[scrapeImageSizeFromPage] - BEGIN");
    int[] result = { 0, 0 };
    try {/* w ww. j ava 2  s .co m*/
        Document doc = Jsoup.connect(pageURL).get();
        Element image = doc.select("div.fullImageLink").select("img").first();
        result[0] = Integer.valueOf(image.attr("width"));
        result[1] = Integer.valueOf(image.attr("height"));
    } catch (Exception e) {
        LOG.error("[scrapeImageSizeFromPage] - EXCEPTION: ", e);
    }
    LOG.debug("[scrapeImageSizeFromPage] - END");
    return result;
}

From source file:com.johan.vertretungsplan.parser.UntisMonitorParser.java

private void loadUrl(String url, String encoding, boolean following, List<Document> docs, String startUrl)
        throws IOException {
    String html = httpGet(url, encoding).replace("&nbsp;", "");
    Document doc = Jsoup.parse(html);
    docs.add(doc);/*from w  w  w . ja  va2  s .  co m*/
    if (following && doc.select("meta[http-equiv=refresh]").size() > 0) {
        Element meta = doc.select("meta[http-equiv=refresh]").first();
        String attr = meta.attr("content").toLowerCase();
        String redirectUrl = url.substring(0, url.lastIndexOf("/") + 1)
                + attr.substring(attr.indexOf("url=") + 4);
        if (!redirectUrl.equals(startUrl))
            loadUrl(redirectUrl, encoding, true, docs, startUrl);
    }
}