Example usage for org.jsoup.nodes Element attr

List of usage examples for org.jsoup.nodes Element attr

Introduction

In this page you can find the example usage for org.jsoup.nodes Element attr.

Prototype

public String attr(String attributeKey) 

Source Link

Document

Get an attribute's value by its key.

Usage

From source file:org.aliuge.crawler.jobconf.FetchConfig.java

/**
 * ???/*from  w w  w  .jav  a  2 s .  c  o m*/
 * 
 * @param confFile
 * @return
 */
@SuppressWarnings("unchecked")
public FetchConfig loadConfig(Document confDoc) throws ConfigurationException {
    try {
        Document doc = confDoc;
        super.setJobName(doc.select("job").attr("name"));
        super.setIndexName(doc.select("job").attr("indexName"));
        Elements e = doc.select("fetch");
        this.type = e.select("type").text();
        this.agent = e.select("agent").text();
        String temp = e.select("threadNum").text();
        if (StringUtils.isNotBlank(temp)) {
            this.threadNum = Integer.parseInt(temp);
        }

        temp = e.select("delayBetweenRequests").text();
        if (StringUtils.isNotBlank(temp)) {
            this.delayBetweenRequests = Integer.parseInt(temp);
        }

        temp = e.select("maxDepthOfCrawling").text();
        if (StringUtils.isNotBlank(temp)) {
            this.maxDepthOfCrawling = Integer.parseInt(temp);
        }

        temp = e.select("fetchBinaryContent").text();
        if (StringUtils.isNotBlank(temp)) {
            this.fetchBinaryContent = Boolean.parseBoolean(temp);
        }

        if (StringUtils.isNotBlank(e.select("maxOutgoingLinksToFollow").text())) {
            this.maxOutgoingLinksToFollow = Integer.parseInt(e.select("maxOutgoingLinksToFollow").text());
        }

        temp = e.select("fileSuffix").text();
        if (StringUtils.isNotBlank(temp)) {
            this.fileSuffix = temp;
        }

        temp = e.select("maxDownloadSizePerPage").text();
        if (StringUtils.isNotBlank(temp)) {
            this.maxDownloadSizePerPage = Integer.parseInt(temp);
        }

        temp = e.select("https").text();
        if (StringUtils.isNotBlank(temp)) {
            this.https = Boolean.parseBoolean(temp);
        }

        temp = e.select("onlyDomain").text();
        if (StringUtils.isNotBlank(temp)) {
            this.onlyDomain = Boolean.parseBoolean(temp);
        }

        temp = e.select("socketTimeoutMilliseconds").text();
        if (StringUtils.isNotBlank(temp)) {
            this.socketTimeoutMilliseconds = Integer.parseInt(temp);
        }

        temp = e.select("connectionTimeout").text();
        if (StringUtils.isNotBlank(temp)) {
            this.connectionTimeout = Integer.parseInt(temp);
        }

        temp = e.select("maxTotalConnections").text();
        if (StringUtils.isNotBlank(temp)) {
            this.maxTotalConnections = Integer.parseInt(temp);
        }

        temp = e.select("maxConnectionsPerHost").text();
        if (StringUtils.isNotBlank(temp)) {
            this.maxConnectionsPerHost = Integer.parseInt(e.select("maxConnectionsPerHost").text());
        }

        temp = e.select("maxConnectionsPerHost").text();
        if (StringUtils.isNotBlank(temp)) {
            this.maxConnectionsPerHost = Integer.parseInt(temp);
        }

        temp = e.select("proxy").text();
        if (StringUtils.isNotBlank(temp)) {
            Properties p = PropertyConfigurationHelper.getProperties(temp);
            this.proxyIps = Lists.newLinkedList();
            for (Object o : p.keySet()) {
                proxyIps.add((String) p.get(o));
            }

        }

        // seed
        Elements seeds = doc.select("fetch seeds seed");
        for (Element element : seeds) {
            // WebURL url = new WebURL();
            String url = element.text();
            if (StringUtils.isBlank(url)) {
                continue;
            }
            url = url.trim();
            String area = element.attr("area");
            this.seeds.add(url);

            WebURL areaUrl = new WebURL(area, url);

            try {
                PendingManager.getPendingArea(super.getJobName()).addElement(areaUrl);
            } catch (QueueException e1) {
                log.error("", e1);
                e1.printStackTrace();
            }
            // BloomfilterHelper.getInstance().add(url.getURL());

        }

        /*
         * ??Url
         */
        Elements fetchUrlFilters = doc.select("fetchUrlFilters filter");
        for (Element element : fetchUrlFilters) {
            String tmp = element.text();
            if (StringUtils.isNoneBlank(tmp))
                this.fetchUrlFilters.add(element.text());
        }
        /*
         * ?????Url
         */
        Elements extractUrlfilters = doc.select("extractUrlfilters filter");
        for (Element element : extractUrlfilters) {
            String tmp = element.text();
            String tmp_rep = element.attr("replace");
            if (StringUtils.isNoneBlank(tmp))
                this.extractUrlfilters.add(new KeyValue(tmp, tmp_rep));
        }
    } catch (NumberFormatException e) {
        throw new ConfigurationException("?" + e.getMessage());
    }
    // super.setFetchConfig(this);
    return this;
}

From source file:org.angellist.angellistmobile.FeedJSONAdapter.java

public String replaceUserAndStartupLinks(String descriptionHtml) {
    // get the description and remove all tags

    String description = Html.fromHtml(descriptionHtml).toString();

    // for each link, replace it with a link
    Document doc = Jsoup.parse(descriptionHtml);
    Elements links = doc.select("a[href]");
    for (Element link : links) {
        if ("User".equals(link.attr("data-type"))) {
            description = description.replace(link.text(), "<a href=\"org.angellist.angellistmobile.user://"
                    + link.attr("data-id") + "\">" + link.text() + "</a>");
        } else if ("Startup".equals(link.attr("data-type"))) {
            description = description.replace(link.text(), "<a href=\"org.angellist.angellistmobile.startup://"
                    + link.attr("data-id") + "\">" + link.text() + "</a>");
        }/* w w w.  j a va 2s.c  o  m*/
    }

    return description;
}

From source file:org.apache.jmeter.protocol.http.proxy.FormCharSetFinder.java

/**
 * Add form action urls and their corresponding encodings for all forms on the page
 *
 * @param html the html to parse for form encodings
 * @param formEncodings the Map where form encodings should be added
 * @param pageEncoding the encoding used for the whole page
 * @throws HTMLParseException when parsing the <code>html</code> fails
 *//*from w  ww  . ja v  a  2s  .  c o  m*/
public void addFormActionsAndCharSet(String html, Map<String, String> formEncodings, String pageEncoding)
        throws HTMLParseException {
    if (log.isDebugEnabled()) {
        log.debug("Parsing html of: " + html);
    }

    Document document = Jsoup.parse(html);
    Elements forms = document.select("form");
    for (Element element : forms) {
        String action = element.attr("action");
        if (!(StringUtils.isEmpty(action))) {
            // We use the page encoding where the form resides, as the
            // default encoding for the form
            String formCharSet = pageEncoding;
            String acceptCharSet = element.attr("accept-charset");
            // Check if we found an accept-charset attribute on the form
            if (acceptCharSet != null) {
                String[] charSets = JOrphanUtils.split(acceptCharSet, ",");
                // Just use the first one of the possible many charsets
                if (charSets.length > 0) {
                    formCharSet = charSets[0].trim();
                    if (formCharSet.length() == 0) {
                        formCharSet = null;
                    }
                }
            }
            if (formCharSet != null) {
                synchronized (formEncodings) {
                    formEncodings.put(action, formCharSet);
                }
            }
        }
    }
}

From source file:org.apache.marmotta.ldclient.provider.html.mapping.CssLiteralAttrMapper.java

@Override
public List<Value> map(String resourceUri, Element elem, ValueFactory factory) {
    final String value = cleanValue(elem.attr(attr));
    if (StringUtils.isBlank(value))
        return Collections.emptyList();
    if (language != null)
        return Collections.singletonList((Value) factory.createLiteral(value, language.toString()));
    if (datatype != null)
        return Collections.singletonList(
                (Value) factory.createLiteral(value, factory.createURI(Namespaces.NS_XSD + datatype)));
    else/*from   ww w.  j av a2  s. c  o m*/
        return Collections.singletonList((Value) factory.createLiteral(value));
}

From source file:org.apache.maven.wagon.shared.http.HtmlFileListParser.java

/**
 * Fetches a raw HTML from a provided InputStream, parses it, and returns the file list.
 *
 * @param stream the input stream.//from   w  w  w  .  j a va 2 s. co m
 * @return the file list.
 * @throws TransferFailedException if there was a problem fetching the raw html.
 */
public static List<String> parseFileList(String baseurl, InputStream stream) throws TransferFailedException {
    try {
        URI baseURI = new URI(baseurl);
        // to make debugging easier, start with a string. This is assuming UTF-8, which might not be a safe
        // assumption.
        String content = IOUtils.toString(stream, "utf-8");
        Document doc = Jsoup.parse(content, baseurl);
        Elements links = doc.select("a[href]");
        Set<String> results = new HashSet<String>();
        for (Element link : links) {
            /*
             * The abs:href loses directories, so we deal with absolute paths ourselves below in cleanLink
             */
            String target = link.attr("href");
            if (target != null) {
                String clean = cleanLink(baseURI, target);
                if (isAcceptableLink(clean)) {
                    results.add(clean);
                }
            }

        }

        return new ArrayList<String>(results);
    } catch (URISyntaxException e) {
        throw new TransferFailedException("Unable to parse as base URI: " + baseurl, e);
    } catch (IOException e) {
        throw new TransferFailedException("I/O error reading HTML listing of artifacts: " + e.getMessage(), e);
    }
}

From source file:org.apache.nifi.GetHTMLElement.java

/**
 * Extracts the HTML value based on the configuration values.
 *
 * @return value from the parsed HTML element
 *//* w  ww. ja v a 2  s. c  o m*/
private String extractElementValue(String prependValue, final String outputType, String appendValue,
        final Element ele, final String attrKey) {
    if (StringUtils.isEmpty(prependValue)) {
        prependValue = "";
    }
    if (StringUtils.isEmpty(appendValue)) {
        appendValue = "";
    }

    switch (outputType) {
    case ELEMENT_HTML:
        return prependValue + ele.html() + appendValue;
    case ELEMENT_TEXT:
        return prependValue + ele.text() + appendValue;
    case ELEMENT_DATA:
        return prependValue + ele.data() + appendValue;
    case ELEMENT_ATTRIBUTE:
        return prependValue + ele.attr(attrKey) + appendValue;
    default:
        return prependValue + ele.html() + appendValue;
    }
}

From source file:org.apache.nifi.TestModifyHTMLElement.java

@Test
public void testModifyAttribute() throws Exception {
    final String MOD_VALUE = "http://localhost/newlink";
    testRunner.setProperty(ModifyHTMLElement.CSS_SELECTOR, "#" + GDR_ID);
    testRunner.setProperty(ModifyHTMLElement.OUTPUT_TYPE, ModifyHTMLElement.ELEMENT_ATTRIBUTE);
    testRunner.setProperty(ModifyHTMLElement.ATTRIBUTE_KEY, "href");
    testRunner.setProperty(ModifyHTMLElement.MODIFIED_VALUE, MOD_VALUE);

    testRunner.enqueue(new File("src/test/resources/Weather.html").toPath());
    testRunner.run();/* www  . j a v a  2  s  .  co m*/

    testRunner.assertTransferCount(ModifyHTMLElement.REL_SUCCESS, 1);
    testRunner.assertTransferCount(ModifyHTMLElement.REL_INVALID_HTML, 0);
    testRunner.assertTransferCount(ModifyHTMLElement.REL_ORIGINAL, 1);
    testRunner.assertTransferCount(ModifyHTMLElement.REL_NOT_FOUND, 0);

    List<MockFlowFile> ffs = testRunner.getFlowFilesForRelationship(ModifyHTMLElement.REL_SUCCESS);
    assertTrue(ffs.size() == 1);
    String data = new String(testRunner.getContentAsByteArray(ffs.get(0)));

    //Contents will be the entire HTML doc. So lets use Jsoup again just the grab the element we want.
    Document doc = Jsoup.parse(data);
    Elements eles = doc.select("#" + GDR_ID);
    Element ele = eles.get(0);

    assertTrue(StringUtils.equals(MOD_VALUE, ele.attr("href")));
}

From source file:org.apache.nutch.protocol.httpclient.HttpFormAuthentication.java

private List<NameValuePair> getLoginFormParams(String pageContent) throws UnsupportedEncodingException {
    List<NameValuePair> params = new ArrayList<NameValuePair>();
    Document doc = Jsoup.parse(pageContent);
    Element loginform = doc.getElementById(authConfigurer.getLoginFormId());
    if (loginform == null) {
        LOG.debug("No form element found with 'id' = {}, trying 'name'.", authConfigurer.getLoginFormId());
        loginform = doc.select("form[name=" + authConfigurer.getLoginFormId() + "]").first();
        if (loginform == null) {
            LOG.debug("No form element found with 'name' = {}", authConfigurer.getLoginFormId());
            throw new IllegalArgumentException("No form exists: " + authConfigurer.getLoginFormId());
        }/* ww  w .ja v  a  2  s.c o  m*/
    }
    Elements inputElements = loginform.getElementsByTag("input");
    // skip fields in removedFormFields or loginPostData
    for (Element inputElement : inputElements) {
        String key = inputElement.attr("name");
        String value = inputElement.attr("value");
        if (authConfigurer.getLoginPostData().containsKey(key)
                || authConfigurer.getRemovedFormFields().contains(key)) {
            // value = loginPostData.get(key);
            continue;
        }
        params.add(new NameValuePair(key, value));
    }
    // add key and value in loginPostData
    for (Entry<String, String> entry : authConfigurer.getLoginPostData().entrySet()) {
        params.add(new NameValuePair(entry.getKey(), entry.getValue()));
    }
    return params;
}

From source file:org.apache.nutch.protocol.httpclient.proxy.HttpFormAuthentication.java

private List<NameValuePair> getLoginFormParams(String pageContent) throws UnsupportedEncodingException {
    List<NameValuePair> params = new ArrayList<NameValuePair>();
    Document doc = Jsoup.parse(pageContent);
    Element loginform = doc.getElementById(authConfigurer.getLoginFormId());
    if (loginform == null) {
        LOGGER.debug("No form element found with 'id' = {}, trying 'name'.", authConfigurer.getLoginFormId());
        loginform = doc.select("form[name=" + authConfigurer.getLoginFormId() + "]").first();
        if (loginform == null) {
            LOGGER.debug("No form element found with 'name' = {}", authConfigurer.getLoginFormId());
            throw new IllegalArgumentException("No form exists: " + authConfigurer.getLoginFormId());
        }//from   w ww  .jav a2 s.com
    }
    Elements inputElements = loginform.getElementsByTag("input");
    // skip fields in removedFormFields or loginPostData
    for (Element inputElement : inputElements) {
        String key = inputElement.attr("name");
        String value = inputElement.attr("value");
        if (authConfigurer.getLoginPostData().containsKey(key)
                || authConfigurer.getRemovedFormFields().contains(key)) {
            // value = loginPostData.get(key);
            continue;
        }
        params.add(new NameValuePair(key, value));
    }
    // add key and value in loginPostData
    for (Entry<String, String> entry : authConfigurer.getLoginPostData().entrySet()) {
        params.add(new NameValuePair(entry.getKey(), entry.getValue()));
    }
    return params;
}

From source file:org.apdplat.extractor.html.HtmlExtractor.java

/**
 * ??/*from   w  w w.  j av  a 2  s  . c  o  m*/
 * @param url html?
 * @param encoding ??
 * @param in html??
 * @return 
 */
public List<ExtractResult> extract(String url, InputStream in, String encoding) {
    List<ExtractResult> extractResults = new ArrayList<>();
    if (!Charset.isSupported(encoding)) {
        LOGGER.error("???" + encoding + " URL" + url);
        return extractResults;
    }
    //?URL???
    List<HtmlTemplate> htmlTemplates = extractRegular.getHtmlTemplate(url);
    if (htmlTemplates.isEmpty()) {
        return extractResults;
    }
    try {
        byte[] content = readAll(in);
        Document doc = Jsoup.parse(new ByteArrayInputStream(content), encoding, url);
        Elements metas = doc.select("meta");
        String keywords = "";
        String description = "";
        for (Element meta : metas) {
            String name = meta.attr("name");
            if ("keywords".equals(name)) {
                keywords = meta.attr("content");
            }
            if ("description".equals(name)) {
                description = meta.attr("content");
            }
        }
        Set<String> tableNames = new HashSet<>();
        for (HtmlTemplate htmlTemplate : htmlTemplates) {
            if (tableNames.contains(htmlTemplate.getTableName())) {
                LOGGER.debug(
                        "?tableName????UrlPattern?"
                                + htmlTemplate.getUrlPattern().getUrlPattern());
                LOGGER.debug(htmlTemplates.toString());
            }
            tableNames.add(htmlTemplate.getTableName());
            try {
                //???
                ExtractResult extractResult = extractHtmlTemplate(url, htmlTemplate, doc);
                extractResult.setContent(content);
                extractResult.setEncoding(encoding);
                extractResult.setKeywords(keywords);
                extractResult.setDescription(description);
                extractResults.add(extractResult);
            } catch (Exception e) {
                LOGGER.error("???" + htmlTemplate.getTemplateName(), e);
            }
        }
    } catch (Exception e) {
        LOGGER.error("?: " + url, e);
    }
    return extractResults;
}