List of usage examples for org.jsoup.nodes Element attr
public String attr(String attributeKey)
From source file:org.aliuge.crawler.jobconf.FetchConfig.java
/** * ???/*from w w w .jav a 2 s . c o m*/ * * @param confFile * @return */ @SuppressWarnings("unchecked") public FetchConfig loadConfig(Document confDoc) throws ConfigurationException { try { Document doc = confDoc; super.setJobName(doc.select("job").attr("name")); super.setIndexName(doc.select("job").attr("indexName")); Elements e = doc.select("fetch"); this.type = e.select("type").text(); this.agent = e.select("agent").text(); String temp = e.select("threadNum").text(); if (StringUtils.isNotBlank(temp)) { this.threadNum = Integer.parseInt(temp); } temp = e.select("delayBetweenRequests").text(); if (StringUtils.isNotBlank(temp)) { this.delayBetweenRequests = Integer.parseInt(temp); } temp = e.select("maxDepthOfCrawling").text(); if (StringUtils.isNotBlank(temp)) { this.maxDepthOfCrawling = Integer.parseInt(temp); } temp = e.select("fetchBinaryContent").text(); if (StringUtils.isNotBlank(temp)) { this.fetchBinaryContent = Boolean.parseBoolean(temp); } if (StringUtils.isNotBlank(e.select("maxOutgoingLinksToFollow").text())) { this.maxOutgoingLinksToFollow = Integer.parseInt(e.select("maxOutgoingLinksToFollow").text()); } temp = e.select("fileSuffix").text(); if (StringUtils.isNotBlank(temp)) { this.fileSuffix = temp; } temp = e.select("maxDownloadSizePerPage").text(); if (StringUtils.isNotBlank(temp)) { this.maxDownloadSizePerPage = Integer.parseInt(temp); } temp = e.select("https").text(); if (StringUtils.isNotBlank(temp)) { this.https = Boolean.parseBoolean(temp); } temp = e.select("onlyDomain").text(); if (StringUtils.isNotBlank(temp)) { this.onlyDomain = Boolean.parseBoolean(temp); } temp = e.select("socketTimeoutMilliseconds").text(); if (StringUtils.isNotBlank(temp)) { this.socketTimeoutMilliseconds = Integer.parseInt(temp); } temp = e.select("connectionTimeout").text(); if (StringUtils.isNotBlank(temp)) { this.connectionTimeout = Integer.parseInt(temp); } temp = e.select("maxTotalConnections").text(); if (StringUtils.isNotBlank(temp)) { this.maxTotalConnections = Integer.parseInt(temp); } temp = e.select("maxConnectionsPerHost").text(); if (StringUtils.isNotBlank(temp)) { this.maxConnectionsPerHost = Integer.parseInt(e.select("maxConnectionsPerHost").text()); } temp = e.select("maxConnectionsPerHost").text(); if (StringUtils.isNotBlank(temp)) { this.maxConnectionsPerHost = Integer.parseInt(temp); } temp = e.select("proxy").text(); if (StringUtils.isNotBlank(temp)) { Properties p = PropertyConfigurationHelper.getProperties(temp); this.proxyIps = Lists.newLinkedList(); for (Object o : p.keySet()) { proxyIps.add((String) p.get(o)); } } // seed Elements seeds = doc.select("fetch seeds seed"); for (Element element : seeds) { // WebURL url = new WebURL(); String url = element.text(); if (StringUtils.isBlank(url)) { continue; } url = url.trim(); String area = element.attr("area"); this.seeds.add(url); WebURL areaUrl = new WebURL(area, url); try { PendingManager.getPendingArea(super.getJobName()).addElement(areaUrl); } catch (QueueException e1) { log.error("", e1); e1.printStackTrace(); } // BloomfilterHelper.getInstance().add(url.getURL()); } /* * ??Url */ Elements fetchUrlFilters = doc.select("fetchUrlFilters filter"); for (Element element : fetchUrlFilters) { String tmp = element.text(); if (StringUtils.isNoneBlank(tmp)) this.fetchUrlFilters.add(element.text()); } /* * ?????Url */ Elements extractUrlfilters = doc.select("extractUrlfilters filter"); for (Element element : extractUrlfilters) { String tmp = element.text(); String tmp_rep = element.attr("replace"); if (StringUtils.isNoneBlank(tmp)) this.extractUrlfilters.add(new KeyValue(tmp, tmp_rep)); } } catch (NumberFormatException e) { throw new ConfigurationException("?" + e.getMessage()); } // super.setFetchConfig(this); return this; }
From source file:org.angellist.angellistmobile.FeedJSONAdapter.java
public String replaceUserAndStartupLinks(String descriptionHtml) { // get the description and remove all tags String description = Html.fromHtml(descriptionHtml).toString(); // for each link, replace it with a link Document doc = Jsoup.parse(descriptionHtml); Elements links = doc.select("a[href]"); for (Element link : links) { if ("User".equals(link.attr("data-type"))) { description = description.replace(link.text(), "<a href=\"org.angellist.angellistmobile.user://" + link.attr("data-id") + "\">" + link.text() + "</a>"); } else if ("Startup".equals(link.attr("data-type"))) { description = description.replace(link.text(), "<a href=\"org.angellist.angellistmobile.startup://" + link.attr("data-id") + "\">" + link.text() + "</a>"); }/* w w w. j a va 2s.c o m*/ } return description; }
From source file:org.apache.jmeter.protocol.http.proxy.FormCharSetFinder.java
/** * Add form action urls and their corresponding encodings for all forms on the page * * @param html the html to parse for form encodings * @param formEncodings the Map where form encodings should be added * @param pageEncoding the encoding used for the whole page * @throws HTMLParseException when parsing the <code>html</code> fails *//*from w ww . ja v a 2s . c o m*/ public void addFormActionsAndCharSet(String html, Map<String, String> formEncodings, String pageEncoding) throws HTMLParseException { if (log.isDebugEnabled()) { log.debug("Parsing html of: " + html); } Document document = Jsoup.parse(html); Elements forms = document.select("form"); for (Element element : forms) { String action = element.attr("action"); if (!(StringUtils.isEmpty(action))) { // We use the page encoding where the form resides, as the // default encoding for the form String formCharSet = pageEncoding; String acceptCharSet = element.attr("accept-charset"); // Check if we found an accept-charset attribute on the form if (acceptCharSet != null) { String[] charSets = JOrphanUtils.split(acceptCharSet, ","); // Just use the first one of the possible many charsets if (charSets.length > 0) { formCharSet = charSets[0].trim(); if (formCharSet.length() == 0) { formCharSet = null; } } } if (formCharSet != null) { synchronized (formEncodings) { formEncodings.put(action, formCharSet); } } } } }
From source file:org.apache.marmotta.ldclient.provider.html.mapping.CssLiteralAttrMapper.java
@Override public List<Value> map(String resourceUri, Element elem, ValueFactory factory) { final String value = cleanValue(elem.attr(attr)); if (StringUtils.isBlank(value)) return Collections.emptyList(); if (language != null) return Collections.singletonList((Value) factory.createLiteral(value, language.toString())); if (datatype != null) return Collections.singletonList( (Value) factory.createLiteral(value, factory.createURI(Namespaces.NS_XSD + datatype))); else/*from ww w. j av a2 s. c o m*/ return Collections.singletonList((Value) factory.createLiteral(value)); }
From source file:org.apache.maven.wagon.shared.http.HtmlFileListParser.java
/** * Fetches a raw HTML from a provided InputStream, parses it, and returns the file list. * * @param stream the input stream.//from w w w . j a va 2 s. co m * @return the file list. * @throws TransferFailedException if there was a problem fetching the raw html. */ public static List<String> parseFileList(String baseurl, InputStream stream) throws TransferFailedException { try { URI baseURI = new URI(baseurl); // to make debugging easier, start with a string. This is assuming UTF-8, which might not be a safe // assumption. String content = IOUtils.toString(stream, "utf-8"); Document doc = Jsoup.parse(content, baseurl); Elements links = doc.select("a[href]"); Set<String> results = new HashSet<String>(); for (Element link : links) { /* * The abs:href loses directories, so we deal with absolute paths ourselves below in cleanLink */ String target = link.attr("href"); if (target != null) { String clean = cleanLink(baseURI, target); if (isAcceptableLink(clean)) { results.add(clean); } } } return new ArrayList<String>(results); } catch (URISyntaxException e) { throw new TransferFailedException("Unable to parse as base URI: " + baseurl, e); } catch (IOException e) { throw new TransferFailedException("I/O error reading HTML listing of artifacts: " + e.getMessage(), e); } }
From source file:org.apache.nifi.GetHTMLElement.java
/** * Extracts the HTML value based on the configuration values. * * @return value from the parsed HTML element *//* w ww. ja v a 2 s. c o m*/ private String extractElementValue(String prependValue, final String outputType, String appendValue, final Element ele, final String attrKey) { if (StringUtils.isEmpty(prependValue)) { prependValue = ""; } if (StringUtils.isEmpty(appendValue)) { appendValue = ""; } switch (outputType) { case ELEMENT_HTML: return prependValue + ele.html() + appendValue; case ELEMENT_TEXT: return prependValue + ele.text() + appendValue; case ELEMENT_DATA: return prependValue + ele.data() + appendValue; case ELEMENT_ATTRIBUTE: return prependValue + ele.attr(attrKey) + appendValue; default: return prependValue + ele.html() + appendValue; } }
From source file:org.apache.nifi.TestModifyHTMLElement.java
@Test public void testModifyAttribute() throws Exception { final String MOD_VALUE = "http://localhost/newlink"; testRunner.setProperty(ModifyHTMLElement.CSS_SELECTOR, "#" + GDR_ID); testRunner.setProperty(ModifyHTMLElement.OUTPUT_TYPE, ModifyHTMLElement.ELEMENT_ATTRIBUTE); testRunner.setProperty(ModifyHTMLElement.ATTRIBUTE_KEY, "href"); testRunner.setProperty(ModifyHTMLElement.MODIFIED_VALUE, MOD_VALUE); testRunner.enqueue(new File("src/test/resources/Weather.html").toPath()); testRunner.run();/* www . j a v a 2 s . co m*/ testRunner.assertTransferCount(ModifyHTMLElement.REL_SUCCESS, 1); testRunner.assertTransferCount(ModifyHTMLElement.REL_INVALID_HTML, 0); testRunner.assertTransferCount(ModifyHTMLElement.REL_ORIGINAL, 1); testRunner.assertTransferCount(ModifyHTMLElement.REL_NOT_FOUND, 0); List<MockFlowFile> ffs = testRunner.getFlowFilesForRelationship(ModifyHTMLElement.REL_SUCCESS); assertTrue(ffs.size() == 1); String data = new String(testRunner.getContentAsByteArray(ffs.get(0))); //Contents will be the entire HTML doc. So lets use Jsoup again just the grab the element we want. Document doc = Jsoup.parse(data); Elements eles = doc.select("#" + GDR_ID); Element ele = eles.get(0); assertTrue(StringUtils.equals(MOD_VALUE, ele.attr("href"))); }
From source file:org.apache.nutch.protocol.httpclient.HttpFormAuthentication.java
private List<NameValuePair> getLoginFormParams(String pageContent) throws UnsupportedEncodingException { List<NameValuePair> params = new ArrayList<NameValuePair>(); Document doc = Jsoup.parse(pageContent); Element loginform = doc.getElementById(authConfigurer.getLoginFormId()); if (loginform == null) { LOG.debug("No form element found with 'id' = {}, trying 'name'.", authConfigurer.getLoginFormId()); loginform = doc.select("form[name=" + authConfigurer.getLoginFormId() + "]").first(); if (loginform == null) { LOG.debug("No form element found with 'name' = {}", authConfigurer.getLoginFormId()); throw new IllegalArgumentException("No form exists: " + authConfigurer.getLoginFormId()); }/* ww w .ja v a 2 s.c o m*/ } Elements inputElements = loginform.getElementsByTag("input"); // skip fields in removedFormFields or loginPostData for (Element inputElement : inputElements) { String key = inputElement.attr("name"); String value = inputElement.attr("value"); if (authConfigurer.getLoginPostData().containsKey(key) || authConfigurer.getRemovedFormFields().contains(key)) { // value = loginPostData.get(key); continue; } params.add(new NameValuePair(key, value)); } // add key and value in loginPostData for (Entry<String, String> entry : authConfigurer.getLoginPostData().entrySet()) { params.add(new NameValuePair(entry.getKey(), entry.getValue())); } return params; }
From source file:org.apache.nutch.protocol.httpclient.proxy.HttpFormAuthentication.java
private List<NameValuePair> getLoginFormParams(String pageContent) throws UnsupportedEncodingException { List<NameValuePair> params = new ArrayList<NameValuePair>(); Document doc = Jsoup.parse(pageContent); Element loginform = doc.getElementById(authConfigurer.getLoginFormId()); if (loginform == null) { LOGGER.debug("No form element found with 'id' = {}, trying 'name'.", authConfigurer.getLoginFormId()); loginform = doc.select("form[name=" + authConfigurer.getLoginFormId() + "]").first(); if (loginform == null) { LOGGER.debug("No form element found with 'name' = {}", authConfigurer.getLoginFormId()); throw new IllegalArgumentException("No form exists: " + authConfigurer.getLoginFormId()); }//from w ww .jav a2 s.com } Elements inputElements = loginform.getElementsByTag("input"); // skip fields in removedFormFields or loginPostData for (Element inputElement : inputElements) { String key = inputElement.attr("name"); String value = inputElement.attr("value"); if (authConfigurer.getLoginPostData().containsKey(key) || authConfigurer.getRemovedFormFields().contains(key)) { // value = loginPostData.get(key); continue; } params.add(new NameValuePair(key, value)); } // add key and value in loginPostData for (Entry<String, String> entry : authConfigurer.getLoginPostData().entrySet()) { params.add(new NameValuePair(entry.getKey(), entry.getValue())); } return params; }
From source file:org.apdplat.extractor.html.HtmlExtractor.java
/** * ??/*from w w w. j av a 2 s . c o m*/ * @param url html? * @param encoding ?? * @param in html?? * @return */ public List<ExtractResult> extract(String url, InputStream in, String encoding) { List<ExtractResult> extractResults = new ArrayList<>(); if (!Charset.isSupported(encoding)) { LOGGER.error("???" + encoding + " URL" + url); return extractResults; } //?URL??? List<HtmlTemplate> htmlTemplates = extractRegular.getHtmlTemplate(url); if (htmlTemplates.isEmpty()) { return extractResults; } try { byte[] content = readAll(in); Document doc = Jsoup.parse(new ByteArrayInputStream(content), encoding, url); Elements metas = doc.select("meta"); String keywords = ""; String description = ""; for (Element meta : metas) { String name = meta.attr("name"); if ("keywords".equals(name)) { keywords = meta.attr("content"); } if ("description".equals(name)) { description = meta.attr("content"); } } Set<String> tableNames = new HashSet<>(); for (HtmlTemplate htmlTemplate : htmlTemplates) { if (tableNames.contains(htmlTemplate.getTableName())) { LOGGER.debug( "?tableName????UrlPattern?" + htmlTemplate.getUrlPattern().getUrlPattern()); LOGGER.debug(htmlTemplates.toString()); } tableNames.add(htmlTemplate.getTableName()); try { //??? ExtractResult extractResult = extractHtmlTemplate(url, htmlTemplate, doc); extractResult.setContent(content); extractResult.setEncoding(encoding); extractResult.setKeywords(keywords); extractResult.setDescription(description); extractResults.add(extractResult); } catch (Exception e) { LOGGER.error("???" + htmlTemplate.getTemplateName(), e); } } } catch (Exception e) { LOGGER.error("?: " + url, e); } return extractResults; }