List of usage examples for org.jsoup.nodes Element text
public String text()
From source file:net.trustie.model.OpenHubProject_Model.java
private void handleNutShell1(String nutshell) { Elements eles = getAElements(nutshell); Element ele = null; ele = eles.get(0);/*from w w w. j a v a2 s . co m*/ this.mostWrittenIn = StringHandler.removeHeader(ele.text(), OpenHubProject_Model.mostWrittenHeader).trim(); if (eles.size() > 1) { ele = eles.get(1); // System.out.println(ele.text()); this.commentsPercentage = StringHandler .removeTail(ele.text(), OpenHubProject_Model.commentsPercentageTail).trim(); // System.out.println(this.commentsPercentage); this.commentsPercentage = StringHandler.removeIndefiniteArticles(this.commentsPercentage).trim(); // System.out.println(this.commentsPercentage); } }
From source file:net.trustie.model.OpenHubProject_Model.java
private void handleNutShell2(String nutshell) { Elements eles = getAElements(nutshell); Element ele = null; if (eles.size() > 0) { // ele = eles.get(0); // this.codebaseStatus = ele.text(); // ele = eles.get(1); // this.teamScale = ele.text(); // ele = eles.get(2); // this.commitStatus = ele.text(); if (eles.size() == 1) { ele = eles.get(0);/* w ww . j av a 2s . c o m*/ this.codebaseStatus = ele.text(); } else if (eles.size() == 2) { ele = eles.get(0); this.codebaseStatus = ele.text(); ele = eles.get(1); this.teamScale = ele.text(); } else { ele = eles.get(0); this.codebaseStatus = ele.text(); ele = eles.get(1); this.teamScale = ele.text(); ele = eles.get(2); this.commitStatus = ele.text(); } } }
From source file:net.trustie.model.OpenHubProject_Model.java
private void handleNutShell3(String nutshell, Date date) { Elements eles = getAElements(nutshell); Element ele = null; if (eles.size() > 0) { if (eles.size() == 1) { ele = eles.get(0);/*from w w w. j av a 2 s . c om*/ this.estimateEffortTime = StringHandler .removeTail(ele.text(), OpenHubProject_Model.estimateEffortTimeTail).trim(); } else if (eles.size() == 2) { ele = eles.get(0); this.estimateEffortTime = StringHandler .removeTail(ele.text(), OpenHubProject_Model.estimateEffortTimeTail).trim(); ele = eles.get(1); String firstCommitAt = StringHandler .removeHeader(ele.text(), OpenHubProject_Model.firstCommitTimeHeader).trim(); ansTmp = handleDateAt(firstCommitAt).toString(); this.firstCommitTime = handleDateAt(firstCommitAt); } else { ele = eles.get(0); this.estimateEffortTime = StringHandler .removeTail(ele.text(), OpenHubProject_Model.estimateEffortTimeTail).trim(); ele = eles.get(1); String firstCommitAt = StringHandler .removeHeader(ele.text(), OpenHubProject_Model.firstCommitTimeHeader).trim(); ansTmp = handleDateAt(firstCommitAt).toString(); this.firstCommitTime = handleDateAt(firstCommitAt); ele = eles.get(2); String lastCommitAt = StringHandler .removeHeader(ele.text(), OpenHubProject_Model.lastCommitTimeHeader).trim(); // System.out.println(lastCommitAt); lastCommitAt = StringHandler.removePreposition(lastCommitAt); // SimpleDateFormat simpleDateFormat = new // SimpleDateFormat("yyy-MM-dd HH:mm:ss"); //SimpleDateFormat simpleDateFormat = new SimpleDateFormat("yyy-MM-dd HH:mm:ss"); this.lastCommitTime = DateHandler.stringToDate(DateHandler.formatAllTypeDate(lastCommitAt, date));//handleDateBefore(lastCommitAt); } } }
From source file:nl.sidn.pcap.ip.GoogleResolverCheck.java
@Override protected void init() { String url = Settings.getInstance().getSetting(Settings.RESOLVER_LIST_GOOGLE); LOGGER.info("Load Google resolver addresses from url: " + url); Document doc = null;// w ww . java 2 s .c o m try { doc = Jsoup.connect(url).get(); } catch (Exception e) { LOGGER.error("Problem while getting Google resolvers url: " + url); return; } Elements tags = doc.getElementsByTag("pre"); if (tags.size() == 2) { Element resolvers = tags.get(0); //Element resolver = codes.get(0); String[] ips = StringUtils.split(resolvers.text(), '\n'); for (String ip : ips) { String[] parts = StringUtils.split(ip, ' '); if (parts.length == 2) { if (LOGGER.isDebugEnabled()) { LOGGER.debug("Add Google resolver IP range: " + parts[0]); } try { bit_subnets.add(Subnet.createInstance(parts[0])); subnets.add(parts[0]); } catch (UnknownHostException e) { LOGGER.error("Problem while adding Google resolver IP range: " + parts[0] + e); } } } if (subnets.size() == 0) { LOGGER.error("No Google resolvers found at url: " + url); } } else { LOGGER.error("No Google resolvers found at url: " + url); } }
From source file:org.aliuge.crawler.extractor.selector.AbstractElementCssSelector.java
/** * ????/*from www .j a va 2 s .c o m*/ * @param elements * @return */ protected String getExtractText(Elements elements) { if (elements.size() == 0) return null; String temp = ""; if (attr.equalsIgnoreCase("tostring")) { return temp = elements.toString(); } else { if (index == -1 && StringUtils.isNotBlank(this.regex)) { for (Element e : elements) { Element element = e; if (element.select(this.regex).size() > 0) { return temp = e.text(); } } return temp; } else { if (index > -1 && index < elements.size()) { return elements.get(index).text(); } } return elements.first().text(); } /*if(attr.equals("tostring")){ if(index==0 || index>elements.size()) temp = elements.first().toString(); else temp = elements.get(index).toString(); }else{ if(index==0 || index>elements.size()) temp = elements.first().text(); else temp = elements.get(index).text(); } if(null!=pattern){ Matcher m = pattern.matcher(temp); if(m.find()){ temp = m.group(1); } }*/ //return temp; }
From source file:org.aliuge.crawler.jobconf.ExtractConfig.java
/** * ????/* www. j a v a 2 s.co m*/ * @param doc * @return * @throws ConfigurationException */ public ExtractConfig loadConfig(Document doc) { Elements extractElement = doc.select("extract"); super.setJobName(doc.select("job").attr("name")); super.setIndexName(doc.select("job").attr("indexName")); String temp = extractElement.select("threadNum").text(); if (StringUtils.isNotBlank(temp)) { this.threadNum = Integer.parseInt(temp); } Elements templateElement = extractElement.select("extract").select("template"); Iterator<Element> it = templateElement.iterator(); while (it.hasNext()) { Element template = it.next(); ExtractTemplate extractTemplate = new ExtractTemplate(); // ?Url???? Elements urlPatternElement = template.select("url"); List<Pattern> patterns = Lists.newArrayList(); for (Element urlElement : urlPatternElement) { patterns.add(Pattern.compile(urlElement.text())); } extractTemplate.setUrlPattern(patterns); extractTemplate.setName(template.attr("name")); // ??? Elements selectElement = template.select("elements").first().children(); for (Element element : selectElement) { if ("element".equals(element.tagName())) { AbstractElementCssSelector<?> selector = ElementCssSelectorFactory.create(element); extractTemplate.addCssSelector(selector); } else if ("if".equals(element.tagName())) { IFConditions ifConditions = IFConditions.create(element); extractTemplate.addConditions(ifConditions); } } super.setExtractConfig(this); this.templates.add(extractTemplate); } //super.setExtractConfig(this); return this; }
From source file:org.aliuge.crawler.jobconf.FetchConfig.java
/** * ???//from w w w . ja v a 2 s . c o m * * @param confFile * @return */ @SuppressWarnings("unchecked") public FetchConfig loadConfig(Document confDoc) throws ConfigurationException { try { Document doc = confDoc; super.setJobName(doc.select("job").attr("name")); super.setIndexName(doc.select("job").attr("indexName")); Elements e = doc.select("fetch"); this.type = e.select("type").text(); this.agent = e.select("agent").text(); String temp = e.select("threadNum").text(); if (StringUtils.isNotBlank(temp)) { this.threadNum = Integer.parseInt(temp); } temp = e.select("delayBetweenRequests").text(); if (StringUtils.isNotBlank(temp)) { this.delayBetweenRequests = Integer.parseInt(temp); } temp = e.select("maxDepthOfCrawling").text(); if (StringUtils.isNotBlank(temp)) { this.maxDepthOfCrawling = Integer.parseInt(temp); } temp = e.select("fetchBinaryContent").text(); if (StringUtils.isNotBlank(temp)) { this.fetchBinaryContent = Boolean.parseBoolean(temp); } if (StringUtils.isNotBlank(e.select("maxOutgoingLinksToFollow").text())) { this.maxOutgoingLinksToFollow = Integer.parseInt(e.select("maxOutgoingLinksToFollow").text()); } temp = e.select("fileSuffix").text(); if (StringUtils.isNotBlank(temp)) { this.fileSuffix = temp; } temp = e.select("maxDownloadSizePerPage").text(); if (StringUtils.isNotBlank(temp)) { this.maxDownloadSizePerPage = Integer.parseInt(temp); } temp = e.select("https").text(); if (StringUtils.isNotBlank(temp)) { this.https = Boolean.parseBoolean(temp); } temp = e.select("onlyDomain").text(); if (StringUtils.isNotBlank(temp)) { this.onlyDomain = Boolean.parseBoolean(temp); } temp = e.select("socketTimeoutMilliseconds").text(); if (StringUtils.isNotBlank(temp)) { this.socketTimeoutMilliseconds = Integer.parseInt(temp); } temp = e.select("connectionTimeout").text(); if (StringUtils.isNotBlank(temp)) { this.connectionTimeout = Integer.parseInt(temp); } temp = e.select("maxTotalConnections").text(); if (StringUtils.isNotBlank(temp)) { this.maxTotalConnections = Integer.parseInt(temp); } temp = e.select("maxConnectionsPerHost").text(); if (StringUtils.isNotBlank(temp)) { this.maxConnectionsPerHost = Integer.parseInt(e.select("maxConnectionsPerHost").text()); } temp = e.select("maxConnectionsPerHost").text(); if (StringUtils.isNotBlank(temp)) { this.maxConnectionsPerHost = Integer.parseInt(temp); } temp = e.select("proxy").text(); if (StringUtils.isNotBlank(temp)) { Properties p = PropertyConfigurationHelper.getProperties(temp); this.proxyIps = Lists.newLinkedList(); for (Object o : p.keySet()) { proxyIps.add((String) p.get(o)); } } // seed Elements seeds = doc.select("fetch seeds seed"); for (Element element : seeds) { // WebURL url = new WebURL(); String url = element.text(); if (StringUtils.isBlank(url)) { continue; } url = url.trim(); String area = element.attr("area"); this.seeds.add(url); WebURL areaUrl = new WebURL(area, url); try { PendingManager.getPendingArea(super.getJobName()).addElement(areaUrl); } catch (QueueException e1) { log.error("", e1); e1.printStackTrace(); } // BloomfilterHelper.getInstance().add(url.getURL()); } /* * ??Url */ Elements fetchUrlFilters = doc.select("fetchUrlFilters filter"); for (Element element : fetchUrlFilters) { String tmp = element.text(); if (StringUtils.isNoneBlank(tmp)) this.fetchUrlFilters.add(element.text()); } /* * ?????Url */ Elements extractUrlfilters = doc.select("extractUrlfilters filter"); for (Element element : extractUrlfilters) { String tmp = element.text(); String tmp_rep = element.attr("replace"); if (StringUtils.isNoneBlank(tmp)) this.extractUrlfilters.add(new KeyValue(tmp, tmp_rep)); } } catch (NumberFormatException e) { throw new ConfigurationException("?" + e.getMessage()); } // super.setFetchConfig(this); return this; }
From source file:org.angellist.angellistmobile.FeedJSONAdapter.java
public String replaceUserAndStartupLinks(String descriptionHtml) { // get the description and remove all tags String description = Html.fromHtml(descriptionHtml).toString(); // for each link, replace it with a link Document doc = Jsoup.parse(descriptionHtml); Elements links = doc.select("a[href]"); for (Element link : links) { if ("User".equals(link.attr("data-type"))) { description = description.replace(link.text(), "<a href=\"org.angellist.angellistmobile.user://" + link.attr("data-id") + "\">" + link.text() + "</a>"); } else if ("Startup".equals(link.attr("data-type"))) { description = description.replace(link.text(), "<a href=\"org.angellist.angellistmobile.startup://" + link.attr("data-id") + "\">" + link.text() + "</a>"); }// w w w . j a v a 2s.co m } return description; }
From source file:org.apache.marmotta.ldclient.provider.html.mapping.CssTextLiteralMapper.java
@Override public List<Value> map(String resourceUri, Element elem, ValueFactory factory) { final String value = cleanValue(elem.text()); if (StringUtils.isBlank(value)) return Collections.emptyList(); if (language != null) return Collections.singletonList((Value) factory.createLiteral(value, language.toString())); if (datatype != null) return Collections.singletonList( (Value) factory.createLiteral(value, factory.createURI(Namespaces.NS_XSD + datatype))); else/* w w w . jav a 2s . c o m*/ return Collections.singletonList((Value) factory.createLiteral(value)); }
From source file:org.apache.nifi.GetHTMLElement.java
/** * Extracts the HTML value based on the configuration values. * * @return value from the parsed HTML element *//*from ww w . jav a 2 s.co m*/ private String extractElementValue(String prependValue, final String outputType, String appendValue, final Element ele, final String attrKey) { if (StringUtils.isEmpty(prependValue)) { prependValue = ""; } if (StringUtils.isEmpty(appendValue)) { appendValue = ""; } switch (outputType) { case ELEMENT_HTML: return prependValue + ele.html() + appendValue; case ELEMENT_TEXT: return prependValue + ele.text() + appendValue; case ELEMENT_DATA: return prependValue + ele.data() + appendValue; case ELEMENT_ATTRIBUTE: return prependValue + ele.attr(attrKey) + appendValue; default: return prependValue + ele.html() + appendValue; } }