List of usage examples for org.jsoup.nodes Element text
public String text()
From source file:org.apache.nifi.TestModifyHTMLElement.java
@Test public void testModifyText() throws Exception { final String MOD_VALUE = "Newly modified value to replace " + ATL_WEATHER_TEXT; testRunner.setProperty(ModifyHTMLElement.CSS_SELECTOR, "#" + ATL_ID); testRunner.setProperty(ModifyHTMLElement.OUTPUT_TYPE, ModifyHTMLElement.ELEMENT_TEXT); testRunner.setProperty(ModifyHTMLElement.MODIFIED_VALUE, MOD_VALUE); testRunner.enqueue(new File("src/test/resources/Weather.html").toPath()); testRunner.run();//from w w w. j a v a 2 s .c om testRunner.assertTransferCount(ModifyHTMLElement.REL_SUCCESS, 1); testRunner.assertTransferCount(ModifyHTMLElement.REL_INVALID_HTML, 0); testRunner.assertTransferCount(ModifyHTMLElement.REL_ORIGINAL, 1); testRunner.assertTransferCount(ModifyHTMLElement.REL_NOT_FOUND, 0); List<MockFlowFile> ffs = testRunner.getFlowFilesForRelationship(ModifyHTMLElement.REL_SUCCESS); assertTrue(ffs.size() == 1); String data = new String(testRunner.getContentAsByteArray(ffs.get(0))); //Contents will be the entire HTML doc. So lets use Jsoup again just the grab the element we want. Document doc = Jsoup.parse(data); Elements eles = doc.select("#" + ATL_ID); Element ele = eles.get(0); assertTrue(StringUtils.equals(MOD_VALUE, ele.text())); }
From source file:org.apache.nifi.TestModifyHTMLElement.java
@Test public void testModifyHTMLWithExpressionLanguage() throws Exception { final String MOD_VALUE = "Newly modified value to replace " + ATL_WEATHER_TEXT; testRunner.setProperty(ModifyHTMLElement.CSS_SELECTOR, "#" + ATL_ID); testRunner.setProperty(ModifyHTMLElement.OUTPUT_TYPE, ModifyHTMLElement.ELEMENT_TEXT); testRunner.setProperty(ModifyHTMLElement.MODIFIED_VALUE, "${\" " + MOD_VALUE + " \":trim()}"); testRunner.enqueue(new File("src/test/resources/Weather.html").toPath()); testRunner.run();// www. ja v a2 s .c o m testRunner.assertTransferCount(ModifyHTMLElement.REL_SUCCESS, 1); testRunner.assertTransferCount(ModifyHTMLElement.REL_INVALID_HTML, 0); testRunner.assertTransferCount(ModifyHTMLElement.REL_ORIGINAL, 1); testRunner.assertTransferCount(ModifyHTMLElement.REL_NOT_FOUND, 0); List<MockFlowFile> ffs = testRunner.getFlowFilesForRelationship(ModifyHTMLElement.REL_SUCCESS); assertTrue(ffs.size() == 1); String data = new String(testRunner.getContentAsByteArray(ffs.get(0))); //Contents will be the entire HTML doc. So lets use Jsoup again just the grab the element we want. Document doc = Jsoup.parse(data); Elements eles = doc.select("#" + ATL_ID); Element ele = eles.get(0); assertNotNull(ele.text()); }
From source file:org.apache.nifi.TestModifyHTMLElement.java
@Test public void testModifyValueContainsHTMLCharacters() throws Exception { final String MOD_VALUE = "Text that contains > and < characters"; testRunner.setProperty(ModifyHTMLElement.CSS_SELECTOR, "#" + GDR_ID); testRunner.setProperty(ModifyHTMLElement.OUTPUT_TYPE, ModifyHTMLElement.ELEMENT_HTML); testRunner.setProperty(ModifyHTMLElement.MODIFIED_VALUE, MOD_VALUE); testRunner.enqueue(new File("src/test/resources/Weather.html").toPath()); testRunner.run();/*from w w w . j a v a2s . c om*/ testRunner.assertTransferCount(ModifyHTMLElement.REL_SUCCESS, 1); testRunner.assertTransferCount(ModifyHTMLElement.REL_INVALID_HTML, 0); testRunner.assertTransferCount(ModifyHTMLElement.REL_ORIGINAL, 1); testRunner.assertTransferCount(ModifyHTMLElement.REL_NOT_FOUND, 0); List<MockFlowFile> ffs = testRunner.getFlowFilesForRelationship(ModifyHTMLElement.REL_SUCCESS); assertTrue(ffs.size() == 1); String data = new String(testRunner.getContentAsByteArray(ffs.get(0))); //Contents will be the entire HTML doc. So lets use Jsoup again just the grab the element we want. Document doc = Jsoup.parse(data); Elements eles = doc.select("#" + GDR_ID); Element ele = eles.get(0); assertTrue(StringUtils.equals(MOD_VALUE, ele.text())); assertTrue(StringUtils.equals(MOD_VALUE.replace(">", ">").replace("<", "<"), ele.html())); }
From source file:org.apdplat.extractor.html.ExtractFunctionExecutor.java
/** * ?CSS deleteChild(div.ep-source)/*from w ww . java 2s .c o m*/ * ??CSS?CSS??? * * @param text CSS?? * @param doc * @param cssPath CSS * @param parseExpression ? * @return ??? */ public static String executeDeleteChild(String text, Document doc, CssPath cssPath, String parseExpression) { LOGGER.debug("deleteChild??" + text); String parameter = parseExpression.replace("deleteChild(", ""); parameter = parameter.substring(0, parameter.length() - 1); Elements elements = doc.select(cssPath.getCssPath() + " " + parameter); for (Element element : elements) { String t = element.text(); if (StringUtils.isNotBlank(t)) { LOGGER.debug("deleteChild?" + t); text = text.replace(t, ""); } } LOGGER.debug("deleteChild??" + text); return text; }
From source file:org.apdplat.extractor.html.HtmlExtractor.java
/** * ????//w w w.ja v a 2 s . c o m * @param url html? * @param htmlTemplate html?? * @param doc jsoup * @return ? */ private ExtractResult extractHtmlTemplate(String url, HtmlTemplate htmlTemplate, Document doc) { //??? ExtractResult extractResult = new ExtractResult(); extractResult.setUrl(url); extractResult.setTableName(htmlTemplate.getTableName()); List<CssPath> cssPaths = htmlTemplate.getCssPaths(); //??CSS??????? //??CSS??? for (CssPath cssPath : cssPaths) { // ??CSS PATH Elements elements = doc.select(cssPath.getCssPath()); // CSS?? for (Element element : elements) { String text = null; if (StringUtils.isBlank(cssPath.getAttr())) { //??? text = element.text(); } else { //??? text = element.attr(cssPath.getAttr()); } if (StringUtils.isNotBlank(text)) { // ???? if (cssPath.hasExtractFunction()) { //CSS??? for (ExtractFunction pf : cssPath.getExtractFunctions()) { text = ExtractFunctionExecutor.execute(text, doc, cssPath, pf.getExtractExpression()); if (text != null) { ExtractResultItem extractResultItem = new ExtractResultItem(); extractResultItem.setField(pf.getFieldName()); extractResultItem.setValue(text); extractResult.addExtractResultItem(extractResultItem); } else { ExtractFailLog extractFailLog = new ExtractFailLog(); extractFailLog.setUrl(url); extractFailLog.setUrlPattern(htmlTemplate.getUrlPattern().getUrlPattern()); extractFailLog.setTemplateName(htmlTemplate.getTemplateName()); extractFailLog.setCssPath(cssPath.getCssPath()); extractFailLog.setExtractExpression(pf.getExtractExpression()); extractFailLog.setTableName(htmlTemplate.getTableName()); extractFailLog.setFieldName(pf.getFieldName()); extractFailLog.setFieldDescription(pf.getFieldDescription()); extractResult.addExtractFailLog(extractFailLog); //?????? //? //??? return extractResult; } } } else { //CSS? ExtractResultItem extractResultItem = new ExtractResultItem(); extractResultItem.setField(cssPath.getFieldName()); extractResultItem.setValue(text); extractResult.addExtractResultItem(extractResultItem); } } else { //?????? ExtractFailLog extractFailLog = new ExtractFailLog(); extractFailLog.setUrl(url); extractFailLog.setUrlPattern(htmlTemplate.getUrlPattern().getUrlPattern()); extractFailLog.setTemplateName(htmlTemplate.getTemplateName()); extractFailLog.setCssPath(cssPath.getCssPath()); extractFailLog.setExtractExpression(""); extractFailLog.setTableName(htmlTemplate.getTableName()); extractFailLog.setFieldName(cssPath.getFieldName()); extractFailLog.setFieldDescription(cssPath.getFieldDescription()); extractResult.addExtractFailLog(extractFailLog); //?????? //? //??? return extractResult; } } } return extractResult; }
From source file:org.apdplat.search.util.baidu.JsoupBaiduInfoUtil.java
/** * @author JONE/*ww w.jav a 2 s . c o m*/ * @return String * @time 2013-11-11 * @description ??13,100 */ public String getResultsCountText() { if (null == document) { return ""; } LOG.debug("total cssQuery: " + cssQuery); Element totalElement = document.select(cssQuery).first(); String totalText = totalElement.text(); LOG.info("?" + totalText); return totalText; }
From source file:org.apdplat.superword.extract.DefinitionExtractor.java
/** * ???/*w w w .ja v a 2s.c o m*/ * @param html * @return */ public static Word parseWord(String html, String word) { LOGGER.info("???" + word); Word w = new Word(word, ""); try { for (Element element : Jsoup.parse(html).select(COLLINS_DEFINITION_CSS_PATH)) { String definition = element.text().trim(); if (StringUtils.isNotBlank(definition)) { w.addDefinition(definition); LOGGER.debug("?:" + definition); } } } catch (Exception e) { LOGGER.error("?", e); } return w; }
From source file:org.apdplat.superword.extract.HyphenExtractor.java
/** * ???//from www . j av a 2s .c o m * @param html * @return */ public static Word parseWord(String html, String word) { LOGGER.info("???" + word); Word w = new Word(word, ""); try { for (Element element : Jsoup.parse(html).select(COLLINS_DEFINITION_CSS_PATH)) { String definition = element.text().trim(); if (StringUtils.isNotBlank(definition) && definition.toLowerCase().contains(word.toLowerCase())) { w.addDefinition(definition); LOGGER.debug("?:" + definition); } } } catch (Exception e) { LOGGER.error("?", e); } return w; }
From source file:org.apdplat.superword.extract.PartOfSpeechExtractor.java
/** * ??/*from ww w . j ava2s . co m*/ * @param html * @return */ public static Word parseWord(String html, String word) { LOGGER.info("???" + word); Word w = new Word(word, ""); try { for (Element element : Jsoup.parse(html).select(PART_OF_SPEECH_CSS_PATH)) { String partOfSpeech = element.text(); LOGGER.debug("??:" + partOfSpeech); if (StringUtils.isNotBlank(partOfSpeech) && !partOfSpeech.contains("See also")) { partOfSpeech = partOfSpeech.replace(";", "") //??? .replace("COMB in ADJ and N-COUNT", "COMB-in-ADJ-and-N-COUNT") .replace("COMB in ADJ and N", "COMB-in-ADJ-and-N").replace("COMB in ADJ", "COMB-in-ADJ") .replace("COMB in ADJ-GRADED", "COMB-in-ADJ-GRADED") .replace("COMB in N-COUNT", "COMB-in-N-COUNT") .replace("COMB in COLOUR", "COMB-in-COLOUR").replace("COMB in N", "COMB-in-N") .replace("COMB in N-UNCOUNT", "COMB-in-N-UNCOUNT") .replace("COMB in QUANT", "COMB-in-QUANT").replace("COMB in VERB", "COMB-in-VERB"); String[] attrs = partOfSpeech.split("\\s+"); for (String attr : attrs) { if (attr.length() < 1) { LOGGER.debug("?:" + attr); continue; } //?? if (attr.contains("PHR")) { LOGGER.debug(":" + attr); continue; } attr = attr.replace(",", ""); char c = attr.charAt(0); if (c >= 'A' && c <= 'Z') { if ("VERB".equals(attr)) { attr = "V"; } if ("VERB-ERG".equals(attr)) { attr = "V-ERG"; } w.addPartOfSpeech(attr); LOGGER.debug("??:" + attr); } } } } } catch (Exception e) { LOGGER.error("??", e); } return w; }
From source file:org.apdplat.superword.extract.PhraseExtractor.java
/** * ??/* w w w .j a va2s .c o m*/ * @param html * @return */ public static Set<String> parsePhrase(String html, String word) { Set<String> phrases = new HashSet<>(); LOGGER.info("???" + word); if (Character.isUpperCase(word.charAt(0))) { LOGGER.info("???"); return phrases; } try { o: for (Element element : Jsoup.parse(html).select(PHRASE_CSS_PATH)) { String phrase = element.text().trim(); if (StringUtils.isNotBlank(phrase)) { if (phrase.length() >= 50) { LOGGER.debug(":" + phrase); break o; } String[] attrs = phrase.split("\\s+"); if (attrs == null || attrs.length < 2) { LOGGER.debug(":" + phrase); break o; } for (String attr : attrs) { for (char c : attr.toCharArray()) { if (!(c >= 'a' && c <= 'z' || c >= 'A' && c <= 'Z')) { LOGGER.debug(":" + phrase); break o; } } } phrases.add(phrase); LOGGER.debug("?:" + phrase); } } } catch (Exception e) { LOGGER.error("?", e); } return phrases; }