Example usage for org.jsoup.nodes Element text

List of usage examples for org.jsoup.nodes Element text

Introduction

In this page you can find the example usage for org.jsoup.nodes Element text.

Prototype

public String text() 

Source Link

Document

Gets the combined text of this element and all its children.

Usage

From source file:org.apache.nifi.TestModifyHTMLElement.java

@Test
public void testModifyText() throws Exception {
    final String MOD_VALUE = "Newly modified value to replace " + ATL_WEATHER_TEXT;
    testRunner.setProperty(ModifyHTMLElement.CSS_SELECTOR, "#" + ATL_ID);
    testRunner.setProperty(ModifyHTMLElement.OUTPUT_TYPE, ModifyHTMLElement.ELEMENT_TEXT);
    testRunner.setProperty(ModifyHTMLElement.MODIFIED_VALUE, MOD_VALUE);

    testRunner.enqueue(new File("src/test/resources/Weather.html").toPath());
    testRunner.run();//from   w w  w. j a v  a  2 s .c  om

    testRunner.assertTransferCount(ModifyHTMLElement.REL_SUCCESS, 1);
    testRunner.assertTransferCount(ModifyHTMLElement.REL_INVALID_HTML, 0);
    testRunner.assertTransferCount(ModifyHTMLElement.REL_ORIGINAL, 1);
    testRunner.assertTransferCount(ModifyHTMLElement.REL_NOT_FOUND, 0);

    List<MockFlowFile> ffs = testRunner.getFlowFilesForRelationship(ModifyHTMLElement.REL_SUCCESS);
    assertTrue(ffs.size() == 1);
    String data = new String(testRunner.getContentAsByteArray(ffs.get(0)));

    //Contents will be the entire HTML doc. So lets use Jsoup again just the grab the element we want.
    Document doc = Jsoup.parse(data);
    Elements eles = doc.select("#" + ATL_ID);
    Element ele = eles.get(0);

    assertTrue(StringUtils.equals(MOD_VALUE, ele.text()));
}

From source file:org.apache.nifi.TestModifyHTMLElement.java

@Test
public void testModifyHTMLWithExpressionLanguage() throws Exception {

    final String MOD_VALUE = "Newly modified value to replace " + ATL_WEATHER_TEXT;

    testRunner.setProperty(ModifyHTMLElement.CSS_SELECTOR, "#" + ATL_ID);
    testRunner.setProperty(ModifyHTMLElement.OUTPUT_TYPE, ModifyHTMLElement.ELEMENT_TEXT);
    testRunner.setProperty(ModifyHTMLElement.MODIFIED_VALUE, "${\" " + MOD_VALUE + " \":trim()}");

    testRunner.enqueue(new File("src/test/resources/Weather.html").toPath());
    testRunner.run();//  www. ja  v a2  s .c o  m

    testRunner.assertTransferCount(ModifyHTMLElement.REL_SUCCESS, 1);
    testRunner.assertTransferCount(ModifyHTMLElement.REL_INVALID_HTML, 0);
    testRunner.assertTransferCount(ModifyHTMLElement.REL_ORIGINAL, 1);
    testRunner.assertTransferCount(ModifyHTMLElement.REL_NOT_FOUND, 0);

    List<MockFlowFile> ffs = testRunner.getFlowFilesForRelationship(ModifyHTMLElement.REL_SUCCESS);
    assertTrue(ffs.size() == 1);
    String data = new String(testRunner.getContentAsByteArray(ffs.get(0)));

    //Contents will be the entire HTML doc. So lets use Jsoup again just the grab the element we want.
    Document doc = Jsoup.parse(data);
    Elements eles = doc.select("#" + ATL_ID);
    Element ele = eles.get(0);

    assertNotNull(ele.text());
}

From source file:org.apache.nifi.TestModifyHTMLElement.java

@Test
public void testModifyValueContainsHTMLCharacters() throws Exception {
    final String MOD_VALUE = "Text that contains > and < characters";
    testRunner.setProperty(ModifyHTMLElement.CSS_SELECTOR, "#" + GDR_ID);
    testRunner.setProperty(ModifyHTMLElement.OUTPUT_TYPE, ModifyHTMLElement.ELEMENT_HTML);
    testRunner.setProperty(ModifyHTMLElement.MODIFIED_VALUE, MOD_VALUE);

    testRunner.enqueue(new File("src/test/resources/Weather.html").toPath());
    testRunner.run();/*from w  w  w  . j a  v a2s  . c  om*/

    testRunner.assertTransferCount(ModifyHTMLElement.REL_SUCCESS, 1);
    testRunner.assertTransferCount(ModifyHTMLElement.REL_INVALID_HTML, 0);
    testRunner.assertTransferCount(ModifyHTMLElement.REL_ORIGINAL, 1);
    testRunner.assertTransferCount(ModifyHTMLElement.REL_NOT_FOUND, 0);

    List<MockFlowFile> ffs = testRunner.getFlowFilesForRelationship(ModifyHTMLElement.REL_SUCCESS);
    assertTrue(ffs.size() == 1);
    String data = new String(testRunner.getContentAsByteArray(ffs.get(0)));

    //Contents will be the entire HTML doc. So lets use Jsoup again just the grab the element we want.
    Document doc = Jsoup.parse(data);
    Elements eles = doc.select("#" + GDR_ID);
    Element ele = eles.get(0);

    assertTrue(StringUtils.equals(MOD_VALUE, ele.text()));
    assertTrue(StringUtils.equals(MOD_VALUE.replace(">", "&gt;").replace("<", "&lt;"), ele.html()));
}

From source file:org.apdplat.extractor.html.ExtractFunctionExecutor.java

/**
 * ?CSS deleteChild(div.ep-source)/*from   w ww .  java 2s  .c o m*/
 * ??CSS?CSS???
 *
 * @param text CSS??
 * @param doc 
 * @param cssPath CSS
 * @param parseExpression ?
 * @return ???
 */
public static String executeDeleteChild(String text, Document doc, CssPath cssPath, String parseExpression) {
    LOGGER.debug("deleteChild??" + text);
    String parameter = parseExpression.replace("deleteChild(", "");
    parameter = parameter.substring(0, parameter.length() - 1);
    Elements elements = doc.select(cssPath.getCssPath() + " " + parameter);
    for (Element element : elements) {
        String t = element.text();
        if (StringUtils.isNotBlank(t)) {
            LOGGER.debug("deleteChild?" + t);
            text = text.replace(t, "");
        }
    }
    LOGGER.debug("deleteChild??" + text);
    return text;
}

From source file:org.apdplat.extractor.html.HtmlExtractor.java

/**
 * ????//w w w.ja  v a  2  s  . c  o  m
 * @param url html?
 * @param htmlTemplate html??
 * @param doc jsoup
 * @return ?
 */
private ExtractResult extractHtmlTemplate(String url, HtmlTemplate htmlTemplate, Document doc) {
    //???
    ExtractResult extractResult = new ExtractResult();
    extractResult.setUrl(url);
    extractResult.setTableName(htmlTemplate.getTableName());
    List<CssPath> cssPaths = htmlTemplate.getCssPaths();
    //??CSS???????
    //??CSS???
    for (CssPath cssPath : cssPaths) {
        // ??CSS PATH
        Elements elements = doc.select(cssPath.getCssPath());
        // CSS??
        for (Element element : elements) {
            String text = null;
            if (StringUtils.isBlank(cssPath.getAttr())) {
                //???
                text = element.text();
            } else {
                //???
                text = element.attr(cssPath.getAttr());
            }
            if (StringUtils.isNotBlank(text)) {
                // ????
                if (cssPath.hasExtractFunction()) {
                    //CSS???
                    for (ExtractFunction pf : cssPath.getExtractFunctions()) {
                        text = ExtractFunctionExecutor.execute(text, doc, cssPath, pf.getExtractExpression());
                        if (text != null) {
                            ExtractResultItem extractResultItem = new ExtractResultItem();
                            extractResultItem.setField(pf.getFieldName());
                            extractResultItem.setValue(text);
                            extractResult.addExtractResultItem(extractResultItem);
                        } else {
                            ExtractFailLog extractFailLog = new ExtractFailLog();
                            extractFailLog.setUrl(url);
                            extractFailLog.setUrlPattern(htmlTemplate.getUrlPattern().getUrlPattern());
                            extractFailLog.setTemplateName(htmlTemplate.getTemplateName());
                            extractFailLog.setCssPath(cssPath.getCssPath());
                            extractFailLog.setExtractExpression(pf.getExtractExpression());
                            extractFailLog.setTableName(htmlTemplate.getTableName());
                            extractFailLog.setFieldName(pf.getFieldName());
                            extractFailLog.setFieldDescription(pf.getFieldDescription());
                            extractResult.addExtractFailLog(extractFailLog);
                            //??????
                            //?
                            //???
                            return extractResult;
                        }
                    }
                } else {
                    //CSS?
                    ExtractResultItem extractResultItem = new ExtractResultItem();
                    extractResultItem.setField(cssPath.getFieldName());
                    extractResultItem.setValue(text);
                    extractResult.addExtractResultItem(extractResultItem);
                }
            } else {
                //??????
                ExtractFailLog extractFailLog = new ExtractFailLog();
                extractFailLog.setUrl(url);
                extractFailLog.setUrlPattern(htmlTemplate.getUrlPattern().getUrlPattern());
                extractFailLog.setTemplateName(htmlTemplate.getTemplateName());
                extractFailLog.setCssPath(cssPath.getCssPath());
                extractFailLog.setExtractExpression("");
                extractFailLog.setTableName(htmlTemplate.getTableName());
                extractFailLog.setFieldName(cssPath.getFieldName());
                extractFailLog.setFieldDescription(cssPath.getFieldDescription());
                extractResult.addExtractFailLog(extractFailLog);
                //??????
                //?
                //???
                return extractResult;
            }
        }
    }
    return extractResult;
}

From source file:org.apdplat.search.util.baidu.JsoupBaiduInfoUtil.java

/**
 * @author JONE/*ww w.jav  a 2 s . c o m*/
 * @return String
 * @time 2013-11-11
 * @description ??13,100
 */
public String getResultsCountText() {
    if (null == document) {
        return "";
    }
    LOG.debug("total cssQuery: " + cssQuery);
    Element totalElement = document.select(cssQuery).first();
    String totalText = totalElement.text();
    LOG.info("?" + totalText);
    return totalText;
}

From source file:org.apdplat.superword.extract.DefinitionExtractor.java

/**
 * ???/*w w  w  .ja  v a  2s.c  o m*/
 * @param html
 * @return
 */
public static Word parseWord(String html, String word) {
    LOGGER.info("???" + word);
    Word w = new Word(word, "");
    try {
        for (Element element : Jsoup.parse(html).select(COLLINS_DEFINITION_CSS_PATH)) {
            String definition = element.text().trim();
            if (StringUtils.isNotBlank(definition)) {
                w.addDefinition(definition);
                LOGGER.debug("?:" + definition);
            }
        }
    } catch (Exception e) {
        LOGGER.error("?", e);
    }
    return w;
}

From source file:org.apdplat.superword.extract.HyphenExtractor.java

/**
 * ???//from   www  .  j av a 2s .c o  m
 * @param html
 * @return
 */
public static Word parseWord(String html, String word) {
    LOGGER.info("???" + word);
    Word w = new Word(word, "");
    try {
        for (Element element : Jsoup.parse(html).select(COLLINS_DEFINITION_CSS_PATH)) {
            String definition = element.text().trim();
            if (StringUtils.isNotBlank(definition) && definition.toLowerCase().contains(word.toLowerCase())) {
                w.addDefinition(definition);
                LOGGER.debug("?:" + definition);
            }
        }
    } catch (Exception e) {
        LOGGER.error("?", e);
    }
    return w;
}

From source file:org.apdplat.superword.extract.PartOfSpeechExtractor.java

/**
 * ??/*from   ww  w  .  j ava2s .  co  m*/
 * @param html
 * @return
 */
public static Word parseWord(String html, String word) {
    LOGGER.info("???" + word);
    Word w = new Word(word, "");
    try {
        for (Element element : Jsoup.parse(html).select(PART_OF_SPEECH_CSS_PATH)) {
            String partOfSpeech = element.text();
            LOGGER.debug("??:" + partOfSpeech);
            if (StringUtils.isNotBlank(partOfSpeech) && !partOfSpeech.contains("See also")) {
                partOfSpeech = partOfSpeech.replace(";", "")
                        //???
                        .replace("COMB in ADJ and N-COUNT", "COMB-in-ADJ-and-N-COUNT")
                        .replace("COMB in ADJ and N", "COMB-in-ADJ-and-N").replace("COMB in ADJ", "COMB-in-ADJ")
                        .replace("COMB in ADJ-GRADED", "COMB-in-ADJ-GRADED")
                        .replace("COMB in N-COUNT", "COMB-in-N-COUNT")
                        .replace("COMB in COLOUR", "COMB-in-COLOUR").replace("COMB in N", "COMB-in-N")
                        .replace("COMB in N-UNCOUNT", "COMB-in-N-UNCOUNT")
                        .replace("COMB in QUANT", "COMB-in-QUANT").replace("COMB in VERB", "COMB-in-VERB");
                String[] attrs = partOfSpeech.split("\\s+");
                for (String attr : attrs) {
                    if (attr.length() < 1) {
                        LOGGER.debug("?:" + attr);
                        continue;
                    }
                    //??
                    if (attr.contains("PHR")) {
                        LOGGER.debug(":" + attr);
                        continue;
                    }
                    attr = attr.replace(",", "");
                    char c = attr.charAt(0);
                    if (c >= 'A' && c <= 'Z') {
                        if ("VERB".equals(attr)) {
                            attr = "V";
                        }
                        if ("VERB-ERG".equals(attr)) {
                            attr = "V-ERG";
                        }
                        w.addPartOfSpeech(attr);
                        LOGGER.debug("??:" + attr);
                    }
                }
            }
        }
    } catch (Exception e) {
        LOGGER.error("??", e);
    }
    return w;
}

From source file:org.apdplat.superword.extract.PhraseExtractor.java

/**
 * ??/* w w w  .j a va2s .c o m*/
 * @param html
 * @return
 */
public static Set<String> parsePhrase(String html, String word) {
    Set<String> phrases = new HashSet<>();
    LOGGER.info("???" + word);
    if (Character.isUpperCase(word.charAt(0))) {
        LOGGER.info("???");
        return phrases;
    }
    try {
        o: for (Element element : Jsoup.parse(html).select(PHRASE_CSS_PATH)) {
            String phrase = element.text().trim();
            if (StringUtils.isNotBlank(phrase)) {
                if (phrase.length() >= 50) {
                    LOGGER.debug(":" + phrase);
                    break o;
                }
                String[] attrs = phrase.split("\\s+");
                if (attrs == null || attrs.length < 2) {
                    LOGGER.debug(":" + phrase);
                    break o;
                }
                for (String attr : attrs) {
                    for (char c : attr.toCharArray()) {
                        if (!(c >= 'a' && c <= 'z' || c >= 'A' && c <= 'Z')) {
                            LOGGER.debug(":" + phrase);
                            break o;
                        }
                    }
                }
                phrases.add(phrase);
                LOGGER.debug("?:" + phrase);
            }
        }
    } catch (Exception e) {
        LOGGER.error("?", e);
    }
    return phrases;
}