Example usage for org.jsoup.nodes Element text

List of usage examples for org.jsoup.nodes Element text

Introduction

In this page you can find the example usage for org.jsoup.nodes Element text.

Prototype

public String text() 

Source Link

Document

Gets the combined text of this element and all its children.

Usage

From source file:org.xwiki.validator.HTML5DutchWebGuidelinesValidator.java

/**
 * Do not describe the mechanism behind following a link.
 *///from  w  w  w  . j  a  va2  s  .c o m
public void validateRpd8s1() {
    List<String> forbiddenLinkTexts = Arrays.asList(messages.getString("rpd8s1.forbiddenLinkTexts").split(","));

    for (Element link : getElements(ELEM_LINK)) {
        for (Element linkChild : link.getAllElements()) {
            if (linkChild.hasText()) {
                for (String forbiddenLinkText : forbiddenLinkTexts) {
                    assertFalse(Type.ERROR, "rpd8s1.link",
                            StringUtils.containsIgnoreCase(linkChild.text(), forbiddenLinkText));
                }
            }
        }
    }
}

From source file:org.xwiki.validator.HTML5DutchWebGuidelinesValidator.java

/**
 * Links to e-mail addresses: the e-mail address to which the message is addressed must be visible in the link text.
 *///  w  ww. ja va  2  s.  c o m
public void validateRpd8s16() {
    for (Element link : getElements(ELEM_LINK)) {
        String href = getAttributeValue(link, ATTR_HREF);
        if (href != null && href.startsWith(MAILTO)) {
            String email = StringUtils.substringAfter(href, MAILTO);
            if (email.contains(QUERY_STRING_SEPARATOR)) {
                email = StringUtils.substringBefore(email, QUERY_STRING_SEPARATOR);
            }
            assertTrue(Type.ERROR, "rpd8s16.email", link.text().contains(email));
        }
    }
}

From source file:Project.FILER.java

public static String[] Dealing_Files(File f) throws IOException //return array of important strings in the file
{
    Text = "";
    String[] Importants = { "", "", "" }; //first element is the title,second is all headers,third is img alt
    org.jsoup.nodes.Document doc = Jsoup.parse(f, "UTF-8");
    Importants[0] = doc.title(); //get the title of the file
    //Text=Text+" "+doc.title(); 
    String tag = "h";
    String All_Headers = "";
    Elements Header;//from   www  .  j a  v  a  2s. c  o m
    for (int i = 1; i < 20; i++) //loop to get text with headers tag of the file
    {
        tag = "h" + String.valueOf(i);
        Header = doc.select(tag);
        if (Header.size() > 0) {
            Header = doc.getElementsByTag(tag);
            String pConcatenated = "";
            for (Element x : Header) {
                pConcatenated += x.text() + " ";
            }
            All_Headers = All_Headers + pConcatenated;
        } else
            break;

    }
    Importants[1] = All_Headers;
    Text = Text + " " + doc.text(); //get the text of the document
    Elements img = doc.getElementsByTag("img"); //get the text with img tag 
    for (Element element : img) {
        if (element.attr("alt") != null && !(element.attr("alt").equals(""))) {
            Text = Text + " " + element.attr("alt");
            Importants[2] = Importants[2] + " " + element.attr("alt");
        }
    }
    return Importants;
}

From source file:ru.redcraft.pinterest4j.core.api.PinAPI.java

public List<Comment> getComments(Pin pin) {
    LOG.debug("Getting comments for pin = " + pin);
    List<Comment> comments = new ArrayList<Comment>();
    Document doc = null;//from  w  ww.  j  a v a 2  s  . c o  m
    String axajResponse = null;
    try {
        axajResponse = new APIRequestBuilder(pin.getURL()).setErrorMessage(PIN_API_ERROR).build().getResponse()
                .getEntity(String.class);
        doc = Jsoup.parse(new JSONObject(axajResponse).getString("footer"));
    } catch (JSONException e) {
        throw new PinterestRuntimeException(PIN_API_ERROR + axajResponse, e);
    }
    for (Element comment : doc.select("div.comment")) {
        long id = Long.valueOf(comment.getElementsByClass("DeleteComment").first().attr("data"));
        Element contentMeta = comment.getElementsByClass("CommenterMeta").first();
        User user = new LazyUser(contentMeta.getElementsByTag("a").first().attr("href").replace("/", ""),
                getApiManager());
        contentMeta.getElementsByTag("a").remove();
        String text = contentMeta.text();
        comments.add(new CommentImpl(id, text, user, pin));
    }
    LOG.debug("Comments extracted: " + comments);
    return comments;
}

From source file:solarrecorder.SolarRecorder.java

private void getSysData() throws IOException {
    org.jsoup.nodes.Document doc = Jsoup.connect("http://envoy").get();

    Elements allh2 = doc.getElementsByTag("h2");
    for (Element h2 : allh2) {
        if (h2.text().equals("System Statistics")) {
            Elements tables = h2.parent().getElementsByTag("table");
            Elements alltr = tables.first().getElementsByTag("tbody").first().getElementsByTag("tr");
            for (Element tr : alltr) {
                Elements alltd = tr.getElementsByTag("td");
                String name = alltd.first().text();
                String value = alltd.last().text();
                if (name.equals("Number of Microinverters Online")) {
                    envoyData.add(new EnvoyData(name, value));
                }/*from   ww  w.j av  a  2s .co  m*/
            }
        }
    }
}

From source file:synapticloop.documentr.generator.Generator.java

/**
 * Render the table of contents.  This will also render links to the headers, 
 * and back to top links - if the options are enabled.  The first thing that 
 * we do is to remove any of the code fence blocks, we then convert the 
 * markdown to HTML to extract the headers to generate the table of context.
 * Then we go through the actual markdown and add in the links (if 
 * applicable), finally we put in the table of contents and re-insert the 
 * code fence blocks./*from   w w  w .  j a v  a 2 s .  co  m*/
 * 
 * @param rendered The previously rendered string
 * 
 * @return the rendered content, with the table of contents inserted
 */
private String renderTableOfContents(String rendered) {

    // the first thing we are going to do is to remove code fences...
    String renderedClean = removeCodeFenceBlocks(rendered);

    int numHeader = 0;

    // here we are going to render the markdown to HTML and then get all of the
    // header items to build the table of contents.
    StringBuilder headerStringBuilder = new StringBuilder("\n\n");

    PegDownProcessor pegDownProcessor = new PegDownProcessor();

    String markdownToHtml = pegDownProcessor.markdownToHtml(renderedClean);

    numHeader = 0;
    Document document = Jsoup.parse(markdownToHtml);
    Elements headings = document.select("h1, h2, h3, h4, h5, h6");
    for (Element heading : headings) {
        int valueOf = Integer.parseInt(heading.nodeName().substring(1));
        if (valueOf <= tocLevel) {
            if (hasTocLinks) {
                headerStringBuilder.append(SPACING_LOOKUP.get(valueOf) + "[" + heading.text()
                        + "](#documentr_heading_" + numHeader + ")\n");
            } else {
                headerStringBuilder.append(SPACING_LOOKUP.get(valueOf) + heading.text() + "\n");
            }
        }
        numHeader++;
    }

    headerStringBuilder.append("\n\n");

    // Now we have the header all set up

    numHeader = 0;
    // go through and parse the markdown, get all of the headers 
    char[] charArray = renderedClean.toCharArray();
    RootNode rootNode = pegDownProcessor.parseMarkdown(charArray);
    List<Node> children = rootNode.getChildren();

    for (Node node : children) {
        if (node instanceof HeaderNode) {
            HeaderNode headerNode = (HeaderNode) node;
            int level = headerNode.getLevel();
            if (level <= tocLevel) {
                HEADER_LOOKUP.put(new StartEndBean(headerNode.getStartIndex(), headerNode.getEndIndex()),
                        numHeader);
            }
            numHeader++;
        }
    }

    if (hasTocLinks) {
        Iterator<StartEndBean> iterator = HEADER_LOOKUP.keySet().iterator();
        int start = 0;
        StringBuilder renderedStringBuilder = new StringBuilder();

        while (iterator.hasNext()) {
            StartEndBean startEndBean = (StartEndBean) iterator.next();
            int headerStart = startEndBean.getStart();
            int headerEnd = startEndBean.getEnd();
            Integer headerNum = HEADER_LOOKUP.get(startEndBean);
            renderedStringBuilder.append(Arrays.copyOfRange(charArray, start, headerStart));
            renderedStringBuilder.append("\n\n<a name=\"documentr_heading_" + headerNum + "\"></a>\n\n");

            if (hasTocBackToTop) {
                renderedStringBuilder.append(Arrays.copyOfRange(charArray, headerStart, headerEnd - 1));
                renderedStringBuilder.append(tocBackToTop);
                start = headerEnd - 1;
            } else {
                start = headerStart;
            }
        }

        renderedStringBuilder.append(Arrays.copyOfRange(charArray, start, charArray.length));
        renderedClean = renderedStringBuilder.toString();
    }

    renderedClean = renderedClean.replace(DOCUMENTR_TABLE_OF_CONTENTS, headerStringBuilder.toString());

    // last but not least, we need to put back in the code fences
    Iterator<Integer> codeFenceBlocksIterator = codeFenceBlocks.keySet().iterator();
    while (codeFenceBlocksIterator.hasNext()) {
        Integer integer = (Integer) codeFenceBlocksIterator.next();
        renderedClean = renderedClean.replace(
                String.format("%s%d%s", DOCUMENTR_CODE_FENCE_PREFIX, integer, DOCUMENTR_DELIMETER),
                codeFenceBlocks.get(integer).toString());
    }

    return renderedClean;
}

From source file:uk.bl.wa.parsers.HtmlFeatureParser.java

/**
 * /*from w w  w .  j  av a 2s . c  o m*/
 */
@Override
public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context)
        throws IOException, SAXException, TikaException {
    final long start = System.nanoTime();
    // Pick up the URL:
    String url = metadata.get(Metadata.RESOURCE_NAME_KEY);

    // Parse it using JSoup
    Document doc = null;
    try {
        doc = Jsoup.parse(stream, null, url, parser);
    } catch (java.nio.charset.IllegalCharsetNameException e) {
        log.warn("Jsoup parse had to assume UTF-8: " + e);
        doc = Jsoup.parse(stream, "UTF-8", url);
    } catch (Exception e) {
        log.error("Jsoup parse failed: " + e);
    } finally {
        if (doc == null)
            return;
    }
    Instrument.timeRel("HTMLAnalyzer.analyze#parser", "HtmlFeatureParser.parse#jsoupparse", start);

    final long nonJsoupStart = System.nanoTime();
    // Record the number of errors found:
    if (parser.getErrors() != null)
        metadata.set(NUM_PARSE_ERRORS, parser.getErrors().size());

    // Get the links (no image links):
    Set<String> links = this.extractLinks(doc);
    if (links != null && links.size() > 0) {
        metadata.set(LINK_LIST, links.toArray(new String[links.size()]));
    }

    //get the image links
    if (extractImageLinks) {
        Set<String> imageLinks = this.extractImageLinks(doc);
        if (imageLinks != null && imageLinks.size() > 0) {
            metadata.set(IMAGE_LINKS, imageLinks.toArray(new String[imageLinks.size()]));
        }
    }

    // Get the publication date, from BBC pages:
    for (Element meta : doc.select("meta[name=OriginalPublicationDate]")) {
        metadata.set(ORIGINAL_PUB_DATE, meta.attr("content"));
        //log.debug(ORIGINAL_PUB_DATE + ": " + meta.attr("content"));
    }

    // Grab the first paragraph with text, and extract the text:
    for (Element p : doc.select("p")) {
        String pt = p.text();
        if (pt != null) {
            pt = pt.trim();
            if (pt.length() > 0) {
                metadata.set(FIRST_PARAGRAPH, p.text());
                //log.debug(FIRST_PARAGRAPH + ": " +p.text() );
                break;
            }
        }
    }

    // Grab the list of distinct elements used in the page:
    Set<String> de = new HashSet<String>();
    for (Element e : doc.select("*")) {
        // ELEMENT_NAME matching to weed out the worst false positives caused by JavaScript
        // This handles cases such as '<script> if (3<a) console.log('something');' where 'a' would have been
        // seen as a tag, but does not handle cases such as '<script> if ( 3<a ) console.log('something');' where
        // the a is still seen as a tag because it is followed by a space
        if (!"#root".equals(e.tag().getName()) && ELEMENT_NAME.matcher(e.tag().getName()).matches()) {
            de.add(StringUtils.left(e.tag().getName().toLowerCase(Locale.ENGLISH), 100));
        }
    }
    // For some elements, dig deeper and record attributes too:
    for (Element e : doc.select("link")) {
        de.add("link/@rel=" + e.attr("rel"));
    }
    // Store them:
    metadata.set(DISTINCT_ELEMENTS, de.toArray(new String[] {}));

    // Licence field, following:
    // http://www.whatwg.org/specs/web-apps/current-work/multipage/links.html#link-type-license
    for (Element a : doc.select("a[rel=license]")) {
        metadata.add(Metadata.LICENSE_URL, a.attr("href"));
    }
    for (Element a : doc.select("link[rel=license]")) {
        metadata.add(Metadata.LICENSE_URL, a.attr("href"));
    }
    for (Element a : doc.select("area[rel=license]")) {
        metadata.add(Metadata.LICENSE_URL, a.attr("href"));
    }
    Instrument.timeRel("HTMLAnalyzer.analyze#parser", "HtmlFeatureParser.parse#featureextract", nonJsoupStart);
}

From source file:us.colloquy.sandbox.FileProcessor.java

@Test
public void listAllUzipedFiles() {
    ///Documents/Tolstoy/diaries
    //System.getProperty("user.home") + "/Documents/Tolstoy/unzipLetters"

    Path pathToLetters = FileSystems.getDefault()
            .getPath(System.getProperty("user.home") + "/Documents/Tolstoy/openDiaries");

    List<Path> results = new ArrayList<>();

    int maxDepth = 6;

    try (Stream<Path> stream = Files.find(pathToLetters, maxDepth, (path, attr) -> {
        return String.valueOf(path).endsWith(".ncx");
    })) {//from ww w  .  j a  v  a2 s  .  c o m

        stream.forEach(results::add);

        //            String joined = stream
        //                    .sorted()
        //                    .map(String::valueOf)
        //                    .collect(Collectors.joining("; "));
        //
        //            System.out.println("\nFound: " + joined);

    } catch (IOException e) {
        e.printStackTrace();
    }

    System.out.println("files: " + results.size());

    Set<String> uriList = new TreeSet<>();

    try {

        for (Path res : results) {
            Path parent = res.getParent();

            System.out.println("---------------------------------------------");
            System.out.println(parent.toString());
            //use jsoup to list all files that contain something useful
            Document doc = Jsoup.parse(res.toFile(), "UTF-8");

            for (Element element : doc.getElementsByTag("docTitle")) {
                //Letter letter = new Letter();

                // StringBuilder content = new StringBuilder();

                for (Element child : element.children()) {

                    System.out.println("Title: " + child.text());
                }
            }

            for (Element element : doc.getElementsByTag("navPoint")) {
                //Letter letter = new Letter();

                // StringBuilder content = new StringBuilder();

                for (Element child : element.children()) {
                    String label = child.text();

                    if (StringUtils.isNotEmpty(label)) {
                        if (label.matches("?")) {
                            System.out.println("------------------");
                        }

                        String url = child.getElementsByTag("content").attr("src");

                        if (label.matches(".*\\d{1,3}.*[?--?]+.*") && StringUtils.isNotEmpty(url)) {

                            uriList.add(parent.toString() + File.separator + url.replaceAll("#.*", ""));
                            //                                System.out.println("nav point: " + label + " src " + parent.toString()
                            //                                        + System.lineSeparator() + url.replaceAll("#.*",""));

                        } else {
                            // System.out.println("nav point: " + label + " src " + child.getElementsByTag("content").attr("src"));
                        }

                    }
                }
            }

        }
    } catch (Exception e) {
        e.printStackTrace();
    }

    System.out.println("Size: " + uriList.size());

    for (String uri : uriList) {
        //parse and
        System.out.println(uri);
    }

}

From source file:us.colloquy.sandbox.FileProcessor.java

@Test
public void listAllUzipedFilesContent() {
    ///Documents/Tolstoy/diaries

    Path pathToLetters = FileSystems.getDefault()
            .getPath(System.getProperty("user.home") + "/Documents/Tolstoy/unzipLetters");

    List<Path> results = new ArrayList<>();

    int maxDepth = 6;

    try (Stream<Path> stream = Files.find(pathToLetters, maxDepth, (path, attr) -> {
        return String.valueOf(path).endsWith(".opf");
    })) {//from   w ww  . jav  a2s  . c o m

        stream.forEach(results::add);

        //            String joined = stream
        //                    .sorted()
        //                    .map(String::valueOf)
        //                    .collect(Collectors.joining("; "));
        //
        //            System.out.println("\nFound: " + joined);

    } catch (IOException e) {
        e.printStackTrace();
    }

    System.out.println("files: " + results.size());

    Set<String> uriList = new TreeSet<>();

    try {

        for (Path res : results) {
            Path parent = res.getParent();

            System.out.println("---------------------------------------------");
            System.out.println(parent.toString());
            //use jsoup to list all files that contain something useful
            Document doc = Jsoup.parse(res.toFile(), "UTF-8");

            for (Element element : doc.getElementsByTag("dc:title")) {
                //Letter letter = new Letter();

                // StringBuilder content = new StringBuilder();
                System.out.println(element.text());

                //                    for (Element child : element.children())
                //                    {
                //                       System.out.println(child.tagName() + "\t" + child.text());
                //                    }
            }

            //                for (Element element : doc.getElementsByTag("navPoint"))
            //                {
            //                    //Letter letter = new Letter();
            //
            //                    // StringBuilder content = new StringBuilder();
            //
            //                    for (Element child : element.children())
            //                    {
            //                        String label = child.text();
            //
            //                        if (StringUtils.isNotEmpty(label))
            //                        {
            //                            if (label.matches("?"))
            //                            {
            //                                System.out.println("------------------");
            //                            }
            //
            //
            //                            String url = child.getElementsByTag("content").attr("src");
            //
            //                            if (label.matches(".*\\d{1,3}.*[?--?]+.*") &&
            //                                    StringUtils.isNotEmpty(url) )
            //                            {
            //
            //                                uriList.add(parent.toString()
            //                                        + File.separator + url.replaceAll("#.*",""));
            ////                                System.out.println("nav point: " + label + " src " + parent.toString()
            ////                                        + System.lineSeparator() + url.replaceAll("#.*",""));
            //
            //
            //                            } else
            //                            {
            //                                // System.out.println("nav point: " + label + " src " + child.getElementsByTag("content").attr("src"));
            //                            }
            //
            //
            //                        }
            //                    }
            //                }

        }
    } catch (Exception e) {
        e.printStackTrace();
    }

    System.out.println("Size: " + uriList.size());

    for (String uri : uriList) {
        //parse and
        System.out.println(uri);
    }

}

From source file:us.colloquy.sandbox.FileProcessor.java

@Test
public void getURIForAllDiaries() {

    Set<DocumentPointer> uriList = new HashSet<>();
    //String letterDirectory = System.getProperty("user.home") + "/Documents/Tolstoy/openDiaries";

    ///* ww  w  . j  av a 2 s .com*/

    String letterDirectory = System.getProperty("user.home")
            + "/Documents/Tolstoy/90-volume-set/diaries/uzip/dnevnik_1881-1887_vol_49";

    Path pathToLetters = FileSystems.getDefault().getPath(letterDirectory);

    List<Path> results = new ArrayList<>();

    int maxDepth = 6;

    try (Stream<Path> stream = Files.find(pathToLetters, maxDepth, (path, attr) -> {
        return String.valueOf(path).endsWith(".ncx");
    })) {

        stream.forEach(results::add);

    } catch (IOException e) {
        e.printStackTrace();
    }

    System.out.println("files: " + results.size());

    try {

        for (Path res : results) {
            Path parent = res.getParent();

            //                System.out.println("---------------------------------------------");
            //                System.out.println(parent.toString());
            //use jsoup to list all files that contain something useful
            Document doc = Jsoup.parse(res.toFile(), "UTF-8");

            String title = "";

            for (Element element : doc.getElementsByTag("docTitle")) {
                //Letter letter = new Letter();

                // StringBuilder content = new StringBuilder();

                for (Element child : element.children()) {
                    title = child.text();
                    // System.out.println("Title: " + title);
                }
            }

            //  System.out.println("==========================   " + res.toString() + " ==========================");

            boolean startPrinting = false;

            boolean newFile = true;

            for (Element element : doc.getElementsByTag("navPoint")) {

                //get nav label and content

                Element navLabelElement = element.select("navLabel").first();
                Element srsElement = element.select("content").first();

                String navLabel = "";
                String srs = "";

                if (navLabelElement != null) {
                    navLabel = navLabelElement.text().replaceAll("\\*", "").trim();
                }

                if (srsElement != null) {
                    srs = srsElement.attr("src");
                }

                if ("??".matches(navLabel))

                {
                    startPrinting = false;

                    // System.out.println("----------------- end of file pointer ---------------");
                }

                if (StringUtils.isNotEmpty(navLabel)
                        && navLabel.matches("??.*|?? ?.*") && newFile) {
                    newFile = false;
                    startPrinting = true;
                    title = navLabel;
                }

                if (startPrinting) {
                    // System.out.println("----------------- file pointer ---------------");
                    //   System.out.println(navLabel + "\t" + srs);

                    DocumentPointer documentPointer = new DocumentPointer(
                            parent.toString() + File.separator + srs.replaceAll("#.*", ""), title);

                    uriList.add(documentPointer);
                }

                //                    for (Element child : element.children())
                //                    {
                //                        String label = child.text();
                //
                //                        if (StringUtils.isNotEmpty(label))
                //                        {
                //                            if (label.matches("??\\s\\d{4}.*"))
                //                            {
                //                                System.out.println("------------------");
                //                            }

                //
                //                            String url = child.getElementsByTag("content").attr("src");
                //
                //                            if (label.matches(".*\\d{1,3}.*[?--?]+.*") &&
                //                                    StringUtils.isNotEmpty(url))
                //                            {
                //                                DocumentPointer letterPointer = new DocumentPointer(parent.toString()
                //                                        + File.separator + url.replaceAll("#.*", ""), title);
                //
                //                                uriList.add(letterPointer);
                ////                                System.out.println("nav point: " + label + " src " + parent.toString()
                ////                                        + System.lineSeparator() + url.replaceAll("#.*",""));
                //
                //
                //                            } else if (label.matches(".*\\d{1,3}.*") &&
                //                                    StringUtils.isNotEmpty(url) && useOnlyNumber)
                //                            {
                //                                DocumentPointer letterPointer = new DocumentPointer(parent.toString()
                //                                        + File.separator + url.replaceAll("#.*", ""), title);
                //
                //                                uriList.add(letterPointer);
                ////                                System.out.println("nav point: " + label + " src " + parent.toString()
                ////                                        + System.lineSeparator() + url.replaceAll("#.*",""));
                //
                //
                //                            } else
                //                            {
                //                                // System.out.println("nav point: " + label + " src " + child.getElementsByTag("content").attr("src"));
                //                            }
                //
                //
                //                        }
                //                        }
            }

            //   System.out.println("==========================   END OF FILE ==========================");

        }
    } catch (Exception e) {
        e.printStackTrace();
    }

    System.out.println("Size: " + uriList.size());

    for (DocumentPointer pointer : uriList) {
        //parse and
        System.out.println(pointer.getSourse() + "\t" + pointer.getUri());
    }
}