Example usage for org.jsoup.nodes Element children

List of usage examples for org.jsoup.nodes Element children

Introduction

In this page you can find the example usage for org.jsoup.nodes Element children.

Prototype

public Elements children() 

Source Link

Document

Get this element's child elements.

Usage

From source file:poe.trade.assist.SearchForm.java

private String removeAllExceptSearchForm(String html) {
    String htmlDirectory = htmlDirectory();
    Document doc = Jsoup.parse(html);

    // Remove stuff outside of id="main"
    //      doc.body().children().stream().filter(e -> !"main".equalsIgnoreCase(e.id())).forEach(e -> e.remove());

    Element head = doc.head();

    // Replace everthing in the <head>
    head.children().stream().forEach(e -> e.remove());
    head.appendElement("meta").attr("charset", "utf-8");
    head.appendElement("meta").attr("name", "viewport").attr("content", "width=device-width");
    head.appendElement("title").text("poe.trade.assist");
    head.appendElement("script").attr("type", "text/javascript").attr("src", htmlDirectory + "packed.js");
    head.appendElement("link").attr("rel", "stylesheet").attr("href", htmlDirectory + "packed_dark.css");

    // Show search form
    Optional.ofNullable(doc.getElementById("search-form")).ifPresent(e -> e.attr("style", ""));

    Optional.ofNullable(doc.getElementById("search"))
            .ifPresent(e -> e.attr("action", "http://poe.trade/search"));
    //      // w  ww .  j a  v a  2 s  . c o  m
    //      Element mainElement = doc.getElementById("main");
    //      Element topDivContainer = mainElement.child(0);
    //
    //      // Remove everthing that is not id="content" or h2
    //      topDivContainer.children().stream()
    //         .filter(e -> !"content".equalsIgnoreCase(e.id()))
    //         .filter(e -> !e.tag().getName().equalsIgnoreCase("h2"))
    //         .forEach(e -> e.remove());
    //      
    //      // Clean up stuff inside id="content"
    //      
    //         // Remove "Show search form", "search/import"
    //         Optional<Element> searchFormElem = doc.getElementsByTag("a").stream().filter(e -> e.hasClass("button") && e.hasClass("secondary") && e.hasClass("expand")).findFirst();
    //         searchFormElem.ifPresent(e -> e.remove());
    //         
    //         Optional<Element> searchOrImportDiv = doc.getElementsByTag("div").stream().filter(e -> e.hasClass("row") && e.hasClass("form-choose-action")).findFirst();
    //         searchOrImportDiv.ifPresent(e -> e.remove());
    //
    //         // Remove search results
    Elements searchResultBlocks = doc.getElementsByClass("search-results-block");
    if (searchResultBlocks.size() > 0) {
        searchResultBlocks.get(0).remove();
    }

    // append assist as the last element in body
    //       doc.body().appendElement("script").attr("type", "text/javascript").attr("src", htmlDirectory + "assist.js");

    String cleanHtml = doc.toString();
    return cleanHtml;
}

From source file:poe.trade.assist.SearchView.java

private String addHeadElements(String html) {
    String htmlDirectory = htmlDirectory();
    Document doc = Jsoup.parse(html);
    Element head = doc.head();

    // Replace everthing in the <head>
    head.children().stream().forEach(e -> e.remove());
    head.appendElement("meta").attr("charset", "utf-8");
    head.appendElement("meta").attr("name", "viewport").attr("content", "width=device-width");
    head.appendElement("title").text("poe.trade.assist");
    head.appendElement("script").attr("type", "text/javascript").attr("src", htmlDirectory + "packed.js");
    head.appendElement("link").attr("rel", "stylesheet").attr("href", htmlDirectory + "packed_dark.css");

    doc.body().appendElement("script").attr("type", "text/javascript").attr("src", htmlDirectory + "assist.js");

    String cleanHTML = doc.toString();
    //      try {
    //         FileUtils.writeStringToFile(new File("test"), cleanHTML);
    //      } catch (IOException e1) {
    //         // TODO Auto-generated catch block
    //         e1.printStackTrace();
    //      }/*from  w w w .  j a  va2 s .c o  m*/
    return cleanHTML;
}

From source file:Search.DataManipulation.DataParser.java

public Map<String, String> getMetaData(Document dom) {
    Elements details = null;/*from   www  . j a  va  2s. c  o m*/
    Map<String, String> metaData = new HashMap<>();

    Elements detailClass = dom.getElementsByClass("details-section-contents");
    for (Element testClass : detailClass) {
        if (testClass.children().first().hasClass("meta-info")) {
            details = testClass.children();
        }
    }

    assert details != null;
    for (Element detailElement : details) {
        String area = detailElement.children().first().ownText();
        String value = detailElement.children().last().ownText();

        if (!(area.equals("Permissions") || area.equals("Report") || area.equals("Developer"))) {
            metaData.put(area, value);
        }
    }
    return metaData;
}

From source file:uk.co.certait.htmlexporter.writer.AbstractTableCellWriter.java

/**
 * Returns the actual text of the innermost child element for this cell.
 * /*from  ww  w .j  av  a  2s  .  c om*/
 * @param element
 * 
 * @return The text to be output for this Cell.
 */
public String getElementText(Element element) {
    String text = element.ownText();

    for (Element child : element.children()) {
        text = child.ownText();
    }

    return text;
}

From source file:us.colloquy.sandbox.FileProcessor.java

@Test
public void listAllUzipedFiles() {
    ///Documents/Tolstoy/diaries
    //System.getProperty("user.home") + "/Documents/Tolstoy/unzipLetters"

    Path pathToLetters = FileSystems.getDefault()
            .getPath(System.getProperty("user.home") + "/Documents/Tolstoy/openDiaries");

    List<Path> results = new ArrayList<>();

    int maxDepth = 6;

    try (Stream<Path> stream = Files.find(pathToLetters, maxDepth, (path, attr) -> {
        return String.valueOf(path).endsWith(".ncx");
    })) {/*from   w  w  w .  java  2 s .co  m*/

        stream.forEach(results::add);

        //            String joined = stream
        //                    .sorted()
        //                    .map(String::valueOf)
        //                    .collect(Collectors.joining("; "));
        //
        //            System.out.println("\nFound: " + joined);

    } catch (IOException e) {
        e.printStackTrace();
    }

    System.out.println("files: " + results.size());

    Set<String> uriList = new TreeSet<>();

    try {

        for (Path res : results) {
            Path parent = res.getParent();

            System.out.println("---------------------------------------------");
            System.out.println(parent.toString());
            //use jsoup to list all files that contain something useful
            Document doc = Jsoup.parse(res.toFile(), "UTF-8");

            for (Element element : doc.getElementsByTag("docTitle")) {
                //Letter letter = new Letter();

                // StringBuilder content = new StringBuilder();

                for (Element child : element.children()) {

                    System.out.println("Title: " + child.text());
                }
            }

            for (Element element : doc.getElementsByTag("navPoint")) {
                //Letter letter = new Letter();

                // StringBuilder content = new StringBuilder();

                for (Element child : element.children()) {
                    String label = child.text();

                    if (StringUtils.isNotEmpty(label)) {
                        if (label.matches("?")) {
                            System.out.println("------------------");
                        }

                        String url = child.getElementsByTag("content").attr("src");

                        if (label.matches(".*\\d{1,3}.*[?--?]+.*") && StringUtils.isNotEmpty(url)) {

                            uriList.add(parent.toString() + File.separator + url.replaceAll("#.*", ""));
                            //                                System.out.println("nav point: " + label + " src " + parent.toString()
                            //                                        + System.lineSeparator() + url.replaceAll("#.*",""));

                        } else {
                            // System.out.println("nav point: " + label + " src " + child.getElementsByTag("content").attr("src"));
                        }

                    }
                }
            }

        }
    } catch (Exception e) {
        e.printStackTrace();
    }

    System.out.println("Size: " + uriList.size());

    for (String uri : uriList) {
        //parse and
        System.out.println(uri);
    }

}

From source file:us.colloquy.sandbox.FileProcessor.java

@Test
public void getURIForAllDiaries() {

    Set<DocumentPointer> uriList = new HashSet<>();
    //String letterDirectory = System.getProperty("user.home") + "/Documents/Tolstoy/openDiaries";

    ////from ww  w  .j  ava 2s.  c om

    String letterDirectory = System.getProperty("user.home")
            + "/Documents/Tolstoy/90-volume-set/diaries/uzip/dnevnik_1881-1887_vol_49";

    Path pathToLetters = FileSystems.getDefault().getPath(letterDirectory);

    List<Path> results = new ArrayList<>();

    int maxDepth = 6;

    try (Stream<Path> stream = Files.find(pathToLetters, maxDepth, (path, attr) -> {
        return String.valueOf(path).endsWith(".ncx");
    })) {

        stream.forEach(results::add);

    } catch (IOException e) {
        e.printStackTrace();
    }

    System.out.println("files: " + results.size());

    try {

        for (Path res : results) {
            Path parent = res.getParent();

            //                System.out.println("---------------------------------------------");
            //                System.out.println(parent.toString());
            //use jsoup to list all files that contain something useful
            Document doc = Jsoup.parse(res.toFile(), "UTF-8");

            String title = "";

            for (Element element : doc.getElementsByTag("docTitle")) {
                //Letter letter = new Letter();

                // StringBuilder content = new StringBuilder();

                for (Element child : element.children()) {
                    title = child.text();
                    // System.out.println("Title: " + title);
                }
            }

            //  System.out.println("==========================   " + res.toString() + " ==========================");

            boolean startPrinting = false;

            boolean newFile = true;

            for (Element element : doc.getElementsByTag("navPoint")) {

                //get nav label and content

                Element navLabelElement = element.select("navLabel").first();
                Element srsElement = element.select("content").first();

                String navLabel = "";
                String srs = "";

                if (navLabelElement != null) {
                    navLabel = navLabelElement.text().replaceAll("\\*", "").trim();
                }

                if (srsElement != null) {
                    srs = srsElement.attr("src");
                }

                if ("??".matches(navLabel))

                {
                    startPrinting = false;

                    // System.out.println("----------------- end of file pointer ---------------");
                }

                if (StringUtils.isNotEmpty(navLabel)
                        && navLabel.matches("??.*|?? ?.*") && newFile) {
                    newFile = false;
                    startPrinting = true;
                    title = navLabel;
                }

                if (startPrinting) {
                    // System.out.println("----------------- file pointer ---------------");
                    //   System.out.println(navLabel + "\t" + srs);

                    DocumentPointer documentPointer = new DocumentPointer(
                            parent.toString() + File.separator + srs.replaceAll("#.*", ""), title);

                    uriList.add(documentPointer);
                }

                //                    for (Element child : element.children())
                //                    {
                //                        String label = child.text();
                //
                //                        if (StringUtils.isNotEmpty(label))
                //                        {
                //                            if (label.matches("??\\s\\d{4}.*"))
                //                            {
                //                                System.out.println("------------------");
                //                            }

                //
                //                            String url = child.getElementsByTag("content").attr("src");
                //
                //                            if (label.matches(".*\\d{1,3}.*[?--?]+.*") &&
                //                                    StringUtils.isNotEmpty(url))
                //                            {
                //                                DocumentPointer letterPointer = new DocumentPointer(parent.toString()
                //                                        + File.separator + url.replaceAll("#.*", ""), title);
                //
                //                                uriList.add(letterPointer);
                ////                                System.out.println("nav point: " + label + " src " + parent.toString()
                ////                                        + System.lineSeparator() + url.replaceAll("#.*",""));
                //
                //
                //                            } else if (label.matches(".*\\d{1,3}.*") &&
                //                                    StringUtils.isNotEmpty(url) && useOnlyNumber)
                //                            {
                //                                DocumentPointer letterPointer = new DocumentPointer(parent.toString()
                //                                        + File.separator + url.replaceAll("#.*", ""), title);
                //
                //                                uriList.add(letterPointer);
                ////                                System.out.println("nav point: " + label + " src " + parent.toString()
                ////                                        + System.lineSeparator() + url.replaceAll("#.*",""));
                //
                //
                //                            } else
                //                            {
                //                                // System.out.println("nav point: " + label + " src " + child.getElementsByTag("content").attr("src"));
                //                            }
                //
                //
                //                        }
                //                        }
            }

            //   System.out.println("==========================   END OF FILE ==========================");

        }
    } catch (Exception e) {
        e.printStackTrace();
    }

    System.out.println("Size: " + uriList.size());

    for (DocumentPointer pointer : uriList) {
        //parse and
        System.out.println(pointer.getSourse() + "\t" + pointer.getUri());
    }
}

From source file:us.colloquy.sandbox.TestExtractor.java

@Test
public void useJsoup() {

    String homeDir = System.getProperty("user.home");

    System.out.println(homeDir);/*from ww w  .j a  v a2s .  co  m*/

    //JSOUP API allows to extract all  elements of letters in files

    // File input = new File("samples/OEBPS/Text/0001_1006_2001.xhtml");

    File input = new File("samples/pisma-1904/OEBPS/Text/single_doc.html");

    try {
        Document doc = Jsoup.parse(input, "UTF-8");

        List<Letter> letters = new ArrayList<>(); //our model contains only a subset of fields

        String previousYear = "";

        for (Element element : doc.getElementsByClass("section")) {
            Letter letter = new Letter();

            StringBuilder content = new StringBuilder();

            for (Element child : element.children()) {

                for (Attribute att : child.attributes()) {
                    System.out.println(att.getKey() + " " + att.getValue());
                }

                if ("center".equalsIgnoreCase(child.className())) {
                    String toWhom = child.getElementsByTag("strong").text();

                    if (StringUtils.isEmpty(toWhom)) {
                        toWhom = child.text();
                        // System.out.println(toWhom);
                    }

                    String[] toWhomArray = toWhom.split("(\\s\\s)|(,)");

                    for (String to : toWhomArray) {
                        RussianDate.parseToWhom(letter, to); //here we need to recognize a russian name and store that but for now we store the content
                    }

                    //check if there is anything else here and find date and place - it will be replaced if exists below

                    String entireText = child.text();

                    String tail = entireText.replace(toWhom, "");

                    if (StringUtils.isNotEmpty(tail)) {
                        RussianDate.parseDateAndPlace(letter, tail, previousYear); //a parser that figures out date and place if they are present
                    }

                    // System.out.println("two whom\t " +  child.getElementsByTag("strong").text() );

                } else if ("Data".equalsIgnoreCase(child.className())) {

                    if (child.getElementsByTag("em") != null
                            && StringUtils.isNotEmpty(child.getElementsByTag("em").text())) {
                        RussianDate.parseDateAndPlace(letter, child.getElementsByTag("em").text(),
                                previousYear); //most often date and place are enclosed in em tag

                        if (letter.getDate() != null) {
                            LocalDate localDate = letter.getDate().toInstant().atZone(ZoneId.systemDefault())
                                    .toLocalDate();
                            int year = localDate.getYear();
                            previousYear = year + "";
                        }
                    }

                    // System.out.println("when and where\t " + child.getElementsByTag("em").text());

                } else if ("petit".equalsIgnoreCase(child.className())
                        || "Textpetit_otstup".equalsIgnoreCase(child.className())) {
                    letter.getNotes().add(child.text());

                } else {
                    //System.out.println(child.text() );

                    Elements elements = child.getElementsByTag("sup");

                    for (Element e : elements) {
                        String value = e.text();

                        e.replaceWith(new TextNode("[" + value + "]", null));
                    }

                    for (Element el : child.getAllElements()) {
                        // System.out.println(el.tagName());
                        if ("sup".equalsIgnoreCase(el.tagName())) {
                            content.append(" [" + el.text() + "] ");
                        } else {
                            content.append(el.text());
                        }

                    }

                    content.append("\n");

                }

                //                  System.out.println(child.tag() + "\n" );
                //                  System.out.println(child.outerHtml() + "\n" + child.text());
            }

            letter.setContent(content.toString());
            letters.add(letter);
        }

        ObjectWriter ow = new com.fasterxml.jackson.databind.ObjectMapper().writer().withDefaultPrettyPrinter();

        for (Letter letter : letters) {
            //                if (letter.getDate() == null)
            //                {

            //                        if (StringUtils.isNotEmpty(person.getLastName()))
            //                        {
            String json = ow.writeValueAsString(letter);

            System.out.println(json);
            //                        }

            //}

        }

    } catch (IOException e) {
        e.printStackTrace();
    }

}

From source file:us.colloquy.util.DiaryParser.java

@Test
    public void useJsoup() {
        //File input = new File(System.getProperty("user.home") + "/Documents/Tolstoy/openDiaries/dnevnik_1893(2)/OEBPS/Text/0001_1006_2001.xhtml");
        //   File input = new File(System.getProperty("user.home") + "/IdeaProjects/ElasticTest/temp/dnevnik_1862(1)/OEBPS/Text/0001_1006_2001.xhtml");

        File input = new File(System.getProperty("user.home")
                + "/Documents/Tolstoy/90-volume-set/diaries/uzip/dnevnik_1881-1887_vol_49/OEBPS/Text/0001_1011_2005.xhtml");

        String previousYear = "";

        String sourse = "pointer";

        List<DiaryEntry> diaryEntrys = new ArrayList<>();

        try {//from ww w. ja  va2s  .  c o m
            Document doc = Jsoup.parse(input, "UTF-8");

            for (Element element : doc.getElementsByClass("section")) {
                DiaryEntry diaryEntry = null;

                StringBuilder contentBuilder = new StringBuilder();

                for (Element child : element.children()) {
                    //                    for (Attribute att : child.attributes())
                    //                    {
                    //                        //   System.out.println(att.getKey() + " " + att.getValue());
                    //                    }
                    //we need to assume that each element is a continuation unless the entry is a date that starts a new entry
                    //the problem is to distinguish between an entry that contains date and place vs date within an entry

                    //lets try to see if element is a date

                    DiaryEntry diaryEntryToCollectDate = new DiaryEntry();

                    //we send it in two cases when text matches year or when text has em element
                    Element em = child.select("em").first();

                    if (em == null && StringUtils.isNotEmpty(child.text())) {
                        Matcher m = yearPattern.matcher(child.text());

                        if (m.find()) {
                            child.text(m.group(1));
                            previousYear = parseDateAndPlace(previousYear, diaryEntryToCollectDate, child);
                        }
                    }

                    if (em != null) {
                        previousYear = parseDateAndPlace(previousYear, diaryEntryToCollectDate, child);
                    }

                    if (diaryEntryToCollectDate.getDate() != null) //this is the begginng of a new entry
                    {
                        System.out.println("Found date: " + diaryEntryToCollectDate.getDate());
                        //create new DiaryEntry
                        if (diaryEntry != null) {
                            diaryEntry.setEntry(contentBuilder.toString()); //add consecutive entries here
                            diaryEntrys.add(diaryEntry);
                        }

                        diaryEntry = new DiaryEntry();
                        diaryEntry.setSource(sourse);
                        diaryEntry.setDate(diaryEntryToCollectDate.getDate());
                        diaryEntry.setPlace(diaryEntryToCollectDate.getPlace());

                        contentBuilder = new StringBuilder();

                    }

                    if (StringUtils.isNotEmpty(child.text()) && child.text().length() > 8) {
                        contentBuilder.append(child.text() + "\n");

                    }
                    //
                    //                    System.out.println(child.tag() + "\n");
                    //                    System.out.println(child.outerHtml() + "\n" + child.text());
                }

                //whatever we still have, add here:
                if (StringUtils.isNotEmpty(contentBuilder.toString()) && diaryEntry != null) {
                    diaryEntry.setEntry(contentBuilder.toString());
                    diaryEntrys.add(diaryEntry);
                }
            }

        } catch (IOException e) {
            e.printStackTrace();
        }

        for (DiaryEntry diaryEntry : diaryEntrys) {
            System.out.println(diaryEntry.toString());
        }
    }

From source file:us.colloquy.util.EpubExtractor.java

public static void getURIForAllLetters(Set<DocumentPointer> uriList, String letterDirectory,
        boolean useOnlyNumber) {

    Path pathToLetters = FileSystems.getDefault().getPath(letterDirectory);

    List<Path> results = new ArrayList<>();

    int maxDepth = 6;

    try (Stream<Path> stream = Files.find(pathToLetters, maxDepth,
            (path, attr) -> String.valueOf(path).endsWith(".ncx"))) {
        stream.forEach(results::add);/* w ww . j  a  v  a  2  s. c o  m*/

        //            String joined = stream
        //                    .sorted()
        //                    .map(String::valueOf)
        //                    .collect(Collectors.joining("; "));
        //
        //            System.out.println("\nFound: " + joined);

    } catch (IOException e) {
        e.printStackTrace();
    }

    System.out.println("files: " + results.size());

    try {

        for (Path res : results) {
            Path parent = res.getParent();

            //                System.out.println("---------------------------------------------");
            //                System.out.println(parent.toString());
            //use jsoup to list all files that contain something useful
            Document doc = Jsoup.parse(res.toFile(), "UTF-8");

            String title = "";

            for (Element element : doc.getElementsByTag("docTitle")) {
                //Letter letter = new Letter();

                // StringBuilder content = new StringBuilder();

                for (Element child : element.children()) {
                    title = child.text();
                    // System.out.println("Title: " + title);
                }
            }

            for (Element element : doc.getElementsByTag("avantitul")) {

                for (Element child : element.children()) {
                    String label = child.text();

                    if (StringUtils.isNotEmpty(label)) {
                        if (label.matches(
                                "  ? ? .*")) {
                            System.out.println("------------------   " + label);
                        }
                    }
                }

            }

            for (Element element : doc.getElementsByTag("navPoint")) {
                //Letter letter = new Letter();

                // StringBuilder content = new StringBuilder();

                for (Element child : element.children()) {
                    String label = child.text();

                    if (StringUtils.isNotEmpty(label)) {
                        if (label.matches("?")) {
                            System.out.println("------------------ " + "?" + " -------------------");

                        } else if (label.contains(" ?")) {
                            break;
                        }

                        String url = child.getElementsByTag("content").attr("src");

                        if (label.matches(".*\\d{1,3}.*[?--?A-Za-z]+.*") && StringUtils.isNotEmpty(url)) {
                            DocumentPointer documentPointer = new DocumentPointer(
                                    parent.toString() + File.separator + url.replaceAll("#.*", ""), title);

                            uriList.add(documentPointer);
                            //                                System.out.println("nav point: " + label + " src " + parent.toString()
                            //                                        + System.lineSeparator() + url.replaceAll("#.*",""));

                        } else if (label.matches(".*\\d{1,3}.*") && StringUtils.isNotEmpty(url)
                                && useOnlyNumber) {
                            DocumentPointer documentPointer = new DocumentPointer(
                                    parent.toString() + File.separator + url.replaceAll("#.*", ""), title);

                            uriList.add(documentPointer);
                            //                                System.out.println("nav point: " + label + " src " + parent.toString()
                            //                                        + System.lineSeparator() + url.replaceAll("#.*",""));

                        } else {
                            // System.out.println("nav point: " + label + " src " + child.getElementsByTag("content").attr("src"));
                        }

                    }
                }
            }

        }
    } catch (Exception e) {
        e.printStackTrace();
    }

    //        System.out.println("Size: " + uriList.size());

    //        for (DocumentPointer pointer : uriList)
    //        {
    //            //parse and
    //            System.out.println(pointer.getSourse() + "\t" + pointer.getUri());
    //        }
}