List of usage examples for org.jsoup.nodes Element children
public Elements children()
From source file:poe.trade.assist.SearchForm.java
private String removeAllExceptSearchForm(String html) { String htmlDirectory = htmlDirectory(); Document doc = Jsoup.parse(html); // Remove stuff outside of id="main" // doc.body().children().stream().filter(e -> !"main".equalsIgnoreCase(e.id())).forEach(e -> e.remove()); Element head = doc.head(); // Replace everthing in the <head> head.children().stream().forEach(e -> e.remove()); head.appendElement("meta").attr("charset", "utf-8"); head.appendElement("meta").attr("name", "viewport").attr("content", "width=device-width"); head.appendElement("title").text("poe.trade.assist"); head.appendElement("script").attr("type", "text/javascript").attr("src", htmlDirectory + "packed.js"); head.appendElement("link").attr("rel", "stylesheet").attr("href", htmlDirectory + "packed_dark.css"); // Show search form Optional.ofNullable(doc.getElementById("search-form")).ifPresent(e -> e.attr("style", "")); Optional.ofNullable(doc.getElementById("search")) .ifPresent(e -> e.attr("action", "http://poe.trade/search")); // // w ww . j a v a 2 s . c o m // Element mainElement = doc.getElementById("main"); // Element topDivContainer = mainElement.child(0); // // // Remove everthing that is not id="content" or h2 // topDivContainer.children().stream() // .filter(e -> !"content".equalsIgnoreCase(e.id())) // .filter(e -> !e.tag().getName().equalsIgnoreCase("h2")) // .forEach(e -> e.remove()); // // // Clean up stuff inside id="content" // // // Remove "Show search form", "search/import" // Optional<Element> searchFormElem = doc.getElementsByTag("a").stream().filter(e -> e.hasClass("button") && e.hasClass("secondary") && e.hasClass("expand")).findFirst(); // searchFormElem.ifPresent(e -> e.remove()); // // Optional<Element> searchOrImportDiv = doc.getElementsByTag("div").stream().filter(e -> e.hasClass("row") && e.hasClass("form-choose-action")).findFirst(); // searchOrImportDiv.ifPresent(e -> e.remove()); // // // Remove search results Elements searchResultBlocks = doc.getElementsByClass("search-results-block"); if (searchResultBlocks.size() > 0) { searchResultBlocks.get(0).remove(); } // append assist as the last element in body // doc.body().appendElement("script").attr("type", "text/javascript").attr("src", htmlDirectory + "assist.js"); String cleanHtml = doc.toString(); return cleanHtml; }
From source file:poe.trade.assist.SearchView.java
private String addHeadElements(String html) { String htmlDirectory = htmlDirectory(); Document doc = Jsoup.parse(html); Element head = doc.head(); // Replace everthing in the <head> head.children().stream().forEach(e -> e.remove()); head.appendElement("meta").attr("charset", "utf-8"); head.appendElement("meta").attr("name", "viewport").attr("content", "width=device-width"); head.appendElement("title").text("poe.trade.assist"); head.appendElement("script").attr("type", "text/javascript").attr("src", htmlDirectory + "packed.js"); head.appendElement("link").attr("rel", "stylesheet").attr("href", htmlDirectory + "packed_dark.css"); doc.body().appendElement("script").attr("type", "text/javascript").attr("src", htmlDirectory + "assist.js"); String cleanHTML = doc.toString(); // try { // FileUtils.writeStringToFile(new File("test"), cleanHTML); // } catch (IOException e1) { // // TODO Auto-generated catch block // e1.printStackTrace(); // }/*from w w w . j a va2 s .c o m*/ return cleanHTML; }
From source file:Search.DataManipulation.DataParser.java
public Map<String, String> getMetaData(Document dom) { Elements details = null;/*from www . j a va 2s. c o m*/ Map<String, String> metaData = new HashMap<>(); Elements detailClass = dom.getElementsByClass("details-section-contents"); for (Element testClass : detailClass) { if (testClass.children().first().hasClass("meta-info")) { details = testClass.children(); } } assert details != null; for (Element detailElement : details) { String area = detailElement.children().first().ownText(); String value = detailElement.children().last().ownText(); if (!(area.equals("Permissions") || area.equals("Report") || area.equals("Developer"))) { metaData.put(area, value); } } return metaData; }
From source file:uk.co.certait.htmlexporter.writer.AbstractTableCellWriter.java
/** * Returns the actual text of the innermost child element for this cell. * /*from ww w .j av a 2s . c om*/ * @param element * * @return The text to be output for this Cell. */ public String getElementText(Element element) { String text = element.ownText(); for (Element child : element.children()) { text = child.ownText(); } return text; }
From source file:us.colloquy.sandbox.FileProcessor.java
@Test public void listAllUzipedFiles() { ///Documents/Tolstoy/diaries //System.getProperty("user.home") + "/Documents/Tolstoy/unzipLetters" Path pathToLetters = FileSystems.getDefault() .getPath(System.getProperty("user.home") + "/Documents/Tolstoy/openDiaries"); List<Path> results = new ArrayList<>(); int maxDepth = 6; try (Stream<Path> stream = Files.find(pathToLetters, maxDepth, (path, attr) -> { return String.valueOf(path).endsWith(".ncx"); })) {/*from w w w . java 2 s .co m*/ stream.forEach(results::add); // String joined = stream // .sorted() // .map(String::valueOf) // .collect(Collectors.joining("; ")); // // System.out.println("\nFound: " + joined); } catch (IOException e) { e.printStackTrace(); } System.out.println("files: " + results.size()); Set<String> uriList = new TreeSet<>(); try { for (Path res : results) { Path parent = res.getParent(); System.out.println("---------------------------------------------"); System.out.println(parent.toString()); //use jsoup to list all files that contain something useful Document doc = Jsoup.parse(res.toFile(), "UTF-8"); for (Element element : doc.getElementsByTag("docTitle")) { //Letter letter = new Letter(); // StringBuilder content = new StringBuilder(); for (Element child : element.children()) { System.out.println("Title: " + child.text()); } } for (Element element : doc.getElementsByTag("navPoint")) { //Letter letter = new Letter(); // StringBuilder content = new StringBuilder(); for (Element child : element.children()) { String label = child.text(); if (StringUtils.isNotEmpty(label)) { if (label.matches("?")) { System.out.println("------------------"); } String url = child.getElementsByTag("content").attr("src"); if (label.matches(".*\\d{1,3}.*[?--?]+.*") && StringUtils.isNotEmpty(url)) { uriList.add(parent.toString() + File.separator + url.replaceAll("#.*", "")); // System.out.println("nav point: " + label + " src " + parent.toString() // + System.lineSeparator() + url.replaceAll("#.*","")); } else { // System.out.println("nav point: " + label + " src " + child.getElementsByTag("content").attr("src")); } } } } } } catch (Exception e) { e.printStackTrace(); } System.out.println("Size: " + uriList.size()); for (String uri : uriList) { //parse and System.out.println(uri); } }
From source file:us.colloquy.sandbox.FileProcessor.java
@Test public void getURIForAllDiaries() { Set<DocumentPointer> uriList = new HashSet<>(); //String letterDirectory = System.getProperty("user.home") + "/Documents/Tolstoy/openDiaries"; ////from ww w .j ava 2s. c om String letterDirectory = System.getProperty("user.home") + "/Documents/Tolstoy/90-volume-set/diaries/uzip/dnevnik_1881-1887_vol_49"; Path pathToLetters = FileSystems.getDefault().getPath(letterDirectory); List<Path> results = new ArrayList<>(); int maxDepth = 6; try (Stream<Path> stream = Files.find(pathToLetters, maxDepth, (path, attr) -> { return String.valueOf(path).endsWith(".ncx"); })) { stream.forEach(results::add); } catch (IOException e) { e.printStackTrace(); } System.out.println("files: " + results.size()); try { for (Path res : results) { Path parent = res.getParent(); // System.out.println("---------------------------------------------"); // System.out.println(parent.toString()); //use jsoup to list all files that contain something useful Document doc = Jsoup.parse(res.toFile(), "UTF-8"); String title = ""; for (Element element : doc.getElementsByTag("docTitle")) { //Letter letter = new Letter(); // StringBuilder content = new StringBuilder(); for (Element child : element.children()) { title = child.text(); // System.out.println("Title: " + title); } } // System.out.println("========================== " + res.toString() + " =========================="); boolean startPrinting = false; boolean newFile = true; for (Element element : doc.getElementsByTag("navPoint")) { //get nav label and content Element navLabelElement = element.select("navLabel").first(); Element srsElement = element.select("content").first(); String navLabel = ""; String srs = ""; if (navLabelElement != null) { navLabel = navLabelElement.text().replaceAll("\\*", "").trim(); } if (srsElement != null) { srs = srsElement.attr("src"); } if ("??".matches(navLabel)) { startPrinting = false; // System.out.println("----------------- end of file pointer ---------------"); } if (StringUtils.isNotEmpty(navLabel) && navLabel.matches("??.*|?? ?.*") && newFile) { newFile = false; startPrinting = true; title = navLabel; } if (startPrinting) { // System.out.println("----------------- file pointer ---------------"); // System.out.println(navLabel + "\t" + srs); DocumentPointer documentPointer = new DocumentPointer( parent.toString() + File.separator + srs.replaceAll("#.*", ""), title); uriList.add(documentPointer); } // for (Element child : element.children()) // { // String label = child.text(); // // if (StringUtils.isNotEmpty(label)) // { // if (label.matches("??\\s\\d{4}.*")) // { // System.out.println("------------------"); // } // // String url = child.getElementsByTag("content").attr("src"); // // if (label.matches(".*\\d{1,3}.*[?--?]+.*") && // StringUtils.isNotEmpty(url)) // { // DocumentPointer letterPointer = new DocumentPointer(parent.toString() // + File.separator + url.replaceAll("#.*", ""), title); // // uriList.add(letterPointer); //// System.out.println("nav point: " + label + " src " + parent.toString() //// + System.lineSeparator() + url.replaceAll("#.*","")); // // // } else if (label.matches(".*\\d{1,3}.*") && // StringUtils.isNotEmpty(url) && useOnlyNumber) // { // DocumentPointer letterPointer = new DocumentPointer(parent.toString() // + File.separator + url.replaceAll("#.*", ""), title); // // uriList.add(letterPointer); //// System.out.println("nav point: " + label + " src " + parent.toString() //// + System.lineSeparator() + url.replaceAll("#.*","")); // // // } else // { // // System.out.println("nav point: " + label + " src " + child.getElementsByTag("content").attr("src")); // } // // // } // } } // System.out.println("========================== END OF FILE =========================="); } } catch (Exception e) { e.printStackTrace(); } System.out.println("Size: " + uriList.size()); for (DocumentPointer pointer : uriList) { //parse and System.out.println(pointer.getSourse() + "\t" + pointer.getUri()); } }
From source file:us.colloquy.sandbox.TestExtractor.java
@Test public void useJsoup() { String homeDir = System.getProperty("user.home"); System.out.println(homeDir);/*from ww w .j a v a2s . co m*/ //JSOUP API allows to extract all elements of letters in files // File input = new File("samples/OEBPS/Text/0001_1006_2001.xhtml"); File input = new File("samples/pisma-1904/OEBPS/Text/single_doc.html"); try { Document doc = Jsoup.parse(input, "UTF-8"); List<Letter> letters = new ArrayList<>(); //our model contains only a subset of fields String previousYear = ""; for (Element element : doc.getElementsByClass("section")) { Letter letter = new Letter(); StringBuilder content = new StringBuilder(); for (Element child : element.children()) { for (Attribute att : child.attributes()) { System.out.println(att.getKey() + " " + att.getValue()); } if ("center".equalsIgnoreCase(child.className())) { String toWhom = child.getElementsByTag("strong").text(); if (StringUtils.isEmpty(toWhom)) { toWhom = child.text(); // System.out.println(toWhom); } String[] toWhomArray = toWhom.split("(\\s\\s)|(,)"); for (String to : toWhomArray) { RussianDate.parseToWhom(letter, to); //here we need to recognize a russian name and store that but for now we store the content } //check if there is anything else here and find date and place - it will be replaced if exists below String entireText = child.text(); String tail = entireText.replace(toWhom, ""); if (StringUtils.isNotEmpty(tail)) { RussianDate.parseDateAndPlace(letter, tail, previousYear); //a parser that figures out date and place if they are present } // System.out.println("two whom\t " + child.getElementsByTag("strong").text() ); } else if ("Data".equalsIgnoreCase(child.className())) { if (child.getElementsByTag("em") != null && StringUtils.isNotEmpty(child.getElementsByTag("em").text())) { RussianDate.parseDateAndPlace(letter, child.getElementsByTag("em").text(), previousYear); //most often date and place are enclosed in em tag if (letter.getDate() != null) { LocalDate localDate = letter.getDate().toInstant().atZone(ZoneId.systemDefault()) .toLocalDate(); int year = localDate.getYear(); previousYear = year + ""; } } // System.out.println("when and where\t " + child.getElementsByTag("em").text()); } else if ("petit".equalsIgnoreCase(child.className()) || "Textpetit_otstup".equalsIgnoreCase(child.className())) { letter.getNotes().add(child.text()); } else { //System.out.println(child.text() ); Elements elements = child.getElementsByTag("sup"); for (Element e : elements) { String value = e.text(); e.replaceWith(new TextNode("[" + value + "]", null)); } for (Element el : child.getAllElements()) { // System.out.println(el.tagName()); if ("sup".equalsIgnoreCase(el.tagName())) { content.append(" [" + el.text() + "] "); } else { content.append(el.text()); } } content.append("\n"); } // System.out.println(child.tag() + "\n" ); // System.out.println(child.outerHtml() + "\n" + child.text()); } letter.setContent(content.toString()); letters.add(letter); } ObjectWriter ow = new com.fasterxml.jackson.databind.ObjectMapper().writer().withDefaultPrettyPrinter(); for (Letter letter : letters) { // if (letter.getDate() == null) // { // if (StringUtils.isNotEmpty(person.getLastName())) // { String json = ow.writeValueAsString(letter); System.out.println(json); // } //} } } catch (IOException e) { e.printStackTrace(); } }
From source file:us.colloquy.util.DiaryParser.java
@Test public void useJsoup() { //File input = new File(System.getProperty("user.home") + "/Documents/Tolstoy/openDiaries/dnevnik_1893(2)/OEBPS/Text/0001_1006_2001.xhtml"); // File input = new File(System.getProperty("user.home") + "/IdeaProjects/ElasticTest/temp/dnevnik_1862(1)/OEBPS/Text/0001_1006_2001.xhtml"); File input = new File(System.getProperty("user.home") + "/Documents/Tolstoy/90-volume-set/diaries/uzip/dnevnik_1881-1887_vol_49/OEBPS/Text/0001_1011_2005.xhtml"); String previousYear = ""; String sourse = "pointer"; List<DiaryEntry> diaryEntrys = new ArrayList<>(); try {//from ww w. ja va2s . c o m Document doc = Jsoup.parse(input, "UTF-8"); for (Element element : doc.getElementsByClass("section")) { DiaryEntry diaryEntry = null; StringBuilder contentBuilder = new StringBuilder(); for (Element child : element.children()) { // for (Attribute att : child.attributes()) // { // // System.out.println(att.getKey() + " " + att.getValue()); // } //we need to assume that each element is a continuation unless the entry is a date that starts a new entry //the problem is to distinguish between an entry that contains date and place vs date within an entry //lets try to see if element is a date DiaryEntry diaryEntryToCollectDate = new DiaryEntry(); //we send it in two cases when text matches year or when text has em element Element em = child.select("em").first(); if (em == null && StringUtils.isNotEmpty(child.text())) { Matcher m = yearPattern.matcher(child.text()); if (m.find()) { child.text(m.group(1)); previousYear = parseDateAndPlace(previousYear, diaryEntryToCollectDate, child); } } if (em != null) { previousYear = parseDateAndPlace(previousYear, diaryEntryToCollectDate, child); } if (diaryEntryToCollectDate.getDate() != null) //this is the begginng of a new entry { System.out.println("Found date: " + diaryEntryToCollectDate.getDate()); //create new DiaryEntry if (diaryEntry != null) { diaryEntry.setEntry(contentBuilder.toString()); //add consecutive entries here diaryEntrys.add(diaryEntry); } diaryEntry = new DiaryEntry(); diaryEntry.setSource(sourse); diaryEntry.setDate(diaryEntryToCollectDate.getDate()); diaryEntry.setPlace(diaryEntryToCollectDate.getPlace()); contentBuilder = new StringBuilder(); } if (StringUtils.isNotEmpty(child.text()) && child.text().length() > 8) { contentBuilder.append(child.text() + "\n"); } // // System.out.println(child.tag() + "\n"); // System.out.println(child.outerHtml() + "\n" + child.text()); } //whatever we still have, add here: if (StringUtils.isNotEmpty(contentBuilder.toString()) && diaryEntry != null) { diaryEntry.setEntry(contentBuilder.toString()); diaryEntrys.add(diaryEntry); } } } catch (IOException e) { e.printStackTrace(); } for (DiaryEntry diaryEntry : diaryEntrys) { System.out.println(diaryEntry.toString()); } }
From source file:us.colloquy.util.EpubExtractor.java
public static void getURIForAllLetters(Set<DocumentPointer> uriList, String letterDirectory, boolean useOnlyNumber) { Path pathToLetters = FileSystems.getDefault().getPath(letterDirectory); List<Path> results = new ArrayList<>(); int maxDepth = 6; try (Stream<Path> stream = Files.find(pathToLetters, maxDepth, (path, attr) -> String.valueOf(path).endsWith(".ncx"))) { stream.forEach(results::add);/* w ww . j a v a 2 s. c o m*/ // String joined = stream // .sorted() // .map(String::valueOf) // .collect(Collectors.joining("; ")); // // System.out.println("\nFound: " + joined); } catch (IOException e) { e.printStackTrace(); } System.out.println("files: " + results.size()); try { for (Path res : results) { Path parent = res.getParent(); // System.out.println("---------------------------------------------"); // System.out.println(parent.toString()); //use jsoup to list all files that contain something useful Document doc = Jsoup.parse(res.toFile(), "UTF-8"); String title = ""; for (Element element : doc.getElementsByTag("docTitle")) { //Letter letter = new Letter(); // StringBuilder content = new StringBuilder(); for (Element child : element.children()) { title = child.text(); // System.out.println("Title: " + title); } } for (Element element : doc.getElementsByTag("avantitul")) { for (Element child : element.children()) { String label = child.text(); if (StringUtils.isNotEmpty(label)) { if (label.matches( " ? ? .*")) { System.out.println("------------------ " + label); } } } } for (Element element : doc.getElementsByTag("navPoint")) { //Letter letter = new Letter(); // StringBuilder content = new StringBuilder(); for (Element child : element.children()) { String label = child.text(); if (StringUtils.isNotEmpty(label)) { if (label.matches("?")) { System.out.println("------------------ " + "?" + " -------------------"); } else if (label.contains(" ?")) { break; } String url = child.getElementsByTag("content").attr("src"); if (label.matches(".*\\d{1,3}.*[?--?A-Za-z]+.*") && StringUtils.isNotEmpty(url)) { DocumentPointer documentPointer = new DocumentPointer( parent.toString() + File.separator + url.replaceAll("#.*", ""), title); uriList.add(documentPointer); // System.out.println("nav point: " + label + " src " + parent.toString() // + System.lineSeparator() + url.replaceAll("#.*","")); } else if (label.matches(".*\\d{1,3}.*") && StringUtils.isNotEmpty(url) && useOnlyNumber) { DocumentPointer documentPointer = new DocumentPointer( parent.toString() + File.separator + url.replaceAll("#.*", ""), title); uriList.add(documentPointer); // System.out.println("nav point: " + label + " src " + parent.toString() // + System.lineSeparator() + url.replaceAll("#.*","")); } else { // System.out.println("nav point: " + label + " src " + child.getElementsByTag("content").attr("src")); } } } } } } catch (Exception e) { e.printStackTrace(); } // System.out.println("Size: " + uriList.size()); // for (DocumentPointer pointer : uriList) // { // //parse and // System.out.println(pointer.getSourse() + "\t" + pointer.getUri()); // } }