List of usage examples for org.jsoup.nodes Element getElementsByTag
public Elements getElementsByTag(String tagName)
From source file:org.xwiki.validator.HTML5DutchWebGuidelinesValidator.java
/** * Avoid automatic redirection during interaction with forms. *///from w w w .j av a 2 s . c om public void validateRpd13s4() { for (Element form : getElements(ELEM_FORM)) { boolean hasSubmit = false; boolean hasDynamicSelect = false; for (Element input : form.getElementsByTag(ELEM_INPUT)) { String type = input.attr(ATTR_TYPE); if ("submit".equals(type) || "image".equals(type)) { hasSubmit = true; break; } } assertTrue(Type.ERROR, "rpd13s4.submit", hasSubmit); for (Element select : form.getElementsByTag("select")) { if (select.hasAttr("onchange")) { hasDynamicSelect = true; break; } } if (hasDynamicSelect) { addError(Type.WARNING, -1, -1, "rpd13s4.select"); } } }
From source file:ru.redcraft.pinterest4j.core.api.PinAPI.java
public List<Comment> getComments(Pin pin) { LOG.debug("Getting comments for pin = " + pin); List<Comment> comments = new ArrayList<Comment>(); Document doc = null;/*w ww . j a v a 2 s. c o m*/ String axajResponse = null; try { axajResponse = new APIRequestBuilder(pin.getURL()).setErrorMessage(PIN_API_ERROR).build().getResponse() .getEntity(String.class); doc = Jsoup.parse(new JSONObject(axajResponse).getString("footer")); } catch (JSONException e) { throw new PinterestRuntimeException(PIN_API_ERROR + axajResponse, e); } for (Element comment : doc.select("div.comment")) { long id = Long.valueOf(comment.getElementsByClass("DeleteComment").first().attr("data")); Element contentMeta = comment.getElementsByClass("CommenterMeta").first(); User user = new LazyUser(contentMeta.getElementsByTag("a").first().attr("href").replace("/", ""), getApiManager()); contentMeta.getElementsByTag("a").remove(); String text = contentMeta.text(); comments.add(new CommentImpl(id, text, user, pin)); } LOG.debug("Comments extracted: " + comments); return comments; }
From source file:Search.DataManipulation.DataParser.java
public String getPrice(Document dom) { Elements priceClass = dom.select("button.price"); Element priceClass1 = priceClass.first(); Elements priceClass2 = priceClass1.getElementsByTag("span"); String price = priceClass2.last().ownText(); if (price.equalsIgnoreCase("install")) { price = "Free"; } else {//from www. j a v a 2s .co m String[] split = StringUtils.split(price); price = split[0]; } return price; }
From source file:Search.DataManipulation.DataParser.java
public String getThumbnails(Document dom) throws IOException { Elements thumbnailsClass = dom.getElementsByClass("thumbnails"); Elements thumbnails = thumbnailsClass.first().children(); List<String> imageArray = new ArrayList<String>(); for (Element images : thumbnails) { String imageTagUrl = images.getElementsByTag("img").first().attr("src"); byte[] imageByte = dataHandler.imageDownloader(imageTagUrl); if (imageByte.length == 0) { continue; }// w w w . j ava 2 s .c om String imageTag = Base64.getEncoder().encodeToString(imageByte); imageArray.add(imageTag); } return JSONValue.toJSONString(imageArray); }
From source file:solarrecorder.SolarRecorder.java
private void getProdData() throws IOException { org.jsoup.nodes.Document doc = Jsoup.connect("http://envoy/production").get(); Element h1 = doc.getElementsByTag("h1").first(); Element table = h1.nextElementSibling(); Elements alltr = table.getElementsByTag("tbody").first().getElementsByTag("tr"); for (Element tr : alltr) { Elements alltd = tr.getElementsByTag("td"); if (alltd.size() == 2) { String name = alltd.first().text(); String value = alltd.last().text(); switch (name) { case "Currently": case "Today": envoyData.add(new EnvoyData(name, value)); break; }/* ww w .j a v a 2 s . c o m*/ } } }
From source file:solarrecorder.SolarRecorder.java
private void getSysData() throws IOException { org.jsoup.nodes.Document doc = Jsoup.connect("http://envoy").get(); Elements allh2 = doc.getElementsByTag("h2"); for (Element h2 : allh2) { if (h2.text().equals("System Statistics")) { Elements tables = h2.parent().getElementsByTag("table"); Elements alltr = tables.first().getElementsByTag("tbody").first().getElementsByTag("tr"); for (Element tr : alltr) { Elements alltd = tr.getElementsByTag("td"); String name = alltd.first().text(); String value = alltd.last().text(); if (name.equals("Number of Microinverters Online")) { envoyData.add(new EnvoyData(name, value)); }//from w ww. ja v a 2s .c o m } } } }
From source file:us.colloquy.sandbox.FileProcessor.java
@Test public void listAllUzipedFiles() { ///Documents/Tolstoy/diaries //System.getProperty("user.home") + "/Documents/Tolstoy/unzipLetters" Path pathToLetters = FileSystems.getDefault() .getPath(System.getProperty("user.home") + "/Documents/Tolstoy/openDiaries"); List<Path> results = new ArrayList<>(); int maxDepth = 6; try (Stream<Path> stream = Files.find(pathToLetters, maxDepth, (path, attr) -> { return String.valueOf(path).endsWith(".ncx"); })) {/*from w w w .j a v a2 s . c om*/ stream.forEach(results::add); // String joined = stream // .sorted() // .map(String::valueOf) // .collect(Collectors.joining("; ")); // // System.out.println("\nFound: " + joined); } catch (IOException e) { e.printStackTrace(); } System.out.println("files: " + results.size()); Set<String> uriList = new TreeSet<>(); try { for (Path res : results) { Path parent = res.getParent(); System.out.println("---------------------------------------------"); System.out.println(parent.toString()); //use jsoup to list all files that contain something useful Document doc = Jsoup.parse(res.toFile(), "UTF-8"); for (Element element : doc.getElementsByTag("docTitle")) { //Letter letter = new Letter(); // StringBuilder content = new StringBuilder(); for (Element child : element.children()) { System.out.println("Title: " + child.text()); } } for (Element element : doc.getElementsByTag("navPoint")) { //Letter letter = new Letter(); // StringBuilder content = new StringBuilder(); for (Element child : element.children()) { String label = child.text(); if (StringUtils.isNotEmpty(label)) { if (label.matches("?")) { System.out.println("------------------"); } String url = child.getElementsByTag("content").attr("src"); if (label.matches(".*\\d{1,3}.*[?--?]+.*") && StringUtils.isNotEmpty(url)) { uriList.add(parent.toString() + File.separator + url.replaceAll("#.*", "")); // System.out.println("nav point: " + label + " src " + parent.toString() // + System.lineSeparator() + url.replaceAll("#.*","")); } else { // System.out.println("nav point: " + label + " src " + child.getElementsByTag("content").attr("src")); } } } } } } catch (Exception e) { e.printStackTrace(); } System.out.println("Size: " + uriList.size()); for (String uri : uriList) { //parse and System.out.println(uri); } }
From source file:us.colloquy.sandbox.TestExtractor.java
@Test public void useJsoup() { String homeDir = System.getProperty("user.home"); System.out.println(homeDir);// ww w .j a va 2s. com //JSOUP API allows to extract all elements of letters in files // File input = new File("samples/OEBPS/Text/0001_1006_2001.xhtml"); File input = new File("samples/pisma-1904/OEBPS/Text/single_doc.html"); try { Document doc = Jsoup.parse(input, "UTF-8"); List<Letter> letters = new ArrayList<>(); //our model contains only a subset of fields String previousYear = ""; for (Element element : doc.getElementsByClass("section")) { Letter letter = new Letter(); StringBuilder content = new StringBuilder(); for (Element child : element.children()) { for (Attribute att : child.attributes()) { System.out.println(att.getKey() + " " + att.getValue()); } if ("center".equalsIgnoreCase(child.className())) { String toWhom = child.getElementsByTag("strong").text(); if (StringUtils.isEmpty(toWhom)) { toWhom = child.text(); // System.out.println(toWhom); } String[] toWhomArray = toWhom.split("(\\s\\s)|(,)"); for (String to : toWhomArray) { RussianDate.parseToWhom(letter, to); //here we need to recognize a russian name and store that but for now we store the content } //check if there is anything else here and find date and place - it will be replaced if exists below String entireText = child.text(); String tail = entireText.replace(toWhom, ""); if (StringUtils.isNotEmpty(tail)) { RussianDate.parseDateAndPlace(letter, tail, previousYear); //a parser that figures out date and place if they are present } // System.out.println("two whom\t " + child.getElementsByTag("strong").text() ); } else if ("Data".equalsIgnoreCase(child.className())) { if (child.getElementsByTag("em") != null && StringUtils.isNotEmpty(child.getElementsByTag("em").text())) { RussianDate.parseDateAndPlace(letter, child.getElementsByTag("em").text(), previousYear); //most often date and place are enclosed in em tag if (letter.getDate() != null) { LocalDate localDate = letter.getDate().toInstant().atZone(ZoneId.systemDefault()) .toLocalDate(); int year = localDate.getYear(); previousYear = year + ""; } } // System.out.println("when and where\t " + child.getElementsByTag("em").text()); } else if ("petit".equalsIgnoreCase(child.className()) || "Textpetit_otstup".equalsIgnoreCase(child.className())) { letter.getNotes().add(child.text()); } else { //System.out.println(child.text() ); Elements elements = child.getElementsByTag("sup"); for (Element e : elements) { String value = e.text(); e.replaceWith(new TextNode("[" + value + "]", null)); } for (Element el : child.getAllElements()) { // System.out.println(el.tagName()); if ("sup".equalsIgnoreCase(el.tagName())) { content.append(" [" + el.text() + "] "); } else { content.append(el.text()); } } content.append("\n"); } // System.out.println(child.tag() + "\n" ); // System.out.println(child.outerHtml() + "\n" + child.text()); } letter.setContent(content.toString()); letters.add(letter); } ObjectWriter ow = new com.fasterxml.jackson.databind.ObjectMapper().writer().withDefaultPrettyPrinter(); for (Letter letter : letters) { // if (letter.getDate() == null) // { // if (StringUtils.isNotEmpty(person.getLastName())) // { String json = ow.writeValueAsString(letter); System.out.println(json); // } //} } } catch (IOException e) { e.printStackTrace(); } }
From source file:us.colloquy.util.DiaryParser.java
private static void replaceSupTag(Element child) { Elements elements = child.getElementsByTag("sup"); for (Element e : elements) { String value = e.text(); e.replaceWith(new TextNode("[" + value + "]", null)); }/*ww w . j av a 2s. c o m*/ }
From source file:us.colloquy.util.EpubExtractor.java
public static void getURIForAllLetters(Set<DocumentPointer> uriList, String letterDirectory, boolean useOnlyNumber) { Path pathToLetters = FileSystems.getDefault().getPath(letterDirectory); List<Path> results = new ArrayList<>(); int maxDepth = 6; try (Stream<Path> stream = Files.find(pathToLetters, maxDepth, (path, attr) -> String.valueOf(path).endsWith(".ncx"))) { stream.forEach(results::add);//from w ww. j av a 2 s.c o m // String joined = stream // .sorted() // .map(String::valueOf) // .collect(Collectors.joining("; ")); // // System.out.println("\nFound: " + joined); } catch (IOException e) { e.printStackTrace(); } System.out.println("files: " + results.size()); try { for (Path res : results) { Path parent = res.getParent(); // System.out.println("---------------------------------------------"); // System.out.println(parent.toString()); //use jsoup to list all files that contain something useful Document doc = Jsoup.parse(res.toFile(), "UTF-8"); String title = ""; for (Element element : doc.getElementsByTag("docTitle")) { //Letter letter = new Letter(); // StringBuilder content = new StringBuilder(); for (Element child : element.children()) { title = child.text(); // System.out.println("Title: " + title); } } for (Element element : doc.getElementsByTag("avantitul")) { for (Element child : element.children()) { String label = child.text(); if (StringUtils.isNotEmpty(label)) { if (label.matches( " ? ? .*")) { System.out.println("------------------ " + label); } } } } for (Element element : doc.getElementsByTag("navPoint")) { //Letter letter = new Letter(); // StringBuilder content = new StringBuilder(); for (Element child : element.children()) { String label = child.text(); if (StringUtils.isNotEmpty(label)) { if (label.matches("?")) { System.out.println("------------------ " + "?" + " -------------------"); } else if (label.contains(" ?")) { break; } String url = child.getElementsByTag("content").attr("src"); if (label.matches(".*\\d{1,3}.*[?--?A-Za-z]+.*") && StringUtils.isNotEmpty(url)) { DocumentPointer documentPointer = new DocumentPointer( parent.toString() + File.separator + url.replaceAll("#.*", ""), title); uriList.add(documentPointer); // System.out.println("nav point: " + label + " src " + parent.toString() // + System.lineSeparator() + url.replaceAll("#.*","")); } else if (label.matches(".*\\d{1,3}.*") && StringUtils.isNotEmpty(url) && useOnlyNumber) { DocumentPointer documentPointer = new DocumentPointer( parent.toString() + File.separator + url.replaceAll("#.*", ""), title); uriList.add(documentPointer); // System.out.println("nav point: " + label + " src " + parent.toString() // + System.lineSeparator() + url.replaceAll("#.*","")); } else { // System.out.println("nav point: " + label + " src " + child.getElementsByTag("content").attr("src")); } } } } } } catch (Exception e) { e.printStackTrace(); } // System.out.println("Size: " + uriList.size()); // for (DocumentPointer pointer : uriList) // { // //parse and // System.out.println(pointer.getSourse() + "\t" + pointer.getUri()); // } }