List of usage examples for org.jsoup.nodes Element text
public String text()
From source file:org.xwiki.validator.HTML5DutchWebGuidelinesValidator.java
/** * Do not describe the mechanism behind following a link. *///from w w w . j a va2 s .c o m public void validateRpd8s1() { List<String> forbiddenLinkTexts = Arrays.asList(messages.getString("rpd8s1.forbiddenLinkTexts").split(",")); for (Element link : getElements(ELEM_LINK)) { for (Element linkChild : link.getAllElements()) { if (linkChild.hasText()) { for (String forbiddenLinkText : forbiddenLinkTexts) { assertFalse(Type.ERROR, "rpd8s1.link", StringUtils.containsIgnoreCase(linkChild.text(), forbiddenLinkText)); } } } } }
From source file:org.xwiki.validator.HTML5DutchWebGuidelinesValidator.java
/** * Links to e-mail addresses: the e-mail address to which the message is addressed must be visible in the link text. */// w ww. ja va 2 s. c o m public void validateRpd8s16() { for (Element link : getElements(ELEM_LINK)) { String href = getAttributeValue(link, ATTR_HREF); if (href != null && href.startsWith(MAILTO)) { String email = StringUtils.substringAfter(href, MAILTO); if (email.contains(QUERY_STRING_SEPARATOR)) { email = StringUtils.substringBefore(email, QUERY_STRING_SEPARATOR); } assertTrue(Type.ERROR, "rpd8s16.email", link.text().contains(email)); } } }
From source file:Project.FILER.java
public static String[] Dealing_Files(File f) throws IOException //return array of important strings in the file { Text = ""; String[] Importants = { "", "", "" }; //first element is the title,second is all headers,third is img alt org.jsoup.nodes.Document doc = Jsoup.parse(f, "UTF-8"); Importants[0] = doc.title(); //get the title of the file //Text=Text+" "+doc.title(); String tag = "h"; String All_Headers = ""; Elements Header;//from www . j a v a 2s. c o m for (int i = 1; i < 20; i++) //loop to get text with headers tag of the file { tag = "h" + String.valueOf(i); Header = doc.select(tag); if (Header.size() > 0) { Header = doc.getElementsByTag(tag); String pConcatenated = ""; for (Element x : Header) { pConcatenated += x.text() + " "; } All_Headers = All_Headers + pConcatenated; } else break; } Importants[1] = All_Headers; Text = Text + " " + doc.text(); //get the text of the document Elements img = doc.getElementsByTag("img"); //get the text with img tag for (Element element : img) { if (element.attr("alt") != null && !(element.attr("alt").equals(""))) { Text = Text + " " + element.attr("alt"); Importants[2] = Importants[2] + " " + element.attr("alt"); } } return Importants; }
From source file:ru.redcraft.pinterest4j.core.api.PinAPI.java
public List<Comment> getComments(Pin pin) { LOG.debug("Getting comments for pin = " + pin); List<Comment> comments = new ArrayList<Comment>(); Document doc = null;//from w ww. j a v a 2 s . c o m String axajResponse = null; try { axajResponse = new APIRequestBuilder(pin.getURL()).setErrorMessage(PIN_API_ERROR).build().getResponse() .getEntity(String.class); doc = Jsoup.parse(new JSONObject(axajResponse).getString("footer")); } catch (JSONException e) { throw new PinterestRuntimeException(PIN_API_ERROR + axajResponse, e); } for (Element comment : doc.select("div.comment")) { long id = Long.valueOf(comment.getElementsByClass("DeleteComment").first().attr("data")); Element contentMeta = comment.getElementsByClass("CommenterMeta").first(); User user = new LazyUser(contentMeta.getElementsByTag("a").first().attr("href").replace("/", ""), getApiManager()); contentMeta.getElementsByTag("a").remove(); String text = contentMeta.text(); comments.add(new CommentImpl(id, text, user, pin)); } LOG.debug("Comments extracted: " + comments); return comments; }
From source file:solarrecorder.SolarRecorder.java
private void getSysData() throws IOException { org.jsoup.nodes.Document doc = Jsoup.connect("http://envoy").get(); Elements allh2 = doc.getElementsByTag("h2"); for (Element h2 : allh2) { if (h2.text().equals("System Statistics")) { Elements tables = h2.parent().getElementsByTag("table"); Elements alltr = tables.first().getElementsByTag("tbody").first().getElementsByTag("tr"); for (Element tr : alltr) { Elements alltd = tr.getElementsByTag("td"); String name = alltd.first().text(); String value = alltd.last().text(); if (name.equals("Number of Microinverters Online")) { envoyData.add(new EnvoyData(name, value)); }/*from ww w.j av a 2s .co m*/ } } } }
From source file:synapticloop.documentr.generator.Generator.java
/** * Render the table of contents. This will also render links to the headers, * and back to top links - if the options are enabled. The first thing that * we do is to remove any of the code fence blocks, we then convert the * markdown to HTML to extract the headers to generate the table of context. * Then we go through the actual markdown and add in the links (if * applicable), finally we put in the table of contents and re-insert the * code fence blocks./*from w w w . j a v a 2 s . co m*/ * * @param rendered The previously rendered string * * @return the rendered content, with the table of contents inserted */ private String renderTableOfContents(String rendered) { // the first thing we are going to do is to remove code fences... String renderedClean = removeCodeFenceBlocks(rendered); int numHeader = 0; // here we are going to render the markdown to HTML and then get all of the // header items to build the table of contents. StringBuilder headerStringBuilder = new StringBuilder("\n\n"); PegDownProcessor pegDownProcessor = new PegDownProcessor(); String markdownToHtml = pegDownProcessor.markdownToHtml(renderedClean); numHeader = 0; Document document = Jsoup.parse(markdownToHtml); Elements headings = document.select("h1, h2, h3, h4, h5, h6"); for (Element heading : headings) { int valueOf = Integer.parseInt(heading.nodeName().substring(1)); if (valueOf <= tocLevel) { if (hasTocLinks) { headerStringBuilder.append(SPACING_LOOKUP.get(valueOf) + "[" + heading.text() + "](#documentr_heading_" + numHeader + ")\n"); } else { headerStringBuilder.append(SPACING_LOOKUP.get(valueOf) + heading.text() + "\n"); } } numHeader++; } headerStringBuilder.append("\n\n"); // Now we have the header all set up numHeader = 0; // go through and parse the markdown, get all of the headers char[] charArray = renderedClean.toCharArray(); RootNode rootNode = pegDownProcessor.parseMarkdown(charArray); List<Node> children = rootNode.getChildren(); for (Node node : children) { if (node instanceof HeaderNode) { HeaderNode headerNode = (HeaderNode) node; int level = headerNode.getLevel(); if (level <= tocLevel) { HEADER_LOOKUP.put(new StartEndBean(headerNode.getStartIndex(), headerNode.getEndIndex()), numHeader); } numHeader++; } } if (hasTocLinks) { Iterator<StartEndBean> iterator = HEADER_LOOKUP.keySet().iterator(); int start = 0; StringBuilder renderedStringBuilder = new StringBuilder(); while (iterator.hasNext()) { StartEndBean startEndBean = (StartEndBean) iterator.next(); int headerStart = startEndBean.getStart(); int headerEnd = startEndBean.getEnd(); Integer headerNum = HEADER_LOOKUP.get(startEndBean); renderedStringBuilder.append(Arrays.copyOfRange(charArray, start, headerStart)); renderedStringBuilder.append("\n\n<a name=\"documentr_heading_" + headerNum + "\"></a>\n\n"); if (hasTocBackToTop) { renderedStringBuilder.append(Arrays.copyOfRange(charArray, headerStart, headerEnd - 1)); renderedStringBuilder.append(tocBackToTop); start = headerEnd - 1; } else { start = headerStart; } } renderedStringBuilder.append(Arrays.copyOfRange(charArray, start, charArray.length)); renderedClean = renderedStringBuilder.toString(); } renderedClean = renderedClean.replace(DOCUMENTR_TABLE_OF_CONTENTS, headerStringBuilder.toString()); // last but not least, we need to put back in the code fences Iterator<Integer> codeFenceBlocksIterator = codeFenceBlocks.keySet().iterator(); while (codeFenceBlocksIterator.hasNext()) { Integer integer = (Integer) codeFenceBlocksIterator.next(); renderedClean = renderedClean.replace( String.format("%s%d%s", DOCUMENTR_CODE_FENCE_PREFIX, integer, DOCUMENTR_DELIMETER), codeFenceBlocks.get(integer).toString()); } return renderedClean; }
From source file:uk.bl.wa.parsers.HtmlFeatureParser.java
/** * /*from w w w . j av a 2s . c o m*/ */ @Override public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException { final long start = System.nanoTime(); // Pick up the URL: String url = metadata.get(Metadata.RESOURCE_NAME_KEY); // Parse it using JSoup Document doc = null; try { doc = Jsoup.parse(stream, null, url, parser); } catch (java.nio.charset.IllegalCharsetNameException e) { log.warn("Jsoup parse had to assume UTF-8: " + e); doc = Jsoup.parse(stream, "UTF-8", url); } catch (Exception e) { log.error("Jsoup parse failed: " + e); } finally { if (doc == null) return; } Instrument.timeRel("HTMLAnalyzer.analyze#parser", "HtmlFeatureParser.parse#jsoupparse", start); final long nonJsoupStart = System.nanoTime(); // Record the number of errors found: if (parser.getErrors() != null) metadata.set(NUM_PARSE_ERRORS, parser.getErrors().size()); // Get the links (no image links): Set<String> links = this.extractLinks(doc); if (links != null && links.size() > 0) { metadata.set(LINK_LIST, links.toArray(new String[links.size()])); } //get the image links if (extractImageLinks) { Set<String> imageLinks = this.extractImageLinks(doc); if (imageLinks != null && imageLinks.size() > 0) { metadata.set(IMAGE_LINKS, imageLinks.toArray(new String[imageLinks.size()])); } } // Get the publication date, from BBC pages: for (Element meta : doc.select("meta[name=OriginalPublicationDate]")) { metadata.set(ORIGINAL_PUB_DATE, meta.attr("content")); //log.debug(ORIGINAL_PUB_DATE + ": " + meta.attr("content")); } // Grab the first paragraph with text, and extract the text: for (Element p : doc.select("p")) { String pt = p.text(); if (pt != null) { pt = pt.trim(); if (pt.length() > 0) { metadata.set(FIRST_PARAGRAPH, p.text()); //log.debug(FIRST_PARAGRAPH + ": " +p.text() ); break; } } } // Grab the list of distinct elements used in the page: Set<String> de = new HashSet<String>(); for (Element e : doc.select("*")) { // ELEMENT_NAME matching to weed out the worst false positives caused by JavaScript // This handles cases such as '<script> if (3<a) console.log('something');' where 'a' would have been // seen as a tag, but does not handle cases such as '<script> if ( 3<a ) console.log('something');' where // the a is still seen as a tag because it is followed by a space if (!"#root".equals(e.tag().getName()) && ELEMENT_NAME.matcher(e.tag().getName()).matches()) { de.add(StringUtils.left(e.tag().getName().toLowerCase(Locale.ENGLISH), 100)); } } // For some elements, dig deeper and record attributes too: for (Element e : doc.select("link")) { de.add("link/@rel=" + e.attr("rel")); } // Store them: metadata.set(DISTINCT_ELEMENTS, de.toArray(new String[] {})); // Licence field, following: // http://www.whatwg.org/specs/web-apps/current-work/multipage/links.html#link-type-license for (Element a : doc.select("a[rel=license]")) { metadata.add(Metadata.LICENSE_URL, a.attr("href")); } for (Element a : doc.select("link[rel=license]")) { metadata.add(Metadata.LICENSE_URL, a.attr("href")); } for (Element a : doc.select("area[rel=license]")) { metadata.add(Metadata.LICENSE_URL, a.attr("href")); } Instrument.timeRel("HTMLAnalyzer.analyze#parser", "HtmlFeatureParser.parse#featureextract", nonJsoupStart); }
From source file:us.colloquy.sandbox.FileProcessor.java
@Test public void listAllUzipedFiles() { ///Documents/Tolstoy/diaries //System.getProperty("user.home") + "/Documents/Tolstoy/unzipLetters" Path pathToLetters = FileSystems.getDefault() .getPath(System.getProperty("user.home") + "/Documents/Tolstoy/openDiaries"); List<Path> results = new ArrayList<>(); int maxDepth = 6; try (Stream<Path> stream = Files.find(pathToLetters, maxDepth, (path, attr) -> { return String.valueOf(path).endsWith(".ncx"); })) {//from ww w . j a v a2 s . c o m stream.forEach(results::add); // String joined = stream // .sorted() // .map(String::valueOf) // .collect(Collectors.joining("; ")); // // System.out.println("\nFound: " + joined); } catch (IOException e) { e.printStackTrace(); } System.out.println("files: " + results.size()); Set<String> uriList = new TreeSet<>(); try { for (Path res : results) { Path parent = res.getParent(); System.out.println("---------------------------------------------"); System.out.println(parent.toString()); //use jsoup to list all files that contain something useful Document doc = Jsoup.parse(res.toFile(), "UTF-8"); for (Element element : doc.getElementsByTag("docTitle")) { //Letter letter = new Letter(); // StringBuilder content = new StringBuilder(); for (Element child : element.children()) { System.out.println("Title: " + child.text()); } } for (Element element : doc.getElementsByTag("navPoint")) { //Letter letter = new Letter(); // StringBuilder content = new StringBuilder(); for (Element child : element.children()) { String label = child.text(); if (StringUtils.isNotEmpty(label)) { if (label.matches("?")) { System.out.println("------------------"); } String url = child.getElementsByTag("content").attr("src"); if (label.matches(".*\\d{1,3}.*[?--?]+.*") && StringUtils.isNotEmpty(url)) { uriList.add(parent.toString() + File.separator + url.replaceAll("#.*", "")); // System.out.println("nav point: " + label + " src " + parent.toString() // + System.lineSeparator() + url.replaceAll("#.*","")); } else { // System.out.println("nav point: " + label + " src " + child.getElementsByTag("content").attr("src")); } } } } } } catch (Exception e) { e.printStackTrace(); } System.out.println("Size: " + uriList.size()); for (String uri : uriList) { //parse and System.out.println(uri); } }
From source file:us.colloquy.sandbox.FileProcessor.java
@Test public void listAllUzipedFilesContent() { ///Documents/Tolstoy/diaries Path pathToLetters = FileSystems.getDefault() .getPath(System.getProperty("user.home") + "/Documents/Tolstoy/unzipLetters"); List<Path> results = new ArrayList<>(); int maxDepth = 6; try (Stream<Path> stream = Files.find(pathToLetters, maxDepth, (path, attr) -> { return String.valueOf(path).endsWith(".opf"); })) {//from w ww . jav a2s . c o m stream.forEach(results::add); // String joined = stream // .sorted() // .map(String::valueOf) // .collect(Collectors.joining("; ")); // // System.out.println("\nFound: " + joined); } catch (IOException e) { e.printStackTrace(); } System.out.println("files: " + results.size()); Set<String> uriList = new TreeSet<>(); try { for (Path res : results) { Path parent = res.getParent(); System.out.println("---------------------------------------------"); System.out.println(parent.toString()); //use jsoup to list all files that contain something useful Document doc = Jsoup.parse(res.toFile(), "UTF-8"); for (Element element : doc.getElementsByTag("dc:title")) { //Letter letter = new Letter(); // StringBuilder content = new StringBuilder(); System.out.println(element.text()); // for (Element child : element.children()) // { // System.out.println(child.tagName() + "\t" + child.text()); // } } // for (Element element : doc.getElementsByTag("navPoint")) // { // //Letter letter = new Letter(); // // // StringBuilder content = new StringBuilder(); // // for (Element child : element.children()) // { // String label = child.text(); // // if (StringUtils.isNotEmpty(label)) // { // if (label.matches("?")) // { // System.out.println("------------------"); // } // // // String url = child.getElementsByTag("content").attr("src"); // // if (label.matches(".*\\d{1,3}.*[?--?]+.*") && // StringUtils.isNotEmpty(url) ) // { // // uriList.add(parent.toString() // + File.separator + url.replaceAll("#.*","")); //// System.out.println("nav point: " + label + " src " + parent.toString() //// + System.lineSeparator() + url.replaceAll("#.*","")); // // // } else // { // // System.out.println("nav point: " + label + " src " + child.getElementsByTag("content").attr("src")); // } // // // } // } // } } } catch (Exception e) { e.printStackTrace(); } System.out.println("Size: " + uriList.size()); for (String uri : uriList) { //parse and System.out.println(uri); } }
From source file:us.colloquy.sandbox.FileProcessor.java
@Test public void getURIForAllDiaries() { Set<DocumentPointer> uriList = new HashSet<>(); //String letterDirectory = System.getProperty("user.home") + "/Documents/Tolstoy/openDiaries"; ///* ww w . j av a 2 s .com*/ String letterDirectory = System.getProperty("user.home") + "/Documents/Tolstoy/90-volume-set/diaries/uzip/dnevnik_1881-1887_vol_49"; Path pathToLetters = FileSystems.getDefault().getPath(letterDirectory); List<Path> results = new ArrayList<>(); int maxDepth = 6; try (Stream<Path> stream = Files.find(pathToLetters, maxDepth, (path, attr) -> { return String.valueOf(path).endsWith(".ncx"); })) { stream.forEach(results::add); } catch (IOException e) { e.printStackTrace(); } System.out.println("files: " + results.size()); try { for (Path res : results) { Path parent = res.getParent(); // System.out.println("---------------------------------------------"); // System.out.println(parent.toString()); //use jsoup to list all files that contain something useful Document doc = Jsoup.parse(res.toFile(), "UTF-8"); String title = ""; for (Element element : doc.getElementsByTag("docTitle")) { //Letter letter = new Letter(); // StringBuilder content = new StringBuilder(); for (Element child : element.children()) { title = child.text(); // System.out.println("Title: " + title); } } // System.out.println("========================== " + res.toString() + " =========================="); boolean startPrinting = false; boolean newFile = true; for (Element element : doc.getElementsByTag("navPoint")) { //get nav label and content Element navLabelElement = element.select("navLabel").first(); Element srsElement = element.select("content").first(); String navLabel = ""; String srs = ""; if (navLabelElement != null) { navLabel = navLabelElement.text().replaceAll("\\*", "").trim(); } if (srsElement != null) { srs = srsElement.attr("src"); } if ("??".matches(navLabel)) { startPrinting = false; // System.out.println("----------------- end of file pointer ---------------"); } if (StringUtils.isNotEmpty(navLabel) && navLabel.matches("??.*|?? ?.*") && newFile) { newFile = false; startPrinting = true; title = navLabel; } if (startPrinting) { // System.out.println("----------------- file pointer ---------------"); // System.out.println(navLabel + "\t" + srs); DocumentPointer documentPointer = new DocumentPointer( parent.toString() + File.separator + srs.replaceAll("#.*", ""), title); uriList.add(documentPointer); } // for (Element child : element.children()) // { // String label = child.text(); // // if (StringUtils.isNotEmpty(label)) // { // if (label.matches("??\\s\\d{4}.*")) // { // System.out.println("------------------"); // } // // String url = child.getElementsByTag("content").attr("src"); // // if (label.matches(".*\\d{1,3}.*[?--?]+.*") && // StringUtils.isNotEmpty(url)) // { // DocumentPointer letterPointer = new DocumentPointer(parent.toString() // + File.separator + url.replaceAll("#.*", ""), title); // // uriList.add(letterPointer); //// System.out.println("nav point: " + label + " src " + parent.toString() //// + System.lineSeparator() + url.replaceAll("#.*","")); // // // } else if (label.matches(".*\\d{1,3}.*") && // StringUtils.isNotEmpty(url) && useOnlyNumber) // { // DocumentPointer letterPointer = new DocumentPointer(parent.toString() // + File.separator + url.replaceAll("#.*", ""), title); // // uriList.add(letterPointer); //// System.out.println("nav point: " + label + " src " + parent.toString() //// + System.lineSeparator() + url.replaceAll("#.*","")); // // // } else // { // // System.out.println("nav point: " + label + " src " + child.getElementsByTag("content").attr("src")); // } // // // } // } } // System.out.println("========================== END OF FILE =========================="); } } catch (Exception e) { e.printStackTrace(); } System.out.println("Size: " + uriList.size()); for (DocumentPointer pointer : uriList) { //parse and System.out.println(pointer.getSourse() + "\t" + pointer.getUri()); } }