List of usage examples for org.jsoup.nodes Element getAllElements
public Elements getAllElements()
From source file:org.norvelle.addressdiscoverer.parse.structured.StructuredPageWebContactLink.java
/** * Attempt to find a URL-type link associated with the given Jsoup Element, * by looking at all the HREF attributes of the various subelements. * /* w ww . j ava 2 s . co m*/ * @param element * @throws DoesNotContainContactLinkException * @throws MultipleContactLinksOfSameTypeFoundException */ public StructuredPageWebContactLink(Element element) throws DoesNotContainContactLinkException, MultipleContactLinksOfSameTypeFoundException { super(element); ArrayList<String> hrefs = new ArrayList(); Elements elements = element.getAllElements(); for (Element child : elements) { if (child.hasAttr("href")) { String href = child.attr("href"); if (!href.startsWith("mailto:")) hrefs.add(href); } } if (hrefs.isEmpty()) throw new DoesNotContainContactLinkException(); else if (hrefs.size() > 1) throw new MultipleContactLinksOfSameTypeFoundException("Multiple web links"); this.address = hrefs.get(0); }
From source file:org.xwiki.validator.HTML5DutchWebGuidelinesValidator.java
/** * Images placed in a link should have a non-empty text alternative to enable visitors who do not see the image to * follow the link.// ww w .j a va2s. c o m */ public void validateRpd7s4() { for (Element link : getElements(ELEM_LINK)) { // Look for images in the link. boolean hasNonEmptyAlt = false; for (Element child : getChildren(link, ELEM_IMG)) { if (StringUtils.isNotEmpty(getAttributeValue(child, ATTR_ALT))) { hasNonEmptyAlt = true; } } // Look for text in the link. boolean hasText = false; for (Element linkChild : link.getAllElements()) { if (linkChild.hasText()) { hasText = true; } } // Images in links must have a not empty alt attribute if there's no text in the link. assertTrue(Type.ERROR, "rpd7s4.links", hasNonEmptyAlt || hasText); } }
From source file:org.xwiki.validator.HTML5DutchWebGuidelinesValidator.java
/** * Do not describe the mechanism behind following a link. *//*from w w w . j a va2 s . c om*/ public void validateRpd8s1() { List<String> forbiddenLinkTexts = Arrays.asList(messages.getString("rpd8s1.forbiddenLinkTexts").split(",")); for (Element link : getElements(ELEM_LINK)) { for (Element linkChild : link.getAllElements()) { if (linkChild.hasText()) { for (String forbiddenLinkText : forbiddenLinkTexts) { assertFalse(Type.ERROR, "rpd8s1.link", StringUtils.containsIgnoreCase(linkChild.text(), forbiddenLinkText)); } } } } }
From source file:us.colloquy.sandbox.TestExtractor.java
@Test public void useJsoup() { String homeDir = System.getProperty("user.home"); System.out.println(homeDir);/*from www.ja va 2 s . com*/ //JSOUP API allows to extract all elements of letters in files // File input = new File("samples/OEBPS/Text/0001_1006_2001.xhtml"); File input = new File("samples/pisma-1904/OEBPS/Text/single_doc.html"); try { Document doc = Jsoup.parse(input, "UTF-8"); List<Letter> letters = new ArrayList<>(); //our model contains only a subset of fields String previousYear = ""; for (Element element : doc.getElementsByClass("section")) { Letter letter = new Letter(); StringBuilder content = new StringBuilder(); for (Element child : element.children()) { for (Attribute att : child.attributes()) { System.out.println(att.getKey() + " " + att.getValue()); } if ("center".equalsIgnoreCase(child.className())) { String toWhom = child.getElementsByTag("strong").text(); if (StringUtils.isEmpty(toWhom)) { toWhom = child.text(); // System.out.println(toWhom); } String[] toWhomArray = toWhom.split("(\\s\\s)|(,)"); for (String to : toWhomArray) { RussianDate.parseToWhom(letter, to); //here we need to recognize a russian name and store that but for now we store the content } //check if there is anything else here and find date and place - it will be replaced if exists below String entireText = child.text(); String tail = entireText.replace(toWhom, ""); if (StringUtils.isNotEmpty(tail)) { RussianDate.parseDateAndPlace(letter, tail, previousYear); //a parser that figures out date and place if they are present } // System.out.println("two whom\t " + child.getElementsByTag("strong").text() ); } else if ("Data".equalsIgnoreCase(child.className())) { if (child.getElementsByTag("em") != null && StringUtils.isNotEmpty(child.getElementsByTag("em").text())) { RussianDate.parseDateAndPlace(letter, child.getElementsByTag("em").text(), previousYear); //most often date and place are enclosed in em tag if (letter.getDate() != null) { LocalDate localDate = letter.getDate().toInstant().atZone(ZoneId.systemDefault()) .toLocalDate(); int year = localDate.getYear(); previousYear = year + ""; } } // System.out.println("when and where\t " + child.getElementsByTag("em").text()); } else if ("petit".equalsIgnoreCase(child.className()) || "Textpetit_otstup".equalsIgnoreCase(child.className())) { letter.getNotes().add(child.text()); } else { //System.out.println(child.text() ); Elements elements = child.getElementsByTag("sup"); for (Element e : elements) { String value = e.text(); e.replaceWith(new TextNode("[" + value + "]", null)); } for (Element el : child.getAllElements()) { // System.out.println(el.tagName()); if ("sup".equalsIgnoreCase(el.tagName())) { content.append(" [" + el.text() + "] "); } else { content.append(el.text()); } } content.append("\n"); } // System.out.println(child.tag() + "\n" ); // System.out.println(child.outerHtml() + "\n" + child.text()); } letter.setContent(content.toString()); letters.add(letter); } ObjectWriter ow = new com.fasterxml.jackson.databind.ObjectMapper().writer().withDefaultPrettyPrinter(); for (Letter letter : letters) { // if (letter.getDate() == null) // { // if (StringUtils.isNotEmpty(person.getLastName())) // { String json = ow.writeValueAsString(letter); System.out.println(json); // } //} } } catch (IOException e) { e.printStackTrace(); } }