List of usage examples for org.jsoup.nodes Element tagName
public String tagName()
From source file:org.tinymediamanager.scraper.ofdb.OfdbMetadataProvider.java
private void parseCast(Elements el, MediaCastMember.CastType type, MediaMetadata md) { if (el != null && !el.isEmpty()) { Element castEl = null;//w w w . j av a 2s . co m for (Element element : el) { if (!element.tagName().equals("option")) { // we get more, just do not take the optionbox castEl = element; } } if (castEl == null) { LOGGER.debug("meh, no " + type.name() + " found"); return; } // walk up to table TR... while (!((castEl == null) || (castEl.tagName().equalsIgnoreCase("tr")))) { castEl = castEl.parent(); } // ... and take the next table row ^^ Element tr = castEl.nextElementSibling(); if (tr != null) { for (Element a : tr.getElementsByAttributeValue("valign", "middle")) { String act = a.toString(); String aname = StrgUtils.substr(act, "alt=\"(.*?)\""); if (!aname.isEmpty()) { MediaCastMember cm = new MediaCastMember(); cm.setName(aname); String id = StrgUtils.substr(act, "id=(.*?)[^\"]\">"); if (!id.isEmpty()) { cm.setId(id); // thumb // http://www.ofdb.de/thumbnail.php?cover=images%2Fperson%2F7%2F7689.jpg&size=6 // fullsize ;) http://www.ofdb.de/images/person/7/7689.jpg try { String imgurl = URLDecoder .decode(StrgUtils.substr(act, "images%2Fperson%2F(.*?)&size"), "UTF-8"); if (!imgurl.isEmpty()) { imgurl = BASE_URL + "/images/person/" + imgurl; } cm.setImageUrl(imgurl); } catch (Exception e) { } } String arole = StrgUtils.substr(act, "\\.\\.\\. (.*?)</font>").replaceAll("<[^>]*>", ""); cm.setCharacter(arole); cm.setType(type); md.addCastMember(cm); } } } } }
From source file:org.wandora.application.tools.extractors.bookmark.BookmarkExtractor.java
private void parseCategory(Element c, TopicMap t) throws TopicMapException, ParseException { Topic wc = getWandoraClass(t);// w w w . jav a 2 s .c o m Topic root = getOrCreateTopic(t, ROOT_SI, "Bookmark"); root.setSubjectLocator(new Locator(ROOT_SI)); root.setDisplayName(LANG, "Bookmark"); makeSubclassOf(t, root, wc); for (Element child : c.children()) { if (child.tagName().equals("dt")) { for (Element grandChild : child.children()) { if (grandChild.tagName().equals("a")) parseItem(grandChild, root, t); else if (grandChild.tagName().equals("dl")) parseCategory(child, root, t); } } } parseCategory(c, root, t); }
From source file:org.wandora.application.tools.extractors.bookmark.BookmarkExtractor.java
private void parseCategory(Element c, Topic parent, TopicMap t) throws TopicMapException, ParseException { Topic cTopic = parent;/*from w w w . j av a2 s .co m*/ Elements children = c.children(); for (Element child : children) { if (child.tagName().equals("h3")) { String cLocator = parent.getSubjectLocator().toString(); cLocator += "/" + urlEncode(child.html()); String cName = child.ownText(); cTopic = getOrCreateTopic(t, cLocator); cTopic.setSubjectLocator(new Locator(cLocator)); cTopic.setBaseName(cName + " (Bookmark)"); cTopic.setDisplayName(LANG, cName); makeSubclassOf(t, cTopic, parent); } } for (Element child : children) { if (!child.tagName().equals("dl")) continue; for (Element grandChild : child.children()) { if (!grandChild.tagName().equals("dt")) continue; for (Element ggChild : grandChild.children()) { if (ggChild.tagName().equals("a")) parseItem(ggChild, cTopic, t); else if (ggChild.tagName().equals("dl")) parseCategory(grandChild, cTopic, t); } } } }
From source file:org.xlrnet.metadict.engines.woxikon.WoxikonEngine.java
private void findRecommendations(@NotNull Document doc, @NotNull BilingualQueryResultBuilder resultBuilder) { // Determine all candidate nodes: Elements alternativeNodes = doc.select("div.cc > p > *"); Language currentLanguage = null;//ww w .ja v a2s . co m for (Element node : alternativeNodes) { // If the next node is a flagicon, try to determine the language for the next entries from the class name if (node.tagName().equals("span") && node.hasClass("flagicon")) { Set<String> classNames = node.classNames(); classNames.remove("flagicon"); for (String className : classNames) { Language candidate = Language.getExistingLanguageById(className); if (candidate != null) { currentLanguage = candidate; break; } } } else if (node.tagName().equals("a")) { String recommendationText = node.text(); DictionaryObjectBuilder objectBuilder = ImmutableDictionaryObject.builder(); objectBuilder.setLanguage(currentLanguage).setGeneralForm(recommendationText); resultBuilder.addSimilarRecommendation(objectBuilder.build()); } } }
From source file:org.xwiki.validator.HTML5DutchWebGuidelinesValidator.java
private String getMetaCharset() { Elements metas = this.html5Document.getElementsByAttributeValue("http-equiv", "Content-Type"); for (Element meta : metas) { if ("meta".equals(meta.tagName())) { return meta.attr("content"); }/*from ww w . j av a 2s .co m*/ } return null; }
From source file:uk.co.certait.htmlexporter.css.StyleMap.java
private Style getStyleForTag(Element element) { return styles.get(element.tagName()); }
From source file:uk.co.certait.htmlexporter.css.StyleMap.java
private Style getInlineStyle(Element element) { Style style = null;/* ww w. j av a 2 s. c o m*/ if (element.hasAttr("style")) { List<Rule> inlineRules; try { String inlineStyle = element.attr("style").endsWith(";") ? element.attr("style") : element.attr("style") + ";"; inlineRules = CSSParser.parse("x{" + inlineStyle + "}"); } catch (Exception e) { throw new RuntimeException("Error parsing inline style for element " + element.tagName()); } style = generator.createStyle(inlineRules.get(0), inlineRules.get(0).getSelectors().get(0)); } return style; }
From source file:us.colloquy.sandbox.TestExtractor.java
@Test public void useJsoup() { String homeDir = System.getProperty("user.home"); System.out.println(homeDir);// w w w .j ava2s .c o m //JSOUP API allows to extract all elements of letters in files // File input = new File("samples/OEBPS/Text/0001_1006_2001.xhtml"); File input = new File("samples/pisma-1904/OEBPS/Text/single_doc.html"); try { Document doc = Jsoup.parse(input, "UTF-8"); List<Letter> letters = new ArrayList<>(); //our model contains only a subset of fields String previousYear = ""; for (Element element : doc.getElementsByClass("section")) { Letter letter = new Letter(); StringBuilder content = new StringBuilder(); for (Element child : element.children()) { for (Attribute att : child.attributes()) { System.out.println(att.getKey() + " " + att.getValue()); } if ("center".equalsIgnoreCase(child.className())) { String toWhom = child.getElementsByTag("strong").text(); if (StringUtils.isEmpty(toWhom)) { toWhom = child.text(); // System.out.println(toWhom); } String[] toWhomArray = toWhom.split("(\\s\\s)|(,)"); for (String to : toWhomArray) { RussianDate.parseToWhom(letter, to); //here we need to recognize a russian name and store that but for now we store the content } //check if there is anything else here and find date and place - it will be replaced if exists below String entireText = child.text(); String tail = entireText.replace(toWhom, ""); if (StringUtils.isNotEmpty(tail)) { RussianDate.parseDateAndPlace(letter, tail, previousYear); //a parser that figures out date and place if they are present } // System.out.println("two whom\t " + child.getElementsByTag("strong").text() ); } else if ("Data".equalsIgnoreCase(child.className())) { if (child.getElementsByTag("em") != null && StringUtils.isNotEmpty(child.getElementsByTag("em").text())) { RussianDate.parseDateAndPlace(letter, child.getElementsByTag("em").text(), previousYear); //most often date and place are enclosed in em tag if (letter.getDate() != null) { LocalDate localDate = letter.getDate().toInstant().atZone(ZoneId.systemDefault()) .toLocalDate(); int year = localDate.getYear(); previousYear = year + ""; } } // System.out.println("when and where\t " + child.getElementsByTag("em").text()); } else if ("petit".equalsIgnoreCase(child.className()) || "Textpetit_otstup".equalsIgnoreCase(child.className())) { letter.getNotes().add(child.text()); } else { //System.out.println(child.text() ); Elements elements = child.getElementsByTag("sup"); for (Element e : elements) { String value = e.text(); e.replaceWith(new TextNode("[" + value + "]", null)); } for (Element el : child.getAllElements()) { // System.out.println(el.tagName()); if ("sup".equalsIgnoreCase(el.tagName())) { content.append(" [" + el.text() + "] "); } else { content.append(el.text()); } } content.append("\n"); } // System.out.println(child.tag() + "\n" ); // System.out.println(child.outerHtml() + "\n" + child.text()); } letter.setContent(content.toString()); letters.add(letter); } ObjectWriter ow = new com.fasterxml.jackson.databind.ObjectMapper().writer().withDefaultPrettyPrinter(); for (Letter letter : letters) { // if (letter.getDate() == null) // { // if (StringUtils.isNotEmpty(person.getLastName())) // { String json = ow.writeValueAsString(letter); System.out.println(json); // } //} } } catch (IOException e) { e.printStackTrace(); } }
From source file:utils.AutoLinkRenderer.java
/** * * Check whether element is links, code tags. * @param el//from w w w . j av a 2 s . c o m * @return */ private boolean isIgnoreElement(Element el) { return ArrayUtils.contains(IGNORE_TAGNAME, el.tagName().toUpperCase()); }
From source file:xxx.web.comments.debates.impl.ProConOrgCommentsParser.java
/** * Extracts the document of the quote/*from ww w . j a v a 2 s. c o m*/ * * @param textElement text quote element * @return plain string with paragraphs kept */ protected static String extractPlainTextFromTextElement(Element textElement) { StringBuilder sb = new StringBuilder(); for (Node childNode : textElement.childNodes()) { if (childNode instanceof Element) { Element childElement = (Element) childNode; String tagName = childElement.tagName(); if ("p".equals(tagName) || "span".equals(tagName)) { sb.append(childElement.text()); sb.append("\n"); } else if ("br".equals(tagName)) { // prevent double newlines sb = new StringBuilder(sb.toString().trim()); sb.append("\n"); } } else if (childNode instanceof TextNode) { TextNode textNode = (TextNode) childNode; sb.append(textNode.text()); } } // remove leading + ending quotes return Utils.normalize(sb.toString()).replaceAll("[(^\")(\"$)]", ""); }