List of usage examples for org.jsoup.nodes Element nextElementSibling
public Element nextElementSibling()
From source file:com.kantenkugel.discordbot.jdocparser.JDocParser.java
private static List<DocBlock> getDocBlock(String jdocBase, Element elem, ClassDocumentation reference) { if (elem != null) { String baseLink = JDocUtil.getLink(jdocBase, reference); List<DocBlock> blocks = new ArrayList<>(10); String hashLink = null;/* w w w. ja va2 s .c om*/ for (elem = elem.nextElementSibling(); elem != null; elem = elem.nextElementSibling()) { if (elem.tagName().equals("a")) { hashLink = '#' + elem.attr("name"); } else if (elem.tagName().equals("ul")) { Element tmp = elem.getElementsByTag("h4").first(); String title = JDocUtil.fixSpaces(tmp.text().trim()); String description = "", signature = ""; OrderedMap<String, List<String>> fields = new ListOrderedMap<>(); for (; tmp != null; tmp = tmp.nextElementSibling()) { if (tmp.tagName().equals("pre")) { //contains full signature signature = JDocUtil.fixSpaces(tmp.text().trim()); } else if (tmp.tagName().equals("div") && tmp.className().equals("block")) { //main block of content (description or deprecation) Element deprecationElem = tmp.getElementsByClass("deprecationComment").first(); if (deprecationElem != null) { //deprecation block fields.put("Deprecated:", Collections .singletonList(JDocUtil.formatText(deprecationElem.html(), baseLink))); } else { //description block description = JDocUtil.formatText(tmp.html(), baseLink); } } else if (tmp.tagName().equals("dl")) { //a field String fieldName = null; List<String> fieldValues = new ArrayList<>(); for (Element element : tmp.children()) { if (element.tagName().equals("dt")) { if (fieldName != null) { fields.put(fieldName, fieldValues); fieldValues = new ArrayList<>(); } fieldName = JDocUtil.fixSpaces(element.text().trim()); } else if (element.tagName().equals("dd")) { fieldValues.add(JDocUtil.formatText(element.html(), baseLink)); } } if (fieldName != null) { fields.put(fieldName, fieldValues); } } } blocks.add(new DocBlock(title, hashLink, signature, description, fields)); } } return blocks; } return null; }
From source file:com.kantenkugel.discordbot.jdocparser.JDocParser.java
private static Map<String, String> getInheritedMethods(Element summaryAnchor) { Map<String, String> inherited = new HashMap<>(); if (summaryAnchor == null) return inherited; summaryAnchor = summaryAnchor.parent(); Elements inheritAnchors = summaryAnchor.select("a[name^=\"methods.inherited.from.class\"]"); for (Element inheritAnchor : inheritAnchors) { if (inheritAnchor.siblingElements().size() != 2) throw new RuntimeException("Got unexpected html while parsing inherited methods from class " + inheritAnchor.attr("name")); Element next = inheritAnchor.nextElementSibling(); if (!next.tagName().equals("h3")) throw new RuntimeException("Got unexpected html while parsing inherited methods from class " + inheritAnchor.attr("name")); Element sub = next.children().last(); if (sub == null || !sub.tagName().equals("a")) continue; String parent = sub.text().toLowerCase(); next = next.nextElementSibling(); if (!next.tagName().equals("code")) throw new RuntimeException("Got unexpected html while parsing inherited methods from class " + inheritAnchor.attr("name")); for (sub = next.children().first(); sub != null; sub = sub.nextElementSibling()) { if (sub.tagName().equals("a")) { inherited.putIfAbsent(sub.text().toLowerCase(), parent); }/* w w w . j ava2 s . c o m*/ } } return inherited; }
From source file:com.screenslicer.common.CommonUtil.java
public static List<Element> getNextSiblingElementsByOwnText(Document doc, String text) { Elements elements = doc.getElementsMatchingOwnText(text); List<Element> siblings = new ArrayList<Element>(); if (elements == null || elements.isEmpty()) { return siblings; }//from w ww.j ava 2 s . c om Element element = elements.get(0).nextElementSibling(); while (element != null) { siblings.add(element); String tagName = element.tagName(); element = element.nextElementSibling(); if (element != null && !element.tagName().equalsIgnoreCase(tagName)) { break; } } return siblings; }
From source file:com.jejking.hh.nord.corpus.AllrisHtmlToRawDrucksache.java
private ImmutableList<String> druckSachenContents(Document htmlDoc) { /*// www . j a va 2 s .co m * In this way we can identify the bits of "RTF" like text inserted into the overall HTML. * JSoup cleans up the broken HTML removing the xml declaration and inserted html roots * that ALLRIS manages to put in. */ Elements contentMetaElements = htmlDoc.getElementsByAttributeValue("name", "generator"); ImmutableList.Builder<String> listBuilder = ImmutableList.builder(); /* * Iterate over our candidates. Sometimes there are several. */ for (Element contentMetaElement : contentMetaElements) { StringBuilder contentAsTextBuilder = new StringBuilder(); Element nextSibling = contentMetaElement.nextElementSibling(); /* * In the cleaned up HTML DOM returned by JSoup the "RTF" content is * rendered as siblings of the meta node (JSoup having removed the html, head, body * elements which should never have been there in the first place). */ while (nextSibling != null && !nextSibling.tag().equals("meta")) { contentAsTextBuilder.append(nextSibling.text()); nextSibling = nextSibling.nextElementSibling(); } /* * Only carry over non-empty content. */ String contentAsText = contentAsTextBuilder.toString(); if (!removeNonBreakingSpacesAndTrim(contentAsText).isEmpty()) { listBuilder.add(contentAsText); } } return listBuilder.build(); }
From source file:com.johan.vertretungsplan.parser.SVPlanParser.java
public Vertretungsplan getVertretungsplan() throws IOException, JSONException { new LoginHandler(schule).handleLogin(executor, cookieStore, username, password); // JSONArray urls = schule.getData().getJSONArray("urls"); String encoding = schule.getData().getString("encoding"); List<Document> docs = new ArrayList<Document>(); for (int i = 0; i < urls.length(); i++) { JSONObject url = urls.getJSONObject(i); loadUrl(url.getString("url"), encoding, docs); }/* www . j a v a 2s .c o m*/ LinkedHashMap<String, VertretungsplanTag> tage = new LinkedHashMap<String, VertretungsplanTag>(); for (Document doc : docs) { if (doc.select(".svp-tabelle").size() > 0) { VertretungsplanTag tag = new VertretungsplanTag(); String date = "Unbekanntes Datum"; if (doc.select(".svp-plandatum-heute, .svp-plandatum-morgen").size() > 0) date = doc.select(".svp-plandatum-heute, .svp-plandatum-morgen").text(); else if (doc.title().startsWith("Vertretungsplan fr ")) date = doc.title().substring("Vertretungsplan fr ".length()); tag.setDatum(date); if (doc.select(".svp-uploaddatum").size() > 0) tag.setStand(doc.select(".svp-uploaddatum").text().replace("Aktualisierung: ", "")); Elements rows = doc.select(".svp-tabelle tr"); String lastLesson = ""; for (Element row : rows) { if (row.hasClass("svp-header")) continue; Vertretung vertretung = new Vertretung(); List<String> affectedClasses = new ArrayList<String>(); for (Element column : row.select("td")) { if (!hasData(column.text())) { continue; } String type = column.className(); if (type.startsWith("svp-stunde")) { vertretung.setLesson(column.text()); lastLesson = column.text(); } else if (type.startsWith("svp-klasse")) affectedClasses = Arrays.asList(column.text().split(", ")); else if (type.startsWith("svp-esfehlt")) vertretung.setPreviousTeacher(column.text()); else if (type.startsWith("svp-esvertritt")) vertretung.setTeacher(column.text()); else if (type.startsWith("svp-fach")) vertretung.setSubject(column.text()); else if (type.startsWith("svp-bemerkung")) { vertretung.setDesc(column.text()); vertretung.setType(recognizeType(column.text())); } else if (type.startsWith("svp-raum")) vertretung.setRoom(column.text()); if (vertretung.getLesson() == null) vertretung.setLesson(lastLesson); } if (vertretung.getType() == null) { vertretung.setType("Vertretung"); } for (String klasse : affectedClasses) { KlassenVertretungsplan kv = tag.getKlassen().get(klasse); if (kv == null) kv = new KlassenVertretungsplan(klasse); kv.add(vertretung); tag.getKlassen().put(klasse, kv); } } List<String> nachrichten = new ArrayList<String>(); if (doc.select("h2:contains(Mitteilungen)").size() > 0) { Element h2 = doc.select("h2:contains(Mitteilungen)").first(); Element sibling = h2.nextElementSibling(); while (sibling != null && sibling.tagName().equals("p")) { for (String nachricht : TextNode.createFromEncoded(sibling.html(), null).getWholeText() .split("<br />\\s*<br />")) { if (hasData(nachricht)) nachrichten.add(nachricht); } sibling = sibling.nextElementSibling(); } } tag.setNachrichten(nachrichten); tage.put(date, tag); } else { throw new IOException("keine SVPlan-Tabelle gefunden"); } } Vertretungsplan v = new Vertretungsplan(); v.setTage(new ArrayList<VertretungsplanTag>(tage.values())); return v; }
From source file:com.jejking.hh.nord.corpus.AllrisHtmlToRawDrucksache.java
private ImmutableMap<String, String> druckSachenProperties(Document htmlDoc) { ImmutableMap.Builder<String, String> mapBuilder = ImmutableMap.builder(); Elements keyElements = htmlDoc.getElementsByClass("kb1"); // td elements for (Element element : keyElements) { String key = removeNonBreakingSpacesAndTrim(element.text()); if (key.endsWith(":")) { key = key.substring(0, key.length() - 1); }//w w w . j av a 2s .c om if (element.nextElementSibling() != null && !element.nextElementSibling().hasAttr("kb1")) { String value = removeNonBreakingSpacesAndTrim(element.nextElementSibling().text()); if ((!key.isEmpty()) && (!value.isEmpty())) { mapBuilder.put(key, value); } } } return mapBuilder.build(); }
From source file:gov.medicaid.screening.dao.impl.BusinessLienDAOBean.java
/** * Get value pair of label element.//from ww w .j av a 2s . c o m * * @param elements group of elements * @param label label to look for * @return value */ private String getValuePairOfLabel(Elements elements, String label) { Element labelElement = elements.select("dt:containsOwn(" + label + ")").first(); return labelElement != null && labelElement.nextElementSibling() != null ? labelElement.nextElementSibling().text() : ""; }
From source file:me.vertretungsplan.parser.SVPlanParser.java
private void parseSvPlanDay(SubstitutionSchedule v, Element svp, Document doc) throws IOException { SubstitutionScheduleDay day = new SubstitutionScheduleDay(); if ((svp.select(".svp-plandatum-heute, .svp-plandatum-morgen, .Titel").size() > 0 || doc.title().startsWith("Vertretungsplan fr "))) { setDate(svp, doc, day);/*from w w w . j ava2s .com*/ if (svp.select(".svp-tabelle, table:has(.Klasse)").size() > 0) { Elements rows = svp.select(".svp-tabelle tr, table:has(.Klasse) tr"); String lastLesson = ""; String lastClass = ""; for (Element row : rows) { if ((doc.select(".svp-header").size() > 0 && row.hasClass("svp-header")) || row.select("th").size() > 0 || row.text().trim().equals("")) { continue; } Substitution substitution = new Substitution(); for (Element column : row.select("td")) { String type = column.className(); if (!hasData(column.text())) { if ((type.startsWith("svp-stunde") || type.startsWith("Stunde")) && hasData(lastLesson)) { substitution.setLesson(lastLesson); } else if ((type.startsWith("svp-klasse") || type.startsWith("Klasse")) && hasData(lastClass)) { substitution.getClasses().addAll(Arrays .asList(lastClass.split(data.optString(PARAM_CLASS_SEPARATOR, ", ")))); } continue; } if (type.startsWith("svp-stunde") || type.startsWith("Stunde")) { substitution.setLesson(column.text()); lastLesson = column.text(); } else if (type.startsWith("svp-klasse") || type.startsWith("Klasse")) { substitution.getClasses().addAll(Arrays .asList(column.text().split(data.optString(PARAM_CLASS_SEPARATOR, ", ")))); lastClass = column.text(); } else if (type.startsWith("svp-esfehlt") || type.startsWith("Lehrer")) { if (!data.optBoolean(PARAM_EXCLUDE_TEACHERS)) { substitution.setPreviousTeacher(column.text()); } } else if (type.startsWith("svp-esvertritt") || type.startsWith("Vertretung")) { if (!data.optBoolean(PARAM_EXCLUDE_TEACHERS)) { substitution.setTeacher(column.text().replaceAll(" \\+$", "")); } } else if (type.startsWith("svp-fach") || type.startsWith("Fach")) { substitution.setSubject(column.text()); } else if (type.startsWith("svp-bemerkung") || type.startsWith("Anmerkung")) { substitution.setDesc(column.text()); String recognizedType = recognizeType(column.text()); substitution.setType(recognizedType); substitution.setColor(colorProvider.getColor(recognizedType)); } else if (type.startsWith("svp-raum") || type.startsWith("Raum")) { substitution.setRoom(column.text()); } } if (substitution.getType() == null) { substitution.setType("Vertretung"); substitution.setColor(colorProvider.getColor("Vertretung")); } day.addSubstitution(substitution); } } if (svp.select(".LehrerVerplant").size() > 0) { day.addMessage("<b>Verplante Lehrer:</b> " + svp.select(".LehrerVerplant").text()); } if (svp.select(".Abwesenheiten").size() > 0) { day.addMessage("<b>Abwesenheiten:</b> " + svp.select(".Abwesenheiten").text()); } if (svp.select("h2:contains(Mitteilungen)").size() > 0) { Element h2 = svp.select("h2:contains(Mitteilungen)").first(); Element sibling = h2.nextElementSibling(); while (sibling != null && sibling.tagName().equals("p")) { for (String nachricht : TextNode.createFromEncoded(sibling.html(), null).getWholeText() .split("<br />\\s*<br />")) { if (hasData(nachricht)) day.addMessage(nachricht); } sibling = sibling.nextElementSibling(); } } else if (svp.select(".Mitteilungen").size() > 0) { for (Element p : svp.select(".Mitteilungen")) { for (String nachricht : TextNode.createFromEncoded(p.html(), null).getWholeText() .split("<br />\\s*<br />")) { if (hasData(nachricht)) day.addMessage(nachricht); } } } v.addDay(day); } else { throw new IOException("keine SVPlan-Tabelle gefunden"); } }
From source file:edu.harvard.iq.safe.lockss.impl.LOCKSSPlatformStatusHtmlParser.java
/** * * @param is/*ww w . jav a 2 s .com*/ */ @Override public void getPlatformStatusData(InputStream is) { try { Document doc = DataUtil.load(is, "UTF-8", ""); Element body = doc.body(); // most of the target items are sandwitched by <b> tag // this can be used to reach each target item. String tmpCurrentTime = null; String tmpUpTime = null; String currentTime = null; Elements tags = body.getElementsByTag("b"); for (Element tag : tags) { // get the current-time string: for 1.52.3 or older daemons // this is the ony place to get it. String tagText = tag.text(); logger.log(Level.FINE, "working on tagText={0}", tagText); if (tagText.equals("Daemon Status")) { // find current time and up running currentTime = tag.parent().parent().text(); logger.log(Level.INFO, "currentTime text=[{0}]", currentTime); // "currentTime =Daemon Status lockss.statelib.lib.in.us (usdocspln group) 01:25:55 03/01/12, up 7d5h21m" tmstmpMatcher = currentTimeStampPattern.matcher(currentTime); if (tmstmpMatcher.find()) { logger.log(Level.INFO, "group 0={0}", tmstmpMatcher.group(0)); tmpCurrentTime = tmstmpMatcher.group(1); logger.log(Level.INFO, "Current Time:group 1={0}", tmpCurrentTime); tmpUpTime = tmstmpMatcher.group(2); logger.log(Level.INFO, "UpTime:group 2={0}", tmpUpTime); } } // get the remaining key-value sets if (fieldNameSet.contains(tagText)) { Element parent = tag.parent(); String fieldValue = parent.nextElementSibling().text(); logger.log(Level.FINE, "{0}={1}", new Object[] { tagText, fieldValue }); summaryInfoMap.put(tagText, fieldValue); } } // extract the daemon version and platform info that are located // at the bottom // these data are sandwitched by a <center> tag Elements ctags = body.getElementsByTag("center"); String version = null; String platform = null; for (Element ctag : ctags) { String cText = ctag.text(); logger.log(Level.FINE, "center tag Text={0}", cText); // cText is like this: // Daemon 1.53.3 built 28-Jan-12 01:06:36 on build7.lockss.org, Linux RPM 1 if (StringUtils.isNotBlank(cText) && ctag.child(0).nodeName().equals("font")) { String[] versionPlatform = cText.split(", "); if (versionPlatform.length == 2) { logger.log(Level.INFO, "daemon version={0};platform={1}", versionPlatform); version = DaemonStatusDataUtil.getDaemonVersion(versionPlatform[0]); platform = versionPlatform[1]; } else { // the above regex failed logger.log(Level.WARNING, "String-formatting differs; use pattern matching"); version = DaemonStatusDataUtil.getDaemonVersion(cText); int platformOffset = cText.lastIndexOf(", ") + 2; platform = cText.substring(platformOffset); logger.log(Level.INFO, "platform={0}", platform); } } } if (summaryInfoMap.containsKey("V3 Identity")) { String ipAddress = DaemonStatusDataUtil.getPeerIpAddress(summaryInfoMap.get("V3 Identity")); logger.log(Level.INFO, "ipAddress={0}", ipAddress); if (StringUtils.isNotBlank(ipAddress)) { boxInfoMap.put("host", ipAddress); if (!ipAddress.equals(summaryInfoMap.get("IP Address"))) { summaryInfoMap.put("IP Address", ipAddress); } } else { logger.log(Level.WARNING, "host token is blank or null: use IP Address instead"); logger.log(Level.INFO, "IP Address={0}", summaryInfoMap.get("IP Address")); boxInfoMap.put("host", summaryInfoMap.get("IP Address")); } } // for pre-1.53.3 versions boxInfoMap.put("time", tmpCurrentTime); if (!summaryInfoMap.containsKey("Current Time")) { summaryInfoMap.put("Current Time", tmpCurrentTime); } boxInfoMap.put("up", tmpUpTime); if (!summaryInfoMap.containsKey("Uptime")) { summaryInfoMap.put("Uptime", tmpUpTime); } boxInfoMap.put("version", version); if (!summaryInfoMap.containsKey("Daemon Version")) { summaryInfoMap.put("Daemon Version", version); } boxInfoMap.put("platform", platform); if (!summaryInfoMap.containsKey("Platform")) { summaryInfoMap.put("Platform", platform); } } catch (IOException ex) { logger.log(Level.SEVERE, "IO error", ex); } logger.log(Level.INFO, "boxInfoMap={0}", boxInfoMap); logger.log(Level.INFO, "summaryInfo={0}", summaryInfoMap); }
From source file:net.pixomania.crawler.W3C.parser.rules.principalAuthors.PrincipalAuthorsRule1.java
@Override public ArrayList<Person> run(String url, Document doc) { ArrayList<Person> editorList = new ArrayList<>(); Elements editors = doc.select("dt:contains(Principal Author) ~ dd"); if (editors.size() == 0) return null; boolean skip = false; for (Element editor : editors) { Element prev = editor.previousElementSibling(); if (prev.tagName().equals("dt")) { if (!prev.text().trim().toLowerCase().startsWith("principal author")) { skip = true;/* www . j av a 2 s.com*/ } } if (skip) { Element next = editor.nextElementSibling(); if (next != null) { if (next.text().trim().toLowerCase().startsWith("principal author")) { skip = false; continue; } } continue; } String[] splitted = editor.html().split(","); for (String split : splitted) { if (!split.isEmpty()) { if (split.toLowerCase().startsWith("(in alphabetic") || split.toLowerCase().startsWith("see acknowl") || split.toLowerCase().startsWith("the w3") || split.toLowerCase().startsWith("(see ac") || split.toLowerCase().startsWith("see participants") || split.toLowerCase().contains("note:")) { Log.log("warning", "Spec " + url + " may refer to a different section!"); continue; } if (split.equals("WHATWG:") || split.equals("W3C:")) continue; Document newdoc = Jsoup.parse(split.replaceAll("\n", "")); Person result = NameParser.parse(newdoc.text()); if (result == null) continue; for (int i = 0; i < newdoc.select("a").size(); i++) { if (!newdoc.select("a").get(i).attr("href").isEmpty()) { if (newdoc.select("a").get(i).attr("href").contains("@")) { result.setEmail(newdoc.select("a").get(i).attr("href").replace("mailto:", "")); } else { result.addWebsite(newdoc.select("a").get(i).attr("href")); } } } editorList.add(result); } } } if (editorList.size() == 0) return null; return editorList; }