List of usage examples for org.jsoup.nodes Element nextElementSibling
public Element nextElementSibling()
From source file:de.geeksfactory.opacclient.apis.Littera.java
private String getCellContent(Element detailTable, String pattern) { final Element first = detailTable.select("td.label:matchesOwn(" + pattern + ")").first(); return first == null ? null : first.nextElementSibling().text(); }
From source file:net.parser.JobParser.java
private String getPropertyString(String helpStr, String cssQuery) { Elements elements = doc.select(cssQuery); String propertyString = null; for (Element element : elements) { if (element.getElementsByTag("strong").text().equals(helpStr)) { propertyString = element.nextElementSibling().text(); break; }/*from ww w . j av a 2 s . com*/ } return propertyString; }
From source file:mergedoc.core.APIDocument.java
/** * ? Javadoc ????//from w ww. j a v a 2 s.c om * @param className ?? * @param docHtml API */ private void parseMethodComment(String className, Document doc) { Elements elements = doc.select("body > div.contentContainer > div.details > ul > li > ul > li > ul > li"); for (Element element : elements) { Element sigElm = element.select("pre").first(); if (sigElm == null) { continue; } String sigStr = sigElm.html(); Signature sig = createSignature(className, sigStr); Comment comment = new Comment(sig); // deprecated String depre = ""; Elements divs = element.select("div"); if (divs.size() == 2) { depre = divs.get(0).html(); } if (divs.size() > 0) { String body = divs.last().html(); body = formatLinkTag(className, body); comment.setDocumentBody(body); } Elements dtTags = element.select("dl dt"); for (Element dtTag : dtTags) { String dtText = dtTag.text(); if (dtText.contains(":")) { Element dd = dtTag; while (true) { dd = dd.nextElementSibling(); if (dd == null || dd.tagName().equalsIgnoreCase("dd") == false) { break; } String name = dd.select("code").first().text(); if (dtText.contains(":")) { name = "<" + name + ">"; } String items = dd.html(); Pattern p = PatternCache .getPattern("(?si)<CODE>(.+?)</CODE>\\s*-\\s*(.*?)(<DD>|</DD>|</DL>|<DT>|$)"); Matcher m = p.matcher(items); if (m.find()) { String desc = formatLinkTag(className, m.group(2)); comment.addParam(name, desc); } } continue; } if (dtText.contains(":")) { Element dd = dtTag.nextElementSibling(); String str = dd.html(); str = formatLinkTag(className, str); comment.addReturn(str); continue; } if (dtText.contains(":")) { Element dd = dtTag; while (true) { dd = dd.nextElementSibling(); if (dd == null || dd.tagName().equalsIgnoreCase("dd") == false) { break; } String name = dd.select("code").first().text(); String items = dd.html(); Pattern p = PatternCache .getPattern("(?si)<CODE>(.+?)</CODE>\\s*-\\s*(.*?)(<DD>|</DD>|</DL>|<DT>|$)"); Matcher m = p.matcher(items); if (m.find()) { String desc = formatLinkTag(className, m.group(2)); String param = name + " " + desc; comment.addThrows(param); } } continue; } } // deprecated parseDeprecatedTag(className, depre, comment); // parseCommonTag(className, element, comment); contextTable.put(sig, comment); } }
From source file:mergedoc.core.APIDocument.java
/** * Javadoc ? ??????// w w w .j a v a 2 s . c o m * @param className ?? * @param context * @param comment */ private void parseCommonTag(String className, Element element, Comment comment) { Elements dts = element.select("dl dt"); for (Element dt : dts) { String dtText = dt.text(); if (dtText.contains("")) { Elements aTags = dt.nextElementSibling().select("a:has(code)"); for (Element a : aTags) { String url = a.attr("href"); String ref; if (a.childNodeSize() != 1) { ref = aTags.outerHtml(); } else { ref = formatClassName(className, url); ref = FastStringUtils.replace(ref, "%28", "("); ref = FastStringUtils.replace(ref, "%29", ")"); Pattern methodRefPat = PatternCache.getPattern("-(.*)-$"); Matcher methodRefMat = methodRefPat.matcher(ref); if (methodRefMat.find()) { ref = FastStringUtils.replaceAll(ref, "-(.*)-$", "($1)"); // for Java8 ref = FastStringUtils.replace(ref, "-", ","); // for Java8 ref = FastStringUtils.replace(ref, ":A", "[]"); // for Java8 } } comment.addSee(ref); } } else if (dtText.contains("???:")) { comment.addSince(dt.nextElementSibling().text()); } } }
From source file:net.pixomania.crawler.W3C.parser.rules.editors.EditorsRule7.java
@Override public ArrayList<Person> run(String url, Document doc) { ArrayList<Person> editorList = new ArrayList<>(); Elements editors = doc.select("dt:contains(Authors/Editors) ~ dd, dt:contains(Author/Editor) ~ dd"); if (editors.size() == 0) return null; boolean skip = false; for (Element editor : editors) { Element prev = editor.previousElementSibling(); if (prev.tagName().equals("dt")) { if (!prev.text().trim().toLowerCase().startsWith("authors/editors") && !prev.text().trim().toLowerCase().startsWith("author/editor")) { skip = true;//from w w w.ja v a2 s . c o m } } if (skip) { Element next = editor.nextElementSibling(); if (next != null) { if (next.text().trim().toLowerCase().startsWith("authors/editors") || next.text().trim().toLowerCase().startsWith("author/editor")) { skip = false; continue; } } continue; } if (StringUtils.countMatches(editor.text(), " - ") > 2) { Log.log("warning", url + ": This editor may be a list of editors separated by - "); EditorsRule5 ed5 = new EditorsRule5(); return ed5.run(url, doc); } String[] splitted = editor.html().split("<br />|<br clear=\"none\" />"); if (splitted.length < 2) { if (editor.text().equals("WHATWG:") || editor.text().equals("W3C:")) continue; Person result = NameParser.parse(editor.text()); if (result == null) continue; for (int i = 0; i < editor.select("a").size(); i++) { if (!editor.select("a").get(i).attr("href").isEmpty()) { if (editor.select("a").get(i).attr("href").contains("@")) { result.setEmail(editor.select("a").get(i).attr("href").replace("mailto:", "")); } else { result.addWebsite(editor.select("a").get(i).attr("href")); } } } editorList.add(result); } else { for (String split : splitted) { if (!split.isEmpty()) { if (split.equals("WHATWG:") || split.equals("W3C:")) continue; Document newdoc = Jsoup.parse(split.replaceAll("\n", "")); Person result = NameParser.parse(newdoc.text()); if (result == null) continue; for (int i = 0; i < newdoc.select("a").size(); i++) { if (!newdoc.select("a").get(i).attr("href").isEmpty()) { if (newdoc.select("a").get(i).attr("href").contains("@")) { result.setEmail(newdoc.select("a").get(i).attr("href").replace("mailto:", "")); } else { result.addWebsite(newdoc.select("a").get(i).attr("href")); } } } editorList.add(result); } } } Element next = editor.nextElementSibling(); if (next != null) if (next.tag().getName().equals("dt")) break; } if (editorList.size() == 0) return null; return editorList; }
From source file:gov.medicaid.screening.dao.impl.SocialWorkLicenseDAOBean.java
/** * Retrieves all results from the source site. * * @param searchCriteria the search criteria. * @return the providers matched//from w w w . java2 s. c o m * @throws URISyntaxException if the URL could not be correctly constructed * @throws IOException for any I/O related errors * @throws ServiceException for any other errors encountered */ private SearchResult<License> getAllResults(SocialWorkCriteria searchCriteria) throws URISyntaxException, IOException, ServiceException { DefaultHttpClient client = new DefaultHttpClient(getLaxSSLConnectionManager()); client.setRedirectStrategy(new LaxRedirectStrategy()); HttpGet getSearchPage = new HttpGet(new URIBuilder(getSearchURL()).build()); HttpResponse response = client.execute(getSearchPage); verifyAndAuditCall(getSearchURL(), response); Document page = Jsoup.parse(EntityUtils.toString(response.getEntity())); String licenseNo = ""; if (searchCriteria instanceof SocialWorkLicenseSearchByLicenseNumberCriteria) { licenseNo = "" + ((SocialWorkLicenseSearchByLicenseNumberCriteria) searchCriteria).getLicenseNumber(); } String level = "none"; if (searchCriteria.getLevel() != null) { level = Util.defaultString(searchCriteria.getLevel().getName()); } HttpPost search = new HttpPost(new URIBuilder(getSearchURL()).build()); HttpEntity entity = postForm(getSearchURL(), client, search, buildParams(searchCriteria, page, licenseNo, level, null), true); page = Jsoup.parse(EntityUtils.toString(entity)); List<License> allLicenses = new ArrayList<License>(); // check if detail page (single match) if (page.select("#lblFormTitle").text().equals("License Details")) { allLicenses.add(parseLicenseDetail(page)); } else { Elements rows = page.select(RESULT_ROWS_SELECTOR); while (rows.size() > 0) { for (Element row : rows) { License license = parseLicense(row.children()); if (license != null) { allLicenses.add(license); } } rows.clear(); // check for next page Element currentPage = page.select("#_ctl7_grdSearchResults tr.TablePager span").first(); getLog().log(Level.DEBUG, "Current page is: " + currentPage.text()); Element pageLink = currentPage.nextElementSibling(); if (pageLink != null && pageLink.hasAttr("href")) { getLog().log(Level.DEBUG, "There are more results, getting the next page."); String target = parseEventTarget(pageLink.attr("href")); entity = postForm(getSearchURL(), client, search, buildParams(searchCriteria, page, licenseNo, level, target), true); page = Jsoup.parse(EntityUtils.toString(entity)); rows = page.select(RESULT_ROWS_SELECTOR); } } } SearchResult<License> results = new SearchResult<License>(); results.setItems(allLicenses); return results; }
From source file:gov.medicaid.screening.dao.impl.ChiropracticLicenseDAOBean.java
/** * Parses the Chiropractic license details page. * /*from w w w .j a v a 2s .c o m*/ * @param page * the details page * @param licenseType * if user has multiple licenses, this one will be used * @return the parsed license details * @throws ParsingException * if the expected tags were not found */ private License parseLicense(Document page, String licenseType) throws ParsingException { License license = new License(); ProviderProfile profile = new ProviderProfile(); license.setProfile(profile); User user = new User(); profile.setUser(user); Elements tables = page.select("table"); for (Element cell : tables.get(0).select("td")) { if (cell.text().equals("First Name")) { user.setFirstName(cell.nextElementSibling().text()); } else if (cell.text().equals("Middle Name")) { user.setMiddleName(cell.nextElementSibling().text()); } else if (cell.text().equals("Last Name")) { user.setLastName(cell.nextElementSibling().text()); } else if (cell.text().equals("Gender")) { String gender = cell.nextElementSibling().text(); if (Util.isNotBlank(gender)) { if ("M".equalsIgnoreCase(gender)) { profile.setSex(Sex.MALE); } else { profile.setSex(Sex.FEMALE); } } } } List<Address> addresses = new ArrayList<Address>(); Address address = new Address(); addresses.add(address); profile.setAddresses(addresses); StringBuffer locBuffer = new StringBuffer(); for (Element cell : tables.get(1).select("td")) { if (cell.text().equals("Address Line1")) { locBuffer.insert(0, cell.nextElementSibling().text() + " "); } else if (cell.text().equals("Address Line2")) { locBuffer.append(cell.nextElementSibling().text()); } else if (cell.text().equals("City")) { address.setCity(cell.nextElementSibling().text()); } else if (cell.text().equals("State")) { address.setState(cell.nextElementSibling().text()); } else if (cell.text().equals("ZIP")) { address.setZipcode(cell.nextElementSibling().text()); } else if (cell.text().equals("Phone Number")) { profile.setContactPhoneNumber(cell.nextElementSibling().text()); } } address.setLocation(locBuffer.toString().trim()); for (Element row : tables.get(2).select("tr")) { String lType = row.select("td:eq(0)").text(); if (licenseType != null && !lType.startsWith(licenseType)) { // user has multiple licenses, the results will show this user twice (search by name) continue; } LicenseType type = new LicenseType(); type.setName(row.select("td:eq(0)").text()); license.setType(type); license.setLicenseNumber(row.select("td:eq(1)").text()); LicenseStatus status = new LicenseStatus(); status.setName(row.select("td:eq(2)").text()); license.setStatus(status); String issueDate = row.select("td:eq(3)").text(); if (Util.isNotBlank(issueDate)) { license.setOriginalIssueDate(parseDate(issueDate, DATE_FORMAT)); } String renewalDate = row.select("td:eq(4)").text(); if (Util.isNotBlank(renewalDate)) { license.setRenewalDate(parseDate(renewalDate, DATE_FORMAT)); } String expirationDate = row.select("td:eq(5)").text(); if (Util.isNotBlank(expirationDate)) { license.setExpireDate(parseDate(expirationDate, DATE_FORMAT)); } } return license; }
From source file:net.pixomania.crawler.W3C.parser.rules.editors.version.VersionEditorRule1.java
@Override public ArrayList<Person> run(String url, Document doc) { ArrayList<Person> editorList = new ArrayList<>(); Elements editors = doc.select("dt:contains(version 1), dt:contains(version 1) ~ dd"); if (editors.size() == 0) return null; boolean skip = false; String version = ""; for (Element editor : editors) { Element prev = editor.previousElementSibling(); if (prev != null) { if (prev.tagName().equals("dt")) { if (!prev.text().trim().toLowerCase().startsWith("version 1") && !prev.text().trim().toLowerCase().startsWith("editors (version 1")) { skip = true;//from w ww. jav a 2 s .c o m } } if (skip) { Element next = editor.nextElementSibling(); if (next != null) { if (next.text().trim().toLowerCase().startsWith("version 1") || next.text().trim().toLowerCase().startsWith("editors (version 1")) { skip = false; continue; } } continue; } } if (editor.tagName().equals("dt")) { version = editor.text(); continue; } String[] splitted = editor.html().split("<br />|<br clear=\"none\" />"); if (splitted.length < 2) { if (editor.text().toLowerCase().startsWith("(in alphabetic") || editor.text().toLowerCase().startsWith("see acknowl") || editor.text().toLowerCase().startsWith("the w3") || editor.text().toLowerCase().startsWith("(see ac") || editor.text().toLowerCase().startsWith("see participants") || editor.text().toLowerCase().contains("note:")) { Log.log("warning", "Spec " + url + " may refer to a different section!"); continue; } if (editor.text().equals("WHATWG:") || editor.text().equals("W3C:")) continue; Person result = NameParser.parse(editor.text()); if (result == null) continue; result.setVersion(version); for (int i = 0; i < editor.select("a").size(); i++) { if (!editor.select("a").get(i).attr("href").isEmpty()) { if (editor.select("a").get(i).attr("href").contains("@")) { result.setEmail(editor.select("a").get(i).attr("href").replace("mailto:", "")); } else { result.addWebsite(editor.select("a").get(i).attr("href")); } } } editorList.add(result); } else { for (String split : splitted) { if (!split.isEmpty()) { if (split.toLowerCase().startsWith("(in alphabetic") || split.toLowerCase().startsWith("see acknowl") || split.toLowerCase().startsWith("the w3") || split.toLowerCase().startsWith("(see ac") || split.toLowerCase().startsWith("see participants") || split.toLowerCase().contains("note:")) { Log.log("warning", "Spec " + url + " may refer to a different section!"); continue; } if (split.equals("WHATWG:") || split.equals("W3C:")) continue; Document newdoc = Jsoup.parse(split.replaceAll("\n", "")); Person result = NameParser.parse(newdoc.text()); if (result == null) continue; result.setVersion(version); for (int i = 0; i < newdoc.select("a").size(); i++) { if (!newdoc.select("a").get(i).attr("href").isEmpty()) { if (newdoc.select("a").get(i).attr("href").contains("@")) { result.setEmail(newdoc.select("a").get(i).attr("href").replace("mailto:", "")); } else { result.addWebsite(newdoc.select("a").get(i).attr("href")); } } } editorList.add(result); } } } Element next = editor.nextElementSibling(); if (next != null) if (next.tag().getName().equals("dt") && !next.text().trim().toLowerCase().startsWith("editors (version 1")) break; } if (editorList.size() == 0) return null; return editorList; }
From source file:com.johan.vertretungsplan.parser.UntisInfoHeadlessParser.java
@Override public Vertretungsplan getVertretungsplan() throws IOException, JSONException { new LoginHandler(schule).handleLogin(executor, cookieStore, username, password); Vertretungsplan v = new Vertretungsplan(); List<VertretungsplanTag> tage = new ArrayList<VertretungsplanTag>(); Document doc = Jsoup.parse(httpGet(url, schule.getData().getString("encoding"))); Elements days = doc.select("#vertretung > p > b, #vertretung > b"); for (Element day : days) { VertretungsplanTag tag = new VertretungsplanTag(); tag.setStand(""); tag.setDatum(day.text());// w w w . j a va 2s.c o m Element next = null; if (day.parent().tagName().equals("p")) { next = day.parent().nextElementSibling().nextElementSibling(); } else next = day.parent().select("p").first().nextElementSibling(); if (next.className().equals("subst")) { //Vertretungstabelle if (next.text().contains("Vertretungen sind nicht freigegeben")) continue; parseVertretungsplanTable(next, data, tag); } else { //Nachrichten parseNachrichten(next, data, tag); next = next.nextElementSibling().nextElementSibling(); parseVertretungsplanTable(next, data, tag); } tage.add(tag); } v.setTage(tage); return v; }
From source file:com.johan.vertretungsplan.parser.UntisCommonParser.java
/** * Parst eine Vertretungstabelle eines Untis-Vertretungsplans * /* w w w. j a v a 2 s.c om*/ * @param table * das <code>table</code>-Element des HTML-Dokuments, das geparst * werden soll * @param data * Daten von der Schule (aus <code>Schule.getData()</code>) * @param tag * der {@link VertretungsplanTag} in dem die Vertretungen * gespeichert werden sollen * @throws JSONException */ protected void parseVertretungsplanTable(Element table, JSONObject data, VertretungsplanTag tag) throws JSONException { if (data.optBoolean("class_in_extra_line")) { for (Element element : table.select("td.inline_header")) { String className = getClassName(element.text(), data); if (isValidClass(className)) { KlassenVertretungsplan kv = new KlassenVertretungsplan(className); Element zeile = null; try { zeile = element.parent().nextElementSibling(); if (zeile.select("td") == null) { zeile = zeile.nextElementSibling(); } while (zeile != null && !zeile.select("td").attr("class").equals("list inline_header")) { Vertretung v = new Vertretung(); int i = 0; for (Element spalte : zeile.select("td")) { if (!hasData(spalte.text())) { i++; continue; } String type = data.getJSONArray("columns").getString(i); if (type.equals("lesson")) v.setLesson(spalte.text()); else if (type.equals("subject")) v.setSubject(spalte.text()); else if (type.equals("previousSubject")) v.setPreviousSubject(spalte.text()); else if (type.equals("type")) v.setType(spalte.text()); else if (type.equals("type-entfall")) { if (spalte.text().equals("x")) v.setType("Entfall"); else v.setType("Vertretung"); } else if (type.equals("room")) v.setRoom(spalte.text()); else if (type.equals("teacher")) v.setTeacher(spalte.text()); else if (type.equals("previousTeacher")) v.setPreviousTeacher(spalte.text()); else if (type.equals("desc")) v.setDesc(spalte.text()); else if (type.equals("desc-type")) { v.setDesc(spalte.text()); v.setType(recognizeType(spalte.text())); } else if (type.equals("previousRoom")) v.setPreviousRoom(spalte.text()); i++; } if (v.getDesc() != null && v.getLesson() == null && v.getPreviousRoom() == null && v.getPreviousSubject() == null && v.getPreviousTeacher() == null && v.getRoom() == null && v.getSubject() == null && v.getTeacher() == null && v.getType() == null) { // Beschreibung aus der letzten Zeile fortgesetzt Vertretung previousVertretung = kv.getVertretung() .get(kv.getVertretung().size() - 1); previousVertretung.setDesc(previousVertretung.getDesc() + " " + v.getDesc()); zeile = zeile.nextElementSibling(); continue; } if (v.getType() == null) v.setType("Vertretung"); if (!v.getLesson().equals("")) { kv.add(v); } zeile = zeile.nextElementSibling(); } tag.getKlassen().put(className, kv); } catch (Throwable e) { e.printStackTrace(); } } } } else { boolean hasType = false; for (int i = 0; i < data.getJSONArray("columns").length(); i++) { if (data.getJSONArray("columns").getString(i).equals("type")) hasType = true; } Vertretung previousVertretung = null; for (Element zeile : table.select("tr.list.odd:not(:has(td.inline_header)), " + "tr.list.even:not(:has(td.inline_header)), " + "tr:has(td[align=center]:has(font[color]))")) { Vertretung v = new Vertretung(); String klassen = ""; int i = 0; for (Element spalte : zeile.select("td")) { if (!hasData(spalte.text())) { i++; continue; } String type = data.getJSONArray("columns").getString(i); if (type.equals("lesson")) v.setLesson(spalte.text()); else if (type.equals("subject")) v.setSubject(spalte.text()); else if (type.equals("previousSubject")) v.setPreviousSubject(spalte.text()); else if (type.equals("type")) v.setType(spalte.text()); else if (type.equals("type-entfall")) { if (spalte.text().equals("x")) v.setType("Entfall"); else if (!hasType) v.setType("Vertretung"); } else if (type.equals("room")) v.setRoom(spalte.text()); else if (type.equals("previousRoom")) v.setPreviousRoom(spalte.text()); else if (type.equals("desc")) v.setDesc(spalte.text()); else if (type.equals("desc-type")) { v.setDesc(spalte.text()); v.setType(recognizeType(spalte.text())); } else if (type.equals("teacher")) v.setTeacher(spalte.text()); else if (type.equals("previousTeacher")) v.setPreviousTeacher(spalte.text()); else if (type.equals("class")) klassen = getClassName(spalte.text(), data); i++; } if (v.getDesc() != null && v.getLesson() == null && v.getPreviousRoom() == null && v.getPreviousSubject() == null && v.getPreviousTeacher() == null && v.getRoom() == null && v.getSubject() == null && v.getTeacher() == null && v.getType() == null && previousVertretung != null) { // Beschreibung aus der letzten Zeile fortgesetzt previousVertretung.setDesc(previousVertretung.getDesc() + " " + v.getDesc()); continue; } if (v.getType() == null) { if (zeile.select("strike").size() > 0 || (v.getSubject() == null && v.getRoom() == null && v.getTeacher() == null && v.getPreviousSubject() != null)) v.setType("Entfall"); else v.setType("Vertretung"); } List<String> affectedClasses; // Detect things like "5-12" Pattern pattern = Pattern.compile("(\\d+) ?- ?(\\d+)"); Matcher matcher = pattern.matcher(klassen); if (matcher.find()) { affectedClasses = new ArrayList<String>(); int min = Integer.parseInt(matcher.group(1)); int max = Integer.parseInt(matcher.group(2)); try { for (String klasse : getAllClasses()) { Pattern pattern2 = Pattern.compile("\\d+"); Matcher matcher2 = pattern2.matcher(klasse); if (matcher2.find()) { int num = Integer.parseInt(matcher2.group()); if (min <= num && num <= max) affectedClasses.add(klasse); } } } catch (IOException e) { e.printStackTrace(); } } else { if (data.optBoolean("classes_separated", true)) { affectedClasses = Arrays.asList(klassen.split(", ")); } else { affectedClasses = new ArrayList<String>(); try { for (String klasse : getAllClasses()) { // TODO: // Gibt es // eine // bessere // Mglichkeit? StringBuilder regex = new StringBuilder(); for (char character : klasse.toCharArray()) { regex.append(character); regex.append(".*"); } if (klassen.matches(regex.toString())) affectedClasses.add(klasse); } } catch (IOException e) { e.printStackTrace(); } } } for (String klasse : affectedClasses) { if (isValidClass(klasse)) { KlassenVertretungsplan kv = tag.getKlassen().get(klasse); if (kv == null) kv = new KlassenVertretungsplan(klasse); kv.add(v); tag.getKlassen().put(klasse, kv); } } previousVertretung = v; } } if (data.optBoolean("sort_classes")) { List<KlassenVertretungsplan> list = new ArrayList<>(tag.getKlassen().values()); Collections.sort(list, new Comparator<KlassenVertretungsplan>() { @Override public int compare(KlassenVertretungsplan o1, KlassenVertretungsplan o2) { return o1.getKlasse().compareTo(o2.getKlasse()); } }); LinkedHashMap<String, KlassenVertretungsplan> hashMap = new LinkedHashMap<>(); for (KlassenVertretungsplan klasse : list) { hashMap.put(klasse.getKlasse(), klasse); } tag.setKlassen(hashMap); } }