List of usage examples for org.jsoup.nodes Element children
public Elements children()
From source file:net.slkdev.swagger.confluence.service.impl.XHtmlToConfluenceServiceImpl.java
private Map<String, ConfluenceLink> buildTableOfContentsLinkMap() { final Map<String, ConfluenceLink> titleLinkMap = new HashMap<>(); final Document document = SWAGGER_DOCUMENT.get(); final Elements tocElements = document.select(".toc"); final Elements tocCategoryElements = tocElements.select(".sectlevel1").first().children(); final Elements tocFilteredCategoryElements = new Elements(); for (final Element tocCategoryElement : tocCategoryElements) { final Element categoryLinkElement = tocCategoryElement.children().first(); tocFilteredCategoryElements.add(categoryLinkElement); }//from ww w . j a v a2 s. c o m final Elements tocIndividualElements = tocElements.select(".sectlevel2"); addLinksByType(titleLinkMap, tocFilteredCategoryElements, PageType.CATEGORY, null); int categoryCount = 1; for (final Element tocIndividualElement : tocIndividualElements) { final Elements tocIndividualElementLinks = tocIndividualElement.select("a"); addLinksByType(titleLinkMap, tocIndividualElementLinks, INDIVIDUAL, categoryCount); categoryCount++; } return titleLinkMap; }
From source file:me.vertretungsplan.parser.IndiwareParser.java
SubstitutionScheduleDay parseIndiwareDay(Element doc, boolean html) throws IOException { SubstitutionScheduleDay day = new SubstitutionScheduleDay(); DataSource ds;//from w w w . j a v a 2 s .c o m if (html) { ds = new HTMLDataSource(doc); } else { ds = new XMLDataSource(doc); } Matcher matcher = datePattern.matcher(ds.titel().text()); if (!matcher.find()) throw new IOException("malformed date: " + ds.titel().text()); String date = matcher.group(); day.setDate( DateTimeFormat.forPattern("EEEE, dd. MMMM yyyy").withLocale(Locale.GERMAN).parseLocalDate(date)); String lastChange = ds.datum().text(); day.setLastChange(DateTimeFormat.forPattern("dd.MM.yyyy, HH:mm").withLocale(Locale.GERMAN) .parseLocalDateTime(lastChange)); if (ds.kopfinfos().size() > 0) { for (Element kopfinfo : ds.kopfinfos()) { String title = html ? kopfinfo.select("th").text() : kopfinfoTitle(kopfinfo.tagName()) + ":"; StringBuilder message = new StringBuilder(); if (title != null && !title.isEmpty()) { message.append("<b>").append(title).append("</b>").append(" "); } message.append(html ? kopfinfo.select("td").text() : kopfinfo.text()); day.addMessage(message.toString()); } } if (ds.fuss() != null) { StringBuilder message = new StringBuilder(); boolean first = true; for (Element fusszeile : ds.fusszeilen()) { if (first) { first = false; } else { message.append("\n"); } message.append(fusszeile.text()); } day.addMessage(message.toString()); } List<String> columnTypes = null; if (html) { columnTypes = new ArrayList<>(); for (Element th : ((HTMLDataSource) ds).headers()) { columnTypes.add(th.className().replace("thplan", "").replace("thlplan", "")); } } for (Element aktion : ds.aktionen()) { Substitution substitution = new Substitution(); String type = "Vertretung"; String course = null; int i = 0; for (Element info : aktion.children()) { String value = info.text().replace("\u00a0", ""); if (value.equals("---")) { i++; continue; } final String columnType = html ? columnTypes.get(i) : info.tagName(); switch (columnType) { case "klasse": Set<String> classes = new HashSet<>(); for (String klasse : value.split(",")) { Matcher courseMatcher = coursePattern.matcher(klasse); if (courseMatcher.matches()) { classes.add(courseMatcher.group(1)); course = courseMatcher.group(2); } else { classes.add(klasse); } } substitution.setClasses(classes); break; case "stunde": substitution.setLesson(value); break; case "fach": String subject = subjectAndCourse(course, value); if (columnTypes != null && columnTypes.contains("vfach")) { substitution.setPreviousSubject(subject); } else { substitution.setSubject(subject); } break; case "vfach": substitution.setSubject(subjectAndCourse(course, value)); case "lehrer": Matcher bracesMatcher = bracesPattern.matcher(value); if (bracesMatcher.matches()) value = bracesMatcher.group(1); substitution.setTeacher(value); break; case "raum": if (columnTypes != null && columnTypes.contains("vraum")) { substitution.setPreviousRoom(value); } else { substitution.setRoom(value); } break; case "vraum": substitution.setRoom(value); case "info": Matcher substitutionMatcher = substitutionPattern.matcher(value); Matcher cancelMatcher = cancelPattern.matcher(value); Matcher delayMatcher = delayPattern.matcher(value); Matcher selfMatcher = selfPattern.matcher(value); if (substitutionMatcher.matches()) { substitution.setPreviousSubject(substitutionMatcher.group(1)); substitution.setPreviousTeacher(substitutionMatcher.group(2)); if (!substitutionMatcher.group(3).isEmpty()) { substitution.setDesc(substitutionMatcher.group(3)); } } else if (cancelMatcher.matches()) { type = "Entfall"; substitution.setPreviousSubject(cancelMatcher.group(1)); substitution.setPreviousTeacher(cancelMatcher.group(2)); } else if (delayMatcher.matches()) { type = "Verlegung"; substitution.setPreviousSubject(delayMatcher.group(1)); substitution.setPreviousTeacher(delayMatcher.group(2)); substitution.setDesc(delayMatcher.group(3)); } else if (selfMatcher.matches()) { type = "selbst."; if (!selfMatcher.group(1).isEmpty()) substitution.setDesc(selfMatcher.group(1)); } else if (value.equals("fllt aus") || value.equals("Klausur") || value.equals("Aufg.")) { type = value; } else { substitution.setDesc(value); } break; } i++; } substitution.setType(type); substitution.setColor(colorProvider.getColor(substitution.getType())); if (course != null && substitution.getSubject() == null) { substitution.setSubject(course); } day.addSubstitution(substitution); } return day; }
From source file:us.colloquy.index.IndexHandler.java
public void getURIForAllLetters(Set<DocumentPointer> uriList, String letterDirectory, boolean useOnlyNumber) { ///Documents/Tolstoy/diaries Path pathToLetters = FileSystems.getDefault().getPath(letterDirectory); List<Path> results = new ArrayList<>(); int maxDepth = 6; try (Stream<Path> stream = Files.find(pathToLetters, maxDepth, (path, attr) -> { return String.valueOf(path).endsWith(".ncx"); })) {//from www . j ava 2s .com stream.forEach(results::add); // String joined = stream // .sorted() // .map(String::valueOf) // .collect(Collectors.joining("; ")); // // System.out.println("\nFound: " + joined); } catch (IOException e) { e.printStackTrace(); } System.out.println("files: " + results.size()); try { for (Path res : results) { Path parent = res.getParent(); // System.out.println("---------------------------------------------"); // System.out.println(parent.toString()); //use jsoup to list all files that contain something useful Document doc = Jsoup.parse(res.toFile(), "UTF-8"); String title = ""; for (Element element : doc.getElementsByTag("docTitle")) { //Letter letter = new Letter(); // StringBuilder content = new StringBuilder(); for (Element child : element.children()) { title = child.text(); // System.out.println("Title: " + title); } } for (Element element : doc.getElementsByTag("navPoint")) { //Letter letter = new Letter(); // StringBuilder content = new StringBuilder(); for (Element child : element.children()) { String label = child.text(); if (StringUtils.isNotEmpty(label)) { if (label.matches("?")) { System.out.println("------------------"); } String url = child.getElementsByTag("content").attr("src"); if (label.matches(".*\\d{1,3}.*[?--?]+.*") && StringUtils.isNotEmpty(url)) { DocumentPointer documentPointer = new DocumentPointer( parent.toString() + File.separator + url.replaceAll("#.*", ""), title); uriList.add(documentPointer); // System.out.println("nav point: " + label + " src " + parent.toString() // + System.lineSeparator() + url.replaceAll("#.*","")); } else if (label.matches(".*\\d{1,3}.*") && StringUtils.isNotEmpty(url) && useOnlyNumber) { DocumentPointer documentPointer = new DocumentPointer( parent.toString() + File.separator + url.replaceAll("#.*", ""), title); uriList.add(documentPointer); // System.out.println("nav point: " + label + " src " + parent.toString() // + System.lineSeparator() + url.replaceAll("#.*","")); } else { // System.out.println("nav point: " + label + " src " + child.getElementsByTag("content").attr("src")); } } } } } } catch (Exception e) { e.printStackTrace(); } // System.out.println("Size: " + uriList.size()); // for (DocumentPointer pointer : uriList) // { // //parse and // System.out.println(pointer.getSourse() + "\t" + pointer.getUri()); // } }
From source file:us.colloquy.index.IndexHandler.java
public void getURIForAllDiaries(List<DocumentPointer> documentPointers, Path pathToLetters) { List<Path> results = new ArrayList<>(); int maxDepth = 6; try (Stream<Path> stream = Files.find(pathToLetters, maxDepth, (path, attr) -> { return String.valueOf(path).endsWith(".ncx"); })) {/*from ww w. j a v a2s. co m*/ stream.forEach(results::add); } catch (IOException e) { e.printStackTrace(); } System.out.println("files: " + results.size()); try { for (Path res : results) { Path parent = res.getParent(); // System.out.println("---------------------------------------------"); // System.out.println(parent.toString()); //use jsoup to list all files that contain something useful Document doc = Jsoup.parse(res.toFile(), "UTF-8"); String title = ""; for (Element element : doc.getElementsByTag("docTitle")) { //Letter letter = new Letter(); // StringBuilder content = new StringBuilder(); for (Element child : element.children()) { title = child.text(); // System.out.println("Title: " + title); } } // System.out.println("========================== " + res.toString() + " =========================="); boolean startPrinting = false; boolean newFile = true; for (Element element : doc.getElementsByTag("navPoint")) { //get nav label and content Element navLabelElement = element.select("navLabel").first(); Element srsElement = element.select("content").first(); String navLabel = ""; String srs = ""; if (navLabelElement != null) { navLabel = navLabelElement.text().replaceAll("\\*", "").trim(); } if (srsElement != null) { srs = srsElement.attr("src"); } if ("??".matches(navLabel)) { startPrinting = false; // System.out.println("----------------- end of file pointer ---------------"); } if (StringUtils.isNotEmpty(navLabel) && navLabel.matches("??.*|?? ?.*") && newFile) { newFile = false; startPrinting = true; } if (startPrinting && !navLabel .matches("(|??? ??)")) { // System.out.println("----------------- file pointer ---------------"); // System.out.println(navLabel + "\t" + srs); DocumentPointer documentPointer = new DocumentPointer( parent.toString() + File.separator + srs.replaceAll("#.*", ""), title); documentPointers.add(documentPointer); } } // System.out.println("========================== END OF FILE =========================="); } } catch (Exception e) { e.printStackTrace(); } System.out.println("Size: " + documentPointers.size()); // for (DocumentPointer pointer : documentPointers) // { //parse and // System.out.println(pointer.getSourse() + "\t" + pointer.getUri()); }
From source file:de.geeksfactory.opacclient.apis.Heidi.java
@Override public ProlongAllResult prolongAll(Account account, int useraction, String selection) throws IOException { String html = httpGet(opac_url + "/konto.cgi?sess=" + sessid + "&email=&verl=Gesamtkontoverlngerung", ENCODING);/* www. j a v a2 s. c o m*/ Document doc = Jsoup.parse(html); if (doc.select("input[name=pw]").size() > 0) { try { login(account); } catch (OpacErrorException e) { return new ProlongAllResult(MultiStepResult.Status.ERROR, e.getMessage()); } return prolongAll(account, useraction, selection); } List<Map<String, String>> result = new ArrayList<>(); Map<String, String> line = new HashMap<>(); for (Element tr : doc.select(".kontobox table tbody tr")) { if (tr.children().size() < 2) { if (line.size() > 0) { line.put(ProlongAllResult.KEY_LINE_MESSAGE, tr.child(0).text().trim()); result.add(line); line = new HashMap<>(); } continue; } String label = tr.child(0).text(); String text = tr.child(1).text().trim(); if (label.contains("Verfasser")) { line.put(ProlongAllResult.KEY_LINE_AUTHOR, text); } else if (label.contains("Titel")) { line.put(ProlongAllResult.KEY_LINE_TITLE, text); } else if (label.contains("Altes Leihfristende")) { line.put(ProlongAllResult.KEY_LINE_OLD_RETURNDATE, text); } else if (label.contains("Neues")) { line.put(ProlongAllResult.KEY_LINE_NEW_RETURNDATE, text); } } return new ProlongAllResult(MultiStepResult.Status.OK, result); }
From source file:de.geeksfactory.opacclient.apis.Zones22.java
private DetailledItem parse_result(String id, String html) throws IOException { Document doc = Jsoup.parse(html); DetailledItem result = new DetailledItem(); result.setTitle(""); boolean title_is_set = false; result.setId(id);//from w w w . j a v a 2 s.co m Elements detaildiv = doc.select("div.record-item-new"); Elements detailtrs1 = doc.select(".DetailDataCell table table:not(.inRecordHeader) tr"); for (int i = 0; i < detailtrs1.size(); i++) { Element tr = detailtrs1.get(i); int s = tr.children().size(); if (tr.child(0).text().trim().equals("Titel") && !title_is_set) { result.setTitle(tr.child(s - 1).text().trim()); title_is_set = true; } else if (s > 1) { Element valchild = tr.child(s - 1); if (valchild.select("table").isEmpty()) { String val = valchild.text().trim(); if (val.length() > 0) result.addDetail(new Detail(tr.child(0).text().trim(), val)); } } } for (Element a : doc.select("a.SummaryActionLink")) { if (a.text().contains("Vormerken")) { result.setReservable(true); result.setReservation_info(a.attr("href")); } } if (!detaildiv.isEmpty()) { for (int i = 0; i < detaildiv.size(); i++) { Element dd = detaildiv.get(i); String text = ""; for (Node node : dd.childNodes()) { if (node instanceof TextNode) { String snip = ((TextNode) node).text(); if (snip.length() > 0) text += snip; } else if (node instanceof Element) { if (((Element) node).tagName().equals("br")) text += "\n"; else { String snip = ((Element) node).text().trim(); if (snip.length() > 0) text += snip; } } } result.addDetail(new Detail("", text)); } } if (doc.select("span.z3988").size() > 0) { // Sometimes there is a <span class="Z3988"> item which provides // data in a standardized format. String z3988data = doc.select("span.z3988").first().attr("title").trim(); for (String pair : z3988data.split("\\&")) { String[] nv = pair.split("=", 2); if (nv.length == 2) { if (!nv[1].trim().equals("")) { if (nv[0].equals("rft.btitle") && result.getTitle().length() == 0) { result.setTitle(nv[1]); } else if (nv[0].equals("rft.atitle") && result.getTitle().length() == 0) { result.setTitle(nv[1]); } else if (nv[0].equals("rft.au")) { result.addDetail(new Detail("Author", nv[1])); } } } } } Elements copydivs = doc.select(".DetailDataCell div[id^=stock_]"); String pop = ""; for (int i = 0; i < copydivs.size(); i++) { Element div = copydivs.get(i); if (div.attr("id").startsWith("stock_head")) { pop = div.text().trim(); continue; } Map<String, String> copy = new HashMap<String, String>(); // This is getting very ugly - check if it is valid for libraries // which are not // Hamburg. int j = 0; for (Node node : div.childNodes()) { try { if (node instanceof Element) { if (((Element) node).tag().getName().equals("br")) { copy.put(DetailledItem.KEY_COPY_BRANCH, pop); result.addCopy(copy); j = -1; } else if (((Element) node).tag().getName().equals("b") && j == 1) { copy.put(DetailledItem.KEY_COPY_LOCATION, ((Element) node).text()); } else if (((Element) node).tag().getName().equals("b") && j > 1) { copy.put(DetailledItem.KEY_COPY_STATUS, ((Element) node).text()); } j++; } else if (node instanceof TextNode) { if (j == 0) copy.put(DetailledItem.KEY_COPY_DEPARTMENT, ((TextNode) node).text()); if (j == 2) copy.put(DetailledItem.KEY_COPY_BARCODE, ((TextNode) node).getWholeText().trim().split("\n")[0].trim()); if (j == 6) { String text = ((TextNode) node).text().trim(); copy.put(DetailledItem.KEY_COPY_RETURN, text.substring(text.length() - 10)); } j++; } } catch (Exception e) { e.printStackTrace(); } } } return result; }
From source file:com.jimplush.goose.ContentExtractor.java
private Set<String> extractTags(Element node) { if (node.children().size() == 0) return NO_STRINGS; Elements elements = Selector.select(A_REL_TAG_SELECTOR, node); if (elements.size() == 0) return NO_STRINGS; Set<String> tags = new HashSet<String>(elements.size()); for (Element el : elements) { String tag = el.text();/* w ww . ja v a 2 s. c o m*/ if (!string.isNullOrEmpty(tag)) tags.add(tag); } return tags; }
From source file:de.geeksfactory.opacclient.apis.IOpac.java
@Override public List<SearchField> getSearchFields() throws IOException { List<SearchField> fields = new ArrayList<>(); // Extract all search fields, except media types String html;//ww w . j a v a 2 s. c om try { html = httpGet(opac_url + dir + "/search_expert.htm", getDefaultEncoding()); } catch (NotReachableException e) { html = httpGet(opac_url + dir + "/iopacie.htm", getDefaultEncoding()); } Document doc = Jsoup.parse(html); Elements trs = doc.select("form tr:has(input:not([type=submit], [type=reset])), form tr:has(select)"); for (Element tr : trs) { Elements tds = tr.children(); if (tds.size() == 4) { // Two search fields next to each other in one row SearchField field1 = createSearchField(tds.get(0), tds.get(1)); SearchField field2 = createSearchField(tds.get(2), tds.get(3)); if (field1 != null) { fields.add(field1); } if (field2 != null) { fields.add(field2); } } else if (tds.size() == 2 || (tds.size() == 3 && tds.get(2).children().size() == 0)) { SearchField field = createSearchField(tds.get(0), tds.get(1)); if (field != null) { fields.add(field); } } } if (fields.size() == 0 && doc.select("[name=sleStichwort]").size() > 0) { TextSearchField field = new TextSearchField(); Element input = doc.select("input[name=sleStichwort]").first(); field.setDisplayName(stringProvider.getString(StringProvider.FREE_SEARCH)); field.setId(input.attr("name")); field.setHint(""); fields.add(field); } // Extract available media types. // We have to parse JavaScript. Doing this with RegEx is evil. // But not as evil as including a JavaScript VM into the app. // And I honestly do not see another way. Pattern pattern_key = Pattern.compile("mtyp\\[[0-9]+\\]\\[\"typ\"\\] = \"([^\"]+)\";"); Pattern pattern_value = Pattern.compile("mtyp\\[[0-9]+\\]\\[\"bez\"\\] = \"([^\"]+)\";"); DropdownSearchField mtyp = new DropdownSearchField(); try { try { html = httpGet(opac_url + dir + "/mtyp.js", getDefaultEncoding()); } catch (NotReachableException e) { html = httpGet(opac_url + "/mtyp.js", getDefaultEncoding()); } String[] parts = html.split("new Array\\(\\);"); for (String part : parts) { Matcher matcher1 = pattern_key.matcher(part); String key = ""; String value = ""; if (matcher1.find()) { key = matcher1.group(1); } Matcher matcher2 = pattern_value.matcher(part); if (matcher2.find()) { value = matcher2.group(1); } if (!value.equals("")) { mtyp.addDropdownValue(key, value); } } } catch (IOException e) { try { html = httpGet(opac_url + dir + "/frames/search_form.php?bReset=1?bReset=1", getDefaultEncoding()); doc = Jsoup.parse(html); for (Element opt : doc.select("#imtyp option")) { mtyp.addDropdownValue(opt.attr("value"), opt.text()); } } catch (IOException e1) { e1.printStackTrace(); } } if (mtyp.getDropdownValues() != null && !mtyp.getDropdownValues().isEmpty()) { mtyp.setDisplayName("Medientypen"); mtyp.setId("Medientyp"); fields.add(mtyp); } return fields; }
From source file:org.shareok.data.sagedata.SageSourceDataHandlerImpl.java
private String[] getArticleAuthorsFromFullTextDoc(Document doc) throws NoHtmlComponentsFoundException { String[] authors = null;//from w w w .j a v a2 s. co m List<String> auList = new ArrayList<>(); try { Elements authorElements = doc.select("div.authors").get(0).select("span.contribDegrees"); for (Element authSpan : authorElements) { String author = authSpan.children().get(0).text(); if (null != author && !author.equals("")) { auList.add(author); } } } catch (Exception ex) { logger.error("Cannot get the authors for SAGE article!", ex); return null; } if (auList.size() > 0) { authors = auList.toArray(new String[auList.size()]); } return authors; }