List of usage examples for org.jsoup.nodes Element children
public Elements children()
From source file:tr.edu.gsu.nerwip.retrieval.reader.wikipedia.WikipediaReader.java
/** * Pulls a text from a Wikipedia URL without images, tags, etc. * /* w w w . ja v a 2 s . com*/ * @param url * Address of the targetted text. * @return * An Article object representing the retrieved object. * * @throws ReaderException * Problem while retrieving the text. */ @Override public Article read(URL url) throws ReaderException { Article result = null; String name = getName(url); try { // get the page String address = url.toString(); logger.log("Retrieving page " + address); long startTime = System.currentTimeMillis(); Document document = retrieveSourceCode(name, url); // get its title Element firstHeadingElt = document.getElementsByAttributeValue(XmlNames.ATT_ID, ID_TITLE).get(0); String title = firstHeadingElt.text(); logger.log("Get title: " + title); // get raw and linked texts logger.log("Get raw and linked texts."); StringBuilder rawStr = new StringBuilder(); StringBuilder linkedStr = new StringBuilder(); Element bodyContentElt = document.getElementsByAttributeValue(XmlNames.ATT_ID, ID_CONTENT).get(0); // processing each element in the content part boolean ignoringSection = false; boolean first = true; for (Element element : bodyContentElt.children()) { String eltName = element.tag().getName(); String eltClass = element.attr(XmlNames.ATT_CLASS); // section headers if (eltName.equals(XmlNames.ELT_H2)) { first = false; // get section name StringBuilder fakeRaw = new StringBuilder(); StringBuilder fakeLinked = new StringBuilder(); processParagraphElement(element, fakeRaw, fakeLinked); String str = fakeRaw.toString().trim().toLowerCase(Locale.ENGLISH); // check section name if (IGNORED_SECTIONS.contains(str)) ignoringSection = true; else { ignoringSection = false; rawStr.append("\n-----"); linkedStr.append("\n-----"); processParagraphElement(element, rawStr, linkedStr); } } else if (!ignoringSection) { // lower sections if (eltName.equals(XmlNames.ELT_H3) || eltName.equals(XmlNames.ELT_H4) || eltName.equals(XmlNames.ELT_H5) || eltName.equals(XmlNames.ELT_H6)) { first = false; processParagraphElement(element, rawStr, linkedStr); } // paragraph else if (eltName.equals(XmlNames.ELT_P)) { String str = element.text(); // ignore possible initial disambiguation link if (!first || !str.startsWith(PARAGRAPH_FORTHE)) { first = false; processParagraphElement(element, rawStr, linkedStr); } } // list else if (eltName.equals(XmlNames.ELT_UL)) { first = false; processListElement(element, rawStr, linkedStr, false); } else if (eltName.equals(XmlNames.ELT_OL)) { first = false; processListElement(element, rawStr, linkedStr, true); } else if (eltName.equals(XmlNames.ELT_DL)) { first = false; processDescriptionListElement(element, rawStr, linkedStr); } // tables else if (eltName.equals(XmlNames.ELT_TABLE)) { first = !processTableElement(element, rawStr, linkedStr); } // divisions else if (eltName.equals(XmlNames.ELT_DIV)) { // ignore possible initial picture if (!first || eltClass == null || !eltClass.contains(CLASS_THUMB)) first = !processDivisionElement(element, rawStr, linkedStr); } // we ignore certain types of span (phonetic trancription, WP buttons...) else if (eltName.equals(XmlNames.ELT_SPAN)) { first = !processSpanElement(element, rawStr, linkedStr); } // hyperlinks must be included in the linked string, provided they are not external else if (eltName.equals(XmlNames.ELT_A)) { first = !processHyperlinkElement(element, rawStr, linkedStr); } // quotes are just processed recursively else if (eltName.equals(XmlNames.ELT_BLOCKQUOTE)) { first = !processQuoteElement(element, rawStr, linkedStr); } // other tags are ignored } } // create article object result = new Article(name); result.setTitle(title); result.setUrl(url); result.initDate(); // clean text String rawText = rawStr.toString(); rawText = cleanText(rawText); // rawText = ArticleCleaning.replaceChars(rawText); result.setRawText(rawText); logger.log("Length of the raw text: " + rawText.length() + " chars."); String linkedText = linkedStr.toString(); linkedText = cleanText(linkedText); // linkedText = ArticleCleaning.replaceChars(linkedText); result.setLinkedText(linkedText); logger.log("Length of the linked text: " + linkedText.length() + " chars."); // get original html source code logger.log("Get original HTML source code."); String originalPage = document.toString(); result.setOriginalPage(originalPage); logger.log("Length of the original page: " + originalPage.length() + " chars."); // get the categories of the article List<ArticleCategory> categories = getArticleCategories(result); result.setCategories(categories); long endTime = System.currentTimeMillis(); logger.log("Total duration: " + (endTime - startTime) + " ms."); } catch (ClientProtocolException e) { e.printStackTrace(); } catch (ParseException e) { e.printStackTrace(); } catch (IOException e) { e.printStackTrace(); } catch (org.json.simple.parser.ParseException e) { e.printStackTrace(); } return result; }
From source file:de.geeksfactory.opacclient.apis.IOpac.java
static void parseMediaList(List<LentItem> media, Document doc, JSONObject data) { if (doc.select("a[name=AUS]").size() == 0) return;//from w w w .j av a 2s . c o m Elements copytrs = doc.select("a[name=AUS] ~ table, a[name=AUS] ~ form table").first().select("tr"); doc.setBaseUri(data.optString("baseurl")); DateTimeFormatter fmt = DateTimeFormat.forPattern("dd.MM.yyyy").withLocale(Locale.GERMAN); int trs = copytrs.size(); if (trs < 2) { return; } assert (trs > 0); JSONObject copymap = new JSONObject(); try { if (data.has("accounttable")) { copymap = data.getJSONObject("accounttable"); } } catch (JSONException e) { } Pattern datePattern = Pattern.compile("\\d{2}\\.\\d{2}\\.\\d{4}"); for (int i = 1; i < trs; i++) { Element tr = copytrs.get(i); LentItem item = new LentItem(); if (copymap.optInt("title", 0) >= 0) { item.setTitle(tr.child(copymap.optInt("title", 0)).text().trim().replace("\u00a0", "")); } if (copymap.optInt("author", 1) >= 0) { item.setAuthor(tr.child(copymap.optInt("author", 1)).text().trim().replace("\u00a0", "")); } if (copymap.optInt("format", 2) >= 0) { item.setFormat(tr.child(copymap.optInt("format", 2)).text().trim().replace("\u00a0", "")); } int prolongCount = 0; if (copymap.optInt("prolongcount", 3) >= 0) { prolongCount = Integer .parseInt(tr.child(copymap.optInt("prolongcount", 3)).text().trim().replace("\u00a0", "")); item.setStatus(String.valueOf(prolongCount) + "x verl."); } if (data.optInt("maxprolongcount", -1) != -1) { item.setRenewable(prolongCount < data.optInt("maxprolongcount", -1)); } if (copymap.optInt("returndate", 4) >= 0) { String value = tr.child(copymap.optInt("returndate", 4)).text().trim().replace("\u00a0", ""); Matcher matcher = datePattern.matcher(value); if (matcher.find()) { try { item.setDeadline(fmt.parseLocalDate(matcher.group())); } catch (IllegalArgumentException e1) { e1.printStackTrace(); } } } if (copymap.optInt("prolongurl", 5) >= 0) { if (tr.children().size() > copymap.optInt("prolongurl", 5)) { Element cell = tr.child(copymap.optInt("prolongurl", 5)); if (cell.select("input[name=MedNrVerlAll]").size() > 0) { // new iOPAC Version 1.45 - checkboxes to prolong multiple items // internal convention: We add "NEW" to the media ID to show that we have // the new iOPAC version Element input = cell.select("input[name=MedNrVerlAll]").first(); String value = input.val(); item.setProlongData("NEW" + value); item.setId(value.split(";")[0]); if (input.hasAttr("disabled")) item.setRenewable(false); } else { // previous versions - link for prolonging on every medium String link = cell.select("a").attr("href"); item.setProlongData(link); // find media number with regex Pattern pattern = Pattern.compile("mednr=([^&]*)&"); Matcher matcher = pattern.matcher(link); if (matcher.find() && matcher.group() != null) item.setId(matcher.group(1)); } } } media.add(item); } assert (media.size() == trs - 1); }
From source file:de.geeksfactory.opacclient.apis.SISIS.java
protected DetailledItem parse_result(String html) throws IOException { Document doc = Jsoup.parse(html); doc.setBaseUri(opac_url);//from w w w.j a v a 2 s .c o m String html2 = httpGet(opac_url + "/singleHit.do?methodToCall=activateTab&tab=showTitleActive", ENCODING); Document doc2 = Jsoup.parse(html2); doc2.setBaseUri(opac_url); String html3 = httpGet(opac_url + "/singleHit.do?methodToCall=activateTab&tab=showAvailabilityActive", ENCODING); Document doc3 = Jsoup.parse(html3); doc3.setBaseUri(opac_url); DetailledItem result = new DetailledItem(); try { result.setId(doc.select("#bibtip_id").text().trim()); } catch (Exception ex) { ex.printStackTrace(); } List<String> reservationlinks = new ArrayList<>(); for (Element link : doc3.select("#vormerkung a, #tab-content a")) { String href = link.absUrl("href"); Map<String, String> hrefq = getQueryParamsFirst(href); if (result.getId() == null) { // ID retrieval String key = hrefq.get("katkey"); if (key != null) { result.setId(key); break; } } // Vormerken if (hrefq.get("methodToCall") != null) { if (hrefq.get("methodToCall").equals("doVormerkung") || hrefq.get("methodToCall").equals("doBestellung")) { reservationlinks.add(href.split("\\?")[1]); } } } if (reservationlinks.size() == 1) { result.setReservable(true); result.setReservation_info(reservationlinks.get(0)); } else if (reservationlinks.size() == 0) { result.setReservable(false); } else { // TODO: Multiple options - handle this case! } if (doc.select(".data td img").size() == 1) { result.setCover(doc.select(".data td img").first().attr("abs:src")); try { downloadCover(result); } catch (Exception e) { } } if (doc.select(".aw_teaser_title").size() == 1) { result.setTitle(doc.select(".aw_teaser_title").first().text().trim()); } else if (doc.select(".data td strong").size() > 0) { result.setTitle(doc.select(".data td strong").first().text().trim()); } else { result.setTitle(""); } if (doc.select(".aw_teaser_title_zusatz").size() > 0) { result.addDetail(new Detail("Titelzusatz", doc.select(".aw_teaser_title_zusatz").text().trim())); } String title = ""; String text = ""; boolean takeover = false; Element detailtrs = doc2.select(".box-container .data td").first(); for (Node node : detailtrs.childNodes()) { if (node instanceof Element) { if (((Element) node).tagName().equals("strong")) { title = ((Element) node).text().trim(); text = ""; } else { if (((Element) node).tagName().equals("a") && (((Element) node).text().trim().contains("hier klicken") || title.equals("Link:"))) { text = text + node.attr("href"); takeover = true; break; } } } else if (node instanceof TextNode) { text = text + ((TextNode) node).text(); } } if (!takeover) { text = ""; title = ""; } detailtrs = doc2.select("#tab-content .data td").first(); if (detailtrs != null) { for (Node node : detailtrs.childNodes()) { if (node instanceof Element) { if (((Element) node).tagName().equals("strong")) { if (!text.equals("") && !title.equals("")) { result.addDetail(new Detail(title.trim(), text.trim())); if (title.equals("Titel:")) { result.setTitle(text.trim()); } text = ""; } title = ((Element) node).text().trim(); } else { if (((Element) node).tagName().equals("a") && (((Element) node).text().trim().contains("hier klicken") || title.equals("Link:"))) { text = text + node.attr("href"); } else { text = text + ((Element) node).text(); } } } else if (node instanceof TextNode) { text = text + ((TextNode) node).text(); } } } else { if (doc2.select("#tab-content .fulltitle tr").size() > 0) { Elements rows = doc2.select("#tab-content .fulltitle tr"); for (Element tr : rows) { if (tr.children().size() == 2) { Element valcell = tr.child(1); String value = valcell.text().trim(); if (valcell.select("a").size() == 1) { value = valcell.select("a").first().absUrl("href"); } result.addDetail(new Detail(tr.child(0).text().trim(), value)); } } } else { result.addDetail(new Detail(stringProvider.getString(StringProvider.ERROR), stringProvider.getString(StringProvider.COULD_NOT_LOAD_DETAIL))); } } if (!text.equals("") && !title.equals("")) { result.addDetail(new Detail(title.trim(), text.trim())); if (title.equals("Titel:")) { result.setTitle(text.trim()); } } for (Element link : doc3.select("#tab-content a")) { Map<String, String> hrefq = getQueryParamsFirst(link.absUrl("href")); if (result.getId() == null) { // ID retrieval String key = hrefq.get("katkey"); if (key != null) { result.setId(key); break; } } } for (Element link : doc3.select(".box-container a")) { if (link.text().trim().equals("Download")) { result.addDetail( new Detail(stringProvider.getString(StringProvider.DOWNLOAD), link.absUrl("href"))); } } Map<String, Integer> copy_columnmap = new HashMap<>(); // Default values copy_columnmap.put("barcode", 1); copy_columnmap.put("branch", 3); copy_columnmap.put("status", 4); Elements copy_columns = doc.select("#tab-content .data tr#bg2 th"); for (int i = 0; i < copy_columns.size(); i++) { Element th = copy_columns.get(i); String head = th.text().trim(); if (head.contains("Status")) { copy_columnmap.put("status", i); } if (head.contains("Zweigstelle")) { copy_columnmap.put("branch", i); } if (head.contains("Mediennummer")) { copy_columnmap.put("barcode", i); } if (head.contains("Standort")) { copy_columnmap.put("location", i); } if (head.contains("Signatur")) { copy_columnmap.put("signature", i); } } Pattern status_lent = Pattern.compile( "^(entliehen) bis ([0-9]{1,2}.[0-9]{1,2}.[0-9]{2," + "4}) \\(gesamte Vormerkungen: ([0-9]+)\\)$"); Pattern status_and_barcode = Pattern.compile("^(.*) ([0-9A-Za-z]+)$"); Elements exemplartrs = doc.select("#tab-content .data tr").not("#bg2"); DateTimeFormatter fmt = DateTimeFormat.forPattern("dd.MM.yyyy").withLocale(Locale.GERMAN); for (Element tr : exemplartrs) { try { Copy copy = new Copy(); Element status = tr.child(copy_columnmap.get("status")); Element barcode = tr.child(copy_columnmap.get("barcode")); String barcodetext = barcode.text().trim().replace(" Wegweiser", ""); // STATUS String statustext; if (status.getElementsByTag("b").size() > 0) { statustext = status.getElementsByTag("b").text().trim(); } else { statustext = status.text().trim(); } if (copy_columnmap.get("status").equals(copy_columnmap.get("barcode"))) { Matcher matcher1 = status_and_barcode.matcher(statustext); if (matcher1.matches()) { statustext = matcher1.group(1); barcodetext = matcher1.group(2); } } Matcher matcher = status_lent.matcher(statustext); if (matcher.matches()) { copy.setStatus(matcher.group(1)); copy.setReservations(matcher.group(3)); copy.setReturnDate(fmt.parseLocalDate(matcher.group(2))); } else { copy.setStatus(statustext); } copy.setBarcode(barcodetext); if (status.select("a[href*=doVormerkung]").size() == 1) { copy.setResInfo(status.select("a[href*=doVormerkung]").attr("href").split("\\?")[1]); } String branchtext = tr.child(copy_columnmap.get("branch")).text().trim().replace(" Wegweiser", ""); copy.setBranch(branchtext); if (copy_columnmap.containsKey("location")) { copy.setLocation( tr.child(copy_columnmap.get("location")).text().trim().replace(" Wegweiser", "")); } if (copy_columnmap.containsKey("signature")) { copy.setShelfmark( tr.child(copy_columnmap.get("signature")).text().trim().replace(" Wegweiser", "")); } result.addCopy(copy); } catch (Exception ex) { ex.printStackTrace(); } } try { Element isvolume = null; Map<String, String> volume = new HashMap<>(); Elements links = doc.select(".data td a"); int elcount = links.size(); for (int eli = 0; eli < elcount; eli++) { List<NameValuePair> anyurl = URLEncodedUtils.parse(new URI(links.get(eli).attr("href")), "UTF-8"); for (NameValuePair nv : anyurl) { if (nv.getName().equals("methodToCall") && nv.getValue().equals("volumeSearch")) { isvolume = links.get(eli); } else if (nv.getName().equals("catKey")) { volume.put("catKey", nv.getValue()); } else if (nv.getName().equals("dbIdentifier")) { volume.put("dbIdentifier", nv.getValue()); } } if (isvolume != null) { volume.put("volume", "true"); result.setVolumesearch(volume); break; } } } catch (Exception e) { e.printStackTrace(); } return result; }
From source file:de.geeksfactory.opacclient.apis.Bibliotheca.java
protected SearchRequestResult parse_search(String html, int page) { Document doc = Jsoup.parse(html); doc.setBaseUri(opac_url);/*from www .j a v a 2s .c o m*/ Elements table = doc.select(".resulttab tr.result_trefferX, .resulttab tr.result_treffer"); List<SearchResult> results = new ArrayList<>(); for (int i = 0; i < table.size(); i++) { Element tr = table.get(i); SearchResult sr = new SearchResult(); int contentindex = 1; if (tr.select("td a img").size() > 0) { String[] fparts = tr.select("td a img").get(0).attr("src").split("/"); String fname = fparts[fparts.length - 1]; if (data.has("mediatypes")) { try { sr.setType(MediaType.valueOf(data.getJSONObject("mediatypes").getString(fname))); } catch (JSONException | IllegalArgumentException e) { sr.setType(defaulttypes.get(fname.toLowerCase(Locale.GERMAN).replace(".jpg", "") .replace(".gif", "").replace(".png", ""))); } } else { sr.setType(defaulttypes.get(fname.toLowerCase(Locale.GERMAN).replace(".jpg", "") .replace(".gif", "").replace(".png", ""))); } } else { if (tr.children().size() == 3) { contentindex = 2; } } sr.setInnerhtml(tr.child(contentindex).child(0).html()); sr.setNr(i); Element link = tr.child(contentindex).select("a").first(); try { if (link != null && link.attr("href").contains("detmediennr")) { Map<String, String> params = getQueryParamsFirst(link.attr("abs:href")); String nr = params.get("detmediennr"); if (Integer.parseInt(nr) > i + 1) { // Seems to be an ID if (params.get("detDB") != null) { sr.setId("&detmediennr=" + nr + "&detDB=" + params.get("detDB")); } else { sr.setId("&detmediennr=" + nr); } } } } catch (Exception e) { } try { if (tr.child(1).childNode(0) instanceof Comment) { Comment c = (Comment) tr.child(1).childNode(0); String comment = c.getData().trim(); String id = comment.split(": ")[1]; sr.setId(id); } } catch (Exception e) { e.printStackTrace(); } results.add(sr); } int results_total = -1; if (doc.select(".result_gefunden").size() > 0) { try { results_total = Integer.parseInt( doc.select(".result_gefunden").text().trim().replaceAll(".*[^0-9]+([0-9]+).*", "$1")); } catch (NumberFormatException e) { e.printStackTrace(); results_total = -1; } } return new SearchRequestResult(results, results_total, page); }
From source file:tr.edu.gsu.nerwip.retrieval.reader.wikipedia.WikipediaReader.java
/** * Retrieve the text located in a table (TABLE) HTML element. * <br/>/*from www . j av a2 s .c o m*/ * We process each cell in the table as a text element. * Some tables are ignored: infoboxes, wikitables, navboxes, * metadata, persondata, etc. * * @param element * Element to be processed. * @param rawStr * Current raw text string. * @param linkedStr * Current text with hyperlinks. * @return * {@code true} iff the element was processed. */ private boolean processTableElement(Element element, StringBuilder rawStr, StringBuilder linkedStr) { boolean result; String eltClass = element.attr(XmlNames.ATT_CLASS); if (eltClass == null || // we ignore infoboxes (!eltClass.contains(CLASS_INFORMATIONBOX) // and wikitables && !eltClass.contains(CLASS_WIKITABLE) // navigation boxes && !eltClass.contains(CLASS_NAVIGATIONBOX) // navigation boxes, WP warnings (incompleteness, etc.) && !eltClass.contains(CLASS_METADATA) // personal data box (?) && !eltClass.contains(CLASS_PERSONDATA))) { result = true; Element tbodyElt = element.children().get(0); for (Element rowElt : tbodyElt.children()) { for (Element colElt : rowElt.children()) { // process cell content processTextElement(colElt, rawStr, linkedStr); // possibly add final dot and space. if (rawStr.charAt(rawStr.length() - 1) != ' ') { if (rawStr.charAt(rawStr.length() - 1) == '.') { rawStr.append(" "); linkedStr.append(" "); } else { rawStr.append(". "); linkedStr.append(". "); } } } } } else result = false; return result; }
From source file:tr.edu.gsu.nerwip.retrieval.reader.wikipedia.WikipediaReader.java
/** * Retrieve the text located in //from ww w . jav a 2 s . co m * a description list (DL) HTML element. * * @param element * Element to be processed. * @param rawStr * Current raw text string. * @param linkedStr * Current text with hyperlinks. */ private void processDescriptionListElement(Element element, StringBuilder rawStr, StringBuilder linkedStr) { // possibly remove the last new line character char c = rawStr.charAt(rawStr.length() - 1); if (c == '\n') { rawStr.deleteCharAt(rawStr.length() - 1); linkedStr.deleteCharAt(linkedStr.length() - 1); } // possibly remove preceeding space c = rawStr.charAt(rawStr.length() - 1); if (c == ' ') { rawStr.deleteCharAt(rawStr.length() - 1); linkedStr.deleteCharAt(linkedStr.length() - 1); } // possibly add a column c = rawStr.charAt(rawStr.length() - 1); if (c != '.' && c != ':' && c != ';') { rawStr.append(":"); linkedStr.append(":"); } // process each list element Elements elements = element.children(); Iterator<Element> it = elements.iterator(); Element tempElt = null; if (it.hasNext()) tempElt = it.next(); while (tempElt != null) { // add leading space rawStr.append(" "); linkedStr.append(" "); // get term String tempName = tempElt.tagName(); if (tempName.equals(XmlNames.ELT_DT)) { // process term processTextElement(tempElt, rawStr, linkedStr); // possibly remove the last new line character c = rawStr.charAt(rawStr.length() - 1); if (c == '\n') { rawStr.deleteCharAt(rawStr.length() - 1); linkedStr.deleteCharAt(linkedStr.length() - 1); } // possibly remove preceeding space c = rawStr.charAt(rawStr.length() - 1); if (c == ' ') { rawStr.deleteCharAt(rawStr.length() - 1); linkedStr.deleteCharAt(linkedStr.length() - 1); } // possibly add a column and space c = rawStr.charAt(rawStr.length() - 1); if (c != '.' && c != ':' && c != ';') { rawStr.append(": "); linkedStr.append(": "); } // go to next element if (it.hasNext()) tempElt = it.next(); else tempElt = null; } // get definition // if(tempName.equals(XmlNames.ELT_DD)) if (tempElt != null) { // process term processTextElement(tempElt, rawStr, linkedStr); // possibly remove the last new line character c = rawStr.charAt(rawStr.length() - 1); if (c == '\n') { rawStr.deleteCharAt(rawStr.length() - 1); linkedStr.deleteCharAt(linkedStr.length() - 1); } // possibly remove preceeding space c = rawStr.charAt(rawStr.length() - 1); if (c == ' ') { rawStr.deleteCharAt(rawStr.length() - 1); linkedStr.deleteCharAt(linkedStr.length() - 1); } // possibly add a semi-column c = rawStr.charAt(rawStr.length() - 1); if (c != '.' && c != ':' && c != ';') { rawStr.append(";"); linkedStr.append(";"); } // go to next element if (it.hasNext()) tempElt = it.next(); else tempElt = null; } } // possibly remove last separator c = rawStr.charAt(rawStr.length() - 1); if (c == ';') { rawStr.deleteCharAt(rawStr.length() - 1); linkedStr.deleteCharAt(linkedStr.length() - 1); c = rawStr.charAt(rawStr.length() - 1); if (c != '.') { rawStr.append("."); linkedStr.append("."); } rawStr.append("\n"); linkedStr.append("\n"); } }
From source file:cn.wanghaomiao.xpath.core.XpathEvaluator.java
/** * ?xpath/*w w w . j a v a 2s . co m*/ * * @param xpath * @param root * @return */ public List<JXNode> evaluate(String xpath, Elements root) throws NoSuchAxisException, NoSuchFunctionException { List<JXNode> res = new LinkedList<JXNode>(); Elements context = root; List<Node> xpathNodes = getXpathNodeTree(xpath); for (int i = 0; i < xpathNodes.size(); i++) { Node n = xpathNodes.get(i); LinkedList<Element> contextTmp = new LinkedList<Element>(); if (n.getScopeEm() == ScopeEm.RECURSIVE || n.getScopeEm() == ScopeEm.CURREC) { if (n.getTagName().startsWith("@")) { for (Element e : context) { //? String key = n.getTagName().substring(1); if (key.equals("*")) { res.add(JXNode.t(e.attributes().toString())); } else { String value = e.attr(key); if (StringUtils.isNotBlank(value)) { res.add(JXNode.t(value)); } } //?? for (Element dep : e.getAllElements()) { if (key.equals("*")) { res.add(JXNode.t(dep.attributes().toString())); } else { String value = dep.attr(key); if (StringUtils.isNotBlank(value)) { res.add(JXNode.t(value)); } } } } } else if (n.getTagName().endsWith("()")) { //??text() res.add(JXNode.t(context.text())); } else { Elements searchRes = context.select(n.getTagName()); for (Element e : searchRes) { Element filterR = filter(e, n); if (filterR != null) { contextTmp.add(filterR); } } context = new Elements(contextTmp); if (i == xpathNodes.size() - 1) { for (Element e : contextTmp) { res.add(JXNode.e(e)); } } } } else { if (n.getTagName().startsWith("@")) { for (Element e : context) { String key = n.getTagName().substring(1); if (key.equals("*")) { res.add(JXNode.t(e.attributes().toString())); } else { String value = e.attr(key); if (StringUtils.isNotBlank(value)) { res.add(JXNode.t(value)); } } } } else if (n.getTagName().endsWith("()")) { res = (List<JXNode>) callFunc(n.getTagName().substring(0, n.getTagName().length() - 2), context); } else { for (Element e : context) { Elements filterScope = e.children(); if (StringUtils.isNotBlank(n.getAxis())) { filterScope = getAxisScopeEls(n.getAxis(), e); } for (Element chi : filterScope) { Element fchi = filter(chi, n); if (fchi != null) { contextTmp.add(fchi); } } } context = new Elements(contextTmp); if (i == xpathNodes.size() - 1) { for (Element e : contextTmp) { res.add(JXNode.e(e)); } } } } } return res; }
From source file:de.geeksfactory.opacclient.apis.SISIS.java
protected SearchRequestResult parse_search(String html, int page) throws OpacErrorException { Document doc = Jsoup.parse(html); doc.setBaseUri(opac_url + "/searchfoo"); if (doc.select(".error").size() > 0) { throw new OpacErrorException(doc.select(".error").text().trim()); } else if (doc.select(".nohits").size() > 0) { throw new OpacErrorException(doc.select(".nohits").text().trim()); } else if (doc.select(".box-header h2, #nohits").text().contains("keine Treffer")) { return new SearchRequestResult(new ArrayList<SearchResult>(), 0, 1, 1); }//from w w w . jav a 2s.c o m int results_total = -1; String resultnumstr = doc.select(".box-header h2").first().text(); if (resultnumstr.contains("(1/1)") || resultnumstr.contains(" 1/1")) { reusehtml = html; throw new OpacErrorException("is_a_redirect"); } else if (resultnumstr.contains("(")) { results_total = Integer.parseInt(resultnumstr.replaceAll(".*\\(([0-9]+)\\).*", "$1")); } else if (resultnumstr.contains(": ")) { results_total = Integer.parseInt(resultnumstr.replaceAll(".*: ([0-9]+)$", "$1")); } Elements table = doc.select("table.data tbody tr"); identifier = null; Elements links = doc.select("table.data a"); boolean haslink = false; for (int i = 0; i < links.size(); i++) { Element node = links.get(i); if (node.hasAttr("href") & node.attr("href").contains("singleHit.do") && !haslink) { haslink = true; try { List<NameValuePair> anyurl = URLEncodedUtils .parse(new URI(node.attr("href").replace(" ", "%20").replace("&", "&")), ENCODING); for (NameValuePair nv : anyurl) { if (nv.getName().equals("identifier")) { identifier = nv.getValue(); break; } } } catch (Exception e) { e.printStackTrace(); } } } List<SearchResult> results = new ArrayList<>(); for (int i = 0; i < table.size(); i++) { Element tr = table.get(i); SearchResult sr = new SearchResult(); if (tr.select("td img[title]").size() > 0) { String title = tr.select("td img").get(0).attr("title"); String[] fparts = tr.select("td img").get(0).attr("src").split("/"); String fname = fparts[fparts.length - 1]; MediaType default_by_fname = defaulttypes.get(fname.toLowerCase(Locale.GERMAN).replace(".jpg", "") .replace(".gif", "").replace(".png", "")); MediaType default_by_title = defaulttypes.get(title); MediaType default_name = default_by_title != null ? default_by_title : default_by_fname; if (data.has("mediatypes")) { try { sr.setType(MediaType.valueOf(data.getJSONObject("mediatypes").getString(fname))); } catch (JSONException | IllegalArgumentException e) { sr.setType(default_name); } } else { sr.setType(default_name); } } String alltext = tr.text(); if (alltext.contains("eAudio") || alltext.contains("eMusic")) { sr.setType(MediaType.MP3); } else if (alltext.contains("eVideo")) { sr.setType(MediaType.EVIDEO); } else if (alltext.contains("eBook")) { sr.setType(MediaType.EBOOK); } else if (alltext.contains("Munzinger")) { sr.setType(MediaType.EDOC); } if (tr.children().size() > 3 && tr.child(3).select("img[title*=cover]").size() == 1) { sr.setCover(tr.child(3).select("img[title*=cover]").attr("abs:src")); if (sr.getCover().contains("showCover.do")) { downloadCover(sr); } } Element middlething; if (tr.children().size() > 2 && tr.child(2).select("a").size() > 0) { middlething = tr.child(2); } else { middlething = tr.child(1); } List<Node> children = middlething.childNodes(); if (middlething.select("div").not("#hlrightblock,.bestellfunktionen").size() == 1) { Element indiv = middlething.select("div").not("#hlrightblock,.bestellfunktionen").first(); if (indiv.children().size() > 1) { children = indiv.childNodes(); } } else if (middlething.select("span.titleData").size() == 1) { children = middlething.select("span.titleData").first().childNodes(); } int childrennum = children.size(); List<String[]> strings = new ArrayList<>(); for (int ch = 0; ch < childrennum; ch++) { Node node = children.get(ch); if (node instanceof TextNode) { String text = ((TextNode) node).text().trim(); if (text.length() > 3) { strings.add(new String[] { "text", "", text }); } } else if (node instanceof Element) { List<Node> subchildren = node.childNodes(); for (int j = 0; j < subchildren.size(); j++) { Node subnode = subchildren.get(j); if (subnode instanceof TextNode) { String text = ((TextNode) subnode).text().trim(); if (text.length() > 3) { strings.add(new String[] { ((Element) node).tag().getName(), "text", text, ((Element) node).className(), node.attr("style") }); } } else if (subnode instanceof Element) { String text = ((Element) subnode).text().trim(); if (text.length() > 3) { strings.add(new String[] { ((Element) node).tag().getName(), ((Element) subnode).tag().getName(), text, ((Element) node).className(), node.attr("style") }); } } } } } StringBuilder description = null; if (tr.select("span.Z3988").size() == 1) { // Sometimes there is a <span class="Z3988"> item which provides // data in a standardized format. List<NameValuePair> z3988data; boolean hastitle = false; try { description = new StringBuilder(); z3988data = URLEncodedUtils .parse(new URI("http://dummy/?" + tr.select("span.Z3988").attr("title")), "UTF-8"); for (NameValuePair nv : z3988data) { if (nv.getValue() != null) { if (!nv.getValue().trim().equals("")) { if (nv.getName().equals("rft.btitle") && !hastitle) { description.append("<b>").append(nv.getValue()).append("</b>"); hastitle = true; } else if (nv.getName().equals("rft.atitle") && !hastitle) { description.append("<b>").append(nv.getValue()).append("</b>"); hastitle = true; } else if (nv.getName().equals("rft.au")) { description.append("<br />").append(nv.getValue()); } else if (nv.getName().equals("rft.date")) { description.append("<br />").append(nv.getValue()); } } } } } catch (URISyntaxException e) { description = null; } } boolean described = false; if (description != null && description.length() > 0) { sr.setInnerhtml(description.toString()); described = true; } else { description = new StringBuilder(); } int k = 0; boolean yearfound = false; boolean titlefound = false; boolean sigfound = false; for (String[] part : strings) { if (!described) { if (part[0].equals("a") && (k == 0 || !titlefound)) { if (k != 0) { description.append("<br />"); } description.append("<b>").append(part[2]).append("</b>"); titlefound = true; } else if (part[2].matches("\\D*[0-9]{4}\\D*") && part[2].length() <= 10) { yearfound = true; if (k != 0) { description.append("<br />"); } description.append(part[2]); } else if (k == 1 && !yearfound && part[2].matches("^\\s*\\([0-9]{4}\\)$")) { if (k != 0) { description.append("<br />"); } description.append(part[2]); } else if (k == 1 && !yearfound && part[2].matches("^\\s*\\([0-9]{4}\\)$")) { if (k != 0) { description.append("<br />"); } description.append(part[2]); } else if (k > 1 && k < 4 && !sigfound && part[0].equals("text") && part[2].matches("^[A-Za-z0-9,\\- ]+$")) { description.append("<br />"); description.append(part[2]); } } if (part.length == 4) { if (part[0].equals("span") && part[3].equals("textgruen")) { sr.setStatus(SearchResult.Status.GREEN); } else if (part[0].equals("span") && part[3].equals("textrot")) { sr.setStatus(SearchResult.Status.RED); } } else if (part.length == 5) { if (part[4].contains("purple")) { sr.setStatus(SearchResult.Status.YELLOW); } } if (sr.getStatus() == null) { if ((part[2].contains("entliehen") && part[2].startsWith("Vormerkung ist leider nicht mglich")) || part[2].contains("nur in anderer Zweigstelle ausleihbar und nicht bestellbar")) { sr.setStatus(SearchResult.Status.RED); } else if (part[2].startsWith("entliehen") || part[2].contains("Ein Exemplar finden Sie in einer anderen Zweigstelle")) { sr.setStatus(SearchResult.Status.YELLOW); } else if ((part[2].startsWith("bestellbar") && !part[2].contains("nicht bestellbar")) || (part[2].startsWith("vorbestellbar") && !part[2].contains("nicht vorbestellbar")) || (part[2].startsWith("vorbestellbar") && !part[2].contains("nicht vorbestellbar")) || (part[2].startsWith("vormerkbar") && !part[2].contains("nicht vormerkbar")) || (part[2].contains("heute zurckgebucht")) || (part[2].contains("ausleihbar") && !part[2].contains("nicht ausleihbar"))) { sr.setStatus(SearchResult.Status.GREEN); } if (sr.getType() != null) { if (sr.getType().equals(MediaType.EBOOK) || sr.getType().equals(MediaType.EVIDEO) || sr.getType().equals(MediaType.MP3)) // Especially Onleihe.de ebooks are often marked // green though they are not available. { sr.setStatus(SearchResult.Status.UNKNOWN); } } } k++; } if (!described) { sr.setInnerhtml(description.toString()); } sr.setNr(10 * (page - 1) + i); sr.setId(null); results.add(sr); } resultcount = results.size(); return new SearchRequestResult(results, results_total, page); }
From source file:com.lloydtorres.stately.issues.IssueDecisionActivity.java
/** * Process the received page into the Issue and its IssueOptions * @param v Activity view//from w w w . j av a 2s .c o m * @param d Document received from NationStates */ private void processIssueInfo(View v, Document d) { // First check if the issue is still available if (d.text().contains(NOT_AVAILABLE)) { mSwipeRefreshLayout.setRefreshing(false); SparkleHelper.makeSnackbar(v, String.format(Locale.US, getString(R.string.issue_unavailable), mNation.name)); return; } Element issueInfoContainer = d.select("div#dilemma").first(); if (issueInfoContainer == null) { // safety check mSwipeRefreshLayout.setRefreshing(false); SparkleHelper.makeSnackbar(v, getString(R.string.login_error_parsing)); return; } Elements issueInfoRaw = issueInfoContainer.children(); String issueText = issueInfoRaw.select("p").first().text(); // If this is an issue chain, grab the second paragraph instead if (d.select("div.dilemmachain").first() != null) { issueText = issueInfoRaw.select("p").get(1).text(); if (d.text().contains(STORY_SO_FAR)) { issueText = issueText + "<br><br>" + issueInfoRaw.select("p").get(2).text(); } } issue.content = issueText; issue.options = new ArrayList<IssueOption>(); Element optionHolderMain = issueInfoRaw.select("ol.diloptions").first(); if (optionHolderMain != null) { Elements optionsHolder = optionHolderMain.select("li"); int i = 0; for (Element option : optionsHolder) { IssueOption issueOption = new IssueOption(); issueOption.index = i++; Element button = option.select("button").first(); if (button != null) { issueOption.header = button.attr("name"); } else { issueOption.header = IssueOption.SELECTED_HEADER; } Element optionContentHolder = option.select("p").first(); if (optionContentHolder == null) { // safety check mSwipeRefreshLayout.setRefreshing(false); SparkleHelper.makeSnackbar(v, getString(R.string.login_error_parsing)); return; } issueOption.content = optionContentHolder.text(); issue.options.add(issueOption); } } IssueOption dismissOption = new IssueOption(); dismissOption.index = -1; dismissOption.header = IssueOption.DISMISS_HEADER; dismissOption.content = ""; issue.options.add(dismissOption); setRecyclerAdapter(issue); mSwipeRefreshLayout.setRefreshing(false); mSwipeRefreshLayout.setEnabled(false); }
From source file:com.lloydtorres.stately.issues.IssuesFragment.java
/** * Process the HTML contents of the issues into actual Issue objects * @param d// ww w .j a v a2 s .c om */ private void processIssues(View v, Document d) { issues = new ArrayList<Object>(); Element issuesContainer = d.select("ul.dilemmalist").first(); if (issuesContainer == null) { // safety check mSwipeRefreshLayout.setRefreshing(false); SparkleHelper.makeSnackbar(v, getString(R.string.login_error_parsing)); return; } Elements issuesRaw = issuesContainer.children(); for (Element i : issuesRaw) { Issue issueCore = new Issue(); Elements issueContents = i.children(); // Get issue ID and name Element issueMain = issueContents.select("a").first(); if (issueMain == null) { continue; } String issueLink = issueMain.attr("href"); issueCore.id = Integer.valueOf(issueLink.replace("page=show_dilemma/dilemma=", "")); Matcher chainMatcher = CHAIN_ISSUE_REGEX.matcher(issueMain.text()); if (chainMatcher.find()) { issueCore.chain = chainMatcher.group(1); issueCore.title = chainMatcher.group(2); } else { issueCore.title = issueMain.text(); } issues.add(issueCore); } Element nextIssueUpdate = d.select("p.dilemmanextupdate").first(); if (nextIssueUpdate != null) { String nextUpdate = nextIssueUpdate.text(); issues.add(nextUpdate); } if (issuesRaw.size() <= 0) { String nextUpdate = getString(R.string.no_issues); Matcher m = NEXT_ISSUE_REGEX.matcher(d.html()); if (m.find()) { long nextUpdateTime = Long.valueOf(m.group(1)) / 1000L; nextUpdate = String.format(Locale.US, getString(R.string.next_issue), SparkleHelper.getReadableDateFromUTC(getContext(), nextUpdateTime)); } issues.add(nextUpdate); } if (mRecyclerAdapter == null) { mRecyclerAdapter = new IssuesRecyclerAdapter(getContext(), issues, mNation); mRecyclerView.setAdapter(mRecyclerAdapter); } else { ((IssuesRecyclerAdapter) mRecyclerAdapter).setIssueCards(issues); } mSwipeRefreshLayout.setRefreshing(false); }