List of usage examples for org.jsoup.nodes Element select
public Elements select(String cssQuery)
From source file:me.vertretungsplan.parser.UntisCommonParser.java
/** * Parses an Untis substitution table ({@link UntisSubstitutionParser}). * * @param v/*from w w w . j av a2 s . c o m*/ * @param lastChange * @param doc * @throws JSONException * @throws CredentialInvalidException */ protected void parseSubstitutionTable(SubstitutionSchedule v, String lastChange, Document doc) throws JSONException, CredentialInvalidException { JSONObject data = scheduleData.getData(); LocalDateTime lastChangeDate = ParserUtils.parseDateTime(lastChange); Pattern dayPattern = Pattern.compile("\\d\\d?.\\d\\d?. / \\w+"); int dateColumn = -1; JSONArray columns = data.getJSONArray("columns"); for (int i = 0; i < columns.length(); i++) { if (columns.getString(i).equals("date")) { dateColumn = i; break; } } Element table = doc.select("table[rules=all], table:has(tr:has(td[align=center]))").first(); if (table.text().replace("\u00a0", "").trim().equals("Keine Vertretungen")) return; if (dateColumn == -1) { SubstitutionScheduleDay day = new SubstitutionScheduleDay(); day.setLastChangeString(lastChange); day.setLastChange(lastChangeDate); String title = doc.select("font[size=5], font[size=4], font[size=3] b").text(); Matcher matcher = dayPattern.matcher(title); if (matcher.find()) { String date = matcher.group(); day.setDateString(date); day.setDate(ParserUtils.parseDate(date)); } parseSubstitutionScheduleTable(table, data, day); v.addDay(day); } else { for (Element line : table.select("tr.list.odd:not(:has(td.inline_header)), " + "tr.list.even:not(:has(td.inline_header)), " + "tr:has(td[align=center]):gt(0)")) { SubstitutionScheduleDay day = null; String date = line.select("td").get(dateColumn).text().trim(); if (date.indexOf("-") > 0) { date = date.substring(0, date.indexOf("-") - 1).trim(); } LocalDate parsedDate = ParserUtils.parseDate(date); for (SubstitutionScheduleDay search : v.getDays()) { if (Objects.equals(search.getDate(), parsedDate) || Objects.equals(search.getDateString(), date)) { day = search; break; } } if (day == null) { day = new SubstitutionScheduleDay(); day.setDateString(date); day.setDate(parsedDate); day.setLastChangeString(lastChange); day.setLastChange(lastChangeDate); v.addDay(day); } parseSubstitutionScheduleTable(line, data, day); } } }
From source file:com.vaushell.shaarlijavaapi.ShaarliClient.java
private String extract(final Element source, final String templateName) { if (source == null) { throw new IllegalArgumentException(); }/* ww w . j a v a 2s . co m*/ final ShaarliTemplates.Template template = templates.get(templateName); if (template == null) { throw new IllegalArgumentException("template '" + templateName + "' not found"); } final Element elt; if (template.cssPath.isEmpty()) { elt = source; } else { final Elements elts = source.select(template.cssPath); if (elts.isEmpty()) { return null; } elt = elts.first(); } String content; if (template.attribut.isEmpty()) { content = elt.text(); } else { content = elt.attr(template.attribut); } if (content == null) { return null; } content = content.trim(); if (!template.regex.isEmpty()) { final Pattern p = Pattern.compile(template.regex); final Matcher m = p.matcher(content); if (m.find()) { content = m.group().trim(); } } if (content.isEmpty()) { return null; } return content; }
From source file:info.dolezel.fatrat.plugins.UloztoDownload.java
@Override public void processLink(String link) { //if (link.contains("/live/")) // link = link.replace("/live/", "/"); if (link.startsWith("http://uloz.to") || link.startsWith("https://uloz.to")) link = link.replace("https?://uloz.to", "https://www.uloz.to"); if (link.startsWith("http://m.uloz.to") || link.startsWith("https://m.uloz.to")) link = link.replace("https?://m.uloz.to", "https://www.uloz.to"); if (!logIn(link)) return;/* w ww. ja va2 s.com*/ final String downloadLink = link; // I can't make 'link' final fetchPage(link, new PageFetchListener() { @Override public void onCompleted(ByteBuffer buf, Map<String, String> headers) { try { if (headers.containsKey("location")) { String location = headers.get("location"); if (location.contains("smazano") || location.contains("nenalezeno")) setFailed("The file has been removed"); else processLink(location); return; } CharBuffer cb = charsetUtf8.decode(buf); if (cb.toString().contains("?disclaimer=1")) { processLink(downloadLink + "?disclaimer=1"); return; } final Document doc = Jsoup.parse(cb.toString()); final Element freeForm = doc.getElementById("frm-download-freeDownloadTab-freeDownloadForm"); final Element premiumLink = doc.getElementById("#quickDownloadButton"); boolean usePremium = usePremium(downloadLink); if (cb.toString().contains("Nem dostatek kreditu")) setMessage("Credit depleted, using FREE download"); else if (usePremium && premiumLink != null) { String msg = "Using premium download"; Elements aCredits = doc.getElementsByAttributeValue("href", "/kredit"); if (!aCredits.isEmpty()) msg += " (" + aCredits.get(0).ownText() + " left)"; setMessage(msg); startDownload("http://www.uloz.to" + premiumLink.attr("href")); return; } else if (loggedIn) setMessage("Login failed, using FREE download"); Elements aNames = doc.getElementsByClass("jsShowDownload"); if (!aNames.isEmpty()) reportFileName(aNames.get(0).ownText()); final PostQuery pq = new PostQuery(); final Map<String, String> hdr = new HashMap<String, String>(); Elements eHiddens = freeForm.select("input[type=hidden]"); hdr.put("X-Requested-With", "XMLHttpRequest"); hdr.put("Referer", downloadLink); hdr.put("Accept", "application/json, text/javascript, */*; q=0.01"); for (Element e : eHiddens) pq.add(e.attr("name"), e.attr("value")); fetchPage("https://uloz.to/reloadXapca.php?rnd=" + Math.abs(new Random().nextInt()), new PageFetchListener() { @Override public void onCompleted(ByteBuffer buf, Map<String, String> headers) { CharBuffer cb = charsetUtf8.decode(buf); String captchaUrl; try { JSONObject json = new JSONObject(cb.toString()); captchaUrl = "https:" + json.getString("image"); pq.add("hash", json.getString("hash")); pq.add("timestamp", "" + json.getInt("timestamp")); pq.add("salt", "" + json.getInt("salt")); } catch (JSONException e) { setFailed("Error parsing captcha JSON"); return; } solveCaptcha(captchaUrl, new CaptchaListener() { @Override public void onFailed() { setFailed("Failed to decode the captcha code"); } @Override public void onSolved(String text) { String action = freeForm.attr("action"); pq.add("captcha_value", text); fetchPage("https://www.uloz.to" + action, new PageFetchListener() { @Override public void onCompleted(ByteBuffer buf, Map<String, String> headers) { try { CharBuffer cb = charsetUtf8.decode(buf); JSONObject obj = new JSONObject(cb.toString()); startDownload(obj.getString("url")); } catch (Exception e) { setFailed("" + e); } } @Override public void onFailed(String error) { setFailed(error); } }, pq.toString(), hdr); } }); } @Override public void onFailed(String error) { setFailed("Failed to load captcha AJAX page"); } }); } catch (Exception e) { e.printStackTrace(); setFailed(e.toString()); } } @Override public void onFailed(String error) { setFailed("Failed to load the initial page"); } }, null); }
From source file:de.geeksfactory.opacclient.apis.SISIS.java
@Override public ReservationResult reservation(DetailledItem item, Account acc, int useraction, String selection) throws IOException { String reservation_info = item.getReservation_info(); final String branch_inputfield = "issuepoint"; Document doc = null;/*from ww w . j av a 2 s .c om*/ String action = "reservation"; if (reservation_info.contains("doBestellung")) { action = "order"; } if (useraction == MultiStepResult.ACTION_CONFIRMATION) { List<NameValuePair> nameValuePairs = new ArrayList<>(2); nameValuePairs.add(new BasicNameValuePair("methodToCall", action)); nameValuePairs.add(new BasicNameValuePair("CSId", CSId)); String html = httpPost(opac_url + "/" + action + ".do", new UrlEncodedFormEntity(nameValuePairs), ENCODING); doc = Jsoup.parse(html); } else if (selection == null || useraction == 0) { String html = httpGet(opac_url + "/availability.do?" + reservation_info, ENCODING); doc = Jsoup.parse(html); if (doc.select("input[name=username]").size() > 0) { // Login vonnten List<NameValuePair> nameValuePairs = new ArrayList<>(2); nameValuePairs.add(new BasicNameValuePair("username", acc.getName())); nameValuePairs.add(new BasicNameValuePair("password", acc.getPassword())); nameValuePairs.add(new BasicNameValuePair("methodToCall", "submit")); nameValuePairs.add(new BasicNameValuePair("CSId", CSId)); nameValuePairs.add(new BasicNameValuePair("login_action", "Login")); html = handleLoginMessage( httpPost(opac_url + "/login.do", new UrlEncodedFormEntity(nameValuePairs), ENCODING)); doc = Jsoup.parse(html); if (doc.getElementsByClass("error").size() == 0) { logged_in = System.currentTimeMillis(); logged_in_as = acc; } } if (doc.select("input[name=expressorder]").size() > 0) { List<NameValuePair> nameValuePairs = new ArrayList<>(2); nameValuePairs.add(new BasicNameValuePair(branch_inputfield, selection)); nameValuePairs.add(new BasicNameValuePair("methodToCall", action)); nameValuePairs.add(new BasicNameValuePair("CSId", CSId)); nameValuePairs.add(new BasicNameValuePair("expressorder", " ")); html = httpPost(opac_url + "/" + action + ".do", new UrlEncodedFormEntity(nameValuePairs), ENCODING); doc = Jsoup.parse(html); } if (doc.select("input[name=" + branch_inputfield + "]").size() > 0) { List<Map<String, String>> branches = new ArrayList<>(); for (Element option : doc.select("input[name=" + branch_inputfield + "]").first().parent().parent() .parent().select("td")) { if (option.select("input").size() != 1) { continue; } String value = option.text().trim(); String key = option.select("input").val(); Map<String, String> selopt = new HashMap<>(); selopt.put("key", key); selopt.put("value", value); branches.add(selopt); } ReservationResult result = new ReservationResult(MultiStepResult.Status.SELECTION_NEEDED); result.setActionIdentifier(ReservationResult.ACTION_BRANCH); result.setSelection(branches); return result; } } else if (useraction == ReservationResult.ACTION_BRANCH) { List<NameValuePair> nameValuePairs = new ArrayList<>(2); nameValuePairs.add(new BasicNameValuePair(branch_inputfield, selection)); nameValuePairs.add(new BasicNameValuePair("methodToCall", action)); nameValuePairs.add(new BasicNameValuePair("CSId", CSId)); String html = httpPost(opac_url + "/" + action + ".do", new UrlEncodedFormEntity(nameValuePairs), ENCODING); doc = Jsoup.parse(html); } if (doc == null) { return new ReservationResult(MultiStepResult.Status.ERROR); } if (doc.getElementsByClass("error").size() >= 1) { return new ReservationResult(MultiStepResult.Status.ERROR, doc.getElementsByClass("error").get(0).text()); } if (doc.select("#CirculationForm p").size() > 0 && doc.select("input[type=button]").size() >= 2) { List<String[]> details = new ArrayList<>(); for (String row : doc.select("#CirculationForm p").first().html().split("<br>")) { Document frag = Jsoup.parseBodyFragment(row); if (frag.text().contains(":")) { String[] split = frag.text().split(":"); if (split.length >= 2) { details.add(new String[] { split[0].trim() + ":", split[1].trim() }); } } else { details.add(new String[] { "", frag.text().trim() }); } } ReservationResult result = new ReservationResult(Status.CONFIRMATION_NEEDED); result.setDetails(details); return result; } if (doc.select("#CirculationForm .textrot").size() >= 1) { String errmsg = doc.select("#CirculationForm .textrot").get(0).text(); if (errmsg.contains("Dieses oder andere Exemplare in anderer Zweigstelle ausleihbar")) { Copy best = null; for (Copy copy : item.getCopies()) { if (copy.getResInfo() == null) { continue; } if (best == null) { best = copy; continue; } try { if (Integer.parseInt(copy.getReservations()) < Long.parseLong(best.getReservations())) { best = copy; } else if (Integer.parseInt(copy.getReservations()) == Long .parseLong(best.getReservations())) { if (copy.getReturnDate().isBefore(best.getReturnDate())) { best = copy; } } } catch (NumberFormatException e) { } } if (best != null) { item.setReservation_info(best.getResInfo()); return reservation(item, acc, 0, null); } } return new ReservationResult(MultiStepResult.Status.ERROR, errmsg); } if (doc.select("#CirculationForm td[colspan=2] strong").size() >= 1) { return new ReservationResult(MultiStepResult.Status.OK, doc.select("#CirculationForm td[colspan=2] strong").get(0).text()); } return new ReservationResult(Status.OK); }
From source file:ExtractorContentTest.java
private void treatSection(Element section, List<Catalog> catalogs) { // 1. get section name // FIXME what is it does not exist? // FIXME can be "h3" Elements sect2 = section.getElementsByTag("h2"); String s2 = null;/*from ww w. j a v a2 s .c om*/ if (!sect2.isEmpty()) s2 = sect2.first().text(); // FIXME what about more than 1 ? String s3 = null; Elements sect3 = section.getElementsByTag("h3"); if (!sect3.isEmpty()) s3 = sect3.first().text(); String dt = null; Elements sectDT = section.getElementsByTag("p"); if (!sectDT.isEmpty()) { String contentDT = sectDT.first().text(); if (contentDT.startsWith(";")) dt = contentDT.replaceAll(";", ""); } // FIXME can be subsection // FIXME (1. optional step) some comments // 2. retrieve tabular Elements tables = section.getElementsByTag("table"); //if (!tables.isEmpty()) //System.err.println("\n****** " + s2 + " " + s3 + " *******\n"); for (Element table : tables) { // (0. optional step) act as subviewname Elements caption = table.select("caption"); String captionName = null; if (!caption.isEmpty()) captionName = caption.first().text(); /*** * Headers */ // List<Header> rHeaders = collectHeaders(table); boolean sortable = !table.select("[class=sortable wikitable]").isEmpty() || !table.select("[class=wikitable sortable]").isEmpty(); // FIXME: other cases Elements heads = table.select("thead"); if (sortable && (!heads.isEmpty())) { rHeaders = collectHeaders(heads.first()); } // 2 treat row Catalog product = null; Tree<String> structuralInformation = mkStructuralInformation(s2, s3, dt, captionName); if (sortable) { product = treatRows(table.select("tbody").first(), structuralInformation, rHeaders, sortable); } else product = treatRows(table, structuralInformation, rHeaders, sortable); catalogs.add(product); // } // set the "ID" / names // clean up for (Catalog catalog : catalogs) { for (Product p : catalog) { Header primaryHeader = p.getHeaders().get(0); p.setName(p.getValue(primaryHeader.getName())); } } }
From source file:de.geeksfactory.opacclient.apis.SISIS.java
protected SearchRequestResult parse_search(String html, int page) throws OpacErrorException { Document doc = Jsoup.parse(html); doc.setBaseUri(opac_url + "/searchfoo"); if (doc.select(".error").size() > 0) { throw new OpacErrorException(doc.select(".error").text().trim()); } else if (doc.select(".nohits").size() > 0) { throw new OpacErrorException(doc.select(".nohits").text().trim()); } else if (doc.select(".box-header h2, #nohits").text().contains("keine Treffer")) { return new SearchRequestResult(new ArrayList<SearchResult>(), 0, 1, 1); }// w ww .j a va 2 s.c o m int results_total = -1; String resultnumstr = doc.select(".box-header h2").first().text(); if (resultnumstr.contains("(1/1)") || resultnumstr.contains(" 1/1")) { reusehtml = html; throw new OpacErrorException("is_a_redirect"); } else if (resultnumstr.contains("(")) { results_total = Integer.parseInt(resultnumstr.replaceAll(".*\\(([0-9]+)\\).*", "$1")); } else if (resultnumstr.contains(": ")) { results_total = Integer.parseInt(resultnumstr.replaceAll(".*: ([0-9]+)$", "$1")); } Elements table = doc.select("table.data tbody tr"); identifier = null; Elements links = doc.select("table.data a"); boolean haslink = false; for (int i = 0; i < links.size(); i++) { Element node = links.get(i); if (node.hasAttr("href") & node.attr("href").contains("singleHit.do") && !haslink) { haslink = true; try { List<NameValuePair> anyurl = URLEncodedUtils .parse(new URI(node.attr("href").replace(" ", "%20").replace("&", "&")), ENCODING); for (NameValuePair nv : anyurl) { if (nv.getName().equals("identifier")) { identifier = nv.getValue(); break; } } } catch (Exception e) { e.printStackTrace(); } } } List<SearchResult> results = new ArrayList<>(); for (int i = 0; i < table.size(); i++) { Element tr = table.get(i); SearchResult sr = new SearchResult(); if (tr.select("td img[title]").size() > 0) { String title = tr.select("td img").get(0).attr("title"); String[] fparts = tr.select("td img").get(0).attr("src").split("/"); String fname = fparts[fparts.length - 1]; MediaType default_by_fname = defaulttypes.get(fname.toLowerCase(Locale.GERMAN).replace(".jpg", "") .replace(".gif", "").replace(".png", "")); MediaType default_by_title = defaulttypes.get(title); MediaType default_name = default_by_title != null ? default_by_title : default_by_fname; if (data.has("mediatypes")) { try { sr.setType(MediaType.valueOf(data.getJSONObject("mediatypes").getString(fname))); } catch (JSONException | IllegalArgumentException e) { sr.setType(default_name); } } else { sr.setType(default_name); } } String alltext = tr.text(); if (alltext.contains("eAudio") || alltext.contains("eMusic")) { sr.setType(MediaType.MP3); } else if (alltext.contains("eVideo")) { sr.setType(MediaType.EVIDEO); } else if (alltext.contains("eBook")) { sr.setType(MediaType.EBOOK); } else if (alltext.contains("Munzinger")) { sr.setType(MediaType.EDOC); } if (tr.children().size() > 3 && tr.child(3).select("img[title*=cover]").size() == 1) { sr.setCover(tr.child(3).select("img[title*=cover]").attr("abs:src")); if (sr.getCover().contains("showCover.do")) { downloadCover(sr); } } Element middlething; if (tr.children().size() > 2 && tr.child(2).select("a").size() > 0) { middlething = tr.child(2); } else { middlething = tr.child(1); } List<Node> children = middlething.childNodes(); if (middlething.select("div").not("#hlrightblock,.bestellfunktionen").size() == 1) { Element indiv = middlething.select("div").not("#hlrightblock,.bestellfunktionen").first(); if (indiv.children().size() > 1) { children = indiv.childNodes(); } } else if (middlething.select("span.titleData").size() == 1) { children = middlething.select("span.titleData").first().childNodes(); } int childrennum = children.size(); List<String[]> strings = new ArrayList<>(); for (int ch = 0; ch < childrennum; ch++) { Node node = children.get(ch); if (node instanceof TextNode) { String text = ((TextNode) node).text().trim(); if (text.length() > 3) { strings.add(new String[] { "text", "", text }); } } else if (node instanceof Element) { List<Node> subchildren = node.childNodes(); for (int j = 0; j < subchildren.size(); j++) { Node subnode = subchildren.get(j); if (subnode instanceof TextNode) { String text = ((TextNode) subnode).text().trim(); if (text.length() > 3) { strings.add(new String[] { ((Element) node).tag().getName(), "text", text, ((Element) node).className(), node.attr("style") }); } } else if (subnode instanceof Element) { String text = ((Element) subnode).text().trim(); if (text.length() > 3) { strings.add(new String[] { ((Element) node).tag().getName(), ((Element) subnode).tag().getName(), text, ((Element) node).className(), node.attr("style") }); } } } } } StringBuilder description = null; if (tr.select("span.Z3988").size() == 1) { // Sometimes there is a <span class="Z3988"> item which provides // data in a standardized format. List<NameValuePair> z3988data; boolean hastitle = false; try { description = new StringBuilder(); z3988data = URLEncodedUtils .parse(new URI("http://dummy/?" + tr.select("span.Z3988").attr("title")), "UTF-8"); for (NameValuePair nv : z3988data) { if (nv.getValue() != null) { if (!nv.getValue().trim().equals("")) { if (nv.getName().equals("rft.btitle") && !hastitle) { description.append("<b>").append(nv.getValue()).append("</b>"); hastitle = true; } else if (nv.getName().equals("rft.atitle") && !hastitle) { description.append("<b>").append(nv.getValue()).append("</b>"); hastitle = true; } else if (nv.getName().equals("rft.au")) { description.append("<br />").append(nv.getValue()); } else if (nv.getName().equals("rft.date")) { description.append("<br />").append(nv.getValue()); } } } } } catch (URISyntaxException e) { description = null; } } boolean described = false; if (description != null && description.length() > 0) { sr.setInnerhtml(description.toString()); described = true; } else { description = new StringBuilder(); } int k = 0; boolean yearfound = false; boolean titlefound = false; boolean sigfound = false; for (String[] part : strings) { if (!described) { if (part[0].equals("a") && (k == 0 || !titlefound)) { if (k != 0) { description.append("<br />"); } description.append("<b>").append(part[2]).append("</b>"); titlefound = true; } else if (part[2].matches("\\D*[0-9]{4}\\D*") && part[2].length() <= 10) { yearfound = true; if (k != 0) { description.append("<br />"); } description.append(part[2]); } else if (k == 1 && !yearfound && part[2].matches("^\\s*\\([0-9]{4}\\)$")) { if (k != 0) { description.append("<br />"); } description.append(part[2]); } else if (k == 1 && !yearfound && part[2].matches("^\\s*\\([0-9]{4}\\)$")) { if (k != 0) { description.append("<br />"); } description.append(part[2]); } else if (k > 1 && k < 4 && !sigfound && part[0].equals("text") && part[2].matches("^[A-Za-z0-9,\\- ]+$")) { description.append("<br />"); description.append(part[2]); } } if (part.length == 4) { if (part[0].equals("span") && part[3].equals("textgruen")) { sr.setStatus(SearchResult.Status.GREEN); } else if (part[0].equals("span") && part[3].equals("textrot")) { sr.setStatus(SearchResult.Status.RED); } } else if (part.length == 5) { if (part[4].contains("purple")) { sr.setStatus(SearchResult.Status.YELLOW); } } if (sr.getStatus() == null) { if ((part[2].contains("entliehen") && part[2].startsWith("Vormerkung ist leider nicht mglich")) || part[2].contains("nur in anderer Zweigstelle ausleihbar und nicht bestellbar")) { sr.setStatus(SearchResult.Status.RED); } else if (part[2].startsWith("entliehen") || part[2].contains("Ein Exemplar finden Sie in einer anderen Zweigstelle")) { sr.setStatus(SearchResult.Status.YELLOW); } else if ((part[2].startsWith("bestellbar") && !part[2].contains("nicht bestellbar")) || (part[2].startsWith("vorbestellbar") && !part[2].contains("nicht vorbestellbar")) || (part[2].startsWith("vorbestellbar") && !part[2].contains("nicht vorbestellbar")) || (part[2].startsWith("vormerkbar") && !part[2].contains("nicht vormerkbar")) || (part[2].contains("heute zurckgebucht")) || (part[2].contains("ausleihbar") && !part[2].contains("nicht ausleihbar"))) { sr.setStatus(SearchResult.Status.GREEN); } if (sr.getType() != null) { if (sr.getType().equals(MediaType.EBOOK) || sr.getType().equals(MediaType.EVIDEO) || sr.getType().equals(MediaType.MP3)) // Especially Onleihe.de ebooks are often marked // green though they are not available. { sr.setStatus(SearchResult.Status.UNKNOWN); } } } k++; } if (!described) { sr.setInnerhtml(description.toString()); } sr.setNr(10 * (page - 1) + i); sr.setId(null); results.add(sr); } resultcount = results.size(); return new SearchRequestResult(results, results_total, page); }
From source file:ExtractorContentTest.java
private Catalog treatRows(Element table, Tree<String> structuralInformation, List<Header> rHeaders, boolean sortable) { int I = 0;/*from w w w . ja v a 2 s. c o m*/ Catalog product = new Catalog(structuralInformation, rHeaders); for (Element row : table.select("tr")) { Elements lines; if (sortable) { lines = row.select("th"); // first entry is a header in sortable table lines.addAll(row.select("td")); } else { lines = row.select("td"); } Product p = new Product("product_" + I, structuralInformation, rHeaders); int J = 0; for (Element line : lines) { p.add(J, line.text()); J++; } // necessarily a tr with a td if (!lines.isEmpty()) { if (sortable && (I == 0)) { // header (first entry) is not a product } else product.add(p); I++; } } return product; }
From source file:de.geeksfactory.opacclient.apis.SISIS.java
protected DetailledItem parse_result(String html) throws IOException { Document doc = Jsoup.parse(html); doc.setBaseUri(opac_url);/* w w w.java 2 s. c o m*/ String html2 = httpGet(opac_url + "/singleHit.do?methodToCall=activateTab&tab=showTitleActive", ENCODING); Document doc2 = Jsoup.parse(html2); doc2.setBaseUri(opac_url); String html3 = httpGet(opac_url + "/singleHit.do?methodToCall=activateTab&tab=showAvailabilityActive", ENCODING); Document doc3 = Jsoup.parse(html3); doc3.setBaseUri(opac_url); DetailledItem result = new DetailledItem(); try { result.setId(doc.select("#bibtip_id").text().trim()); } catch (Exception ex) { ex.printStackTrace(); } List<String> reservationlinks = new ArrayList<>(); for (Element link : doc3.select("#vormerkung a, #tab-content a")) { String href = link.absUrl("href"); Map<String, String> hrefq = getQueryParamsFirst(href); if (result.getId() == null) { // ID retrieval String key = hrefq.get("katkey"); if (key != null) { result.setId(key); break; } } // Vormerken if (hrefq.get("methodToCall") != null) { if (hrefq.get("methodToCall").equals("doVormerkung") || hrefq.get("methodToCall").equals("doBestellung")) { reservationlinks.add(href.split("\\?")[1]); } } } if (reservationlinks.size() == 1) { result.setReservable(true); result.setReservation_info(reservationlinks.get(0)); } else if (reservationlinks.size() == 0) { result.setReservable(false); } else { // TODO: Multiple options - handle this case! } if (doc.select(".data td img").size() == 1) { result.setCover(doc.select(".data td img").first().attr("abs:src")); try { downloadCover(result); } catch (Exception e) { } } if (doc.select(".aw_teaser_title").size() == 1) { result.setTitle(doc.select(".aw_teaser_title").first().text().trim()); } else if (doc.select(".data td strong").size() > 0) { result.setTitle(doc.select(".data td strong").first().text().trim()); } else { result.setTitle(""); } if (doc.select(".aw_teaser_title_zusatz").size() > 0) { result.addDetail(new Detail("Titelzusatz", doc.select(".aw_teaser_title_zusatz").text().trim())); } String title = ""; String text = ""; boolean takeover = false; Element detailtrs = doc2.select(".box-container .data td").first(); for (Node node : detailtrs.childNodes()) { if (node instanceof Element) { if (((Element) node).tagName().equals("strong")) { title = ((Element) node).text().trim(); text = ""; } else { if (((Element) node).tagName().equals("a") && (((Element) node).text().trim().contains("hier klicken") || title.equals("Link:"))) { text = text + node.attr("href"); takeover = true; break; } } } else if (node instanceof TextNode) { text = text + ((TextNode) node).text(); } } if (!takeover) { text = ""; title = ""; } detailtrs = doc2.select("#tab-content .data td").first(); if (detailtrs != null) { for (Node node : detailtrs.childNodes()) { if (node instanceof Element) { if (((Element) node).tagName().equals("strong")) { if (!text.equals("") && !title.equals("")) { result.addDetail(new Detail(title.trim(), text.trim())); if (title.equals("Titel:")) { result.setTitle(text.trim()); } text = ""; } title = ((Element) node).text().trim(); } else { if (((Element) node).tagName().equals("a") && (((Element) node).text().trim().contains("hier klicken") || title.equals("Link:"))) { text = text + node.attr("href"); } else { text = text + ((Element) node).text(); } } } else if (node instanceof TextNode) { text = text + ((TextNode) node).text(); } } } else { if (doc2.select("#tab-content .fulltitle tr").size() > 0) { Elements rows = doc2.select("#tab-content .fulltitle tr"); for (Element tr : rows) { if (tr.children().size() == 2) { Element valcell = tr.child(1); String value = valcell.text().trim(); if (valcell.select("a").size() == 1) { value = valcell.select("a").first().absUrl("href"); } result.addDetail(new Detail(tr.child(0).text().trim(), value)); } } } else { result.addDetail(new Detail(stringProvider.getString(StringProvider.ERROR), stringProvider.getString(StringProvider.COULD_NOT_LOAD_DETAIL))); } } if (!text.equals("") && !title.equals("")) { result.addDetail(new Detail(title.trim(), text.trim())); if (title.equals("Titel:")) { result.setTitle(text.trim()); } } for (Element link : doc3.select("#tab-content a")) { Map<String, String> hrefq = getQueryParamsFirst(link.absUrl("href")); if (result.getId() == null) { // ID retrieval String key = hrefq.get("katkey"); if (key != null) { result.setId(key); break; } } } for (Element link : doc3.select(".box-container a")) { if (link.text().trim().equals("Download")) { result.addDetail( new Detail(stringProvider.getString(StringProvider.DOWNLOAD), link.absUrl("href"))); } } Map<String, Integer> copy_columnmap = new HashMap<>(); // Default values copy_columnmap.put("barcode", 1); copy_columnmap.put("branch", 3); copy_columnmap.put("status", 4); Elements copy_columns = doc.select("#tab-content .data tr#bg2 th"); for (int i = 0; i < copy_columns.size(); i++) { Element th = copy_columns.get(i); String head = th.text().trim(); if (head.contains("Status")) { copy_columnmap.put("status", i); } if (head.contains("Zweigstelle")) { copy_columnmap.put("branch", i); } if (head.contains("Mediennummer")) { copy_columnmap.put("barcode", i); } if (head.contains("Standort")) { copy_columnmap.put("location", i); } if (head.contains("Signatur")) { copy_columnmap.put("signature", i); } } Pattern status_lent = Pattern.compile( "^(entliehen) bis ([0-9]{1,2}.[0-9]{1,2}.[0-9]{2," + "4}) \\(gesamte Vormerkungen: ([0-9]+)\\)$"); Pattern status_and_barcode = Pattern.compile("^(.*) ([0-9A-Za-z]+)$"); Elements exemplartrs = doc.select("#tab-content .data tr").not("#bg2"); DateTimeFormatter fmt = DateTimeFormat.forPattern("dd.MM.yyyy").withLocale(Locale.GERMAN); for (Element tr : exemplartrs) { try { Copy copy = new Copy(); Element status = tr.child(copy_columnmap.get("status")); Element barcode = tr.child(copy_columnmap.get("barcode")); String barcodetext = barcode.text().trim().replace(" Wegweiser", ""); // STATUS String statustext; if (status.getElementsByTag("b").size() > 0) { statustext = status.getElementsByTag("b").text().trim(); } else { statustext = status.text().trim(); } if (copy_columnmap.get("status").equals(copy_columnmap.get("barcode"))) { Matcher matcher1 = status_and_barcode.matcher(statustext); if (matcher1.matches()) { statustext = matcher1.group(1); barcodetext = matcher1.group(2); } } Matcher matcher = status_lent.matcher(statustext); if (matcher.matches()) { copy.setStatus(matcher.group(1)); copy.setReservations(matcher.group(3)); copy.setReturnDate(fmt.parseLocalDate(matcher.group(2))); } else { copy.setStatus(statustext); } copy.setBarcode(barcodetext); if (status.select("a[href*=doVormerkung]").size() == 1) { copy.setResInfo(status.select("a[href*=doVormerkung]").attr("href").split("\\?")[1]); } String branchtext = tr.child(copy_columnmap.get("branch")).text().trim().replace(" Wegweiser", ""); copy.setBranch(branchtext); if (copy_columnmap.containsKey("location")) { copy.setLocation( tr.child(copy_columnmap.get("location")).text().trim().replace(" Wegweiser", "")); } if (copy_columnmap.containsKey("signature")) { copy.setShelfmark( tr.child(copy_columnmap.get("signature")).text().trim().replace(" Wegweiser", "")); } result.addCopy(copy); } catch (Exception ex) { ex.printStackTrace(); } } try { Element isvolume = null; Map<String, String> volume = new HashMap<>(); Elements links = doc.select(".data td a"); int elcount = links.size(); for (int eli = 0; eli < elcount; eli++) { List<NameValuePair> anyurl = URLEncodedUtils.parse(new URI(links.get(eli).attr("href")), "UTF-8"); for (NameValuePair nv : anyurl) { if (nv.getName().equals("methodToCall") && nv.getValue().equals("volumeSearch")) { isvolume = links.get(eli); } else if (nv.getName().equals("catKey")) { volume.put("catKey", nv.getValue()); } else if (nv.getName().equals("dbIdentifier")) { volume.put("dbIdentifier", nv.getValue()); } } if (isvolume != null) { volume.put("volume", "true"); result.setVolumesearch(volume); break; } } } catch (Exception e) { e.printStackTrace(); } return result; }
From source file:ExtractorContentTest.java
private List<Header> collectHeaders(Element table) { List<Header> headers = new ArrayList<Header>(); List<Header> headersWithNestedHeaders = new ArrayList<Header>(); List<List<Header>> nestedHeaders = new ArrayList<List<Header>>(); int levelHeader = 0; // FIXME nested header > 1 for (Element row : table.select("tr")) { if (isEmpty(row)) // sometimes the first row, especially in sortable table, is empty (the second row is relevant for headers) continue; if (levelHeader == 0) { for (Element header : row.select("th")) { String hName = header.text(); Header headerV = new Header(hName); Elements colspan = header.getElementsByAttribute("colspan"); if (!colspan.isEmpty()) { headersWithNestedHeaders.add(headerV); int v = Integer.parseInt(colspan.first().attr("colspan")); headerV.setNumbersOfNestedHeaders(v); }//from ww w .ja v a2 s. c o m headers.add(headerV); } levelHeader++; } else if (levelHeader == 1) { // nested header List<Header> nHeaders = new ArrayList<Header>(); for (Element header : row.select("th")) { String hName = header.text(); Header headerV = new Header(hName); nHeaders.add(headerV); } nestedHeaders.add(nHeaders); levelHeader++; } } // FIXME table.select("thead"); // FIXME assign a "number" of appearance for headers // especially important for nested headers (colspan="3") List<Header> rHeaders = new ArrayList<Header>(); List<Header> nHeaders = new ArrayList<Header>(); if (nestedHeaders.size() > 0) nHeaders = nestedHeaders.get(0); // FIXME 0 at the moment but normally it can be refined int lastIndex = 0; for (Header header : headers) { // nested if (headersWithNestedHeaders.contains(header)) { // header has nested headers int nNestedHeaders = header.getNumbersOfNestedHeaders(); // number of hested headers // now associating an header to nested headers // nHeaders[lastIndex...lastIndex+nNestedHeaders] int v = 0; int u = 0; for (Header nH : nHeaders) { if (u++ < lastIndex) continue; rHeaders.add(nH); if (v < nNestedHeaders) { header.addNestedHeader(nH); nH.addParentHeader(header); v++; } } lastIndex += nNestedHeaders; } else { rHeaders.add(header); } } //System.err.println("rHeaders=" + rHeaders); return rHeaders; }
From source file:de.geeksfactory.opacclient.apis.BiBer1992.java
static List<LentItem> parseMediaList(AccountData res, Document doc, JSONObject data) throws JSONException { List<LentItem> media = new ArrayList<>(); if (doc == null) { return media; }/*from w ww . java 2 s .c o m*/ // parse result list JSONObject copymap = data.getJSONObject("accounttable"); Pattern expire = Pattern.compile("Ausweisg.ltigkeit: ([0-9.]+)"); Pattern fees = Pattern.compile("([0-9,.]+) ."); for (Element td : doc.select(".td01x09n")) { String text = td.text().trim(); if (expire.matcher(text).matches()) { res.setValidUntil(expire.matcher(text).replaceAll("$1")); } else if (fees.matcher(text).matches()) { res.setPendingFees(text); } } DateTimeFormatter fmt = DateTimeFormat.forPattern("dd.MM.yyyy").withLocale(Locale.GERMAN); Elements rowElements = doc.select("form[name=medkl] table tr"); // rows: skip 1st row -> title row for (int i = 1; i < rowElements.size(); i++) { Element tr = rowElements.get(i); if (tr.child(0).tagName().equals("th")) { continue; } LentItem item = new LentItem(); Pattern itemIdPat = Pattern.compile("javascript:smAcc\\('[a-z]+','[a-z]+','([A-Za-z0-9]+)'\\)"); // columns: all elements of one media Iterator<?> keys = copymap.keys(); while (keys.hasNext()) { String key = (String) keys.next(); int index; try { index = copymap.has(key) ? copymap.getInt(key) : -1; } catch (JSONException e1) { index = -1; } if (index >= 0) { String value = tr.child(index).text().trim().replace("\u00A0", ""); switch (key) { case "author": value = findTitleAndAuthor(value)[1]; break; case "title": value = findTitleAndAuthor(value)[0]; break; case "returndate": try { value = fmt.parseLocalDate(value).toString(); } catch (IllegalArgumentException e1) { e1.printStackTrace(); } break; } if (tr.child(index).select("a").size() == 1) { Matcher matcher = itemIdPat.matcher(tr.child(index).select("a").attr("href")); if (matcher.find()) item.setId(matcher.group(1)); } if (value != null && value.length() != 0) item.set(key, value); } } if (tr.select("input[type=checkbox][value=YES]").size() > 0) { item.setProlongData(tr.select("input[type=checkbox][value=YES]").attr("name")); } media.add(item); } return media; }