List of usage examples for org.jsoup.select Elements first
public Element first()
From source file:com.jimplush.goose.ContentExtractor.java
/** * if the article has meta canonical link set in the url *//*from w w w . j av a 2 s.co m*/ private String getCanonicalLink(Document doc, String baseUrl) { Elements meta = doc.select("link[rel=canonical]"); if (meta.size() > 0) { String href = meta.first().attr("href"); return string.isNullOrEmpty(href) ? string.empty : href.trim(); } else { return baseUrl; } /* Not sure what this is for // set domain based on canonicalUrl URL url = null; try { if (canonicalUrl != null) { if (!canonicalUrl.startsWith("http://")) { url = new URL(new URL(baseUrl), canonicalUrl); } else { url = new URL(canonicalUrl); } } else { url = new URL(baseUrl); } } catch (MalformedURLException e) { logger.error(e.toString(), e); }*/ }
From source file:hu.tbognar76.apking.ApKing.java
public GoogleCategory getCategoryFromGooglePlayStore(String packageName) { GoogleCategory cc = new GoogleCategory(); cc.cat1 = "Unknown"; cc.cat2 = "Unknown"; String url = "https://play.google.com/store/apps/details?id=" + URI.create(packageName) + "&hl=en"; Document doc = null;/* ww w .j a v a 2 s. c om*/ try { doc = Jsoup.connect(url).get(); } catch (IOException e) { // TODO Auto-generated catch block // e.printStackTrace(); System.out.println("!! GooglePlay connect error with : " + url); return cc; } // <span itemprop="genre">letstlus</span> /* * Elements link = doc.select(".document-subtitle category"); String * linkHref = link.attr("href"); // "http://example.com/" String * linkText = link.text(); // "example"" */ Elements genres = doc.select("a[itemprop=genre]"); if (genres != null) { Element e = genres.first(); if (e != null) { cc.cat2 = e.text(); String hr = e.attr("href"); if (hr.indexOf("category/GAME") != -1 || hr.indexOf("category/FAMILY") != -1) { cc.cat1 = "Game"; } else { cc.cat1 = "Application"; } } else { System.out.println("!! GooglePlay parse error structure with : " + url); } } else { System.out.println("!! GooglePlay parse error with : " + url); } /* * for (Element e : genres) { // System.out.println(e.text()); if * (!out.equals("")) { out = out + " "; } out = out + e.text(); * * } */ // <div class="content" itemprop="softwareVersion"> 2.6.9.0 </div> // Elements versions = doc.select("div[itemprop=softwareVersion]"); // System.out.println(versions.first().text()); // <a class="document-subtitle category" // href="/store/apps/category/GAME_ADVENTURE"> <span // itemprop="genre">Kalandjtkok</span> </a> /* Elements maincat = doc.getElementsByClass("category"); if (maincat != null) { Element p = maincat.first(); if (p != null) { String href = maincat.attr("href"); if (href != null) { if ((href.lastIndexOf("GAME") != -1) || (href.lastIndexOf("FAMILY") != -1)) { cc.cat1 = "Game"; } else { cc.cat1 = "Application"; } } // cc.cat1 = maincat.attr("href"); } } */ // <img alt="PEGI 3" class="document-subtitle content-rating-badge" // src="//lpfw=h28"> // <span class="document-subtitle content-rating-title">PEGI 3</span> Elements pegi = doc.getElementsByClass("content-rating-title"); if (pegi != null) { Element p = pegi.first(); if (p != null) { // cc.cat1 = p.text(); } } return cc; }
From source file:com.jimplush.goose.ContentExtractor.java
/** * attemps to grab titles from the html pages, lots of sites use different delimiters * for titles so we'll try and do our best guess. * * * @param doc/*from w ww . jav a 2s. co m*/ * @return */ private String getTitle(Document doc) { String title = string.empty; try { Elements titleElem = doc.getElementsByTag("title"); if (titleElem == null || titleElem.isEmpty()) return string.empty; String titleText = titleElem.first().text(); if (string.isNullOrEmpty(titleText)) return string.empty; boolean usedDelimeter = false; if (titleText.contains("|")) { titleText = doTitleSplits(titleText, PIPE_SPLITTER); usedDelimeter = true; } if (!usedDelimeter && titleText.contains("-")) { titleText = doTitleSplits(titleText, DASH_SPLITTER); usedDelimeter = true; } if (!usedDelimeter && titleText.contains("")) { titleText = doTitleSplits(titleText, ARROWS_SPLITTER); usedDelimeter = true; } if (!usedDelimeter && titleText.contains(":")) { titleText = doTitleSplits(titleText, COLON_SPLITTER); } // encode unicode charz title = StringEscapeUtils.escapeHtml(titleText); // todo this is a hack until I can fix this.. weird motely crue error with // http://money.cnn.com/2010/10/25/news/companies/motley_crue_bp.fortune/index.htm?section=money_latest title = MOTLEY_REPLACEMENT.replaceAll(title); if (logger.isDebugEnabled()) { logger.debug("Page title is: " + title); } } catch (NullPointerException e) { logger.error(e.toString()); } return title; }
From source file:de.geeksfactory.opacclient.apis.IOpac.java
protected SearchRequestResult parse_search(String html, int page) throws OpacErrorException, NotReachableException { Document doc = Jsoup.parse(html); if (doc.select("h4").size() > 0) { if (doc.select("h4").text().trim().startsWith("0 gefundene Medien")) { // nothing found return new SearchRequestResult(new ArrayList<SearchResult>(), 0, 1, 1); } else if (!doc.select("h4").text().trim().contains("gefundene Medien") && !doc.select("h4").text().trim().contains("Es wurden mehr als")) { // error throw new OpacErrorException(doc.select("h4").text().trim()); }/*ww w .ja v a2 s. com*/ } else if (doc.select("h1").size() > 0) { if (doc.select("h1").text().trim().contains("RUNTIME ERROR")) { // Server Error throw new NotReachableException("IOPAC RUNTIME ERROR"); } else { throw new OpacErrorException(stringProvider.getFormattedString( StringProvider.UNKNOWN_ERROR_WITH_DESCRIPTION, doc.select("h1").text().trim())); } } else { return null; } updateRechnr(doc); reusehtml = html; results_total = -1; if (doc.select("h4").text().trim().contains("Es wurden mehr als")) { results_total = 200; } else { String resultnumstr = doc.select("h4").first().text(); resultnumstr = resultnumstr.substring(0, resultnumstr.indexOf(" ")).trim(); results_total = Integer.parseInt(resultnumstr); } List<SearchResult> results = new ArrayList<>(); Elements tables = doc.select("table").first().select("tr:has(td)"); Map<String, Integer> colmap = new HashMap<>(); Element thead = doc.select("table").first().select("tr:has(th)").first(); int j = 0; for (Element th : thead.select("th")) { String text = th.text().trim().toLowerCase(Locale.GERMAN); if (text.contains("cover")) { colmap.put("cover", j); } else if (text.contains("titel")) { colmap.put("title", j); } else if (text.contains("verfasser")) { colmap.put("author", j); } else if (text.contains("mtyp")) { colmap.put("category", j); } else if (text.contains("jahr")) { colmap.put("year", j); } else if (text.contains("signatur")) { colmap.put("shelfmark", j); } else if (text.contains("info")) { colmap.put("info", j); } else if (text.contains("abteilung")) { colmap.put("department", j); } else if (text.contains("verliehen") || text.contains("verl.")) { colmap.put("returndate", j); } else if (text.contains("anz.res")) { colmap.put("reservations", j); } j++; } if (colmap.size() == 0) { colmap.put("cover", 0); colmap.put("title", 1); colmap.put("author", 2); colmap.put("publisher", 3); colmap.put("year", 4); colmap.put("department", 5); colmap.put("shelfmark", 6); colmap.put("returndate", 7); colmap.put("category", 8); } for (int i = 0; i < tables.size(); i++) { Element tr = tables.get(i); SearchResult sr = new SearchResult(); if (tr.select("td").get(colmap.get("cover")).select("img").size() > 0) { String imgUrl = tr.select("td").get(colmap.get("cover")).select("img").first().attr("src"); sr.setCover(imgUrl); } // Media Type if (colmap.get("category") != null) { String mType = tr.select("td").get(colmap.get("category")).text().trim().replace("\u00a0", ""); if (data.has("mediatypes")) { try { sr.setType(MediaType.valueOf( data.getJSONObject("mediatypes").getString(mType.toLowerCase(Locale.GERMAN)))); } catch (JSONException | IllegalArgumentException e) { sr.setType(defaulttypes.get(mType.toLowerCase(Locale.GERMAN))); } } else { sr.setType(defaulttypes.get(mType.toLowerCase(Locale.GERMAN))); } } // Title and additional info String title; String additionalInfo = ""; if (colmap.get("info") != null) { Element info = tr.select("td").get(colmap.get("info")); title = info.select("a[title=Details-Info]").text().trim(); String authorIn = info.text().substring(0, info.text().indexOf(title)); if (authorIn.contains(":")) { authorIn = authorIn.replaceFirst("^([^:]*):(.*)$", "$1"); additionalInfo += " - " + authorIn; } } else { title = tr.select("td").get(colmap.get("title")).text().trim().replace("\u00a0", ""); if (title.contains("(") && title.indexOf("(") > 0) { additionalInfo += title.substring(title.indexOf("(")); title = title.substring(0, title.indexOf("(") - 1).trim(); } // Author if (colmap.containsKey("author")) { String author = tr.select("td").get(colmap.get("author")).text().trim().replace("\u00a0", ""); additionalInfo += " - " + author; } } // Publisher if (colmap.containsKey("publisher")) { String publisher = tr.select("td").get(colmap.get("publisher")).text().trim().replace("\u00a0", ""); additionalInfo += " (" + publisher; } // Year if (colmap.containsKey("year")) { String year = tr.select("td").get(colmap.get("year")).text().trim().replace("\u00a0", ""); additionalInfo += ", " + year + ")"; } sr.setInnerhtml("<b>" + title + "</b><br>" + additionalInfo); // Status String status = tr.select("td").get(colmap.get("returndate")).text().trim().replace("\u00a0", ""); SimpleDateFormat df = new SimpleDateFormat("dd.MM.yyyy", Locale.GERMAN); try { df.parse(status); // this is a return date sr.setStatus(Status.RED); sr.setInnerhtml(sr.getInnerhtml() + "<br><i>" + stringProvider.getString(StringProvider.LENT_UNTIL) + " " + status + "</i>"); } catch (ParseException e) { // this is a different status text String lc = status.toLowerCase(Locale.GERMAN); if ((lc.equals("") || lc.toLowerCase(Locale.GERMAN).contains("onleihe") || lc.contains("verleihbar") || lc.contains("entleihbar") || lc.contains("ausleihbar")) && !lc.contains("nicht")) { sr.setStatus(Status.GREEN); } else { sr.setStatus(Status.YELLOW); sr.setInnerhtml(sr.getInnerhtml() + "<br><i>" + status + "</i>"); } } // In some libraries (for example search for "atelier" in Preetz) // the results are sorted differently than their numbers suggest, so // we need to detect the number ("recno") from the link String link = tr.select("a[href^=/cgi-bin/di.exe?page=]").attr("href"); Map<String, String> params = getQueryParamsFirst(link); if (params.containsKey("recno")) { int recno = Integer.valueOf(params.get("recno")); sr.setNr(recno - 1); } else { // the above should work, but fall back to this if it doesn't sr.setNr(10 * (page - 1) + i); } // In some libraries (for example Preetz) we can detect the media ID // here using another link present in the search results Elements idLinks = tr.select("a[href^=/cgi-bin/di.exe?cMedNr]"); if (idLinks.size() > 0) { Map<String, String> idParams = getQueryParamsFirst(idLinks.first().attr("href")); String id = idParams.get("cMedNr"); sr.setId(id); } else { sr.setId(null); } results.add(sr); } return new SearchRequestResult(results, results_total, page); }
From source file:com.vaushell.shaarlijavaapi.ShaarliClient.java
private String extract(final Element source, final String templateName) { if (source == null) { throw new IllegalArgumentException(); }/* w ww . j av a2 s . c o m*/ final ShaarliTemplates.Template template = templates.get(templateName); if (template == null) { throw new IllegalArgumentException("template '" + templateName + "' not found"); } final Element elt; if (template.cssPath.isEmpty()) { elt = source; } else { final Elements elts = source.select(template.cssPath); if (elts.isEmpty()) { return null; } elt = elts.first(); } String content; if (template.attribut.isEmpty()) { content = elt.text(); } else { content = elt.attr(template.attribut); } if (content == null) { return null; } content = content.trim(); if (!template.regex.isEmpty()) { final Pattern p = Pattern.compile(template.regex); final Matcher m = p.matcher(content); if (m.find()) { content = m.group().trim(); } } if (content.isEmpty()) { return null; } return content; }
From source file:ExtractorContentTest.java
private void treatSection(Element section, List<Catalog> catalogs) { // 1. get section name // FIXME what is it does not exist? // FIXME can be "h3" Elements sect2 = section.getElementsByTag("h2"); String s2 = null;/*w ww .ja va 2 s. c o m*/ if (!sect2.isEmpty()) s2 = sect2.first().text(); // FIXME what about more than 1 ? String s3 = null; Elements sect3 = section.getElementsByTag("h3"); if (!sect3.isEmpty()) s3 = sect3.first().text(); String dt = null; Elements sectDT = section.getElementsByTag("p"); if (!sectDT.isEmpty()) { String contentDT = sectDT.first().text(); if (contentDT.startsWith(";")) dt = contentDT.replaceAll(";", ""); } // FIXME can be subsection // FIXME (1. optional step) some comments // 2. retrieve tabular Elements tables = section.getElementsByTag("table"); //if (!tables.isEmpty()) //System.err.println("\n****** " + s2 + " " + s3 + " *******\n"); for (Element table : tables) { // (0. optional step) act as subviewname Elements caption = table.select("caption"); String captionName = null; if (!caption.isEmpty()) captionName = caption.first().text(); /*** * Headers */ // List<Header> rHeaders = collectHeaders(table); boolean sortable = !table.select("[class=sortable wikitable]").isEmpty() || !table.select("[class=wikitable sortable]").isEmpty(); // FIXME: other cases Elements heads = table.select("thead"); if (sortable && (!heads.isEmpty())) { rHeaders = collectHeaders(heads.first()); } // 2 treat row Catalog product = null; Tree<String> structuralInformation = mkStructuralInformation(s2, s3, dt, captionName); if (sortable) { product = treatRows(table.select("tbody").first(), structuralInformation, rHeaders, sortable); } else product = treatRows(table, structuralInformation, rHeaders, sortable); catalogs.add(product); // } // set the "ID" / names // clean up for (Catalog catalog : catalogs) { for (Product p : catalog) { Header primaryHeader = p.getHeaders().get(0); p.setName(p.getValue(primaryHeader.getName())); } } }
From source file:ExtractorContentTest.java
private void treatTable(Element table, List<Catalog> catalogs) { // 1. get section name Elements sect2 = table.parents().select("h2"); // section.getElementsByTag("h2") ; String s2 = null;//from w ww.j av a 2s.c o m if (!sect2.isEmpty()) s2 = sect2.first().text(); // FIXME what about more than 1 ? String s3 = null; Elements sect3 = table.parents().select("h3"); if (!sect3.isEmpty()) s3 = sect3.first().text(); String dt = null; Elements sectDT = table.parents().select("p"); if (!sectDT.isEmpty()) { String contentDT = sectDT.first().text(); if (contentDT.startsWith(";")) dt = contentDT.replaceAll(";", ""); } Elements caption = table.select("caption"); String captionName = null; if (!caption.isEmpty()) captionName = caption.first().text(); // FIXME other forms of structural information /*** * Headers */ // List<Header> rHeaders = collectHeaders(table); boolean sortable = !table.select("[class=sortable wikitable]").isEmpty() || !table.select("[class=wikitable sortable]").isEmpty(); // || !table.select("[class=sortable wikitable jquery-tablesorter]").isEmpty() ; // FIXME: other cases Elements heads = table.select("thead"); if (sortable && (!heads.isEmpty())) { rHeaders = collectHeaders(heads.first()); } System.err.println("SORTABLE:" + sortable + " rHeaders=" + rHeaders); // 2 treat row Catalog product = null; Tree<String> structuralInformation = mkStructuralInformation(s2, s3, dt, captionName); if (sortable) { product = treatRows(table.select("tbody").first(), structuralInformation, rHeaders, sortable); } else product = treatRows(table, structuralInformation, rHeaders, sortable); catalogs.add(product); // // set the "ID" / names // clean up for (Catalog catalog : catalogs) { List<Product> toRemove = new ArrayList<Product>(); for (Product p : catalog) { Header primaryHeader = p.getHeaders().get(0); p.setName(p.getValue(primaryHeader.getName())); // some products are headers (each value equals to header name) List<Header> headers = p.getHeaders(); boolean isHeader = true; for (Header header : headers) { String hName = header.getName(); String pValue = p.getValue(hName); if (pValue == null) continue; if (!hName.contains(pValue)) { isHeader = false; } } if (isHeader) { toRemove.add(p); } } if (!toRemove.isEmpty() && !catalog.isEmpty()) catalog.removeAll(toRemove); } }
From source file:ExtractorContentTest.java
private List<Header> collectHeaders(Element table) { List<Header> headers = new ArrayList<Header>(); List<Header> headersWithNestedHeaders = new ArrayList<Header>(); List<List<Header>> nestedHeaders = new ArrayList<List<Header>>(); int levelHeader = 0; // FIXME nested header > 1 for (Element row : table.select("tr")) { if (isEmpty(row)) // sometimes the first row, especially in sortable table, is empty (the second row is relevant for headers) continue; if (levelHeader == 0) { for (Element header : row.select("th")) { String hName = header.text(); Header headerV = new Header(hName); Elements colspan = header.getElementsByAttribute("colspan"); if (!colspan.isEmpty()) { headersWithNestedHeaders.add(headerV); int v = Integer.parseInt(colspan.first().attr("colspan")); headerV.setNumbersOfNestedHeaders(v); }/*from w w w. ja v a 2s .c om*/ headers.add(headerV); } levelHeader++; } else if (levelHeader == 1) { // nested header List<Header> nHeaders = new ArrayList<Header>(); for (Element header : row.select("th")) { String hName = header.text(); Header headerV = new Header(hName); nHeaders.add(headerV); } nestedHeaders.add(nHeaders); levelHeader++; } } // FIXME table.select("thead"); // FIXME assign a "number" of appearance for headers // especially important for nested headers (colspan="3") List<Header> rHeaders = new ArrayList<Header>(); List<Header> nHeaders = new ArrayList<Header>(); if (nestedHeaders.size() > 0) nHeaders = nestedHeaders.get(0); // FIXME 0 at the moment but normally it can be refined int lastIndex = 0; for (Header header : headers) { // nested if (headersWithNestedHeaders.contains(header)) { // header has nested headers int nNestedHeaders = header.getNumbersOfNestedHeaders(); // number of hested headers // now associating an header to nested headers // nHeaders[lastIndex...lastIndex+nNestedHeaders] int v = 0; int u = 0; for (Header nH : nHeaders) { if (u++ < lastIndex) continue; rHeaders.add(nH); if (v < nNestedHeaders) { header.addNestedHeader(nH); nH.addParentHeader(header); v++; } } lastIndex += nNestedHeaders; } else { rHeaders.add(header); } } //System.err.println("rHeaders=" + rHeaders); return rHeaders; }
From source file:cc.metapro.openct.grades.GradePresenter.java
@Override public void loadCETGrade(final Map<String, String> queryMap) { Observable.create(new ObservableOnSubscribe<Map<String, String>>() { @Override//from w w w .jav a2s .c o m public void subscribe(ObservableEmitter<Map<String, String>> e) throws Exception { CETService service = ServiceCenter.createCETService(); String queryResult = service.queryCET(mContext.getString(R.string.url_chsi_referer), queryMap.get(mContext.getString(R.string.key_ticket_num)), queryMap.get(mContext.getString(R.string.key_full_name)), "t").execute().body(); Document document = Jsoup.parse(queryResult); Elements elements = document.select("table[class=cetTable]"); Element targetTable = elements.first(); Elements tds = targetTable.getElementsByTag("td"); String name = tds.get(0).text(); String school = tds.get(1).text(); String type = tds.get(2).text(); String num = tds.get(3).text(); String time = tds.get(4).text(); String grade = tds.get(5).text(); Map<String, String> results = new HashMap<>(6); results.put(mContext.getString(R.string.key_full_name), name); results.put(mContext.getString(R.string.key_school), school); results.put(mContext.getString(R.string.key_cet_type), type); results.put(mContext.getString(R.string.key_ticket_num), num); results.put(mContext.getString(R.string.key_cet_time), time); results.put(mContext.getString(R.string.key_cet_grade), grade); e.onNext(results); } }).subscribeOn(Schedulers.newThread()).observeOn(AndroidSchedulers.mainThread()) .doOnNext(new Consumer<Map<String, String>>() { @Override public void accept(Map<String, String> stringMap) throws Exception { mView.onLoadCETGrade(stringMap); } }).onErrorReturn(new Function<Throwable, Map<String, String>>() { @Override public Map<String, String> apply(Throwable throwable) throws Exception { Toast.makeText(mContext, R.string.fetch_cet_fail, Toast.LENGTH_SHORT).show(); return new HashMap<>(); } }).subscribe(); }