List of usage examples for org.jsoup.nodes Element attr
public String attr(String attributeKey)
From source file:com.jimplush.goose.ContentExtractor.java
/** * returns the gravityScore as an integer from this node * * @param node//from w ww . ja v a 2s . c om * @return */ private int getScore(Element node) { if (node == null) return 0; try { String grvScoreString = node.attr("gravityScore"); if (string.isNullOrEmpty(grvScoreString)) return 0; return Integer.parseInt(grvScoreString); } catch (NumberFormatException e) { return 0; } }
From source file:com.jimplush.goose.ContentExtractor.java
private String debugNode(Element e) { StringBuilder sb = new StringBuilder(); sb.append("GravityScore: '"); sb.append(e.attr("gravityScore")); sb.append("' paraNodeCount: '"); sb.append(e.attr("gravityNodes")); sb.append("' nodeId: '"); sb.append(e.id());//from w ww.java 2s .c o m sb.append("' className: '"); sb.append(e.attr("class")); return sb.toString(); }
From source file:info.dolezel.fatrat.plugins.UloztoDownload.java
@Override public void processLink(String link) { //if (link.contains("/live/")) // link = link.replace("/live/", "/"); if (link.startsWith("http://uloz.to") || link.startsWith("https://uloz.to")) link = link.replace("https?://uloz.to", "https://www.uloz.to"); if (link.startsWith("http://m.uloz.to") || link.startsWith("https://m.uloz.to")) link = link.replace("https?://m.uloz.to", "https://www.uloz.to"); if (!logIn(link)) return;//w w w . ja v a 2s . c o m final String downloadLink = link; // I can't make 'link' final fetchPage(link, new PageFetchListener() { @Override public void onCompleted(ByteBuffer buf, Map<String, String> headers) { try { if (headers.containsKey("location")) { String location = headers.get("location"); if (location.contains("smazano") || location.contains("nenalezeno")) setFailed("The file has been removed"); else processLink(location); return; } CharBuffer cb = charsetUtf8.decode(buf); if (cb.toString().contains("?disclaimer=1")) { processLink(downloadLink + "?disclaimer=1"); return; } final Document doc = Jsoup.parse(cb.toString()); final Element freeForm = doc.getElementById("frm-download-freeDownloadTab-freeDownloadForm"); final Element premiumLink = doc.getElementById("#quickDownloadButton"); boolean usePremium = usePremium(downloadLink); if (cb.toString().contains("Nem dostatek kreditu")) setMessage("Credit depleted, using FREE download"); else if (usePremium && premiumLink != null) { String msg = "Using premium download"; Elements aCredits = doc.getElementsByAttributeValue("href", "/kredit"); if (!aCredits.isEmpty()) msg += " (" + aCredits.get(0).ownText() + " left)"; setMessage(msg); startDownload("http://www.uloz.to" + premiumLink.attr("href")); return; } else if (loggedIn) setMessage("Login failed, using FREE download"); Elements aNames = doc.getElementsByClass("jsShowDownload"); if (!aNames.isEmpty()) reportFileName(aNames.get(0).ownText()); final PostQuery pq = new PostQuery(); final Map<String, String> hdr = new HashMap<String, String>(); Elements eHiddens = freeForm.select("input[type=hidden]"); hdr.put("X-Requested-With", "XMLHttpRequest"); hdr.put("Referer", downloadLink); hdr.put("Accept", "application/json, text/javascript, */*; q=0.01"); for (Element e : eHiddens) pq.add(e.attr("name"), e.attr("value")); fetchPage("https://uloz.to/reloadXapca.php?rnd=" + Math.abs(new Random().nextInt()), new PageFetchListener() { @Override public void onCompleted(ByteBuffer buf, Map<String, String> headers) { CharBuffer cb = charsetUtf8.decode(buf); String captchaUrl; try { JSONObject json = new JSONObject(cb.toString()); captchaUrl = "https:" + json.getString("image"); pq.add("hash", json.getString("hash")); pq.add("timestamp", "" + json.getInt("timestamp")); pq.add("salt", "" + json.getInt("salt")); } catch (JSONException e) { setFailed("Error parsing captcha JSON"); return; } solveCaptcha(captchaUrl, new CaptchaListener() { @Override public void onFailed() { setFailed("Failed to decode the captcha code"); } @Override public void onSolved(String text) { String action = freeForm.attr("action"); pq.add("captcha_value", text); fetchPage("https://www.uloz.to" + action, new PageFetchListener() { @Override public void onCompleted(ByteBuffer buf, Map<String, String> headers) { try { CharBuffer cb = charsetUtf8.decode(buf); JSONObject obj = new JSONObject(cb.toString()); startDownload(obj.getString("url")); } catch (Exception e) { setFailed("" + e); } } @Override public void onFailed(String error) { setFailed(error); } }, pq.toString(), hdr); } }); } @Override public void onFailed(String error) { setFailed("Failed to load captcha AJAX page"); } }); } catch (Exception e) { e.printStackTrace(); setFailed(e.toString()); } } @Override public void onFailed(String error) { setFailed("Failed to load the initial page"); } }, null); }
From source file:codeu.chat.client.commandline.Chat.java
private List<String> findScript(String url) { List<String> elemLinks = new ArrayList<String>(); try {//from ww w . j a v a 2 s .c o m Document doc = Jsoup.connect(url).get(); // Make the request String elemLink, elemText; // Parse the search results Elements links = doc.select("a[href]"); for (Element link : links) { elemLink = link.attr("href"); elemText = link.text(); /* Check if any scripts for a movie in this Google search were found. If so, add them to the links list */ if ((elemLink.contains("script-o-rama") || elemLink.contains("springfieldspringfield")) && !(elemText.equals("Cached") || elemText.equals("Similar"))) { elemLinks.add(elemLink); } } } catch (IOException e) { e.printStackTrace(); } return elemLinks; // Return an empty string to indicate failure }
From source file:feedzilla.Feed.java
private void parser(Element entry) { boolean source = false; for (Element element : entry.children()) { switch (element.nodeName()) { case "id": this.id = Integer.parseInt(element.text().split(":")[1]); break; case "title": if (source) { this.source_title = element.text(); } else { this.title = element.text(); }/*w w w .j a va 2 s.co m*/ break; case "summary": this.summary = element.text().split("<br")[0]; break; case "published": this.published = element.text(); break; case "updated": this.updated = element.text(); break; case "author": this.author = element.text(); break; case "link": if (source) { this.source_link = element.attr("href"); } else { this.link = element.attr("href"); } break; case "rights": this.copyright = element.text(); break; case "source": source = true; break; default: Log.debug("Unknow TAG: " + element.nodeName()); break; } } }
From source file:de.geeksfactory.opacclient.apis.IOpac.java
private SearchField createSearchField(Element descTd, Element inputTd) { String name = descTd.select("span, blockquote").text().replace(":", "").trim().replace("\u00a0", ""); if (inputTd.select("select").size() > 0 && !name.equals("Treffer/Seite") && !name.equals("Medientypen") && !name.equals("Medientyp") && !name.equals("Treffer pro Seite")) { Element select = inputTd.select("select").first(); DropdownSearchField field = new DropdownSearchField(); field.setDisplayName(name);// w w w . j a v a 2 s . c o m field.setId(select.attr("name")); for (Element option : select.select("option")) { field.addDropdownValue(option.attr("value"), option.text()); } return field; } else if (inputTd.select("input").size() > 0) { TextSearchField field = new TextSearchField(); Element input = inputTd.select("input").first(); field.setDisplayName(name); field.setId(input.attr("name")); field.setHint(""); return field; } else { return null; } }
From source file:org.shareok.data.sagedata.SageJournalIssueDateProcessor.java
public Map<String, Map<String, String>> updateSageJournalLinks(Map<String, Map<String, String>> journalMap) { Document doc = null;//w w w. j a va 2s. c om try { doc = Jsoup.connect("http://journals.sagepub.com/action/showPublications?pageSize=20&startPage=199") .userAgent( "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/33.0.1750.152 Safari/537.36") .cookie("auth", "token").timeout(300000).get(); Elements trs = doc.select("form#browsePublicationsForm").get(0).select("table").get(0).select("tbody") .get(0).select("tr"); for (Element tr : trs) { Element link = tr.select("td").get(1).select("a").get(0); String journalName = link.text(); String journalLink = SageDataUtil.SAGE_HTTP_PREFIX + link.attr("href"); String[] linkInfo = journalLink.split("/"); String journalIssuesLink = SageDataUtil.SAGE_HTTP_PREFIX + "/loi/" + linkInfo[linkInfo.length - 1]; if (null == journalMap.get(journalName)) { Map<String, String> infoMap = new HashMap<>(); infoMap.put("homeLink", journalLink); infoMap.put("issueLink", journalIssuesLink); journalMap.put(journalName, infoMap); } else { Map<String, String> infoMap = journalMap.get(journalName); if (null == infoMap.get("homeLink")) { infoMap.put("homeLink", journalLink); } if (null == infoMap.get("issueLink")) { infoMap.put("issueLink", journalIssuesLink); } } } } catch (Exception ex) { ex.printStackTrace(); } return journalMap; }
From source file:de.geeksfactory.opacclient.apis.Bibliotheca.java
protected SearchRequestResult parse_search(String html, int page) { Document doc = Jsoup.parse(html); doc.setBaseUri(opac_url);//w w w .j a v a 2s . com Elements table = doc.select(".resulttab tr.result_trefferX, .resulttab tr.result_treffer"); List<SearchResult> results = new ArrayList<>(); for (int i = 0; i < table.size(); i++) { Element tr = table.get(i); SearchResult sr = new SearchResult(); int contentindex = 1; if (tr.select("td a img").size() > 0) { String[] fparts = tr.select("td a img").get(0).attr("src").split("/"); String fname = fparts[fparts.length - 1]; if (data.has("mediatypes")) { try { sr.setType(MediaType.valueOf(data.getJSONObject("mediatypes").getString(fname))); } catch (JSONException | IllegalArgumentException e) { sr.setType(defaulttypes.get(fname.toLowerCase(Locale.GERMAN).replace(".jpg", "") .replace(".gif", "").replace(".png", ""))); } } else { sr.setType(defaulttypes.get(fname.toLowerCase(Locale.GERMAN).replace(".jpg", "") .replace(".gif", "").replace(".png", ""))); } } else { if (tr.children().size() == 3) { contentindex = 2; } } sr.setInnerhtml(tr.child(contentindex).child(0).html()); sr.setNr(i); Element link = tr.child(contentindex).select("a").first(); try { if (link != null && link.attr("href").contains("detmediennr")) { Map<String, String> params = getQueryParamsFirst(link.attr("abs:href")); String nr = params.get("detmediennr"); if (Integer.parseInt(nr) > i + 1) { // Seems to be an ID if (params.get("detDB") != null) { sr.setId("&detmediennr=" + nr + "&detDB=" + params.get("detDB")); } else { sr.setId("&detmediennr=" + nr); } } } } catch (Exception e) { } try { if (tr.child(1).childNode(0) instanceof Comment) { Comment c = (Comment) tr.child(1).childNode(0); String comment = c.getData().trim(); String id = comment.split(": ")[1]; sr.setId(id); } } catch (Exception e) { e.printStackTrace(); } results.add(sr); } int results_total = -1; if (doc.select(".result_gefunden").size() > 0) { try { results_total = Integer.parseInt( doc.select(".result_gefunden").text().trim().replaceAll(".*[^0-9]+([0-9]+).*", "$1")); } catch (NumberFormatException e) { e.printStackTrace(); results_total = -1; } } return new SearchRequestResult(results, results_total, page); }
From source file:de.geeksfactory.opacclient.apis.IOpac.java
@Override public List<SearchField> getSearchFields() throws IOException { List<SearchField> fields = new ArrayList<>(); // Extract all search fields, except media types String html;/*from w w w . jav a 2s . c om*/ try { html = httpGet(opac_url + dir + "/search_expert.htm", getDefaultEncoding()); } catch (NotReachableException e) { html = httpGet(opac_url + dir + "/iopacie.htm", getDefaultEncoding()); } Document doc = Jsoup.parse(html); Elements trs = doc.select("form tr:has(input:not([type=submit], [type=reset])), form tr:has(select)"); for (Element tr : trs) { Elements tds = tr.children(); if (tds.size() == 4) { // Two search fields next to each other in one row SearchField field1 = createSearchField(tds.get(0), tds.get(1)); SearchField field2 = createSearchField(tds.get(2), tds.get(3)); if (field1 != null) { fields.add(field1); } if (field2 != null) { fields.add(field2); } } else if (tds.size() == 2 || (tds.size() == 3 && tds.get(2).children().size() == 0)) { SearchField field = createSearchField(tds.get(0), tds.get(1)); if (field != null) { fields.add(field); } } } if (fields.size() == 0 && doc.select("[name=sleStichwort]").size() > 0) { TextSearchField field = new TextSearchField(); Element input = doc.select("input[name=sleStichwort]").first(); field.setDisplayName(stringProvider.getString(StringProvider.FREE_SEARCH)); field.setId(input.attr("name")); field.setHint(""); fields.add(field); } // Extract available media types. // We have to parse JavaScript. Doing this with RegEx is evil. // But not as evil as including a JavaScript VM into the app. // And I honestly do not see another way. Pattern pattern_key = Pattern.compile("mtyp\\[[0-9]+\\]\\[\"typ\"\\] = \"([^\"]+)\";"); Pattern pattern_value = Pattern.compile("mtyp\\[[0-9]+\\]\\[\"bez\"\\] = \"([^\"]+)\";"); DropdownSearchField mtyp = new DropdownSearchField(); try { try { html = httpGet(opac_url + dir + "/mtyp.js", getDefaultEncoding()); } catch (NotReachableException e) { html = httpGet(opac_url + "/mtyp.js", getDefaultEncoding()); } String[] parts = html.split("new Array\\(\\);"); for (String part : parts) { Matcher matcher1 = pattern_key.matcher(part); String key = ""; String value = ""; if (matcher1.find()) { key = matcher1.group(1); } Matcher matcher2 = pattern_value.matcher(part); if (matcher2.find()) { value = matcher2.group(1); } if (!value.equals("")) { mtyp.addDropdownValue(key, value); } } } catch (IOException e) { try { html = httpGet(opac_url + dir + "/frames/search_form.php?bReset=1?bReset=1", getDefaultEncoding()); doc = Jsoup.parse(html); for (Element opt : doc.select("#imtyp option")) { mtyp.addDropdownValue(opt.attr("value"), opt.text()); } } catch (IOException e1) { e1.printStackTrace(); } } if (mtyp.getDropdownValues() != null && !mtyp.getDropdownValues().isEmpty()) { mtyp.setDisplayName("Medientypen"); mtyp.setId("Medientyp"); fields.add(mtyp); } return fields; }