List of usage examples for org.jsoup.nodes Element text
public String text()
From source file:de.geeksfactory.opacclient.apis.Bibliotheca.java
private SearchField createSearchField(String name, String hint, Element input) { if (input.tagName().equals("input") && input.attr("type").equals("text")) { TextSearchField field = new TextSearchField(); field.setDisplayName(name);/* w ww .j ava2 s .c o m*/ field.setHint(hint); field.setId(input.attr("name")); return field; } else if (input.tagName().equals("select")) { DropdownSearchField field = new DropdownSearchField(); field.setDisplayName(name); field.setId(input.attr("name")); for (Element option : input.select("option")) { field.addDropdownValue(option.attr("value"), option.text()); } return field; } else { return null; } }
From source file:de.geeksfactory.opacclient.apis.IOpac.java
@Override public List<SearchField> getSearchFields() throws IOException { List<SearchField> fields = new ArrayList<>(); // Extract all search fields, except media types String html;/* w w w. j a v a 2 s . co m*/ try { html = httpGet(opac_url + dir + "/search_expert.htm", getDefaultEncoding()); } catch (NotReachableException e) { html = httpGet(opac_url + dir + "/iopacie.htm", getDefaultEncoding()); } Document doc = Jsoup.parse(html); Elements trs = doc.select("form tr:has(input:not([type=submit], [type=reset])), form tr:has(select)"); for (Element tr : trs) { Elements tds = tr.children(); if (tds.size() == 4) { // Two search fields next to each other in one row SearchField field1 = createSearchField(tds.get(0), tds.get(1)); SearchField field2 = createSearchField(tds.get(2), tds.get(3)); if (field1 != null) { fields.add(field1); } if (field2 != null) { fields.add(field2); } } else if (tds.size() == 2 || (tds.size() == 3 && tds.get(2).children().size() == 0)) { SearchField field = createSearchField(tds.get(0), tds.get(1)); if (field != null) { fields.add(field); } } } if (fields.size() == 0 && doc.select("[name=sleStichwort]").size() > 0) { TextSearchField field = new TextSearchField(); Element input = doc.select("input[name=sleStichwort]").first(); field.setDisplayName(stringProvider.getString(StringProvider.FREE_SEARCH)); field.setId(input.attr("name")); field.setHint(""); fields.add(field); } // Extract available media types. // We have to parse JavaScript. Doing this with RegEx is evil. // But not as evil as including a JavaScript VM into the app. // And I honestly do not see another way. Pattern pattern_key = Pattern.compile("mtyp\\[[0-9]+\\]\\[\"typ\"\\] = \"([^\"]+)\";"); Pattern pattern_value = Pattern.compile("mtyp\\[[0-9]+\\]\\[\"bez\"\\] = \"([^\"]+)\";"); DropdownSearchField mtyp = new DropdownSearchField(); try { try { html = httpGet(opac_url + dir + "/mtyp.js", getDefaultEncoding()); } catch (NotReachableException e) { html = httpGet(opac_url + "/mtyp.js", getDefaultEncoding()); } String[] parts = html.split("new Array\\(\\);"); for (String part : parts) { Matcher matcher1 = pattern_key.matcher(part); String key = ""; String value = ""; if (matcher1.find()) { key = matcher1.group(1); } Matcher matcher2 = pattern_value.matcher(part); if (matcher2.find()) { value = matcher2.group(1); } if (!value.equals("")) { mtyp.addDropdownValue(key, value); } } } catch (IOException e) { try { html = httpGet(opac_url + dir + "/frames/search_form.php?bReset=1?bReset=1", getDefaultEncoding()); doc = Jsoup.parse(html); for (Element opt : doc.select("#imtyp option")) { mtyp.addDropdownValue(opt.attr("value"), opt.text()); } } catch (IOException e1) { e1.printStackTrace(); } } if (mtyp.getDropdownValues() != null && !mtyp.getDropdownValues().isEmpty()) { mtyp.setDisplayName("Medientypen"); mtyp.setId("Medientyp"); fields.add(mtyp); } return fields; }
From source file:com.aurel.track.exchange.docx.exporter.PreprocessImage.java
/** * Removes the HTML5 figure tag and saves the figcaption in the <img> tag's "alt" attribute for later use * @param htmlContent//from ww w .j a va 2s . c o m * @return */ private Document removeFigureSaveFigcaption(String htmlContent) { Document doc = Jsoup.parseBodyFragment(htmlContent); //figure is a HTML5 tag not accepted by Tidy, so it should be replaced by the content <img>-tag, and the figcaption is saved in the "alt" attribute Elements figureElements = doc.select("figure"); Element figcaptionNode = null; if (figureElements != null) { for (Iterator<Element> iterator = figureElements.iterator(); iterator.hasNext();) { Element figureElement = iterator.next(); Elements figureChildren = figureElement.getAllElements(); Node imageNode = null; if (figureChildren != null) { for (Element figureChild : figureChildren) { if ("img".equals(figureChild.nodeName())) { imageNode = figureChild; } else { if ("figcaption".equals(figureChild.nodeName())) { figcaptionNode = figureChild; //set "figcaption" text as value for "alt" attribute if (imageNode != null) { imageNode.attr("alt", figcaptionNode.text()); } } } } } if (imageNode != null) { figureElement.replaceWith(imageNode); } } } return doc; }
From source file:de.geeksfactory.opacclient.apis.Bibliotheca.java
@Override public ProlongAllResult prolongAll(Account account, int useraction, String selection) throws IOException { if (!initialised) { start();/*from w w w . jav a 2 s . co m*/ } if (System.currentTimeMillis() - logged_in > SESSION_LIFETIME || logged_in_as == null) { try { account(account); } catch (JSONException e) { e.printStackTrace(); return new ProlongAllResult(MultiStepResult.Status.ERROR, stringProvider.getString(StringProvider.CONNECTION_ERROR)); } catch (OpacErrorException e) { return new ProlongAllResult(MultiStepResult.Status.ERROR, e.getMessage()); } } else if (logged_in_as.getId() != account.getId()) { try { account(account); } catch (JSONException e) { e.printStackTrace(); return new ProlongAllResult(MultiStepResult.Status.ERROR, stringProvider.getString(StringProvider.CONNECTION_ERROR)); } catch (OpacErrorException e) { return new ProlongAllResult(MultiStepResult.Status.ERROR, e.getMessage()); } } String html = httpGet(opac_url + "/index.asp?target=alleverl", getDefaultEncoding()); Document doc = Jsoup.parse(html); if (doc.getElementsByClass("kontomeldung").size() == 1) { String err = doc.getElementsByClass("kontomeldung").get(0).text(); return new ProlongAllResult(MultiStepResult.Status.ERROR, err); } if (doc.select(".kontozeile table").size() == 1) { Map<Integer, String> colmap = new HashMap<>(); List<Map<String, String>> result = new ArrayList<>(); for (Element tr : doc.select(".kontozeile table tr")) { if (tr.select(".tabHeaderKonto").size() > 0) { int i = 0; for (Element th : tr.select("th")) { if (th.text().contains("Verfasser")) { colmap.put(i, OpacApi.ProlongAllResult.KEY_LINE_AUTHOR); } else if (th.text().contains("Titel")) { colmap.put(i, OpacApi.ProlongAllResult.KEY_LINE_TITLE); } else if (th.text().contains("Neue")) { colmap.put(i, OpacApi.ProlongAllResult.KEY_LINE_NEW_RETURNDATE); } else if (th.text().contains("Frist")) { colmap.put(i, OpacApi.ProlongAllResult.KEY_LINE_OLD_RETURNDATE); } else if (th.text().contains("Status")) { colmap.put(i, OpacApi.ProlongAllResult.KEY_LINE_MESSAGE); } i++; } } else { Map<String, String> line = new HashMap<>(); for (Entry<Integer, String> entry : colmap.entrySet()) { line.put(entry.getValue(), tr.child(entry.getKey()).text().trim()); } result.add(line); } } if (doc.select("input#make_allvl").size() > 0) { List<NameValuePair> nameValuePairs = new ArrayList<>(2); nameValuePairs.add(new BasicNameValuePair("target", "make_allvl_flag")); nameValuePairs.add(new BasicNameValuePair("make_allvl", "Bestaetigung")); httpPost(opac_url + "/index.asp", new UrlEncodedFormEntity(nameValuePairs), getDefaultEncoding()); } return new ProlongAllResult(MultiStepResult.Status.OK, result); } return new ProlongAllResult(MultiStepResult.Status.ERROR, stringProvider.getString(StringProvider.INTERNAL_ERROR)); }
From source file:de.geeksfactory.opacclient.apis.Zones.java
@Override public List<SearchField> getSearchFields() throws IOException { if (!initialised) start();/*from ww w. j a va 2 s.com*/ List<SearchField> fields = new ArrayList<>(); String html = httpGet(opac_url + "/APS_ZONES?fn=AdvancedSearch&Style=Portal3&SubStyle=&Lang=GER" + "&ResponseEncoding=utf-8", getDefaultEncoding()); Document doc = Jsoup.parse(html); // find text fields Elements txt_opts = doc.select("#formSelectTerm_1 option"); for (Element opt : txt_opts) { TextSearchField field = new TextSearchField(); field.setId(opt.attr("value")); field.setHint(""); field.setDisplayName(opt.text()); fields.add(field); } // find filters String filtersQuery = version18 ? ".inSearchLimits .floatingBox" : ".TabRechAv .limitBlock"; Elements filters = doc.select(filtersQuery); int i = 0; for (Element filter : filters) { DropdownSearchField dropdown = new DropdownSearchField(); dropdown.addDropdownValue("", "Alle"); // All dropdowns use "q.limits.limit" as URL param, but they must not have the same ID dropdown.setId("dropdown_" + i); if (version18) { dropdown.setDisplayName(filter.select("tr").get(0).text().trim()); Elements opts = filter.select("tr").get(1).select("table td:has(input)"); for (Element opt : opts) { dropdown.addDropdownValue(opt.select("input").attr("value"), opt.text().trim()); } } else { dropdown.setDisplayName(filter.parent().previousElementSibling().text().trim()); Elements opts = filter.select(".limitChoice label"); for (Element opt : opts) { dropdown.addDropdownValue(opt.attr("for"), opt.text().trim()); } } fields.add(dropdown); i++; } return fields; }
From source file:de.geeksfactory.opacclient.apis.TouchPoint.java
private void parseDropdown(Element dropdownElement, List<SearchField> fields) { Elements options = dropdownElement.select("option"); DropdownSearchField dropdown = new DropdownSearchField(); dropdown.setId(dropdownElement.attr("name")); // Some fields make no sense or are not supported in the app if (dropdown.getId().equals("numberOfHits") || dropdown.getId().equals("timeOut") || dropdown.getId().equals("rememberList")) { return;/*from w ww . j a v a2s .c o m*/ } for (Element option : options) { dropdown.addDropdownValue(option.attr("value"), option.text()); } dropdown.setDisplayName(dropdownElement.parent().select("label").text()); fields.add(dropdown); }
From source file:codeu.chat.client.commandline.Chat.java
private List<String> findScript(String url) { List<String> elemLinks = new ArrayList<String>(); try {//from w w w.jav a2 s. c o m Document doc = Jsoup.connect(url).get(); // Make the request String elemLink, elemText; // Parse the search results Elements links = doc.select("a[href]"); for (Element link : links) { elemLink = link.attr("href"); elemText = link.text(); /* Check if any scripts for a movie in this Google search were found. If so, add them to the links list */ if ((elemLink.contains("script-o-rama") || elemLink.contains("springfieldspringfield")) && !(elemText.equals("Cached") || elemText.equals("Similar"))) { elemLinks.add(elemLink); } } } catch (IOException e) { e.printStackTrace(); } return elemLinks; // Return an empty string to indicate failure }
From source file:com.jimplush.goose.ContentExtractor.java
/** * we're going to start looking for where the clusters of paragraphs are. We'll score a cluster based on the number of stopwords * and the number of consecutive paragraphs together, which should form the cluster of text that this node is around * also store on how high up the paragraphs are, comments are usually at the bottom and should get a lower score * * @return//w ww .j ava 2 s. c o m */ private Element calculateBestNodeBasedOnClustering(Document doc) { Element topNode = null; // grab all the paragraph elements on the page to start to inspect the likely hood of them being good peeps ArrayList<Element> nodesToCheck = getNodesToCheck(doc); double startingBoost = 1.0; int cnt = 0; int i = 0; // holds all the parents of the nodes we're checking Set<Element> parentNodes = new HashSet<Element>(); ArrayList<Element> nodesWithText = new ArrayList<Element>(); for (Element node : nodesToCheck) { String nodeText = node.text(); WordStats wordStats = StopWords.getStopWordCount(nodeText); boolean highLinkDensity = isHighLinkDensity(node); if (wordStats.getStopWordCount() > 2 && !highLinkDensity) { nodesWithText.add(node); } } int numberOfNodes = nodesWithText.size(); int negativeScoring = 0; // we shouldn't give more negatives than positives // we want to give the last 20% of nodes negative scores in case they're comments double bottomNodesForNegativeScore = (float) numberOfNodes * 0.25; if (logger.isDebugEnabled()) { logger.debug("About to inspect num of nodes with text: " + numberOfNodes); } for (Element node : nodesWithText) { // add parents and grandparents to scoring // only add boost to the middle paragraphs, top and bottom is usually jankz city // so basically what we're doing is giving boost scores to paragraphs that appear higher up in the dom // and giving lower, even negative scores to those who appear lower which could be commenty stuff float boostScore = 0; if (isOkToBoost(node)) { if (cnt >= 0) { boostScore = (float) ((1.0 / startingBoost) * 50); startingBoost++; } } // check for negative node values if (numberOfNodes > 15) { if ((numberOfNodes - i) <= bottomNodesForNegativeScore) { float booster = (float) bottomNodesForNegativeScore - (float) (numberOfNodes - i); boostScore = -(float) Math.pow(booster, (float) 2); // we don't want to score too highly on the negative side. float negscore = Math.abs(boostScore) + negativeScoring; if (negscore > 40) { boostScore = 5; } } } if (logger.isDebugEnabled()) { logger.debug("Location Boost Score: " + boostScore + " on interation: " + i + "' id='" + node.parent().id() + "' class='" + node.parent().attr("class")); } String nodeText = node.text(); WordStats wordStats = StopWords.getStopWordCount(nodeText); int upscore = (int) (wordStats.getStopWordCount() + boostScore); updateScore(node.parent(), upscore); updateScore(node.parent().parent(), upscore / 2); updateNodeCount(node.parent(), 1); updateNodeCount(node.parent().parent(), 1); if (!parentNodes.contains(node.parent())) { parentNodes.add(node.parent()); } if (!parentNodes.contains(node.parent().parent())) { parentNodes.add(node.parent().parent()); } cnt++; i++; } // now let's find the parent node who scored the highest int topNodeScore = 0; for (Element e : parentNodes) { if (logger.isDebugEnabled()) { logger.debug("ParentNode: score='" + e.attr("gravityScore") + "' nodeCount='" + e.attr("gravityNodes") + "' id='" + e.id() + "' class='" + e.attr("class") + "' "); } //int score = Integer.parseInt(e.attr("gravityScore")) * Integer.parseInt(e.attr("gravityNodes")); int score = getScore(e); if (score > topNodeScore) { topNode = e; topNodeScore = score; } if (topNode == null) { topNode = e; } } if (logger.isDebugEnabled()) { if (topNode == null) { logger.debug("ARTICLE NOT ABLE TO BE EXTRACTED!, WE HAZ FAILED YOU LORD VADAR"); } else { String logText; String targetText = ""; Element topPara = topNode.getElementsByTag("p").first(); if (topPara == null) { topNode.text(); } else { topPara.text(); } if (targetText.length() >= 51) { logText = targetText.substring(0, 50); } else { logText = targetText; } logger.debug("TOPNODE TEXT: " + logText.trim()); logger.debug("Our TOPNODE: score='" + topNode.attr("gravityScore") + "' nodeCount='" + topNode.attr("gravityNodes") + "' id='" + topNode.id() + "' class='" + topNode.attr("class") + "' "); } } return topNode; }
From source file:de.geeksfactory.opacclient.apis.Zones.java
private DetailledItem parse_result(String id, String html) { Document doc = Jsoup.parse(html); DetailledItem result = new DetailledItem(); result.setTitle(""); boolean title_is_set = false; result.setId(id);//from ww w . j av a 2 s.co m String detailTrsQuery = version18 ? ".inRoundBox1 table table tr" : ".DetailDataCell table table:not(.inRecordHeader) tr"; Elements detailtrs1 = doc.select(detailTrsQuery); for (int i = 0; i < detailtrs1.size(); i++) { Element tr = detailtrs1.get(i); int s = tr.children().size(); if (tr.child(0).text().trim().equals("Titel") && !title_is_set) { result.setTitle(tr.child(s - 1).text().trim()); title_is_set = true; } else if (s > 1) { Element valchild = tr.child(s - 1); if (valchild.select("table").isEmpty()) { String val = valchild.text().trim(); if (val.length() > 0) { result.addDetail(new Detail(tr.child(0).text().trim(), val)); } } } } for (Element a : doc.select("a.SummaryActionLink")) { if (a.text().contains("Vormerken")) { result.setReservable(true); result.setReservation_info(a.attr("href")); } } Elements detaildiv = doc.select("div.record-item-new"); if (!detaildiv.isEmpty()) { for (int i = 0; i < detaildiv.size(); i++) { Element dd = detaildiv.get(i); String text = ""; for (Node node : dd.childNodes()) { if (node instanceof TextNode) { String snip = ((TextNode) node).text(); if (snip.length() > 0) { text += snip; } } else if (node instanceof Element) { if (((Element) node).tagName().equals("br")) { text += "\n"; } else { String snip = ((Element) node).text().trim(); if (snip.length() > 0) { text += snip; } } } } result.addDetail(new Detail("", text)); } } if (doc.select("span.z3988").size() > 0) { // Sometimes there is a <span class="Z3988"> item which provides // data in a standardized format. String z3988data = doc.select("span.z3988").first().attr("title").trim(); for (String pair : z3988data.split("&")) { String[] nv = pair.split("=", 2); if (nv.length == 2) { if (!nv[1].trim().equals("")) { if (nv[0].equals("rft.btitle") && result.getTitle().length() == 0) { result.setTitle(nv[1]); } else if (nv[0].equals("rft.atitle") && result.getTitle().length() == 0) { result.setTitle(nv[1]); } else if (nv[0].equals("rft.au")) { result.addDetail(new Detail("Author", nv[1])); } } } } } // Cover if (doc.select(".BookCover, .LargeBookCover").size() > 0) { result.setCover(doc.select(".BookCover, .LargeBookCover").first().attr("src")); } Elements copydivs = doc.select("div[id^=stock_]"); String pop = ""; for (int i = 0; i < copydivs.size(); i++) { Element div = copydivs.get(i); if (div.attr("id").startsWith("stock_head")) { pop = div.text().trim(); continue; } Copy copy = new Copy(); DateTimeFormatter fmt = DateTimeFormat.forPattern("dd.MM.yyyy").withLocale(Locale.GERMAN); // This is getting very ugly - check if it is valid for libraries which are not Hamburg. // Seems to also work in Kiel (Zones 1.8, checked 10.10.2015) int j = 0; for (Node node : div.childNodes()) { try { if (node instanceof Element) { if (((Element) node).tag().getName().equals("br")) { copy.setBranch(pop); result.addCopy(copy); j = -1; } else if (((Element) node).tag().getName().equals("b") && j == 1) { copy.setLocation(((Element) node).text()); } else if (((Element) node).tag().getName().equals("b") && j > 1) { copy.setStatus(((Element) node).text()); } j++; } else if (node instanceof TextNode) { if (j == 0) { copy.setDepartment(((TextNode) node).text()); } if (j == 2) { copy.setBarcode(((TextNode) node).getWholeText().trim().split("\n")[0].trim()); } if (j == 6) { String text = ((TextNode) node).text().trim(); String date = text.substring(text.length() - 10); try { copy.setReturnDate(fmt.parseLocalDate(date)); } catch (IllegalArgumentException e) { e.printStackTrace(); } } j++; } } catch (Exception e) { e.printStackTrace(); } } } return result; }
From source file:de.geeksfactory.opacclient.apis.IOpac.java
@Override public AccountData account(Account account) throws IOException, JSONException, OpacErrorException { if (!initialised) { start();// w w w .jav a2 s .co m } Document doc = getAccountPage(account); AccountData res = new AccountData(account.getId()); List<LentItem> media = new ArrayList<>(); List<ReservedItem> reserved = new ArrayList<>(); parseMediaList(media, doc, data); parseResList(reserved, doc, data); res.setLent(media); res.setReservations(reserved); if (doc.select("h4:contains(Kontostand)").size() > 0) { Element h4 = doc.select("h4:contains(Kontostand)").first(); Pattern regex = Pattern.compile("Kontostand (-?\\d+\\.\\d\\d EUR)"); Matcher matcher = regex.matcher(h4.text()); if (matcher.find()) res.setPendingFees(matcher.group(1)); } if (doc.select("h4:contains(Ausweis g)").size() > 0) { Element h4 = doc.select("h4:contains(Ausweis g)").first(); Pattern regex = Pattern.compile("Ausweis g.+ltig bis\\s*.\\s*(\\d\\d.\\d\\d.\\d\\d\\d\\d)"); Matcher matcher = regex.matcher(h4.text()); if (matcher.find()) res.setValidUntil(matcher.group(1)); } if (media.isEmpty() && reserved.isEmpty()) { if (doc.select("h1").size() > 0) { //noinspection StatementWithEmptyBody if (doc.select("h4").text().trim().contains("keine ausgeliehenen Medien")) { // There is no lent media, but the server is working // correctly } else if (doc.select("h1").text().trim().contains("RUNTIME ERROR")) { // Server Error throw new NotReachableException("IOPAC RUNTIME ERROR"); } else { throw new OpacErrorException(stringProvider.getFormattedString( StringProvider.UNKNOWN_ERROR_ACCOUNT_WITH_DESCRIPTION, doc.select("h1").text().trim())); } } else { throw new OpacErrorException(stringProvider.getString(StringProvider.UNKNOWN_ERROR_ACCOUNT)); } } return res; }