List of usage examples for org.jsoup.nodes Element child
public Element child(int index)
From source file:de.geeksfactory.opacclient.apis.Pica.java
protected SearchRequestResult parse_search(String html, int page) throws OpacErrorException { Document doc = Jsoup.parse(html); updateSearchSetValue(doc);/* w w w . jav a 2s. c o m*/ if (doc.select(".error").size() > 0) { String error = doc.select(".error").first().text().trim(); if (error.equals("Es wurde nichts gefunden.") || error.equals("Nothing has been found") || error.equals("Er is niets gevonden.") || error.equals("Rien n'a t trouv.")) { // nothing found return new SearchRequestResult(new ArrayList<SearchResult>(), 0, 1, 1); } else { // error throw new OpacErrorException(error); } } reusehtml = html; int results_total; String resultnumstr = doc.select(".pages").first().text(); Pattern p = Pattern.compile("[0-9]+$"); Matcher m = p.matcher(resultnumstr); if (m.find()) { resultnumstr = m.group(); } if (resultnumstr.contains("(")) { results_total = Integer.parseInt(resultnumstr.replaceAll(".*\\(([0-9]+)\\).*", "$1")); } else if (resultnumstr.contains(": ")) { results_total = Integer.parseInt(resultnumstr.replaceAll(".*: ([0-9]+)$", "$1")); } else { results_total = Integer.parseInt(resultnumstr); } List<SearchResult> results = new ArrayList<>(); if (results_total == 1) { // Only one result DetailledItem singleResult = parse_result(html); SearchResult sr = new SearchResult(); sr.setType(getMediaTypeInSingleResult(html)); sr.setInnerhtml( "<b>" + singleResult.getTitle() + "</b><br>" + singleResult.getDetails().get(0).getContent()); results.add(sr); } Elements table = doc.select("table[summary=hitlist] tbody tr[valign=top]"); // identifier = null; Elements links = doc.select("table[summary=hitlist] a"); boolean haslink = false; for (int i = 0; i < links.size(); i++) { Element node = links.get(i); if (node.hasAttr("href") & node.attr("href").contains("SHW?") && !haslink) { haslink = true; try { List<NameValuePair> anyurl = URLEncodedUtils.parse(new URI(node.attr("href")), getDefaultEncoding()); for (NameValuePair nv : anyurl) { if (nv.getName().equals("identifier")) { // identifier = nv.getValue(); break; } } } catch (Exception e) { e.printStackTrace(); } } } for (int i = 0; i < table.size(); i++) { Element tr = table.get(i); SearchResult sr = new SearchResult(); if (tr.select("td.hit img").size() > 0) { String[] fparts = tr.select("td img").get(0).attr("src").split("/"); String fname = fparts[fparts.length - 1]; if (data.has("mediatypes")) { try { sr.setType(MediaType.valueOf(data.getJSONObject("mediatypes").getString(fname))); } catch (JSONException | IllegalArgumentException e) { sr.setType(defaulttypes.get(fname.toLowerCase(Locale.GERMAN).replace(".jpg", "") .replace(".gif", "").replace(".png", ""))); } } else { sr.setType(defaulttypes.get(fname.toLowerCase(Locale.GERMAN).replace(".jpg", "") .replace(".gif", "").replace(".png", ""))); } } Element middlething = tr.child(2); List<Node> children = middlething.childNodes(); int childrennum = children.size(); List<String[]> strings = new ArrayList<>(); for (int ch = 0; ch < childrennum; ch++) { Node node = children.get(ch); if (node instanceof TextNode) { String text = ((TextNode) node).text().trim(); if (text.length() > 3) { strings.add(new String[] { "text", "", text }); } } else if (node instanceof Element) { List<Node> subchildren = node.childNodes(); for (int j = 0; j < subchildren.size(); j++) { Node subnode = subchildren.get(j); if (subnode instanceof TextNode) { String text = ((TextNode) subnode).text().trim(); if (text.length() > 3) { strings.add(new String[] { ((Element) node).tag().getName(), "text", text, ((Element) node).className(), node.attr("style") }); } } else if (subnode instanceof Element) { String text = ((Element) subnode).text().trim(); if (text.length() > 3) { strings.add(new String[] { ((Element) node).tag().getName(), ((Element) subnode).tag().getName(), text, ((Element) node).className(), node.attr("style") }); } } } } } StringBuilder description = new StringBuilder(); int k = 0; for (String[] part : strings) { if (part[0].equals("a") && k == 0) { description.append("<b>").append(part[2]).append("</b>"); } else if (k < 3) { description.append("<br />").append(part[2]); } k++; } sr.setInnerhtml(description.toString()); sr.setNr(10 * (page - 1) + i); sr.setId(null); results.add(sr); } resultcount = results.size(); return new SearchRequestResult(results, results_total, page); }
From source file:de.geeksfactory.opacclient.apis.Bibliotheca.java
protected SearchRequestResult parse_search(String html, int page) { Document doc = Jsoup.parse(html); doc.setBaseUri(opac_url);//from w w w. j ava 2s . c o m Elements table = doc.select(".resulttab tr.result_trefferX, .resulttab tr.result_treffer"); List<SearchResult> results = new ArrayList<>(); for (int i = 0; i < table.size(); i++) { Element tr = table.get(i); SearchResult sr = new SearchResult(); int contentindex = 1; if (tr.select("td a img").size() > 0) { String[] fparts = tr.select("td a img").get(0).attr("src").split("/"); String fname = fparts[fparts.length - 1]; if (data.has("mediatypes")) { try { sr.setType(MediaType.valueOf(data.getJSONObject("mediatypes").getString(fname))); } catch (JSONException | IllegalArgumentException e) { sr.setType(defaulttypes.get(fname.toLowerCase(Locale.GERMAN).replace(".jpg", "") .replace(".gif", "").replace(".png", ""))); } } else { sr.setType(defaulttypes.get(fname.toLowerCase(Locale.GERMAN).replace(".jpg", "") .replace(".gif", "").replace(".png", ""))); } } else { if (tr.children().size() == 3) { contentindex = 2; } } sr.setInnerhtml(tr.child(contentindex).child(0).html()); sr.setNr(i); Element link = tr.child(contentindex).select("a").first(); try { if (link != null && link.attr("href").contains("detmediennr")) { Map<String, String> params = getQueryParamsFirst(link.attr("abs:href")); String nr = params.get("detmediennr"); if (Integer.parseInt(nr) > i + 1) { // Seems to be an ID if (params.get("detDB") != null) { sr.setId("&detmediennr=" + nr + "&detDB=" + params.get("detDB")); } else { sr.setId("&detmediennr=" + nr); } } } } catch (Exception e) { } try { if (tr.child(1).childNode(0) instanceof Comment) { Comment c = (Comment) tr.child(1).childNode(0); String comment = c.getData().trim(); String id = comment.split(": ")[1]; sr.setId(id); } } catch (Exception e) { e.printStackTrace(); } results.add(sr); } int results_total = -1; if (doc.select(".result_gefunden").size() > 0) { try { results_total = Integer.parseInt( doc.select(".result_gefunden").text().trim().replaceAll(".*[^0-9]+([0-9]+).*", "$1")); } catch (NumberFormatException e) { e.printStackTrace(); results_total = -1; } } return new SearchRequestResult(results, results_total, page); }
From source file:de.geeksfactory.opacclient.apis.Bibliotheca.java
@Override public ProlongAllResult prolongAll(Account account, int useraction, String selection) throws IOException { if (!initialised) { start();/* w w w . ja v a 2s . com*/ } if (System.currentTimeMillis() - logged_in > SESSION_LIFETIME || logged_in_as == null) { try { account(account); } catch (JSONException e) { e.printStackTrace(); return new ProlongAllResult(MultiStepResult.Status.ERROR, stringProvider.getString(StringProvider.CONNECTION_ERROR)); } catch (OpacErrorException e) { return new ProlongAllResult(MultiStepResult.Status.ERROR, e.getMessage()); } } else if (logged_in_as.getId() != account.getId()) { try { account(account); } catch (JSONException e) { e.printStackTrace(); return new ProlongAllResult(MultiStepResult.Status.ERROR, stringProvider.getString(StringProvider.CONNECTION_ERROR)); } catch (OpacErrorException e) { return new ProlongAllResult(MultiStepResult.Status.ERROR, e.getMessage()); } } String html = httpGet(opac_url + "/index.asp?target=alleverl", getDefaultEncoding()); Document doc = Jsoup.parse(html); if (doc.getElementsByClass("kontomeldung").size() == 1) { String err = doc.getElementsByClass("kontomeldung").get(0).text(); return new ProlongAllResult(MultiStepResult.Status.ERROR, err); } if (doc.select(".kontozeile table").size() == 1) { Map<Integer, String> colmap = new HashMap<>(); List<Map<String, String>> result = new ArrayList<>(); for (Element tr : doc.select(".kontozeile table tr")) { if (tr.select(".tabHeaderKonto").size() > 0) { int i = 0; for (Element th : tr.select("th")) { if (th.text().contains("Verfasser")) { colmap.put(i, OpacApi.ProlongAllResult.KEY_LINE_AUTHOR); } else if (th.text().contains("Titel")) { colmap.put(i, OpacApi.ProlongAllResult.KEY_LINE_TITLE); } else if (th.text().contains("Neue")) { colmap.put(i, OpacApi.ProlongAllResult.KEY_LINE_NEW_RETURNDATE); } else if (th.text().contains("Frist")) { colmap.put(i, OpacApi.ProlongAllResult.KEY_LINE_OLD_RETURNDATE); } else if (th.text().contains("Status")) { colmap.put(i, OpacApi.ProlongAllResult.KEY_LINE_MESSAGE); } i++; } } else { Map<String, String> line = new HashMap<>(); for (Entry<Integer, String> entry : colmap.entrySet()) { line.put(entry.getValue(), tr.child(entry.getKey()).text().trim()); } result.add(line); } } if (doc.select("input#make_allvl").size() > 0) { List<NameValuePair> nameValuePairs = new ArrayList<>(2); nameValuePairs.add(new BasicNameValuePair("target", "make_allvl_flag")); nameValuePairs.add(new BasicNameValuePair("make_allvl", "Bestaetigung")); httpPost(opac_url + "/index.asp", new UrlEncodedFormEntity(nameValuePairs), getDefaultEncoding()); } return new ProlongAllResult(MultiStepResult.Status.OK, result); } return new ProlongAllResult(MultiStepResult.Status.ERROR, stringProvider.getString(StringProvider.INTERNAL_ERROR)); }
From source file:de.geeksfactory.opacclient.apis.BiBer1992.java
private SearchRequestResult parse_search(String html, int page) { List<SearchResult> results = new ArrayList<>(); Document doc = Jsoup.parse(html); if (doc.select("h3").text().contains("Es wurde nichts gefunden")) { return new SearchRequestResult(results, 0, page); }/*from ww w . j av a2 s. c om*/ Elements trList = doc.select("form table tr[valign]"); // <tr // valign="top"> Elements elem; int rows_per_hit = 2; if (trList.size() == 1 || (trList.size() > 1 && trList.get(0).select("input[type=checkbox]").size() > 0 && trList.get(1).select("input[type=checkbox]").size() > 0)) { rows_per_hit = 1; } try { rows_per_hit = data.getInt("rows_per_hit"); } catch (JSONException e) { } // Overall search results // are very differently layouted, but have always the text: // "....Treffer Gesamt (nnn)" int results_total; Pattern pattern = Pattern.compile("Treffer Gesamt \\(([0-9]+)\\)"); Matcher matcher = pattern.matcher(html); if (matcher.find()) { results_total = Integer.parseInt(matcher.group(1)); } else { results_total = -1; } // limit to 20 entries int numOfEntries = trList.size() / rows_per_hit; // two rows per entry if (numOfEntries > numOfResultsPerPage) { numOfEntries = numOfResultsPerPage; } for (int i = 0; i < numOfEntries; i++) { Element tr = trList.get(i * rows_per_hit); SearchResult sr = new SearchResult(); // ID as href tag elem = tr.select("td a"); if (elem.size() > 0) { String hrefID = elem.get(0).attr("href"); sr.setId(hrefID); } else { // no ID as href found, look for the ID in the input form elem = tr.select("td input"); if (elem.size() > 0) { String nameID = elem.get(0).attr("name").trim(); String hrefID = "/" + opacDir + "/ftitle" + opacSuffix + "?LANG=de&FUNC=full&" + nameID + "=YES"; sr.setId(hrefID); } } // media type elem = tr.select("td img"); if (elem.size() > 0) { setMediaTypeFromImageFilename(sr, elem.get(0).attr("src")); } // description String desc = ""; try { // array "searchtable" list the column numbers of the // description JSONArray searchtable = data.getJSONArray("searchtable"); for (int j = 0; j < searchtable.length(); j++) { int colNum = searchtable.getInt(j); if (j > 0) { desc = desc + "<br />"; } String c = tr.child(colNum).html(); if (tr.child(colNum).childNodes().size() == 1 && tr.child(colNum).select("a[href*=ftitle.]").size() > 0) { c = tr.select("a[href*=ftitle.]").text(); } desc = desc + c; } } catch (Exception e) { e.printStackTrace(); } // remove links "<a ...>...</a> // needed for Friedrichshafen: "Warenkorb", "Vormerkung" // Herford: "Medienkorb" desc = desc.replaceAll("<a .*?</a>", ""); sr.setInnerhtml(desc); if (tr.select("font.p04x09b").size() > 0 && tr.select("font.p02x09b").size() == 0) { sr.setStatus(Status.GREEN); } else if (tr.select("font.p04x09b").size() == 0 && tr.select("font.p02x09b").size() > 0) { sr.setStatus(Status.RED); } else if (tr.select("font.p04x09b").size() > 0 && tr.select("font.p02x09b").size() > 0) { sr.setStatus(Status.YELLOW); } // number sr.setNr(i / rows_per_hit); results.add(sr); } // m_resultcount = results.size(); return new SearchRequestResult(results, results_total, page); }
From source file:de.geeksfactory.opacclient.apis.SISIS.java
protected SearchRequestResult parse_search(String html, int page) throws OpacErrorException { Document doc = Jsoup.parse(html); doc.setBaseUri(opac_url + "/searchfoo"); if (doc.select(".error").size() > 0) { throw new OpacErrorException(doc.select(".error").text().trim()); } else if (doc.select(".nohits").size() > 0) { throw new OpacErrorException(doc.select(".nohits").text().trim()); } else if (doc.select(".box-header h2, #nohits").text().contains("keine Treffer")) { return new SearchRequestResult(new ArrayList<SearchResult>(), 0, 1, 1); }/*w w w. jav a 2 s . co m*/ int results_total = -1; String resultnumstr = doc.select(".box-header h2").first().text(); if (resultnumstr.contains("(1/1)") || resultnumstr.contains(" 1/1")) { reusehtml = html; throw new OpacErrorException("is_a_redirect"); } else if (resultnumstr.contains("(")) { results_total = Integer.parseInt(resultnumstr.replaceAll(".*\\(([0-9]+)\\).*", "$1")); } else if (resultnumstr.contains(": ")) { results_total = Integer.parseInt(resultnumstr.replaceAll(".*: ([0-9]+)$", "$1")); } Elements table = doc.select("table.data tbody tr"); identifier = null; Elements links = doc.select("table.data a"); boolean haslink = false; for (int i = 0; i < links.size(); i++) { Element node = links.get(i); if (node.hasAttr("href") & node.attr("href").contains("singleHit.do") && !haslink) { haslink = true; try { List<NameValuePair> anyurl = URLEncodedUtils .parse(new URI(node.attr("href").replace(" ", "%20").replace("&", "&")), ENCODING); for (NameValuePair nv : anyurl) { if (nv.getName().equals("identifier")) { identifier = nv.getValue(); break; } } } catch (Exception e) { e.printStackTrace(); } } } List<SearchResult> results = new ArrayList<>(); for (int i = 0; i < table.size(); i++) { Element tr = table.get(i); SearchResult sr = new SearchResult(); if (tr.select("td img[title]").size() > 0) { String title = tr.select("td img").get(0).attr("title"); String[] fparts = tr.select("td img").get(0).attr("src").split("/"); String fname = fparts[fparts.length - 1]; MediaType default_by_fname = defaulttypes.get(fname.toLowerCase(Locale.GERMAN).replace(".jpg", "") .replace(".gif", "").replace(".png", "")); MediaType default_by_title = defaulttypes.get(title); MediaType default_name = default_by_title != null ? default_by_title : default_by_fname; if (data.has("mediatypes")) { try { sr.setType(MediaType.valueOf(data.getJSONObject("mediatypes").getString(fname))); } catch (JSONException | IllegalArgumentException e) { sr.setType(default_name); } } else { sr.setType(default_name); } } String alltext = tr.text(); if (alltext.contains("eAudio") || alltext.contains("eMusic")) { sr.setType(MediaType.MP3); } else if (alltext.contains("eVideo")) { sr.setType(MediaType.EVIDEO); } else if (alltext.contains("eBook")) { sr.setType(MediaType.EBOOK); } else if (alltext.contains("Munzinger")) { sr.setType(MediaType.EDOC); } if (tr.children().size() > 3 && tr.child(3).select("img[title*=cover]").size() == 1) { sr.setCover(tr.child(3).select("img[title*=cover]").attr("abs:src")); if (sr.getCover().contains("showCover.do")) { downloadCover(sr); } } Element middlething; if (tr.children().size() > 2 && tr.child(2).select("a").size() > 0) { middlething = tr.child(2); } else { middlething = tr.child(1); } List<Node> children = middlething.childNodes(); if (middlething.select("div").not("#hlrightblock,.bestellfunktionen").size() == 1) { Element indiv = middlething.select("div").not("#hlrightblock,.bestellfunktionen").first(); if (indiv.children().size() > 1) { children = indiv.childNodes(); } } else if (middlething.select("span.titleData").size() == 1) { children = middlething.select("span.titleData").first().childNodes(); } int childrennum = children.size(); List<String[]> strings = new ArrayList<>(); for (int ch = 0; ch < childrennum; ch++) { Node node = children.get(ch); if (node instanceof TextNode) { String text = ((TextNode) node).text().trim(); if (text.length() > 3) { strings.add(new String[] { "text", "", text }); } } else if (node instanceof Element) { List<Node> subchildren = node.childNodes(); for (int j = 0; j < subchildren.size(); j++) { Node subnode = subchildren.get(j); if (subnode instanceof TextNode) { String text = ((TextNode) subnode).text().trim(); if (text.length() > 3) { strings.add(new String[] { ((Element) node).tag().getName(), "text", text, ((Element) node).className(), node.attr("style") }); } } else if (subnode instanceof Element) { String text = ((Element) subnode).text().trim(); if (text.length() > 3) { strings.add(new String[] { ((Element) node).tag().getName(), ((Element) subnode).tag().getName(), text, ((Element) node).className(), node.attr("style") }); } } } } } StringBuilder description = null; if (tr.select("span.Z3988").size() == 1) { // Sometimes there is a <span class="Z3988"> item which provides // data in a standardized format. List<NameValuePair> z3988data; boolean hastitle = false; try { description = new StringBuilder(); z3988data = URLEncodedUtils .parse(new URI("http://dummy/?" + tr.select("span.Z3988").attr("title")), "UTF-8"); for (NameValuePair nv : z3988data) { if (nv.getValue() != null) { if (!nv.getValue().trim().equals("")) { if (nv.getName().equals("rft.btitle") && !hastitle) { description.append("<b>").append(nv.getValue()).append("</b>"); hastitle = true; } else if (nv.getName().equals("rft.atitle") && !hastitle) { description.append("<b>").append(nv.getValue()).append("</b>"); hastitle = true; } else if (nv.getName().equals("rft.au")) { description.append("<br />").append(nv.getValue()); } else if (nv.getName().equals("rft.date")) { description.append("<br />").append(nv.getValue()); } } } } } catch (URISyntaxException e) { description = null; } } boolean described = false; if (description != null && description.length() > 0) { sr.setInnerhtml(description.toString()); described = true; } else { description = new StringBuilder(); } int k = 0; boolean yearfound = false; boolean titlefound = false; boolean sigfound = false; for (String[] part : strings) { if (!described) { if (part[0].equals("a") && (k == 0 || !titlefound)) { if (k != 0) { description.append("<br />"); } description.append("<b>").append(part[2]).append("</b>"); titlefound = true; } else if (part[2].matches("\\D*[0-9]{4}\\D*") && part[2].length() <= 10) { yearfound = true; if (k != 0) { description.append("<br />"); } description.append(part[2]); } else if (k == 1 && !yearfound && part[2].matches("^\\s*\\([0-9]{4}\\)$")) { if (k != 0) { description.append("<br />"); } description.append(part[2]); } else if (k == 1 && !yearfound && part[2].matches("^\\s*\\([0-9]{4}\\)$")) { if (k != 0) { description.append("<br />"); } description.append(part[2]); } else if (k > 1 && k < 4 && !sigfound && part[0].equals("text") && part[2].matches("^[A-Za-z0-9,\\- ]+$")) { description.append("<br />"); description.append(part[2]); } } if (part.length == 4) { if (part[0].equals("span") && part[3].equals("textgruen")) { sr.setStatus(SearchResult.Status.GREEN); } else if (part[0].equals("span") && part[3].equals("textrot")) { sr.setStatus(SearchResult.Status.RED); } } else if (part.length == 5) { if (part[4].contains("purple")) { sr.setStatus(SearchResult.Status.YELLOW); } } if (sr.getStatus() == null) { if ((part[2].contains("entliehen") && part[2].startsWith("Vormerkung ist leider nicht mglich")) || part[2].contains("nur in anderer Zweigstelle ausleihbar und nicht bestellbar")) { sr.setStatus(SearchResult.Status.RED); } else if (part[2].startsWith("entliehen") || part[2].contains("Ein Exemplar finden Sie in einer anderen Zweigstelle")) { sr.setStatus(SearchResult.Status.YELLOW); } else if ((part[2].startsWith("bestellbar") && !part[2].contains("nicht bestellbar")) || (part[2].startsWith("vorbestellbar") && !part[2].contains("nicht vorbestellbar")) || (part[2].startsWith("vorbestellbar") && !part[2].contains("nicht vorbestellbar")) || (part[2].startsWith("vormerkbar") && !part[2].contains("nicht vormerkbar")) || (part[2].contains("heute zurckgebucht")) || (part[2].contains("ausleihbar") && !part[2].contains("nicht ausleihbar"))) { sr.setStatus(SearchResult.Status.GREEN); } if (sr.getType() != null) { if (sr.getType().equals(MediaType.EBOOK) || sr.getType().equals(MediaType.EVIDEO) || sr.getType().equals(MediaType.MP3)) // Especially Onleihe.de ebooks are often marked // green though they are not available. { sr.setStatus(SearchResult.Status.UNKNOWN); } } } k++; } if (!described) { sr.setInnerhtml(description.toString()); } sr.setNr(10 * (page - 1) + i); sr.setId(null); results.add(sr); } resultcount = results.size(); return new SearchRequestResult(results, results_total, page); }
From source file:org.apdplat.superword.extract.ChineseSynonymAntonymExtractor.java
public static SynonymAntonym parseSynonymAntonym(String html, String word) { SynonymAntonym synonymAntonym = new SynonymAntonym(); synonymAntonym.setWord(new Word(word, "")); try {// w ww . j av a 2s . co m for (Element element : Jsoup.parse(html).select(SYNONYM_ANTONYM_CSS_PATH)) { int size = element.children().size(); LOGGER.debug("element size:" + size); for (int i = 0; i < size / 2; i++) { String type = element.child(i * 2).text(); LOGGER.debug("type:" + type); if ("??".equals(type)) { String synonym = element.child(i * 2 + 1).text(); LOGGER.debug("synonym:" + synonym); for (String w : synonym.split("\\s+")) { w = w.replaceAll("\\s+", ""); if (w.length() < 2) { continue; } if (isNotChineseChar(w)) { LOGGER.debug("?" + w); continue; } if (w.equals(word)) { continue; } LOGGER.debug("word:" + w); synonymAntonym.addSynonym(new Word(w, "")); } } if ("???".equals(type)) { String antonym = element.child(i * 2 + 1).text(); LOGGER.debug("antonym:" + antonym); for (String w : antonym.split("\\s+")) { w = w.replaceAll("\\s+", ""); if (w.length() < 2) { continue; } if (isNotChineseChar(w)) { LOGGER.debug("?" + w); continue; } LOGGER.debug("word:" + w); synonymAntonym.addAntonym(new Word(w, "")); } } } } if (!synonymAntonym.getAntonym().isEmpty() || !synonymAntonym.getSynonym().isEmpty()) { LOGGER.info("??????" + synonymAntonym); } } catch (Exception e) { LOGGER.error("??????", e); } return synonymAntonym; }
From source file:org.apdplat.superword.extract.SentenceExtractor.java
public static Map<String, String> parse2(String html) { Map<String, String> sentences = new HashMap<>(); try {//from ww w . j av a 2 s .c o m Document document = Jsoup.parse(html); String title = document.select("html head title").text(); if (!title.startsWith("??")) { LOGGER.error("???" + title); return sentences; } for (Element element : document.select("html body div#main div.main_sl div.info div.info-body")) { String en = element.child(3).text().trim(); LOGGER.info("???:" + en); if (en.split("\\s+").length < 2) { LOGGER.debug("???"); continue; } String cn = element.child(4).text().trim() + element.child(5).text().trim(); LOGGER.info("???:" + cn); if (StringUtils.isNotBlank(en) && StringUtils.isNotBlank(cn)) { sentences.put(en, cn); //? TextAnalyzer.seg(en).forEach(w -> { Word word = new Word(w, ""); WORD_FREQUENCE.putIfAbsent(word, new AtomicInteger()); WORD_FREQUENCE.get(word).incrementAndGet(); }); } } } catch (Exception e) { LOGGER.error("???", e); } return sentences; }
From source file:org.arb.extractor.DomTreeWalker.java
private void collectIdsOnElement(Element element, AbstractCodeUnit codeUnit) { if (element.hasAttr("id")) { elementIdSet.add(element.attr("id")); }/*www. j a v a2 s. c o m*/ if (element.hasAttr("arb:id")) { arbIdSet.add(element.attr("arb:id")); } for (int i = 0; i < element.children().size(); i++) { collectIdsOnElement(element.child(i), codeUnit); } }
From source file:org.arb.extractor.DomTreeWalker.java
/** * Extract resource from an element and all its children. * // w w w .j a v a2 s.co m * @param element the target element. * @param codeUnit used to record all found replacement. */ private void extractResourceOnElement(Element element, AbstractCodeUnit codeUnit) { String ownText = element.ownText(); if (!ownText.isEmpty() && !hasResource(element)) { DomCodeReplacement replacement = new DomCodeReplacement(); replacement.setElement(element); replacement.setResourceText(ownText); replacement.setResourceId(getElementResourceId(element)); if (replacement.getResourceId() == null) { replacement.setNewId(true); replacement.setUseArbId(true); } codeUnit.addReplacement(replacement); } for (int i = 0; i < element.children().size(); i++) { extractResourceOnElement(element.child(i), codeUnit); } }
From source file:org.asqatasun.rules.elementselector.ImageElementSelector.java
/** * /* w w w. j a v a2 s . co m*/ * @param imageParent * @param image * @return whether the current image is an image link */ private boolean isImageLink(Element imageParent, Element image) { if (imageParent == null || !StringUtils.equals(imageParent.text(), image.text())) { return false; } if (imageParent.children().size() == 1) { return isImageLink(imageParent.child(0), image); } else if (imageParent.children().isEmpty() && imageParent.equals(image)) { return true; } return false; }