List of usage examples for org.jsoup.nodes Element text
public String text()
From source file:com.jejking.hh.nord.corpus.AllrisHtmlToRawDrucksache.java
private String druckSacheId(Document htmlDoc) { Elements druckSacheIdElememnts = htmlDoc.select("#risname > h1"); Element druckSacheIdElement = druckSacheIdElememnts.first(); String elementText = druckSacheIdElement.text(); String druckSacheId = removeNonBreakingSpacesAndTrim(elementText.substring("Drucksache - ".length())); return druckSacheId; }
From source file:org.ala.lucene.CreateWordPressIndex.java
/** * Index the WP pages by parsing with Jsoup and indexing into SOLR * * @return//from w w w.j ava 2 s . c om * @throws IOException */ protected int indexPages() throws Exception { int documentCount = 0; // Initialise SOLR SolrServer solrServer = solrUtils.getSolrServer(); logger.info("Deleting all WordPress documents in SOLR index..."); solrServer.deleteByQuery("idxtype:" + IndexedTypes.WORDPRESS); // delete WP pages solrServer.commit(); for (String pageUrl : this.pageUrls) { try { // Crawl and extract text from WP pages Document document = Jsoup.connect(pageUrl + CONTENT_ONLY_PARAM).get(); String title = document.select("head > title").text(); String id = document.select("head > meta[name=id]").attr("content"); String bodyText = document.body().text(); Elements postCategories = document.select("ul[class=post-categories]"); List<String> categoriesOut = new ArrayList<String>(); Boolean excludePost = false; if (!postCategories.isEmpty()) { // Is a WP post (not page) Elements categoriesIn = postCategories.select("li > a"); // get list of li elements for (Element cat : categoriesIn) { String thisCat = cat.text(); if (thisCat != null && excludedCategories.contains(thisCat)) { // "button".equals(thisCat) // exclude category "button" posts excludePost = true; } if (thisCat != null) { // add category to list categoriesOut.add(thisCat.replaceAll(" ", "_")); } } } if (excludePost) { logger.debug("Excluding post (id: " + id + ") with category: " + StringUtils.join(categoriesOut, "|")); continue; } documentCount++; // Index with SOLR logger.debug(documentCount + ". Indexing WP page - id: " + id + " | title: " + title + " | text: " + StringUtils.substring(bodyText, 0, 100) + "... "); SolrInputDocument doc = new SolrInputDocument(); doc.addField("idxtype", IndexedTypes.WORDPRESS); doc.addField("guid", WP_BASE_URI + id); // use page_id based URI instead of permalink in case permalink is too long for id field doc.addField("id", "wp" + id); // probably not needed but safer to leave in doc.addField("name", title, 1.2f); doc.addField("content", bodyText); doc.addField("australian_s", "recorded"); // so they appear in default QF search doc.addField("categories", categoriesOut); // add to index solrServer.add(doc); if (documentCount % 100 == 0) { logger.info("Committing to SOLR (count = " + documentCount + ")..."); solrServer.commit(); } } catch (IOException ex) { // catch it so we don't stop indexing other pages logger.warn("Problem accessing/reading WP page: " + ex.getMessage(), ex); } } logger.info("Final Committing to SOLR..."); solrServer.commit(); //logger.info("Optimising SOLR index..."); //solrServer.optimize(); // throws errors on my machine?? logger.info("Committed to SOLR. Final document count: " + documentCount); return documentCount; }
From source file:me.vertretungsplan.parser.UntisCommonParser.java
static void handleTeacher(Substitution subst, Element cell, JSONObject data) { cell = getContentElement(cell);//from w w w .j ava 2s . c o m if (cell.select("s").size() > 0) { subst.setPreviousTeachers(splitTeachers(cell.select("s").text(), data)); if (cell.ownText().length() > 0) { subst.setTeachers( splitTeachers(cell.ownText().replaceFirst("^\\?", "").replaceFirst("", ""), data)); } } else { subst.setTeachers(splitTeachers(cell.text(), data)); } }
From source file:net.pixomania.crawler.W3C.parser.rules.principalAuthors.PrincipalAuthorsRule1.java
@Override public ArrayList<Person> run(String url, Document doc) { ArrayList<Person> editorList = new ArrayList<>(); Elements editors = doc.select("dt:contains(Principal Author) ~ dd"); if (editors.size() == 0) return null; boolean skip = false; for (Element editor : editors) { Element prev = editor.previousElementSibling(); if (prev.tagName().equals("dt")) { if (!prev.text().trim().toLowerCase().startsWith("principal author")) { skip = true;// w w w. j a v a2s . c om } } if (skip) { Element next = editor.nextElementSibling(); if (next != null) { if (next.text().trim().toLowerCase().startsWith("principal author")) { skip = false; continue; } } continue; } String[] splitted = editor.html().split(","); for (String split : splitted) { if (!split.isEmpty()) { if (split.toLowerCase().startsWith("(in alphabetic") || split.toLowerCase().startsWith("see acknowl") || split.toLowerCase().startsWith("the w3") || split.toLowerCase().startsWith("(see ac") || split.toLowerCase().startsWith("see participants") || split.toLowerCase().contains("note:")) { Log.log("warning", "Spec " + url + " may refer to a different section!"); continue; } if (split.equals("WHATWG:") || split.equals("W3C:")) continue; Document newdoc = Jsoup.parse(split.replaceAll("\n", "")); Person result = NameParser.parse(newdoc.text()); if (result == null) continue; for (int i = 0; i < newdoc.select("a").size(); i++) { if (!newdoc.select("a").get(i).attr("href").isEmpty()) { if (newdoc.select("a").get(i).attr("href").contains("@")) { result.setEmail(newdoc.select("a").get(i).attr("href").replace("mailto:", "")); } else { result.addWebsite(newdoc.select("a").get(i).attr("href")); } } } editorList.add(result); } } } if (editorList.size() == 0) return null; return editorList; }
From source file:com.jejking.hh.nord.corpus.AllrisHtmlToRawDrucksache.java
private ImmutableMap<String, String> druckSachenProperties(Document htmlDoc) { ImmutableMap.Builder<String, String> mapBuilder = ImmutableMap.builder(); Elements keyElements = htmlDoc.getElementsByClass("kb1"); // td elements for (Element element : keyElements) { String key = removeNonBreakingSpacesAndTrim(element.text()); if (key.endsWith(":")) { key = key.substring(0, key.length() - 1); }//ww w .ja v a 2s . c o m if (element.nextElementSibling() != null && !element.nextElementSibling().hasAttr("kb1")) { String value = removeNonBreakingSpacesAndTrim(element.nextElementSibling().text()); if ((!key.isEmpty()) && (!value.isEmpty())) { mapBuilder.put(key, value); } } } return mapBuilder.build(); }
From source file:com.jimplush.goose.outputformatters.DefaultOutputFormatter.java
/** * replace common tags with just text so we don't have any crazy formatting issues * so replace <br>, <i>, <strong>, etc.... with whatever text is inside them */// w w w. ja v a2s .c o m private void replaceTagsWithText() { Elements strongs = topNode.getElementsByTag("strong"); for (Element item : strongs) { TextNode tn = new TextNode(item.text(), topNode.baseUri()); item.replaceWith(tn); } Elements bolds = topNode.getElementsByTag("b"); for (Element item : bolds) { TextNode tn = new TextNode(item.text(), topNode.baseUri()); item.replaceWith(tn); } Elements italics = topNode.getElementsByTag("i"); for (Element item : italics) { TextNode tn = new TextNode(item.text(), topNode.baseUri()); item.replaceWith(tn); } }
From source file:com.johan.vertretungsplan.parser.UntisInfoHeadlessParser.java
@Override public Vertretungsplan getVertretungsplan() throws IOException, JSONException { new LoginHandler(schule).handleLogin(executor, cookieStore, username, password); Vertretungsplan v = new Vertretungsplan(); List<VertretungsplanTag> tage = new ArrayList<VertretungsplanTag>(); Document doc = Jsoup.parse(httpGet(url, schule.getData().getString("encoding"))); Elements days = doc.select("#vertretung > p > b, #vertretung > b"); for (Element day : days) { VertretungsplanTag tag = new VertretungsplanTag(); tag.setStand(""); tag.setDatum(day.text()); Element next = null;// w w w . j a v a 2 s .c o m if (day.parent().tagName().equals("p")) { next = day.parent().nextElementSibling().nextElementSibling(); } else next = day.parent().select("p").first().nextElementSibling(); if (next.className().equals("subst")) { //Vertretungstabelle if (next.text().contains("Vertretungen sind nicht freigegeben")) continue; parseVertretungsplanTable(next, data, tag); } else { //Nachrichten parseNachrichten(next, data, tag); next = next.nextElementSibling().nextElementSibling(); parseVertretungsplanTable(next, data, tag); } tage.add(tag); } v.setTage(tage); return v; }
From source file:cvegrabber.CVEController.java
@RequestMapping(value = "/newest", produces = { "application/json" }) public CVE[] cve() { String url = "https://web.nvd.nist.gov/view/vuln/search-results?query=&search_type=all&cves=on"; CVE[] cvearray = new CVE[10]; try {/* www.j a v a 2 s .co m*/ Document doc = Jsoup.connect(url).get(); Elements newest = doc.select( "a[id*=BodyPlaceHolder_cplPageContent_plcZones_lt_zoneCenter_VulnerabilitySearchResults_VulnResultsRepeater_CveDetailAnchor_]"); int counter = 0; for (Element cveid : newest) { if (counter == 10) break; cvearray[counter] = new CVE(cveid.text(), grabMitreData(cveid.text(), "description"), grabMitreData(cveid.text(), "references")); counter++; } for (int i = 0; i < 10; i++) { logger.info("CVEID: " + cvearray[i].getCVE() + " CVE Description: " + cvearray[i].getDescription() + " CVE References: " + cvearray[i].getReferences()); } } catch (Exception ex) { logger.error("Unable to fetch latest cves. " + ex.getMessage()); } return cvearray; }
From source file:de.geeksfactory.opacclient.apis.Bibliotheca.java
public static AccountData parse_account(Account acc, Document doc, JSONObject data) throws JSONException { JSONObject copymap = data.getJSONObject("accounttable"); List<LentItem> media = new ArrayList<>(); if (doc.select(".kontozeile_center table").size() == 0) { return null; }/*from w w w . j av a 2 s . c o m*/ Elements exemplartrs = doc.select(".kontozeile_center table").get(0).select("tr.tabKonto"); DateTimeFormatter fmt = DateTimeFormat.forPattern("dd.MM.yyyy").withLocale(Locale.GERMAN); for (int i = 0; i < exemplartrs.size(); i++) { Element tr = exemplartrs.get(i); LentItem item = new LentItem(); Iterator<?> keys = copymap.keys(); while (keys.hasNext()) { String key = (String) keys.next(); int index; try { index = copymap.has(key) ? copymap.getInt(key) : -1; } catch (JSONException e1) { index = -1; } if (index >= 0) { if (key.equals("prolongurl")) { if (tr.child(index).children().size() > 0) { item.setProlongData(tr.child(index).child(0).attr("href")); item.setRenewable(tr.child(index).child(0).attr("href").contains("vermsg")); } } else if (key.equals("returndate")) { try { item.setDeadline(fmt.parseLocalDate(tr.child(index).text())); } catch (IllegalArgumentException e1) { e1.printStackTrace(); } } else { item.set(key, tr.child(index).text()); } } } media.add(item); } assert (doc.select(".kontozeile_center table").get(0).select("tr").size() > 0); assert (exemplartrs.size() == media.size()); copymap = data.getJSONObject("reservationtable"); List<ReservedItem> reservations = new ArrayList<>(); exemplartrs = doc.select(".kontozeile_center table").get(1).select("tr.tabKonto"); for (int i = 0; i < exemplartrs.size(); i++) { Element tr = exemplartrs.get(i); ReservedItem item = new ReservedItem(); Iterator<?> keys = copymap.keys(); while (keys.hasNext()) { String key = (String) keys.next(); int index; try { index = copymap.has(key) ? copymap.getInt(key) : -1; } catch (JSONException e1) { index = -1; } if (index >= 0) { if (key.equals("cancelurl")) { if (tr.child(index).children().size() > 0) { item.setCancelData(tr.child(index).child(0).attr("href")); } } else if (key.equals("availability")) { try { item.setReadyDate(fmt.parseLocalDate(tr.child(index).text())); } catch (IllegalArgumentException e1) { item.setStatus(tr.child(index).text()); } } else if (key.equals("expirationdate")) { try { item.setExpirationDate(fmt.parseLocalDate(tr.child(index).text())); } catch (IllegalArgumentException e1) { item.setStatus(tr.child(index).text()); } } else { item.set(key, tr.child(index).text()); } } } reservations.add(item); } assert (doc.select(".kontozeile_center table").get(1).select("tr").size() > 0); assert (exemplartrs.size() == reservations.size()); AccountData res = new AccountData(acc.getId()); for (Element row : doc.select(".kontozeile_center, div[align=center]")) { String text = row.text().trim(); if (text.matches(".*Ausstehende Geb.+hren:[^0-9]+([0-9.,]+)[^0-9A-Z]*(|EUR|CHF|Fr.).*")) { text = text.replaceAll( ".*Ausstehende Geb.+hren:[^0-9]+([0-9.," + "]+)[^0-9A-Z]*(|EUR|CHF|Fr.).*", "$1 $2"); res.setPendingFees(text); } if (text.matches("Ihr Ausweis ist g.ltig bis:.*")) { text = text.replaceAll("Ihr Ausweis ist g.ltig bis:[^A-Za-z0-9]+", ""); res.setValidUntil(text); } else if (text.matches("Ausweis g.ltig bis:.*")) { text = text.replaceAll("Ausweis g.ltig bis:[^A-Za-z0-9]+", ""); res.setValidUntil(text); } } res.setLent(media); res.setReservations(reservations); return res; }
From source file:com.webbfontaine.valuewebb.timer.RatesUpdater.java
public HashMap<String, BigDecimal> ratesFromBank() { HashMap<String, BigDecimal> rates = new HashMap<String, BigDecimal>(); Document doc = getPage();//from w w w. jav a2 s. c om Elements tables = doc.getElementsByTag("table"); Element tableOfRates = null; Elements trs; int pairsCodeIndex = 0; int sellingIndex = 0; for (Element table : tables) { if (table.text().contains("Dollar")) { tableOfRates = table; break; } } if (tableOfRates != null) { trs = tableOfRates.getElementsByTag("tr"); } else { LOGGER.error("Error reading rates from URL"); return rates; } Elements columns = trs.get(0).getElementsByTag("th"); for (int i = 0; i < columns.size(); ++i) { if (columns.get(i).text().equalsIgnoreCase("Pairs Code")) { pairsCodeIndex = i; } if (columns.get(i).text().equalsIgnoreCase("Selling")) { sellingIndex = i; } } for (Element tr : trs) { Elements tds = tr.getElementsByTag("td"); if (tds.size() != 0) { String currPair = tds.get(pairsCodeIndex).text().trim(); String rateText = tds.get(sellingIndex).text().trim().replace(",", ""); BigDecimal rate = new BigDecimal(rateText); String curr; if (currPair.startsWith("GHS")) { curr = currPair.substring(3); rate = new BigDecimal(1).divide(rate, Constants.FRACTION_DIGITS_NUMBER_4, Utils.getRoundingMode()); } else { curr = currPair.substring(0, currPair.lastIndexOf("GHS")); } rates.put(curr, rate); } } return rates; }