List of usage examples for org.jsoup.nodes Element attr
public String attr(String attributeKey)
From source file:net.niyonkuru.koodroid.html.SubscribersHandler.java
@Override public ArrayList<ContentProviderOperation> parse(Document doc, ContentResolver resolver) throws HandlerException { final ArrayList<ContentProviderOperation> batch = new ArrayList<ContentProviderOperation>(); Element subscriberLi = doc.select("div#banSelector li:has(div)").first(); while (subscriberLi != null) { String text = subscriberLi.text(); /* this assumes the name and phone number are separated by a space */ int separator = text.lastIndexOf(' ') + 1; String subscriberId = text.substring(separator).replaceAll("\\D", ""); if (subscriberId.length() != 10) throw new HandlerException(getString(R.string.parser_error_unexpected_input)); final ContentProviderOperation.Builder builder; final Uri subscriberUri = Subscribers.buildSubscriberUri(subscriberId); if (subscriberExists(subscriberUri, resolver)) { builder = ContentProviderOperation.newUpdate(subscriberUri); builder.withValue(Subscribers.UPDATED, System.currentTimeMillis()); } else {// www . j av a2 s .c om builder = ContentProviderOperation.newInsert(Subscribers.CONTENT_URI); } builder.withValue(Subscribers.SUBSCRIBER_ID, subscriberId); String fullName = ""; String[] names = text.substring(0, separator).split("\\s"); for (String name : names) { fullName += ParserUtils.capitalize(name) + " "; } builder.withValue(Subscribers.SUBSCRIBER_FULL_NAME, fullName.trim()); if (subscriberLi.hasAttr("onClick")) { String switchUrl = subscriberLi.attr("onClick"); /* extract only the url */ switchUrl = switchUrl.substring(switchUrl.indexOf('/'), switchUrl.lastIndexOf('\'')); builder.withValue(Subscribers.SUBSCRIBER_SWITCHER, switchUrl); } else { /* this is the default subscriber as it doesn't have a switcher url */ ContentValues cv = new ContentValues(1); cv.put(Settings.SUBSCRIBER, subscriberId); resolver.insert(Settings.CONTENT_URI, cv); } builder.withValue(Subscribers.SUBSCRIBER_EMAIL, mParent); batch.add(builder.build()); subscriberLi = subscriberLi.nextElementSibling(); } if (batch.size() == 0) throw new HandlerException(getString(R.string.parser_error_unexpected_input)); JSONObject metadata = new JSONObject(); try { metadata.put("subscribers", batch.size()); metadata.put("language", getString(R.string.locale)); } catch (JSONException ignored) { } Crittercism.setMetadata(metadata); Crittercism.setUsername(mParent); return batch; }
From source file:de.geeksfactory.opacclient.apis.SISIS.java
protected void parse_medialist(List<LentItem> media, Document doc, int offset) { Elements copytrs = doc.select(".data tr"); doc.setBaseUri(opac_url);/*from w w w .j a va 2 s .c om*/ DateTimeFormatter fmt = DateTimeFormat.forPattern("dd.MM.yyyy").withLocale(Locale.GERMAN); int trs = copytrs.size(); if (trs == 1) { return; } assert (trs > 0); for (int i = 1; i < trs; i++) { Element tr = copytrs.get(i); LentItem item = new LentItem(); if (tr.text().contains("keine Daten")) { return; } item.setTitle(tr.child(1).select("strong").text().trim()); try { item.setAuthor(tr.child(1).html().split("<br[ /]*>")[1].trim()); String[] col2split = tr.child(2).html().split("<br[ /]*>"); String deadline = col2split[0].trim(); if (deadline.contains("-")) { deadline = deadline.split("-")[1].trim(); } try { item.setDeadline(fmt.parseLocalDate(deadline).toString()); } catch (IllegalArgumentException e1) { e1.printStackTrace(); } if (col2split.length > 1) { item.setHomeBranch(col2split[1].trim()); } if (tr.select("a").size() > 0) { for (Element link : tr.select("a")) { String href = link.attr("abs:href"); Map<String, String> hrefq = getQueryParamsFirst(href); if (hrefq.get("methodToCall").equals("renewalPossible")) { item.setProlongData(offset + "$" + href.split("\\?")[1]); item.setRenewable(true); break; } } } else if (tr.select(".textrot, .textgruen, .textdunkelblau").size() > 0) { item.setProlongData("" + tr.select(".textrot, .textgruen, .textdunkelblau").text()); item.setRenewable(false); } } catch (Exception ex) { ex.printStackTrace(); } media.add(item); } assert (media.size() == trs - 1); }
From source file:org.shareok.data.sagedata.SageJournalIssueDateProcessor.java
@SuppressWarnings("empty-statement") public void retrieveSageJournalVolIssueDates(Map<String, String> processedJournalsMap) { List<String> processedJournals = new ArrayList<>(); // JSONObject jsonObj = getSavedSageJournalVolIssueDateInformation(); try {//from ww w .j a va 2s . co m Map<String, Map<String, String>> journalMap = getSavedSageJournalVolIssueDateInformation(); if (null == journalMap) { journalMap = new HashMap<>(); } Document doc = null; try { doc = Jsoup.connect("http://journals.sagepub.com/action/showPublications?pageSize=20&startPage=199") .userAgent( "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/33.0.1750.152 Safari/537.36") .cookie("auth", "token").timeout(300000).get(); Elements trs = doc.select("form#browsePublicationsForm").get(0).select("table").get(0) .select("tbody").get(0).select("tr"); for (Element tr : trs) { Element link = tr.select("td").get(1).select("a").get(0); String journalName = link.text(); String journalLink = SageDataUtil.SAGE_HTTP_PREFIX + link.attr("href"); String[] linkInfo = journalLink.split("/"); String journalIssuesLink = SageDataUtil.SAGE_HTTP_PREFIX + "/loi/" + linkInfo[linkInfo.length - 1]; if (null == journalMap.get(journalName)) { Map<String, String> infoMap = new HashMap<>(); infoMap.put("homeLink", journalLink); infoMap.put("issueLink", journalIssuesLink); journalMap.put(journalName, infoMap); } else { Map<String, String> infoMap = journalMap.get(journalName); if (null == infoMap.get("homeLink")) { infoMap.put("homeLink", journalLink); } if (null == infoMap.get("issueLink")) { infoMap.put("issueLink", journalIssuesLink); } } } int kk = 0; mainLoop: for (String journal : journalMap.keySet()) { System.out.println("Print out journal " + journal + " information :"); if (null != processedJournalsMap && (journal == null ? processedJournalsMap.get(journal) == null : journal.equals(processedJournalsMap.get(journal)))) { System.out.println("Journal : has already been processed!"); continue; } // if(journal.contains("Christian Education")){ // System.out.println("Journal name : International Journal of Health Services, cannot be processed!"); //// continue; // } // if(journal.contains("Plastic Surgery")){ // System.out.println("Journal name : International Journal of Health Services, cannot be processed!"); // continue; // } Map<String, String> journalInfoMap = journalMap.get(journal); for (String key : journalInfoMap.keySet()) { if (key.equals("issueLink")) { Document loiDdoc = null; try { loiDdoc = Jsoup.connect(journalInfoMap.get(key)).userAgent( "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/33.0.1750.152 Safari/537.36") .cookie("auth", "token").timeout(300000).get(); } catch (HttpStatusException ex) { ex.printStackTrace(); break; } Thread.sleep(2200); if (null != loiDdoc) { Map<String, Map<String, String>> dataMap; if (null != journalMap.get(journal).get("data")) { dataMap = DataUtil.getMapFromJson(journalMap.get(journal).get("data")); } else { dataMap = new HashMap<>(); } Elements decaseDivs = loiDdoc.select("div.decade"); if (null != decaseDivs && decaseDivs.size() > 0) { for (Element decade : decaseDivs) { Elements yearsDiv = decade.select("div.years").get(0).children(); if (null != yearsDiv && yearsDiv.size() > 0) { for (Element yearEle : yearsDiv) { Elements volumesDiv = yearEle.select("div.volumes").get(0) .children(); if (null != volumesDiv && volumesDiv.size() > 0) { for (Element volumeEle : volumesDiv) { String volume = volumeEle.select("a").get(0).text().trim() .split("Volume")[1].trim(); Elements issueInfoDivEles = volumeEle .select("div.js_issue"); if (null != issueInfoDivEles && issueInfoDivEles.size() > 0) { for (Element issueInfoDiv : issueInfoDivEles) { String issueText = issueInfoDiv.select("a").get(0) .text(); issueText = issueText.split(", ")[0] .split("Issue")[1].trim(); String oldIssueDate = ""; String issueDate = ""; if (NO_ARTICLE_PUB_DATE_JOURNALS_LIST .contains(journal)) { issueDate = "01 " + issueInfoDiv .select("span.loiIssueCoverDateText") .get(0).text().trim(); oldIssueDate = issueDate; // if(issueDate.contains("Winter")){ // issueDate = issueDate.replaceAll("Winter", "October"); // } // if(issueDate.contains("Fall") || issueDate.contains("Autumn")){ // issueDate = issueDate.replaceAll("Fall", "September"); // issueDate = issueDate.replaceAll("Autumn", "September"); // } // if(issueDate.contains("Summer")){ // issueDate = issueDate.replaceAll("Summer", "April"); // } // if(issueDate.contains("Spring")){ // issueDate = issueDate.replaceAll("Spring", "January"); // } // try{ // // for date string like "01 July-October 2016" // if(issueDate.contains("-")){ // String[] dateInfo = issueDate.split("-"); // issueDate = dateInfo[0] + " " + dateInfo[1].split(" ")[1]; // } // // for date string like "01 July/October 2016" // if(issueDate.contains("/")){ // String[] dataInfo = issueDate.split("/"); // issueDate = dataInfo[0] + " " + dataInfo[1].split(" ")[1]; // } // } // catch(ArrayIndexOutOfBoundsException ex){ // System.out.println("Journal name: "+journal); // System.out.println("Volume: "+volume+", issue: "+issueText); // System.out.println("This date string cannot be parsed: "+oldIssueDate); // ex.printStackTrace(); // continue; // } try { issueDate = "01 " + issueInfoDiv.select( "span.loiIssueCoverDateText").get(0) .text().trim(); oldIssueDate = issueDate; issueDate = DataHandlersUtil .convertFullMonthDateStringFormat( issueDate); } catch (ParseException ex) { // if(!journal.contains("OMEGA - Journal of Death and Dying")){ // continue; // } System.out.println( "Journal name: " + journal); System.out.println("Volume: " + volume + ", issue: " + issueText); System.out.println( "This date string cannot be parsed: " + oldIssueDate); ex.printStackTrace(); continue; } } else { try { Element issueLinkEle = issueInfoDiv .select("a").get(0); String issueLink = issueLinkEle .attr("href"); Document issueDoc = null; try { issueDoc = Jsoup.connect(issueLink) .userAgent( "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/33.0.1750.152 Safari/537.36") .cookie("auth", "token") .timeout(300000).get(); } catch (HttpStatusException ex) { ex.printStackTrace(); break mainLoop; } Thread.sleep(2200); Elements articleDivs = issueDoc .select("div.art_title, .linkable"); String articleLink = SageDataUtil.SAGE_HTTP_PREFIX + articleDivs.get(0) .select("a.ref, .nowrap") .get(0).attr("href"); if (articleLink.contains("pdf/")) { System.out.println("journal: " + journal + " volume=" + volume + " issue=" + issueText + " has ONLY PDF links!"); try { issueDate = issueInfoDiv.select( "span.loiIssueCoverDateText") .get(0).text().trim(); oldIssueDate = issueDate; if (issueDate.contains("Winter")) { issueDate = issueDate .replaceAll("Winter", "December"); } if (issueDate.contains("Fall") || issueDate.contains( "Autumn")) { issueDate = issueDate .replaceAll("Fall", "September"); issueDate = issueDate .replaceAll("Autumn", "September"); } if (issueDate.contains("Summer")) { issueDate = issueDate .replaceAll("Summer", "June"); } if (issueDate.contains("Spring")) { issueDate = issueDate .replaceAll("Spring", "March"); } if (issueDate.contains("/")) { String[] dataInfo = issueDate .split("/"); String dateInfo1 = dataInfo[0] .trim(); String date; String month1; String[] dateInfo1Arr = dateInfo1 .split(" "); if (dateInfo1Arr.length == 2) { date = dateInfo1Arr[0]; month1 = dateInfo1Arr[1]; } else { date = "01"; month1 = dataInfo[0].trim(); } String month2 = dataInfo[1] .split("\\s+")[0]; String year = dataInfo[1] .split("\\s+")[1]; String date1 = DataHandlersUtil .convertFullMonthDateStringFormat( date + " " + month1 + " " + year); String date2 = DataHandlersUtil .convertFullMonthDateStringFormat( date + " " + month2 + " " + year); issueDate = date1 + "::" + date2; } // The Journal of Psychiatry & Law dd MMMM-MMMM yyyy pattern else if (issueDate.contains("-")) { if (journal.equals( "OMEGA - Journal of Death and Dying")) { Document articleDoc = null; try { articleDoc = Jsoup .connect( articleLink) .userAgent( "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/33.0.1750.152 Safari/537.36") .cookie("auth", "token") .timeout(300000) .get(); } catch (HttpStatusException ex) { ex.printStackTrace(); break mainLoop; } Thread.sleep(2200); Element pubDateDiv = articleDoc .select("div.published-dates") .get(0); issueDate = pubDateDiv .text() .split("Issue published:")[1] .trim(); oldIssueDate = issueDate; issueDate = DataHandlersUtil .convertFullMonthDateStringFormat( issueDate); } else { String[] dataInfo = issueDate .split("-"); String dateInfo1 = dataInfo[0] .trim(); String date; String month1; String[] dateInfo1Arr = dateInfo1 .split(" "); if (dateInfo1Arr.length == 2) { date = dateInfo1Arr[0] .trim(); month1 = dateInfo1Arr[1] .trim(); } else { date = "01"; month1 = dataInfo[0] .trim(); } String month2 = dataInfo[1] .split("\\s+")[0]; String year = dataInfo[1] .split("\\s+")[1]; String date1 = DataHandlersUtil .convertFullMonthDateStringFormat( date + " " + month1 + " " + year); String date2 = DataHandlersUtil .convertFullMonthDateStringFormat( date + " " + month2 + " " + year); issueDate = date1 + "::" + date2; } } else { issueDate = "01 " + issueDate; issueDate = DataHandlersUtil .convertFullMonthDateStringFormat( issueDate); } } catch (ParseException | ArrayIndexOutOfBoundsException ex) { System.out.println( "Journal name: " + journal); System.out.println("Volume: " + volume + ", issue: " + issueText); System.out.println( "This date string cannot be parsed: " + issueDate); ex.printStackTrace(); continue; } } else { Document articleDoc = null; try { articleDoc = Jsoup .connect(articleLink) .userAgent( "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/33.0.1750.152 Safari/537.36") .cookie("auth", "token") .timeout(300000).get(); } catch (HttpStatusException ex) { ex.printStackTrace(); break mainLoop; } Thread.sleep(2200); Element pubDateDiv = articleDoc .select("div.published-dates") .get(0); issueDate = pubDateDiv.text() .split("Issue published:")[1] .trim(); oldIssueDate = issueDate; issueDate = DataHandlersUtil .convertFullMonthDateStringFormat( issueDate); } } catch (Exception ex) { logger.error( "Cannot get the issue date for journal =" + journal + " volume=" + volume + " issue=" + issueText + " date=" + oldIssueDate, ex); continue; } } if (DataHandlersUtil.datesCompare(issueDate, "2010-01-01") < 0) { if (dataMap.size() > 0) { ObjectMapper mapper = new ObjectMapper(); String json = mapper .writeValueAsString(dataMap); journalInfoMap.put("data", json); } processedJournals.add(journal); continue mainLoop; } try { if (null != dataMap && dataMap.size() > 0 && null != dataMap.get(volume) && null != dataMap.get(volume) .get(issueText)) { continue; } else { Map<String, String> issueMap = dataMap .get(volume); if (null == issueMap) { issueMap = new HashMap<>(); issueMap.put(issueText, issueDate); dataMap.put(volume, issueMap); } else { issueMap.put(issueText, issueDate); } System.out.println("This is vol. " + volume + " and issue " + issueText + " and date " + issueDate); } } catch (Exception ex) { System.out.println( "Cannot add the pub date info into data map for vol. " + volume + " and issue " + issueText + " and date " + issueDate); } } } } } } } } } if (dataMap.size() > 0) { ObjectMapper mapper = new ObjectMapper(); String json = mapper.writeValueAsString(dataMap); journalInfoMap.put("data", json); } } } } processedJournals.add(journal); if (kk > 100) { break; } kk++; } } catch (IOException ex) { ex.printStackTrace(); } ObjectMapper mapper = new ObjectMapper(); String json = mapper.writeValueAsString(journalMap); String sageJournalIssueDateInfoFilePath = ShareokdataManager.getSageJournalIssueDateInfoFilePath(); File sageFile = new File(sageJournalIssueDateInfoFilePath); if (sageFile.exists()) { String sageJournalIssueDateInfoFilePathOld = sageJournalIssueDateInfoFilePath.split("\\.")[0] + "_" + DataHandlersUtil.getCurrentTimeString() + ".json"; sageFile.renameTo(new File(sageJournalIssueDateInfoFilePathOld)); } DocumentProcessorUtil.outputStringToFile(json, ShareokdataManager.getSageJournalIssueDateInfoFilePath()); System.out.println("processed journals = " + mapper.writeValueAsString(processedJournals)); } catch (Exception ex) { logger.error("Cannot process the issue dates.", ex); } }
From source file:com.vaushell.shaarlijavaapi.ShaarliClient.java
private String extract(final Element source, final String templateName) { if (source == null) { throw new IllegalArgumentException(); }/* www . j a v a 2 s.c o m*/ final ShaarliTemplates.Template template = templates.get(templateName); if (template == null) { throw new IllegalArgumentException("template '" + templateName + "' not found"); } final Element elt; if (template.cssPath.isEmpty()) { elt = source; } else { final Elements elts = source.select(template.cssPath); if (elts.isEmpty()) { return null; } elt = elts.first(); } String content; if (template.attribut.isEmpty()) { content = elt.text(); } else { content = elt.attr(template.attribut); } if (content == null) { return null; } content = content.trim(); if (!template.regex.isEmpty()) { final Pattern p = Pattern.compile(template.regex); final Matcher m = p.matcher(content); if (m.find()) { content = m.group().trim(); } } if (content.isEmpty()) { return null; } return content; }
From source file:de.geeksfactory.opacclient.apis.SISIS.java
public List<SearchField> getSearchFields() throws IOException, JSONException { if (!initialised) { start();/*from w w w. j a v a 2 s . c o m*/ } String html = httpGet(opac_url + "/search.do?methodToCall=switchSearchPage&SearchType=2", ENCODING); Document doc = Jsoup.parse(html); List<SearchField> fields = new ArrayList<>(); Elements options = doc.select("select[name=searchCategories[0]] option"); for (Element option : options) { TextSearchField field = new TextSearchField(); field.setDisplayName(option.text()); field.setId(option.attr("value")); field.setHint(""); fields.add(field); } for (Element dropdown : doc.select("#tab-content select")) { parseDropdown(dropdown, fields); } return fields; }
From source file:ExtractorContentTest.java
@Test public void collectAllComparisonOf() throws IOException { List<Element> hrefs = new ArrayList<Element>(); _collectAllComparisonOf(//from www .j a v a 2s . com "/w/index.php?title=Special%3APrefixIndex&prefix=Comparison&namespace=0&hideredirects=1", hrefs); System.err.println("#hrefs=" + hrefs.size()); StringBuilder content = new StringBuilder(); content.append("Title ; URL\n"); // header for (Element href : hrefs) { String hText = href.attr("title"); String hURL = href.attr("href"); content.append("" + hText + " ; " + URL_BASE_NAME + hURL + "\n"); } //FileUtils.writeStringToFile(new File ("comparisonsData.csv"), content.toString()); }
From source file:de.geeksfactory.opacclient.apis.SISIS.java
@Override public AccountData account(Account acc) throws IOException, JSONException, OpacErrorException { start(); // TODO: Is this necessary? int resultNum; if (!login(acc)) { return null; }/*w w w .j a v a 2s . c o m*/ // Geliehene Medien String html = httpGet(opac_url + "/userAccount.do?methodToCall=showAccount&typ=1", ENCODING); List<LentItem> medien = new ArrayList<>(); Document doc = Jsoup.parse(html); doc.setBaseUri(opac_url); parse_medialist(medien, doc, 1); if (doc.select(".box-right").size() > 0) { for (Element link : doc.select(".box-right").first().select("a")) { String href = link.attr("abs:href"); Map<String, String> hrefq = getQueryParamsFirst(href); if (hrefq == null || hrefq.get("methodToCall") == null) { continue; } if (hrefq.get("methodToCall").equals("pos") && !"1".equals(hrefq.get("anzPos"))) { html = httpGet(href, ENCODING); parse_medialist(medien, Jsoup.parse(html), Integer.parseInt(hrefq.get("anzPos"))); } } } if (doc.select("#label1").size() > 0) { resultNum = 0; String rNum = doc.select("#label1").first().text().trim().replaceAll(".*\\(([0-9]*)\\).*", "$1"); if (rNum.length() > 0) { resultNum = Integer.parseInt(rNum); } assert (resultNum == medien.size()); } // Ordered media ("Bestellungen") html = httpGet(opac_url + "/userAccount.do?methodToCall=showAccount&typ=6", ENCODING); List<ReservedItem> reserved = new ArrayList<>(); doc = Jsoup.parse(html); doc.setBaseUri(opac_url); parse_reslist("6", reserved, doc, 1); Elements label6 = doc.select("#label6"); if (doc.select(".box-right").size() > 0) { for (Element link : doc.select(".box-right").first().select("a")) { String href = link.attr("abs:href"); Map<String, String> hrefq = getQueryParamsFirst(href); if (hrefq == null || hrefq.get("methodToCall") == null) { break; } if (hrefq.get("methodToCall").equals("pos") && !"1".equals(hrefq.get("anzPos"))) { html = httpGet(href, ENCODING); parse_reslist("6", reserved, Jsoup.parse(html), Integer.parseInt(hrefq.get("anzPos"))); } } } // Prebooked media ("Vormerkungen") html = httpGet(opac_url + "/userAccount.do?methodToCall=showAccount&typ=7", ENCODING); doc = Jsoup.parse(html); doc.setBaseUri(opac_url); parse_reslist("7", reserved, doc, 1); if (doc.select(".box-right").size() > 0) { for (Element link : doc.select(".box-right").first().select("a")) { String href = link.attr("abs:href"); Map<String, String> hrefq = getQueryParamsFirst(href); if (hrefq == null || hrefq.get("methodToCall") == null) { break; } if (hrefq.get("methodToCall").equals("pos") && !"1".equals(hrefq.get("anzPos"))) { html = httpGet(href, ENCODING); parse_reslist("7", reserved, Jsoup.parse(html), Integer.parseInt(hrefq.get("anzPos"))); } } } if (label6.size() > 0 && doc.select("#label7").size() > 0) { resultNum = 0; String rNum = label6.text().trim().replaceAll(".*\\(([0-9]*)\\).*", "$1"); if (rNum.length() > 0) { resultNum = Integer.parseInt(rNum); } rNum = doc.select("#label7").text().trim().replaceAll(".*\\(([0-9]*)\\).*", "$1"); if (rNum.length() > 0) { resultNum += Integer.parseInt(rNum); } assert (resultNum == reserved.size()); } AccountData res = new AccountData(acc.getId()); if (doc.select("#label8").size() > 0) { String text = doc.select("#label8").first().text().trim(); if (text.matches("Geb.+hren[^\\(]+\\(([0-9.,]+)[^0-9A-Z]*(|EUR|CHF|Fr)\\)")) { text = text.replaceAll("Geb.+hren[^\\(]+\\(([0-9.,]+)[^0-9A-Z]*(|EUR|CHF|Fr)\\)", "$1 $2"); res.setPendingFees(text); } } Pattern p = Pattern.compile("[^0-9.]*", Pattern.MULTILINE); if (doc.select(".box3").size() > 0) { for (Element box : doc.select(".box3")) { if (box.select("strong").size() == 1) { String text = box.select("strong").text(); if (text.equals("Jahresgebhren")) { text = box.text(); text = p.matcher(text).replaceAll(""); res.setValidUntil(text); } } } } res.setLent(medien); res.setReservations(reserved); return res; }
From source file:ExtractorContentTest.java
@Test public void testStatistics() throws Exception { List<Element> hrefs = new ArrayList<Element>(); _collectAllComparisonOf(/* w w w . ja v a2 s . co m*/ "/w/index.php?title=Special%3APrefixIndex&prefix=Comparison&namespace=0&hideredirects=1", hrefs); int j = 0; // j-th comparison int nRelevant = 0; for (Element href : hrefs) { String hURL = href.attr("href"); int n = "/wiki/".length(); String wikiPageName = hURL.substring(n); System.err.println("(" + j++ + ") " + wikiPageName); if (excludePCMs.contains(wikiPageName)) { System.err.println("Ignoring"); continue; } PCMStatistic stat = computeStatistic(wikiPageName); // we exploit here the stats by printing int nTable = stat.getNumbersOfTables(); System.err.println("numbers of tables:" + nTable); if (nTable > 0) nRelevant++; Collection<CatalogStat> catalogStats = stat.getCatalogStats(); int i = 1; for (CatalogStat catalogStat : catalogStats) { System.err.println("table(" + i++ + ")"); System.err.println("#headers=" + catalogStat.getNumbersOfHeaders()); System.err.println("#products=" + catalogStat.getNumbersOfProduct()); } System.err.println("\n\n\n"); } System.err.println("number of relevant PCMs: " + nRelevant); // String wikiPageName = "Comparison_of_Java_virtual_machines"; }
From source file:ExtractorContentTest.java
private void _collectAllComparisonOf(String url, List<Element> hrefs) throws IOException { Document doc = Jsoup.connect("" + URL_BASE_NAME + url).get(); Elements aHrefs = doc.select("a[href]"); Element urlNext = null;/*from w w w. ja va2 s .c o m*/ for (Element aHref : aHrefs) { Element h = aHref.getElementsByAttribute("href").first(); // val() ; String hText = h.attr("title"); String hURL = h.attr("href"); if (hText.contains("Comparison") && hURL.startsWith("/wiki/")) { hrefs.add(aHref); } String aText = aHref.text(); if (aText.contains("Next page") && hURL.startsWith("/w/index.php?")) urlNext = aHref; } if (urlNext != null) { _collectAllComparisonOf(urlNext.attr("href"), hrefs); } }
From source file:be.ibridge.kettle.jsoup.JsoupInput.java
private Object[] buildRow() throws KettleException { // Create new row... Object[] outputRowData = buildEmptyRow(); if (data.readrow != null) outputRowData = data.readrow.clone(); // Read fields... for (int i = 0; i < data.nrInputFields; i++) { // Get field JsoupInputField field = meta.getInputFields()[i]; // get jsoup array for field Elements jsoupa = data.resultList.get(i); String nodevalue = null;//from w w w. j a va2 s . com if (jsoupa != null) { Element jo = jsoupa.get(data.recordnr); if (jo != null) { // Do Element Type switch (field.getElementType()) { case JsoupInputField.ELEMENT_TYPE_NODE: // Do Result Type switch (field.getResultType()) { case JsoupInputField.RESULT_TYPE_TEXT: nodevalue = jo.text(); break; case JsoupInputField.RESULT_TYPE_TYPE_OUTER_HTML: nodevalue = jo.outerHtml(); break; case JsoupInputField.RESULT_TYPE_TYPE_INNER_HTML: nodevalue = jo.html(); break; default: nodevalue = jo.toString(); break; } break; case JsoupInputField.ELEMENT_TYPE_ATTRIBUT: nodevalue = jo.attr(field.getAttribute()); break; default: nodevalue = jo.toString(); break; } } } // Do trimming switch (field.getTrimType()) { case JsoupInputField.TYPE_TRIM_LEFT: nodevalue = Const.ltrim(nodevalue); break; case JsoupInputField.TYPE_TRIM_RIGHT: nodevalue = Const.rtrim(nodevalue); break; case JsoupInputField.TYPE_TRIM_BOTH: nodevalue = Const.trim(nodevalue); break; default: break; } if (meta.isInFields()) { // Add result field to input stream outputRowData = RowDataUtil.addValueData(outputRowData, data.totalpreviousfields + i, nodevalue); } // Do conversions // ValueMetaInterface targetValueMeta = data.outputRowMeta.getValueMeta(data.totalpreviousfields + i); ValueMetaInterface sourceValueMeta = data.convertRowMeta.getValueMeta(data.totalpreviousfields + i); outputRowData[data.totalpreviousfields + i] = targetValueMeta.convertData(sourceValueMeta, nodevalue); // Do we need to repeat this field if it is null? if (meta.getInputFields()[i].isRepeated()) { if (data.previousRow != null && Const.isEmpty(nodevalue)) { outputRowData[data.totalpreviousfields + i] = data.previousRow[data.totalpreviousfields + i]; } } } // End of loop over fields... int rowIndex = data.nrInputFields; // See if we need to add the filename to the row... if (meta.includeFilename() && !Const.isEmpty(meta.getFilenameField())) { outputRowData[rowIndex++] = data.filename; } // See if we need to add the row number to the row... if (meta.includeRowNumber() && !Const.isEmpty(meta.getRowNumberField())) { outputRowData[rowIndex++] = new Long(data.rownr); } // Possibly add short filename... if (meta.getShortFileNameField() != null && meta.getShortFileNameField().length() > 0) { outputRowData[rowIndex++] = data.shortFilename; } // Add Extension if (meta.getExtensionField() != null && meta.getExtensionField().length() > 0) { outputRowData[rowIndex++] = data.extension; } // add path if (meta.getPathField() != null && meta.getPathField().length() > 0) { outputRowData[rowIndex++] = data.path; } // Add Size if (meta.getSizeField() != null && meta.getSizeField().length() > 0) { outputRowData[rowIndex++] = new Long(data.size); } // add Hidden if (meta.isHiddenField() != null && meta.isHiddenField().length() > 0) { outputRowData[rowIndex++] = new Boolean(data.path); } // Add modification date if (meta.getLastModificationDateField() != null && meta.getLastModificationDateField().length() > 0) { outputRowData[rowIndex++] = data.lastModificationDateTime; } // Add Uri if (meta.getUriField() != null && meta.getUriField().length() > 0) { outputRowData[rowIndex++] = data.uriName; } // Add RootUri if (meta.getRootUriField() != null && meta.getRootUriField().length() > 0) { outputRowData[rowIndex++] = data.rootUriName; } data.recordnr++; RowMetaInterface irow = getInputRowMeta(); data.previousRow = irow == null ? outputRowData : (Object[]) irow.cloneRow(outputRowData); // copy it to make // surely the next step doesn't change it in between... return outputRowData; }