List of usage examples for org.jsoup.nodes Element select
public Elements select(String cssQuery)
From source file:mobi.jenkinsci.ci.client.JenkinsClient.java
private String getCommitIdFromRow(final Element row) { final Element fullChangeDesc = row.select("div[class=changeset-message]").first(); if (fullChangeDesc == null) { return null; }//from ww w . j a v a 2 s . com final Element message = fullChangeDesc.select("b").first(); final String messageText = message.childNode(0).toString(); final Matcher commitMatch = Pattern.compile("Commit ([^ ]+)").matcher(messageText); if (commitMatch.find()) { return commitMatch.group(1); } else { return null; } }
From source file:net.parser.JobParser.java
public boolean regionalJobFirst() { Elements elements = doc.select(".searchlist").eq(0); for (Element element : elements) { if (element.select("h2 span").text().contains("Regionalni poslovi")) { return true; } else {/*w ww . j a va 2 s .co m*/ return false; } } return false; }
From source file:gov.medicaid.screening.dao.impl.LicensedProviderCommonDAO.java
/** * Parse the ProviderProfile information. * /*from w ww . ja v a 2s . co m*/ * @param header * the header element * @param body * the body element * @throws ParsingException * if any error occurs when parsing * @return parsed ProviderProfile */ private ProviderProfile parseProfile(Element header, Element body) throws ParsingException { try { String name = header.select("a").html(); String str1 = body.select("td").get(0).html(); String str2 = body.select("td").get(1).html(); String licenseNumber = Util.getStringInBetween(str2, "License number:", "<br />"); String[] sp = str1.split("<br />"); String address = "", phone = "", county = "", city = "", state = "", zipcode = ""; address = sp[0].trim(); if (sp.length > 3) { phone = sp[2].trim(); } county = sp[sp.length - 1].trim(); sp = sp[1].trim().split(","); city = sp[0].trim(); sp = sp[1].trim().split(" "); state = sp[0].trim(); zipcode = sp[1].trim(); ProviderProfile profile = new ProviderProfile(); // name Business business = new Business(); profile.setBusiness(business); business.setName(name); // address List<Address> addresses = new ArrayList<Address>(); Address addressObj = new Address(); addresses.add(addressObj); profile.setAddresses(addresses); addressObj.setCity(city); addressObj.setCounty(county); addressObj.setState(state); addressObj.setLocation(address); addressObj.setZipcode(zipcode); // phone profile.setContactPhoneNumber(phone); // license List<License> licenses = new ArrayList<License>(); License licenseObj = new License(); licenses.add(licenseObj); profile.setLicenses(licenses); licenseObj.setLicenseNumber(licenseNumber); profile.setProviderType(getProviderType()); return profile; } catch (Throwable e) { throw new ParsingException("Failed to parse the html", e); } }
From source file:lolth.autohome.buy.AutohomeBuyInfoListTaskFetch.java
@Override protected void parsePage(Document doc, FetchTask task) throws Exception { Elements lis = doc.select("li.price-item"); for (Element li : lis) { AutohomeBuyInfoBean bean = new AutohomeBuyInfoBean(); bean.setUrl(task.getUrl());/*from w w w . j a v a2 s . c o m*/ bean.setForumId(task.getExtra()); // post id Elements id = li.select("div.price-share a.share"); if (!id.isEmpty()) { String idStr = id.first().attr("data-target"); idStr = StringUtils.substringAfterLast(idStr, "_"); if (StringUtils.isBlank(idStr)) { continue; } bean.setId(idStr); } // Elements user = li.select("div.user-name a"); if (!user.isEmpty()) { String userUrl = user.first().absUrl("href"); String userId = StringUtils.substringAfterLast(userUrl, "/"); String userName = user.first().text(); bean.setUserId(userId); bean.setUserUrl(userUrl); bean.setUserName(userName); } // ? Elements postTime = li.select("div.user-name span"); if (!postTime.isEmpty()) { bean.setPostTime(StringUtils.trim(StringUtils.substringBefore(postTime.first().text(), "?"))); } Elements dataLis = li.select("div.price-item-bd li"); for (Element dataLi : dataLis) { String data = dataLi.text(); if (StringUtils.startsWith(data, "")) { bean.setCar(StringUtils.trim(StringUtils.substringAfter(data, ""))); } if (StringUtils.startsWith(data, "")) { bean.setPrice(StringUtils.trim(StringUtils.substringAfter(data, ""))); } if (StringUtils.startsWith(data, "")) { bean.setGuidePrice(StringUtils.trim(StringUtils.substringAfter(data, ""))); } if (StringUtils.startsWith(data, "?")) { bean.setTotalPrice(StringUtils.trim(StringUtils.substringAfter(data, ""))); } if (StringUtils.startsWith(data, "")) { bean.setPurchaseTax(StringUtils.trim(StringUtils.substringAfter(data, ""))); } if (StringUtils.startsWith(data, "?")) { bean.setCommercialInsurance(StringUtils.trim(StringUtils.substringAfter(data, ""))); } if (StringUtils.startsWith(data, "")) { bean.setVehicleUseTax(StringUtils.trim(StringUtils.substringAfter(data, ""))); } if (StringUtils.startsWith(data, "")) { bean.setCompulsoryInsurance(StringUtils.trim(StringUtils.substringAfter(data, ""))); } if (StringUtils.startsWith(data, "")) { bean.setLicenseFee(StringUtils.trim(StringUtils.substringAfter(data, ""))); } if (StringUtils.startsWith(data, "?")) { bean.setPromotion(StringUtils.trim(StringUtils.substringAfter(data, ""))); } if (StringUtils.startsWith(data, "")) { bean.setBuyTime(StringUtils.trim(StringUtils.substringAfter(data, ""))); } if (StringUtils.startsWith(data, "")) { String area = StringUtils.trim(StringUtils.substringAfter(data, "")); String[] pAndC = StringUtils.splitByWholeSeparator(area, ",", 2); if (pAndC.length == 1) { bean.setBuyProvince(pAndC[0]); bean.setBuyCity(pAndC[0]); } if (pAndC.length == 2) { bean.setBuyProvince(pAndC[0]); bean.setBuyCity(pAndC[1]); } } if (StringUtils.startsWith(data, "")) { Elements level = dataLi.select("span.level"); // if (!level.isEmpty()) { bean.setSellerComment(level.first().text()); } // ? Elements seller = dataLi.select("a.title"); if (!seller.isEmpty()) { String sellerUrl = seller.first().absUrl("href"); String sellerName = seller.first().text(); String sellerId = StringUtils.substringAfterLast(sellerUrl, "/"); bean.setSellerId(sellerId); bean.setSellerName(sellerName); bean.setSellerUrl(sellerUrl); } // ? Elements sellerPhone = dataLi.select("em.phone-num"); if (!sellerPhone.isEmpty()) { bean.setSellerPhone(sellerPhone.first().text()); } // ? // Elements sellerAddress = dataLi.select("em.phone-num"); } if (StringUtils.startsWith(data, "?")) { bean.setBuyFeeling(StringUtils.trim(StringUtils.substringAfter(data, ""))); } } log.debug("Bean : {}", bean); bean.persistOnNotExist(); } }
From source file:de.tudarmstadt.ukp.argumentation.data.roomfordebate.NYTimesArticleExtractor.java
public Article extractArticle(String html) throws ParseException, IOException { Article result = new Article(); Document doc = Jsoup.parse(html, getBaseName()); Element element; try {/* w w w .j a v a2 s. co m*/ element = doc.select("article.rfd").iterator().next(); } catch (NoSuchElementException exception) { throw new IOException("Cannot find article.rfd element"); } // System.out.println(element); String dateText = element.select("p.pubdate").text().replaceAll("Updated[\\s]+", ""); // time try { DateFormat df = new SimpleDateFormat("MMM dd, yyyy, hh:mm aaa", Locale.ENGLISH); Date date = df.parse(dateText); result.setTimestamp(date); } catch (ParseException e) { // June 24, 2015 DateFormat df = new SimpleDateFormat("MMM dd, yyyy", Locale.ENGLISH); Date date = df.parse(dateText); result.setTimestamp(date); } // title result.setTitle(TextCleaningUtils.normalizeWithParagraphs(element.select("h1").text())); // text StringBuilder sb = new StringBuilder(); for (Element p : element.select("div.nytint-post > p")) { sb.append(p.text()); sb.append("\n"); } result.setText(TextCleaningUtils.normalizeWithParagraphs(sb.toString())); // debate title result.setDebateTitle(TextCleaningUtils .normalizeWithParagraphs(doc.select("div.nytint-discussion-overview > h2").text())); // debate url result.setDebateUrl(doc.select("div.nytint-discussion-overview > h2 > a").iterator().next().attr("href")); // document url result.setUrl(doc.select("meta[name=communityAssetURL]").attr("content")); // debate description result.setDebateDescription(TextCleaningUtils .normalizeWithParagraphs(((TextNode) doc.select("div.nytint-discussion-overview > p").iterator() .next().childNodes().iterator().next()).text())); // aurhor result.setAuthor(element.select("div.nytint-mugshots > img").iterator().next().attr("alt")); // topics for (Element a : element.select("p.nytint-tags > a")) { result.getTopics().add(a.attr("href")); } return result; }
From source file:de.geeksfactory.opacclient.apis.BiBer1992.java
static List<ReservedItem> parseResList(Document doc, JSONObject data) throws JSONException { List<ReservedItem> reservations = new ArrayList<>(); if (doc == null) { // error message as html result return reservations; }//from w w w .j a v a 2 s . c om // parse result list JSONObject copymap; if (!data.has("reservationtable")) { // reservations not specifically supported, let's just try it // with default values but fail silently copymap = new JSONObject(); copymap.put("author", 3); copymap.put("availability", 6); copymap.put("branch", -1); copymap.put("cancelurl", -1); copymap.put("expirationdate", 5); copymap.put("title", 3); } else { copymap = data.getJSONObject("reservationtable"); } DateTimeFormatter fmt = DateTimeFormat.forPattern("dd.MM.yyyy").withLocale(Locale.GERMAN); Elements rowElements = doc.select("form[name=vorml] table tr"); // rows: skip 1st row -> title row for (int i = 1; i < rowElements.size(); i++) { Element tr = rowElements.get(i); if (tr.child(0).tagName().equals("th")) { continue; } ReservedItem item = new ReservedItem(); item.setCancelData(tr.select("input[type=checkbox]").attr("name")); // columns: all elements of one media Iterator<?> keys = copymap.keys(); while (keys.hasNext()) { String key = (String) keys.next(); int index = copymap.getInt(key); if (index >= 0) { String value = tr.child(index).text().trim(); switch (key) { case "author": value = findTitleAndAuthor(value)[1]; break; case "title": value = findTitleAndAuthor(value)[0]; break; case "availability": try { value = fmt.parseLocalDate(value).toString(); } catch (IllegalArgumentException e1) { key = "status"; } break; case "expirationdate": try { value = fmt.parseLocalDate(value).toString(); } catch (IllegalArgumentException e1) { key = "status"; } break; } if (value != null && value.length() != 0) { item.set(key, value); } } } reservations.add(item); } return reservations; }
From source file:net.devietti.ArchConfMapServlet.java
/** Fetch info for a list of conferences from WikiCFP */ private List<Conf> getConfInfo(List<String> confs) throws IOException { String query = StringUtils.join(confs, "+"); List<Conf> results = new LinkedList<Conf>(); /*//from w w w . jav a 2 s . com * NB: year=f returns hits for this year and future years. This is exactly what we want, since * we automatically discard conferences that have already happened. */ Document doc = getURL("http://www.wikicfp.com/cfp/servlet/tool.search?year=f&q=" + query); Elements rows = doc.select("div[class=contsec] table table tr"); for (Iterator<Element> iter = rows.iterator(); iter.hasNext();) { final Element firstRow = iter.next(); final Elements confName = firstRow.select("td a"); if (confName.isEmpty()) continue; final Conf conf = new Conf(); // make sure we match one of the conferences we're interested in String cn = confName.first().text().split(" ")[0]; int found = Arrays.binarySearch(CONFERENCE_NAMES, cn); if (found < 0) continue; // not found final String confFullName = firstRow.select("td").get(1).text(); // don't match other ICS conferences, eg Information, Communication, Society if (CONFERENCE_NAMES[found].equals("ICS")) { if (!confFullName.toLowerCase().contains("supercomputing")) { continue; } } // don't match other CC conferences, eg Creative Construction if (CONFERENCE_NAMES[found].equals("CC")) { if (!confFullName.toLowerCase().contains("compiler")) { continue; } } conf.name = confName.first().text(); /* * we found a hit! The conference information is split across two <tr> table elements. * Conference name and link to cfp are in the first <tr>, and dates, location and deadline * in the second. */ final Element secondRow = iter.next(); String dates = secondRow.select("td").first().text(); String startDate = dates.substring(0, dates.indexOf('-')).trim(); conf.start = cfpDateFormat.parseDateTime(startDate); conf.end = cfpDateFormat.parseDateTime(dates.substring(dates.indexOf('-') + 1).trim()); conf.dates = cfpDateFormat.print(conf.start) + " - " + cfpDateFormat.print(conf.end); if (conf.start.year().equals(conf.end.year()) && conf.start.monthOfYear().equals(conf.end.monthOfYear())) { conf.dates = monthFormat.print(conf.start) + " " + dayFormat.print(conf.start) + "-" + dayFormat.print(conf.end) + " " + yearFormat.print(conf.start); } String deadline = secondRow.select("td").get(2).text().trim(); if (deadline.contains("(")) { // abstract deadline may be in parentheses deadline = deadline.substring(0, deadline.indexOf('(')).trim(); } conf.deadline = cfpDateFormat.parseDateTime(deadline); conf.url = "http://www.wikicfp.com" + confName.attr("href"); /* * extract the WikiCFP eventid from the link, so that, later on, the client can pull the * cfp page and get the direct conference site link. */ com.shopobot.util.URL url = new com.shopobot.util.URL(conf.url); String[] eid = url.getParameters("eventid"); if (0 == eid.length) continue; try { conf.eventid = Integer.valueOf(eid[0]); } catch (NumberFormatException e) { error("invalid event id " + eid); continue; } conf.location = secondRow.select("td").get(1).text(); results.add(conf); } return results; }
From source file:net.parser.JobParser.java
public boolean checkIfLastPage() { Elements elements1 = doc.select(".searchlist").eq(1); if (elements1.size() == 0) { Elements elements2 = doc.select(".searchlist").eq(0); for (Element element : elements2) { if (element.select("h2 span").text().contains("Regionalni poslovi")) { return true; } else { return false; }/*from w w w.j a v a2 s. c o m*/ } return true; } else { return true; } }
From source file:mobi.jenkinsci.ci.client.JenkinsClient.java
private Issue getIssueFromRow(final Element row) throws MalformedURLException { final Element fullChangeMessage = row.select("div[class=changeset-message]").first(); if (fullChangeMessage == null) { return null; }//from www.j a va 2s . com final Element issueLink = fullChangeMessage.select("pre").first().select("a").first(); if (issueLink == null) { return null; } else { final Element issueIcon = issueLink.select("img").first(); return new Issue(getUrl(issueLink, "href"), issueLink.attr("tooltip"), getUrl(issueIcon, "src")); } }
From source file:de.geeksfactory.opacclient.apis.Littera.java
private String getCellContent(Element detailTable, String pattern) { final Element first = detailTable.select("td.label:matchesOwn(" + pattern + ")").first(); return first == null ? null : first.nextElementSibling().text(); }