List of usage examples for org.jsoup.nodes Element select
public Elements select(String cssQuery)
From source file:net.parser.JobParser.java
public List<Job> getJobIdAndLink(List<Job> jobs) { Job job = null;// w w w . j ava2 s .co m Employer employer = null; Elements elements = doc.select(".searchlist").eq(0).select(".job .details"); for (Element jobElement : elements) { Elements aElements = jobElement.select("a"); job = new Job(); employer = new Employer(); Element aJob = aElements.get(0); String linkJob = aJob.attr("href"); linkJob = linkJob.replaceFirst("www", "m"); job.setLink(linkJob); linkJob = linkJob.substring(linkJob.indexOf("/") + 1, linkJob.length()); linkJob = linkJob.substring(linkJob.indexOf("/") + 1, linkJob.length()); linkJob = linkJob.substring(linkJob.indexOf("/") + 1, linkJob.length()); linkJob = linkJob.substring(linkJob.indexOf("/") + 1, linkJob.length()); String id = linkJob.substring(0, linkJob.indexOf("/")); job.setId(Integer.parseInt(id)); Element aEmpoyer = null; try { aEmpoyer = aElements.get(1); String linkEmployer = aEmpoyer.attr("href"); employer.setLink(linkEmployer); linkEmployer = linkEmployer.substring(linkEmployer.indexOf("/") + 1, linkEmployer.length()); linkEmployer = linkEmployer.substring(linkEmployer.indexOf("/") + 1, linkEmployer.length()); linkEmployer = linkEmployer.substring(linkEmployer.indexOf("/") + 1, linkEmployer.length()); linkEmployer = linkEmployer.substring(linkEmployer.indexOf("/") + 1, linkEmployer.length()); id = linkEmployer.substring(0, linkEmployer.indexOf("/")); employer.setId(Integer.parseInt(id)); } catch (IndexOutOfBoundsException e) { employer.setId(0); } employer.setName(getEmployerName(jobElement, employer.getId())); job.setEmployer(employer); jobs.add(job); } return jobs; }
From source file:gov.medicaid.screening.dao.impl.BusinessLienDAOBean.java
/** * Parses the excluded provider profile details page. * * @param page the details page// w w w. j av a2 s .c o m * @return the parsed license details * @throws ParsingException if the expected tags were not found */ private ProviderProfile parseProfile(Document page) throws ParsingException { ProviderProfile profile = new ProviderProfile(); // business String businessName = page.select("#searchItemDetail #recordReview h3").text(); Business business = new Business(); profile.setBusiness(business); business.setName(businessName); Elements detailMaster = page.select("#searchItemDetail #RecordDetailMaster #filingSummary dl"); // business type BusinessType businessType = new BusinessType(); business.setType(businessType); businessType.setName(getValuePairOfLabel(detailMaster, "Business Type")); // statute business.setStatute(getValuePairOfLabel(detailMaster, "MN Statute")); // file number business.setFileNumber(getValuePairOfLabel(detailMaster, "File Number")); // home jurisdiction business.setHomeJurisdiction(getValuePairOfLabel(detailMaster, "Home Jurisdiction")); // filing date Date filingDate = parseDate(getValuePairOfLabel(detailMaster, "Filing Date"), DATE_FORMAT); if (filingDate != null) { business.setFilingDate(filingDate); } // status BusinessStatus status = new BusinessStatus(); status.setName(getValuePairOfLabel(detailMaster, "Status")); business.setStatus(status); // renewal date Date renewalDate = parseDate(getValuePairOfLabel(detailMaster, "Renewal Due Date:"), DATE_FORMAT); if (renewalDate != null) { business.setRenewalDueDate(renewalDate); } // registered office address business.setRegisteredOfficeAddress( parseAddress(getValuePairOfLabel(detailMaster, "Registered Office Address"))); // registered agents ProviderProfile agent = new ProviderProfile(); List<ProviderProfile> agents = new ArrayList<ProviderProfile>(); business.setRegisteredAgents(agents); agents.add(agent); agent.setAgency(getValuePairOfLabel(detailMaster, "Registered Agent(s)")); List<FilingHistory> filingHistories = new ArrayList<FilingHistory>(); business.setFilingHistory(filingHistories); Elements fileHistories = page.select("#filing table tr"); for (Element fileHistory : fileHistories) { FilingHistory fh = new FilingHistory(); Date fileDate = parseDate(fileHistory.select("td.date").text(), DATE_FORMAT); if (fileDate != null) { fh.setDate(fileDate); } fh.setDescription(fileHistory.select("td.action").text()); filingHistories.add(fh); } return profile; }
From source file:com.bdx.rainbow.service.etl.analyze.SYJHttpAnalyze.java
/** * ?/*from w w w .j a v a 2s . c om*/ * * @throws Exception */ private int getTotalPageNum(Document document) throws Exception { // ?table? Element pageTable = document.select("table").get(4); String pageHtml = pageTable.select("tr td").get(0).html(); // ?? String regex = ".+?(.+?).+?"; Object result = AnalyzeUtil.regex(pageHtml, regex); if (null == result) { return 0; } int totalPageNum = Integer.parseInt(result.toString()); return totalPageNum; }
From source file:net.acesinc.convergentui.ConvergentUIResponseFilter.java
@Override public Object run() { String origBody = contentManager.getDownstreamResponse(); if (origBody == null || origBody.isEmpty()) { return null; }//from www .j ava 2s .co m String composedBody = null; log.trace("Response from downstream server: " + origBody); Document doc = Jsoup.parse(origBody); if (hasReplaceableElements(doc)) { log.debug("We have replaceable elements. Let's get em!"); Elements elementsToUpdate = doc.select("div[data-loc]"); for (Element e : elementsToUpdate) { StringBuilder content = new StringBuilder(); String location = e.dataset().get("loc"); String fragmentName = e.dataset().get("fragment-name"); String cacheName = e.dataset().get("cache-name"); boolean useCaching = !Boolean.valueOf(e.dataset().get("disable-caching")); boolean failQuietly = Boolean.valueOf(e.dataset().get("fail-quietly")); URL url = null; try { url = new URL(location); String protocol = url.getProtocol(); String service = url.getHost(); log.debug("Fetching content at location [ " + location + " ] with cacheName = [ " + cacheName + " ]"); try { RequestContext context = RequestContext.getCurrentContext(); ContentResponse response = contentManager.getContentFromService(location, cacheName, useCaching, context); log.trace(response.toString()); if (!response.isError()) { Object resp = response.getContent(); if (String.class.isAssignableFrom(resp.getClass())) { String subContentResponse = (String) resp; //TODO You better trust the source of your downstream HTML! // String cleanedContent = Jsoup.clean(subContentResponse, Whitelist.basic()); //this totally stripped the html out... Document subDocument = Jsoup.parse(subContentResponse); if (fragmentName != null) { Elements fragments = subDocument .select("div[data-fragment-name=\"" + fragmentName + "\"]"); if (fragments != null && fragments.size() > 0) { if (fragments.size() == 1) { Element frag = fragments.first(); //need to see if there are images that we need to replace the urls on Elements images = frag.select("img"); for (Element i : images) { String src = i.attr("src"); if (src.startsWith("/") && !src.startsWith("//")) { i.attr("src", "/cui-req://" + protocol + "://" + service + src); } //else what do we do about relative urls? } content.append(frag.toString()); } else { for (Element frag : fragments) { content.append(frag.toString()).append("\n\n"); } } } else { log.debug("Found no matching fragments for [ " + fragmentName + " ]"); if (failQuietly) { content.append("<div class='cui-error'></div>"); } else { content.append( "<span class='cui-error'>Failed getting content from remote service. Possible reason in reponse below</span>"); content.append(subDocument.toString()); } } } else { //take the whole thing and cram it in there! content.append(subDocument.toString()); } } else { //not text... if (!failQuietly) { content.append( "<span class='cui-error'>Failed getting content from remote service. Reason: content was not text</span>"); } else { content.append("<div class='cui-error'></div>"); } } } else { if (!failQuietly) { content.append( "<span class='cui-error'>Failed getting content from remote service. Reason: " + response.getMessage() + "</span>"); } else { content.append("<div class='cui-error'></div>"); } } //now append it to the page if (!content.toString().isEmpty()) { e.html(content.toString()); } } catch (Throwable t) { if (!failQuietly) { e.html("<span class='cui-error'>Failed getting content from remote service. Reason: " + t.getMessage() + "</span>"); } log.warn("Failed replacing content", t); } } catch (MalformedURLException ex) { log.warn("location was invalid: [ " + location + " ]", ex); if (!failQuietly) { content.append( "<span class='cui-error'>Failed getting content from remote service. Reason: data-loc was an invalid location.</span>"); } else { content.append("<div class='cui-error'></div>"); } } } composedBody = doc.toString(); } else { log.debug("Document has no replaeable elements. Skipping"); } try { addResponseHeaders(); if (composedBody != null && !composedBody.isEmpty()) { writeResponse(composedBody, getMimeType(RequestContext.getCurrentContext())); } else { writeResponse(origBody, getMimeType(RequestContext.getCurrentContext())); } } catch (Exception ex) { log.error("Error sending response", ex); } return null; }
From source file:com.amastigote.xdu.query.module.EduSystem.java
private @Nullable JSONObject gradesQuery() throws IOException, JSONException { if (!checkIsLogin(ID)) { return null; }// ww w . j a v a 2 s .c om URL url = new URL(SYS_HOST + GRADE_QUERY_SUFFIX); HttpURLConnection httpURLConnection = (HttpURLConnection) url.openConnection(); httpURLConnection.setRequestProperty("Cookie", "JSESSIONID=" + SYS_JSESSIONID); httpURLConnection.connect(); Document document = Jsoup.parse(httpURLConnection.getInputStream(), "gb2312", httpURLConnection.getURL().toString()); document = Jsoup.parse(document.toString().replaceAll(" ", "")); JSONObject jsonObject = new JSONObject(); Elements elements_content = document.select("td[class=pageAlign]"); Elements elements_titles = document.select("b"); for (int i = 0; i < elements_titles.size(); i++) { JSONObject jsonObject_semester = new JSONObject(); String semester_key = elements_titles.get(i).text().trim(); Element table_for_this_semester = elements_content.get(i); Elements elements_rows = table_for_this_semester.select("td[align=center]"); for (int j = 0; j < elements_rows.size() / 7; j++) { JSONObject jsonObject_course = new JSONObject(); String course_key = elements_rows.get(j * 7 + 2).text().trim(); jsonObject_course.put(GradeKey.ID, elements_rows.get(j * 7).text().trim()); jsonObject_course.put(GradeKey.CREDIT, elements_rows.get(j * 7 + 4).text().trim()); jsonObject_course.put(GradeKey.ATTR, elements_rows.get(j * 7 + 5).text().trim()); jsonObject_course.put(GradeKey.GRADE, elements_rows.get(j * 7 + 6).text().trim()); jsonObject_semester.put(course_key, jsonObject_course); } jsonObject.put(semester_key, jsonObject_semester); } return jsonObject; }
From source file:me.vertretungsplan.parser.SVPlanParser.java
@NotNull SubstitutionSchedule parseSVPlanSchedule(List<Document> docs) throws IOException, JSONException { SubstitutionSchedule v = SubstitutionSchedule.fromData(scheduleData); for (Document doc : docs) { if (doc.select(".svp").size() > 0) { for (Element svp : doc.select(".svp")) { parseSvPlanDay(v, svp, doc); }// w w w. j av a2 s .c om } else if (doc.select(".Trennlinie").size() > 0) { Element div = new Element(Tag.valueOf("div"), ""); for (Node node : doc.body().childNodesCopy()) { if (node instanceof Element && ((Element) node).hasClass("Trennlinie") && div.select("table").size() > 0) { parseSvPlanDay(v, div, doc); div = new Element(Tag.valueOf("div"), ""); } else { div.appendChild(node); } } parseSvPlanDay(v, div, doc); } else { parseSvPlanDay(v, doc, doc); } } v.setClasses(getAllClasses()); v.setTeachers(getAllTeachers()); return v; }
From source file:com.johan.vertretungsplan.parser.SVPlanParser.java
public Vertretungsplan getVertretungsplan() throws IOException, JSONException { new LoginHandler(schule).handleLogin(executor, cookieStore, username, password); // JSONArray urls = schule.getData().getJSONArray("urls"); String encoding = schule.getData().getString("encoding"); List<Document> docs = new ArrayList<Document>(); for (int i = 0; i < urls.length(); i++) { JSONObject url = urls.getJSONObject(i); loadUrl(url.getString("url"), encoding, docs); }/*w w w .j av a2 s . c o m*/ LinkedHashMap<String, VertretungsplanTag> tage = new LinkedHashMap<String, VertretungsplanTag>(); for (Document doc : docs) { if (doc.select(".svp-tabelle").size() > 0) { VertretungsplanTag tag = new VertretungsplanTag(); String date = "Unbekanntes Datum"; if (doc.select(".svp-plandatum-heute, .svp-plandatum-morgen").size() > 0) date = doc.select(".svp-plandatum-heute, .svp-plandatum-morgen").text(); else if (doc.title().startsWith("Vertretungsplan fr ")) date = doc.title().substring("Vertretungsplan fr ".length()); tag.setDatum(date); if (doc.select(".svp-uploaddatum").size() > 0) tag.setStand(doc.select(".svp-uploaddatum").text().replace("Aktualisierung: ", "")); Elements rows = doc.select(".svp-tabelle tr"); String lastLesson = ""; for (Element row : rows) { if (row.hasClass("svp-header")) continue; Vertretung vertretung = new Vertretung(); List<String> affectedClasses = new ArrayList<String>(); for (Element column : row.select("td")) { if (!hasData(column.text())) { continue; } String type = column.className(); if (type.startsWith("svp-stunde")) { vertretung.setLesson(column.text()); lastLesson = column.text(); } else if (type.startsWith("svp-klasse")) affectedClasses = Arrays.asList(column.text().split(", ")); else if (type.startsWith("svp-esfehlt")) vertretung.setPreviousTeacher(column.text()); else if (type.startsWith("svp-esvertritt")) vertretung.setTeacher(column.text()); else if (type.startsWith("svp-fach")) vertretung.setSubject(column.text()); else if (type.startsWith("svp-bemerkung")) { vertretung.setDesc(column.text()); vertretung.setType(recognizeType(column.text())); } else if (type.startsWith("svp-raum")) vertretung.setRoom(column.text()); if (vertretung.getLesson() == null) vertretung.setLesson(lastLesson); } if (vertretung.getType() == null) { vertretung.setType("Vertretung"); } for (String klasse : affectedClasses) { KlassenVertretungsplan kv = tag.getKlassen().get(klasse); if (kv == null) kv = new KlassenVertretungsplan(klasse); kv.add(vertretung); tag.getKlassen().put(klasse, kv); } } List<String> nachrichten = new ArrayList<String>(); if (doc.select("h2:contains(Mitteilungen)").size() > 0) { Element h2 = doc.select("h2:contains(Mitteilungen)").first(); Element sibling = h2.nextElementSibling(); while (sibling != null && sibling.tagName().equals("p")) { for (String nachricht : TextNode.createFromEncoded(sibling.html(), null).getWholeText() .split("<br />\\s*<br />")) { if (hasData(nachricht)) nachrichten.add(nachricht); } sibling = sibling.nextElementSibling(); } } tag.setNachrichten(nachrichten); tage.put(date, tag); } else { throw new IOException("keine SVPlan-Tabelle gefunden"); } } Vertretungsplan v = new Vertretungsplan(); v.setTage(new ArrayList<VertretungsplanTag>(tage.values())); return v; }
From source file:de.geeksfactory.opacclient.apis.IOpac.java
static void parseResList(List<ReservedItem> media, Document doc, JSONObject data) { if (doc.select("a[name=RES]").size() == 0) return;/*from w w w . j a v a 2 s.c o m*/ Elements copytrs = doc.select("a[name=RES] ~ table:contains(Titel)").first().select("tr"); doc.setBaseUri(data.optString("baseurl")); DateTimeFormatter fmt = DateTimeFormat.forPattern("dd.MM.yyyy").withLocale(Locale.GERMAN); int trs = copytrs.size(); if (trs < 2) { return; } assert (trs > 0); for (int i = 1; i < trs; i++) { Element tr = copytrs.get(i); ReservedItem item = new ReservedItem(); item.setTitle(tr.child(0).text().trim().replace("\u00a0", "")); item.setAuthor(tr.child(1).text().trim().replace("\u00a0", "")); try { item.setReadyDate(fmt.parseLocalDate(tr.child(4).text().trim().replace("\u00a0", ""))); } catch (IllegalArgumentException e) { item.setStatus(tr.child(4).text().trim().replace("\u00a0", "")); } if (tr.select("a").size() > 0) { item.setCancelData(tr.select("a").last().attr("href")); } media.add(item); } assert (media.size() == trs - 1); }
From source file:eu.masconsult.bgbanking.banks.sgexpress.SGExpressClient.java
@Override public List<RawBankAccount> getBankAccounts(String authTokenString) throws IOException, ParseException, AuthenticationException { AuthToken authToken = AuthToken.fromJson(authTokenString); String response = loadPageWithAuth(getHttpClient(), authToken, LIST_ACCOUNTS_XML_ID); Document doc = Jsoup.parse(response, BASE_URL); Element content = doc.getElementById("main"); if (content == null) { throw new ParseException("getBankAccounts: can't find #main"); }// w w w . j a v a 2 s . co m Elements tables = content.select("section.result table.data"); if (tables == null || tables.size() == 0) { throw new ParseException("getBankAccounts: can't find table section.result table.data"); } Elements rows = tables.first().getElementsByTag("tr"); if (rows == null || rows.size() == 0) { throw new ParseException("getBankAccounts: first table is empty"); } ArrayList<RawBankAccount> bankAccounts = new ArrayList<RawBankAccount>(rows.size()); String type = "undef"; for (Element row : rows) { if (row.getElementsByTag("th").size() > 0) { // header row type = row.child(0).text(); } else { RawBankAccount bankAccount = obtainBankAccountFromHtmlTableRow(type, row); if (bankAccount != null) { bankAccounts.add(bankAccount); } } } return bankAccounts; }
From source file:de.geeksfactory.opacclient.apis.Littera.java
@Override public DetailledItem getResultById(String id, String homebranch) throws IOException, OpacErrorException { if (!initialised) { start();//from w w w .jav a 2s . c o m } final String html = httpGet(getApiUrl() + "&view=detail&id=" + id, getDefaultEncoding()); final Document doc = Jsoup.parse(html); final Element detailData = doc.select(".detailData").first(); final Element detailTable = detailData.select("table.titel").first(); final Element availabilityTable = doc.select(".bibliothek table").first(); final DetailledItem result = new DetailledItem(); final Copy copy = new Copy(); result.addCopy(copy); result.setId(id); result.setCover(getCover(doc)); result.setTitle(detailData.select("h3").first().text()); result.setMediaType(MEDIA_TYPES.get(getCellContent(detailTable, "Medienart|Type of media"))); copy.setStatus(getCellContent(availabilityTable, "Verfgbar|Available")); copy.setReturnDate(parseCopyReturn(getCellContent(availabilityTable, "Exemplare verliehen|Copies lent"))); copy.setReservations(getCellContent(availabilityTable, "Reservierungen|Reservations")); for (final Element tr : detailTable.select("tr")) { final String desc = tr.child(0).text(); final String content = tr.child(1).text(); if (desc != null && !desc.trim().equals("")) { result.addDetail(new Detail(desc, content)); } else if (!result.getDetails().isEmpty()) { final Detail lastDetail = result.getDetails().get(result.getDetails().size() - 1); lastDetail.setHtml(true); lastDetail.setContent(lastDetail.getContent() + "\n" + content); } } return result; }