List of usage examples for org.jsoup.nodes Document select
public Elements select(String cssQuery)
From source file:de.geeksfactory.opacclient.apis.Littera.java
protected void addSortingSearchFields(List<SearchField> fields) throws IOException, JSONException { final String html = httpGet(getApiUrl() + "&mode=a", getDefaultEncoding()); final Document doc = Jsoup.parse(html); for (int i = 0; i < 3; i++) { final Element tr = doc.select("#sort_editor tr.sort_" + i).first(); final DropdownSearchField field = new DropdownSearchField(); field.setMeaning(SearchField.Meaning.ORDER); field.setId("sort_" + i); field.setDisplayName(tr.select("td").first().text()); field.addDropdownValue("", ""); for (final Element option : tr.select(".crit option")) { if (option.hasAttr("selected")) { field.addDropdownValue(0, option.attr("value"), option.text()); } else { field.addDropdownValue(option.attr("value"), option.text()); }/*from w ww .j a va2 s .c o m*/ } fields.add(field); } }
From source file:io.seldon.importer.articles.dynamicextractors.FirstElementAttrValueDateWithFormatDynamicExtractor.java
@Override public String extract(AttributeDetail attributeDetail, String url, Document articleDoc) throws Exception { String attrib_value = null;/* w w w . j a v a2 s .c o m*/ String dateFormatString = null; if ((attributeDetail.extractor_args != null) && (attributeDetail.extractor_args.size() >= 3)) { String cssSelector = attributeDetail.extractor_args.get(0); dateFormatString = attributeDetail.extractor_args.get(1); Element element = articleDoc.select(cssSelector).first(); if (StringUtils.isNotBlank(cssSelector)) { int arg_count = 0; for (String value_name : attributeDetail.extractor_args) { if (arg_count > 1) { // skip the first one, its the cssSelector, and second thats the Date format if (element != null && element.attr(value_name) != null) { attrib_value = element.attr(value_name); if (StringUtils.isNotBlank(attrib_value)) { break; } } } arg_count++; } } } if ((attrib_value != null) && (dateFormatString != null)) { String pubtext = attrib_value; SimpleDateFormat dateFormatter = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss"); DateFormat df = new SimpleDateFormat(dateFormatString, Locale.ENGLISH); Date result = null; try { result = df.parse(pubtext); } catch (ParseException e) { logger.info("Failed to parse date with format [" + dateFormatString + "] " + pubtext); } if (result != null) { String attrib_value_orig = attrib_value; attrib_value = dateFormatter.format(result); String msg = "Extracted date [" + attrib_value_orig + "] - > [" + attrib_value + "]"; logger.info(msg); } else { logger.error("Failed to parse date " + pubtext); attrib_value = null; } } return attrib_value; }
From source file:gov.medicaid.screening.dao.impl.BusinessLienDAOBean.java
/** * Parses the excluded provider profile details page. * * @param page the details page/*w ww. j a v a 2 s . c om*/ * @return the parsed license details * @throws ParsingException if the expected tags were not found */ private ProviderProfile parseProfile(Document page) throws ParsingException { ProviderProfile profile = new ProviderProfile(); // business String businessName = page.select("#searchItemDetail #recordReview h3").text(); Business business = new Business(); profile.setBusiness(business); business.setName(businessName); Elements detailMaster = page.select("#searchItemDetail #RecordDetailMaster #filingSummary dl"); // business type BusinessType businessType = new BusinessType(); business.setType(businessType); businessType.setName(getValuePairOfLabel(detailMaster, "Business Type")); // statute business.setStatute(getValuePairOfLabel(detailMaster, "MN Statute")); // file number business.setFileNumber(getValuePairOfLabel(detailMaster, "File Number")); // home jurisdiction business.setHomeJurisdiction(getValuePairOfLabel(detailMaster, "Home Jurisdiction")); // filing date Date filingDate = parseDate(getValuePairOfLabel(detailMaster, "Filing Date"), DATE_FORMAT); if (filingDate != null) { business.setFilingDate(filingDate); } // status BusinessStatus status = new BusinessStatus(); status.setName(getValuePairOfLabel(detailMaster, "Status")); business.setStatus(status); // renewal date Date renewalDate = parseDate(getValuePairOfLabel(detailMaster, "Renewal Due Date:"), DATE_FORMAT); if (renewalDate != null) { business.setRenewalDueDate(renewalDate); } // registered office address business.setRegisteredOfficeAddress( parseAddress(getValuePairOfLabel(detailMaster, "Registered Office Address"))); // registered agents ProviderProfile agent = new ProviderProfile(); List<ProviderProfile> agents = new ArrayList<ProviderProfile>(); business.setRegisteredAgents(agents); agents.add(agent); agent.setAgency(getValuePairOfLabel(detailMaster, "Registered Agent(s)")); List<FilingHistory> filingHistories = new ArrayList<FilingHistory>(); business.setFilingHistory(filingHistories); Elements fileHistories = page.select("#filing table tr"); for (Element fileHistory : fileHistories) { FilingHistory fh = new FilingHistory(); Date fileDate = parseDate(fileHistory.select("td.date").text(), DATE_FORMAT); if (fileDate != null) { fh.setDate(fileDate); } fh.setDescription(fileHistory.select("td.action").text()); filingHistories.add(fh); } return profile; }
From source file:com.elevenpaths.googleindexretriever.GoogleSearch.java
public Elements getResultSpam() throws EmptyQueryException, ManyResultsException, CaptchaException, UnsupportedEncodingException { if (this.query.isEmpty()) { throw new EmptyQueryException(); }//from w w w .j av a 2 s . c o m Document doc = getData(this.query); Elements data = doc.select(".st"); return data; }
From source file:com.bdx.rainbow.service.etl.analyze.SYJHttpAnalyze.java
/** * ?// w w w . j ava 2s . c om * * @param document * @return * @throws Exception */ private Object analyzeLicenseDetail(HttpSeed seed) throws Exception { Document doc = parse(seed.getHtml()); Elements eleTable = doc.select(".listmain table"); // TR Elements eleTrs = eleTable.get(0).select("tr"); // ? Object entity = AnalyzeUtil.getInstant(PREFIX_ENTITY_PATH + syjTableBean.getTableClass()); // tr?trtd? int rowNo = 1; for (int i = 0; i < eleTrs.size(); i++) { Element eleTr = eleTrs.get(i); // ??trtd??nowrapnowrap?true if (i != eleTrs.size() - 1 && (!eleTr.select("td").get(0).hasAttr("nowrap") || !"true".equals(eleTr.select("td").get(0).attr("nowrap")))) { continue; } // td? String tdVal = parseDetailTr(eleTr); // TABLE7411?? if (syjTableBean.getTableClass().equals("TABLE74") && rowNo == 11) { continue; } // entity AnalyzeUtil.executeMethod(entity, PREFIX_ATTRIBUTE + rowNo++, new Object[] { tdVal }, new Class[] { String.class }); } // ?ID, ?createEmpCode String regex = ".+?&Id=(.+?)"; Object obj = AnalyzeUtil.regex(seed.getUrl(), regex); if (null == obj) { // ID AnalyzeUtil.executeMethod(entity, "setContentId", new Object[] { 0l }, new Class[] { Long.class }); } else { // ID AnalyzeUtil.executeMethod(entity, "setContentId", new Object[] { Long.valueOf(obj.toString()) }, new Class[] { Long.class }); } // ? AnalyzeUtil.executeMethod(entity, "setCreateTime", new Object[] { new Timestamp(new Date().getTime()) }, new Class[] { Timestamp.class }); return entity; }
From source file:com.elevenpaths.googleindexretriever.GoogleSearch.java
public String getImageCaptcha(Document doc) { String img = ""; Elements image = doc.select("img"); if (image.size() == 1) { img = doc.select("img").first().attr("src"); }/*w w w . j av a2 s . co m*/ return img; }
From source file:com.elevenpaths.googleindexretriever.GoogleSearch.java
public String getResults() throws EmptyQueryException, ManyResultsException, CaptchaException, UnsupportedEncodingException { if (this.query.isEmpty()) { throw new EmptyQueryException(); }/* w ww . j a v a 2 s. co m*/ Document doc = getData(this.query); Elements data = doc.select(".st"); if (data.size() > 1) { throw new ManyResultsException(); } return data.text(); }
From source file:de.fzi.ALERT.actor.MessageObserver.NotificationObserver.JMSNotificationParser.java
private void parseIssueAlertIusse(Document doc) { // TODO Auto-generated method stub Message message = null;/*from www . j a va 2 s .c om*/ RssFeedGenerator newRssFeedGenerator = new RssFeedGenerator(); try { String content = "EventName: AlertIusse\n"; message = new Message(); Elements events = doc.select("ns1|eventData"); org.jsoup.nodes.Element event = events.get(0); Elements elements = event.getAllElements(); for (org.jsoup.nodes.Element element : elements) { content = content + element.tagName() + " : " + element.ownText() + "\n"; } System.out.println("content:" + content); newRssFeedGenerator.RssFeedXml("title", "Link", content); System.out.println("!!!"); } catch (Exception e) { System.out.println(e.getMessage()); } }
From source file:de.geeksfactory.opacclient.apis.WebOpacNet.java
@Override public List<SearchField> getSearchFields() throws IOException, JSONException { List<SearchField> fields = new ArrayList<>(); // Text fields String html = httpGet(opac_url + "/de/mobile/default.aspx", getDefaultEncoding()); Document doc = Jsoup.parse(html); Elements options = doc.select("#drpOptSearchT option"); for (Element option : options) { TextSearchField field = new TextSearchField(); field.setDisplayName(option.text()); field.setId(option.attr("value")); field.setData(new JSONObject("{\"filter\":false}")); field.setHint(""); fields.add(field);/*from www . j av a 2s. co m*/ } // Dropdowns String text = httpGet(opac_url + "/de/mobile/GetRestrictions.ashx", getDefaultEncoding()); JSONArray filters = new JSONObject(text).getJSONArray("restrcontainers"); for (int i = 0; i < filters.length(); i++) { JSONObject filter = filters.getJSONObject(i); if (filter.getString("querytyp").equals("EJ")) { // Querying by year also works for other years than the ones // listed // -> Make it a text field instead of a dropdown TextSearchField field = new TextSearchField(); field.setDisplayName(filter.getString("kopf")); field.setId(filter.getString("querytyp")); field.setData(new JSONObject("{\"filter\":true}")); field.setHint(""); fields.add(field); } else { DropdownSearchField field = new DropdownSearchField(); field.setId(filter.getString("querytyp")); field.setDisplayName(filter.getString("kopf")); JSONArray restrictions = filter.getJSONArray("restrictions"); field.addDropdownValue("", "Alle"); for (int j = 0; j < restrictions.length(); j++) { JSONObject restriction = restrictions.getJSONObject(j); field.addDropdownValue(restriction.getString("id"), restriction.getString("bez")); } field.setData(new JSONObject("{\"filter\":true}")); fields.add(field); } } return fields; }
From source file:mobi.jenkinsci.alm.assembla.client.AssemblaClient.java
private Document postLoginForm(final Document pinDoc) throws IOException { final List<NameValuePair> formNvps = new ArrayList<NameValuePair>(); final Element form = pinDoc.select("form[id=login-box]").first(); final String formAction = form.attr("action"); final HttpPost formPost = new HttpPost(getUrl(formAction).toString()); final Elements formFields = form.select("input"); for (final Element element : formFields) { final String fieldName = element.attr("name"); String fieldValue = element.attr("value"); final String fieldId = element.attr("id"); final String fieldType = element.attr("type"); if (fieldId.equalsIgnoreCase("user_login")) { fieldValue = username;//from ww w . ja va 2 s. c o m ; } else if (fieldId.equalsIgnoreCase("user_password")) { fieldValue = password; } if (fieldType.equals("submit")) { if (!fieldName.equalsIgnoreCase("commit")) { continue; } } LOG.debug(String.format("Processing form field: name='%s' value='%s' id='%s'", fieldName, fieldValue, fieldId)); formNvps.add(new BasicNameValuePair(fieldName, fieldValue)); } try { formPost.setEntity(new UrlEncodedFormEntity(formNvps, "UTF-8")); } catch (final UnsupportedEncodingException e) { // This would never happen throw new IllegalArgumentException("UTF-8 not recognised"); } HttpResponse response; LOG.debug("Login via posting form-data to " + formPost.getURI()); try { response = sendHttpPost(formPost); if (response.getStatusLine().getStatusCode() != HttpURLConnection.HTTP_MOVED_TEMP) { throw new IOException("Form-based login to Assembla failed: " + response.getStatusLine()); } return Jsoup.parse(getData(response.getFirstHeader("Location").getValue(), false)); } finally { formPost.releaseConnection(); } }