List of usage examples for org.jsoup.nodes Document select
public Elements select(String cssQuery)
From source file:gov.medicaid.screening.dao.impl.SocialWorkLicenseDAOBean.java
/** * When there is exactly one match, the details page is displayed instead of the grid. This parses the details from * that page./* w w w. ja v a 2 s. co m*/ * * @param page the license details page * @return the parsed license */ private License parseLicenseDetail(Document page) { License license = new License(); ProviderProfile profile = new ProviderProfile(); license.setProfile(profile); String name = page.select("#_ctl7_lblName").text(); profile.setUser(parsePersonDetailName(name)); String licenseNumber = page.select("#_ctl7_lblLicNumber").text(); license.setLicenseNumber(licenseNumber); String licenseLevel = page.select("#_ctl7_lblLicLevel").text(); if (Util.isNotBlank(licenseLevel)) { LicenseLevel level = new LicenseLevel(); level.setName(licenseLevel); license.setLevel(level); } String city = page.select("#_ctl7_lblWorkCity").text(); license.setCity(city); String licenseStatus = page.select("#_ctl7_lblLicStatus").text(); if (Util.isNotBlank(licenseStatus)) { LicenseStatus status = new LicenseStatus(); status.setName(licenseStatus); license.setStatus(status); } // per reviewer include originalIssueDate,expireDate,correctiveAction,discipline String issueDate = page.select("#_ctl7_lblIssueDate").text(); if (Util.isNotBlank(issueDate)) { license.setOriginalIssueDate(parseDate(issueDate, DATE_FORMAT)); } String expireDate = page.select("#_ctl7_lblExpDate").text(); if (Util.isNotBlank(expireDate)) { license.setExpireDate(parseDate(expireDate, DATE_FORMAT)); } String correctiveAction = page.select("#_ctl7_lblCorrectiveAction").text(); if (Util.isNotBlank(correctiveAction)) { license.setCorrectiveAction(!"No".equals(correctiveAction)); } String disciplineAction = page.select("#_ctl7_lblDisciplineAction").text(); if (Util.isNotBlank(disciplineAction)) { license.setDiscipline(!"No".equals(disciplineAction)); } return license; }
From source file:org.bungeni.ext.integration.bungeniportal.BungeniServiceAccess.java
private String getItemAttributeValue(Document doc, String itemPath, String attributeReturn) { Elements item = doc.select(itemPath); return item.get(0).attr(attributeReturn); }
From source file:com.thesmartweb.swebrank.WebParser.java
/** * Method to get the number of links (total, internal) * @param link_html the url to parse//from w w w .j a va 2 s . co m * @return the number of links */ public int[] getnlinks(String link_html) { int[] nlinks = new int[2]; nlinks[0] = 0;//total number of links nlinks[1] = 0;//number of internal links try { Document doc = Jsoup.connect(link_html).timeout(10 * 1000).get(); Elements links = doc.select("a[href]"); nlinks[0] = links.size(); //----we check if a link is internal or not (abs is used to get the whole link (abs stands for abs) for (Element link : links) { if (link.attr("abs:href").contains(link_html)) { nlinks[1]++; } } return nlinks; } catch (Exception ex) { Logger.getLogger(com.thesmartweb.swebrank.WebParser.class.getName()).log(Level.SEVERE, null, ex); return nlinks; } }
From source file:gov.medicaid.screening.dao.impl.PodiatricMedicineLicenseDAOBean.java
/** * Retrieves all results from the source site. * * @param criteria the search criteria./*from w ww . j av a 2 s . c om*/ * @return the providers matched * @throws URISyntaxException if the URL could not be correctly constructed * @throws IOException for any I/O related errors * @throws ServiceException for any other errors encountered */ private SearchResult<License> getAllResults(String criteria) throws URISyntaxException, IOException, ServiceException { DefaultHttpClient client = new DefaultHttpClient(); client.setRedirectStrategy(new LaxRedirectStrategy()); HttpGet getFrontPage = new HttpGet(new URIBuilder(getSearchURL()).build()); HttpResponse response = client.execute(getFrontPage); verifyAndAuditCall(getSearchURL(), response); Document page = Jsoup.parse(EntityUtils.toString(response.getEntity())); HttpPost getSearchPage = new HttpPost(new URIBuilder(getSearchURL()).build()); HttpEntity entity = postForm(getSearchURL(), client, getSearchPage, new String[][] { { "_ctl2:dropAgencyCode", "H7Q" }, { "_ctl2:btnLogin", "Login" }, { "__VIEWSTATE", page.select("#__aspnetForm input[name=__VIEWSTATE]").first().val() } }, true); page = Jsoup.parse(EntityUtils.toString(entity)); HttpPost search = new HttpPost(new URIBuilder(getSearchURL()).build()); entity = postForm(getSearchURL(), client, search, new String[][] { { "_ctl2:txtCriteria", criteria }, { "_ctl2:btnSearch", "Search" }, { "__VIEWSTATE", page.select("#__aspnetForm input[name=__VIEWSTATE]").first().val() } }, true); page = Jsoup.parse(EntityUtils.toString(entity)); List<License> allLicenses = new ArrayList<License>(); Elements rows = page.select("table#_ctl2_dgrdResults tr.DataGrid"); for (Element row : rows) { License license = parseLicense(row.children()); if (license != null) { allLicenses.add(license); } } SearchResult<License> results = new SearchResult<License>(); results.setItems(allLicenses); return results; }
From source file:org.confab.VBulletinParser.java
public List<Forum> parseForums(Document root, BulletinBoard parent) { Utilities.debug("parseForums"); List<Forum> ret = new ArrayList<Forum>(); // get table// ww w . ja v a 2 s .co m Elements forum_table = root.select("tbody[id*=collapseobj_forumbit_] tr"); assert !forum_table.isEmpty(); for (Element el_tr : forum_table) { Forum new_forum = new Forum(parent); // Get the table data for this row Elements el_tds = el_tr.select("td"); assert !el_tds.isEmpty() : el_tr.html(); // xbox360achievements has a lot of subforums and puts these in their own table // The <a>'s are picked up as children of the parent <td> so don't parse this sub- // tables row's seperatly if (!el_tds.select("td.thead").isEmpty() || el_tds.size() < 3) { //Utilities.debug("tr doesn't seem to have anything we want, skipping."); continue; } // Get the title URL Elements els_a = el_tds.get(1).select("a"); assert !els_a.isEmpty() : el_tds.html(); new_forum.url = els_a.first().attr("href"); assert new_forum.url != null; Utilities.debug("new_forum.url : " + new_forum.url); // Get the title text assert els_a.first() != null; new_forum.title = els_a.first().text(); assert new_forum.title != null; Utilities.debug("new_forum.title : " + new_forum.title); // Check for any subforums in remaining a elements els_a.remove(els_a.first()); for (Element el_a : els_a) { Forum sub_forum = new Forum(parent); sub_forum.url = el_a.attr("href"); assert sub_forum.url != null; sub_forum.title = el_a.text(); assert sub_forum.title != null; new_forum.subForums.add(sub_forum); Utilities.debug("added subForum: " + sub_forum.title); } // Get num viewing the current forum Element el_viewing = el_tr.select(":matchesOwn((\\d+ Viewing))").first(); if (el_viewing != null) { new_forum.numViewing = el_viewing.text(); } else { new_forum.numViewing = "0"; } Utilities.debug("new_forum.numViewing : " + new_forum.numViewing); // Get the description/message of this topic Element el_description = el_tds.get(1).select("div.smallfont").first(); if (el_description != null) { new_forum.description = el_description.text(); } else { new_forum.description = ""; } Utilities.debug("new_forum.description : " + new_forum.description); Utilities.debug("new_forum.parent.url : " + new_forum.parent.url); ret.add(new_forum); Utilities.debug("-----"); } Utilities.debug("end parseForums"); return ret; }
From source file:me.vertretungsplan.parser.UntisInfoParser.java
@Override public SubstitutionSchedule getSubstitutionSchedule() throws IOException, JSONException, CredentialInvalidException { new LoginHandler(scheduleData, credential, cookieProvider).handleLogin(executor, cookieStore); Document navbarDoc = Jsoup.parse(getNavbarDoc().replace(" ", "")); Element select = navbarDoc.select("select[name=week]").first(); SubstitutionSchedule v = SubstitutionSchedule.fromData(scheduleData); String info = navbarDoc.select(".description").text(); String lastChange;//from ww w. j a va 2 s . c o m try { lastChange = info.substring(info.indexOf("Stand:") + "Stand:".length()).trim(); } catch (Exception e) { try { String infoHtml = httpGet(baseUrl + "/frames/title.htm", data.optString(PARAM_ENCODING, null)); Document infoDoc = Jsoup.parse(infoHtml); String info2 = infoDoc.select(".description").text(); lastChange = info2.substring(info2.indexOf("Stand:") + "Stand:".length()).trim(); } catch (Exception e1) { lastChange = ""; } } int successfulWeeks = 0; HttpResponseException lastException = null; for (Element option : select.children()) { String week = option.attr("value"); String weekName = option.text(); if (data.optBoolean(PARAM_SINGLE_CLASSES, data.optBoolean("single_classes", false)) // backwards compatibility || data.optString(PARAM_SCHEDULE_TYPE, "substitution").equals("timetable")) { int classNumber = 1; for (String klasse : getAllClasses()) { String url = getScheduleUrl(week, classNumber, data); try { parsePage(v, lastChange, klasse, url, weekName); } catch (HttpResponseException e) { if (e.getStatusCode() == 500) { // occurs in Hannover_MMBS classNumber++; continue; } else { throw e; } } classNumber++; } successfulWeeks++; } else { String url = getScheduleUrl(week, 0, data); try { parsePage(v, lastChange, null, url, weekName); successfulWeeks++; } catch (HttpResponseException e) { lastException = e; } } } if (successfulWeeks == 0 && lastException != null) { throw lastException; } v.setClasses(getAllClasses()); v.setTeachers(getAllTeachers()); v.setWebsite(baseUrl + "/default.htm"); return v; }
From source file:com.amastigote.xdu.query.module.EduSystem.java
private @Nullable JSONObject personalInfoQuery() throws IOException, JSONException { if (!checkIsLogin(ID)) { return null; }/* w w w .j av a2 s. c o m*/ URL url = new URL(SYS_HOST + "xjInfoAction.do?oper=xjxx"); HttpURLConnection httpURLConnection = (HttpURLConnection) url.openConnection(); httpURLConnection.setRequestProperty("Cookie", "JSESSIONID=" + SYS_JSESSIONID); httpURLConnection.connect(); Document document = Jsoup.parse(httpURLConnection.getInputStream(), "gb2312", httpURLConnection.getURL().toString()); document = Jsoup.parse(document.toString().replaceAll(" ", "")); Elements elements1 = document.select("td[width=275]"); JSONObject jsonObject = new JSONObject(); jsonObject.put(StudentKey.ID, elements1.get(0).text()); jsonObject.put(StudentKey.NAME, elements1.get(1).text()); jsonObject.put(StudentKey.GENDER, elements1.get(6).text()); jsonObject.put(StudentKey.NATION, elements1.get(10).text()); jsonObject.put(StudentKey.NATIVE_PLACE, elements1.get(11).text()); jsonObject.put(StudentKey.DEPARTMENT, elements1.get(24).text()); jsonObject.put(StudentKey.MAJOR, elements1.get(25).text()); jsonObject.put(StudentKey.CLASS, elements1.get(28).text()); return jsonObject; }
From source file:com.mythesis.userbehaviouranalysis.WebParser.java
/** * Parse the url and get all the content * @param link the url to parse/*from w w w .j a v a 2 s . c o m*/ * @return The content parsed */ private String cleanhtml(String link) { try { Document doc = Jsoup.connect(link).timeout(10 * 1000).get(); String title = doc.title(); String mainbody = doc.body().text(); Elements links = doc.select("a[href]"); Elements media = doc.select("[src]"); //fix link html to remove https:// or http:// and simple / if (link.substring(link.length() - 1, link.length()).equalsIgnoreCase("/")) { link = link.substring(0, link.length() - 1); } if (link.substring(0, 5).equalsIgnoreCase("https")) { link = link.substring(8); } else if (link.substring(0, 4).equalsIgnoreCase("http")) { link = link.substring(7); } String anchortext = ""; String alttext = ""; //-----get the anchor text of internal links for (Element el : links) { String str_check = el.attr("abs:href"); if (el.attr("abs:href").contains(link) && el.text().length() > 1) { anchortext = anchortext + el.text() + " "; } } //-------get alt text to internal images links for (Element medi : media) { if (medi.getElementsByTag("img").attr("src").contains(link)) { alttext = alttext + " " + medi.getElementsByTag("img").attr("alt"); } if (medi.getElementsByTag("img").attr("src").startsWith("/")) { alttext = alttext + " " + medi.getElementsByTag("img").attr("alt"); } } String content = mainbody + title + anchortext + alttext; return content; } catch (IOException ex) { Logger.getLogger(WebParser.class.getName()).log(Level.SEVERE, null, ex); String check = null; return check; } catch (NullPointerException ex) { Logger.getLogger(WebParser.class.getName()).log(Level.SEVERE, null, ex); String check = null; return check; } catch (Exception ex) { Logger.getLogger(WebParser.class.getName()).log(Level.SEVERE, null, ex); String check = null; return check; } }
From source file:org.confab.PhpBB3Parser.java
/** * Constructs and submits a POST with the appropriate parameters to login to a vbulletin. * @param rootURL Base or root URL for the site to log into * @param username User's login name// ww w . ja v a 2 s. c om * @param password User's password * @return User object initialised with a HttpContext */ public User login(String rootURL, String username, String password) { Utilities.debug("login"); User ret = new User(username, password); CookieStore cookieStore = new BasicCookieStore(); HttpContext localContext = new BasicHttpContext(); localContext.setAttribute(ClientContext.COOKIE_STORE, cookieStore); try { // set up the POST HttpPost httppost = new HttpPost(rootURL + "login.php"); List<NameValuePair> nvps = new ArrayList<NameValuePair>(); nvps.add(new BasicNameValuePair("do", "login")); nvps.add(new BasicNameValuePair("vb_login_username", username)); nvps.add(new BasicNameValuePair("vb_login_password", "")); nvps.add(new BasicNameValuePair("s", "")); nvps.add(new BasicNameValuePair("securitytoken", "guest")); nvps.add(new BasicNameValuePair("do", "login")); nvps.add(new BasicNameValuePair("vb_login_md5password", Utilities.md5(password))); nvps.add(new BasicNameValuePair("vb_login_md5password_utf", Utilities.md5(password))); httppost.setEntity(new UrlEncodedFormEntity(nvps, HTTP.UTF_8)); // execute the POST Utilities.debug("Executing POST"); HttpResponse response = httpclient.execute(httppost, localContext); Utilities.debug("POST response: " + response.getStatusLine()); assert response.getStatusLine().getStatusCode() == 200; //TODO: store the cookies //http://bit.ly/e7yY5i (CookieStore javadoc) Utilities.printCookieStore(cookieStore); // confirm we are logged in HttpGet httpget = new HttpGet(rootURL); response = httpclient.execute(httpget, localContext); HttpEntity entity = response.getEntity(); Document page = Jsoup.parse(EntityUtils.toString(entity)); EntityUtils.consume(entity); assert page != null; Utilities.debug("Checking that we are logged in.."); Element username_box = page.select("input[name=vb_login_username]").first(); assert username_box == null; Element password_box = page.select("input[name=vb_login_password]").first(); assert password_box == null; // parse the user's new securitytoken Element el_security_token = page.select("input[name=securitytoken]").first(); assert el_security_token != null; String security_token = el_security_token.attr("value"); assert security_token != null; String[] token_array = security_token.split("-"); assert token_array.length == 2; ret.vb_security_token = token_array[1]; assert ret.vb_security_token.length() == 40; Utilities.debug("securitytoken: " + ret.vb_security_token); Utilities.debug("Login seems ok"); ret.httpContext = localContext; } catch (IOException e) { System.out.println(e); } Utilities.debug("end login"); return ret; }
From source file:de.geeksfactory.opacclient.apis.BiBer1992.java
static List<LentItem> parseMediaList(AccountData res, Document doc, JSONObject data) throws JSONException { List<LentItem> media = new ArrayList<>(); if (doc == null) { return media; }//from w ww .j a va2 s . c o m // parse result list JSONObject copymap = data.getJSONObject("accounttable"); Pattern expire = Pattern.compile("Ausweisg.ltigkeit: ([0-9.]+)"); Pattern fees = Pattern.compile("([0-9,.]+) ."); for (Element td : doc.select(".td01x09n")) { String text = td.text().trim(); if (expire.matcher(text).matches()) { res.setValidUntil(expire.matcher(text).replaceAll("$1")); } else if (fees.matcher(text).matches()) { res.setPendingFees(text); } } DateTimeFormatter fmt = DateTimeFormat.forPattern("dd.MM.yyyy").withLocale(Locale.GERMAN); Elements rowElements = doc.select("form[name=medkl] table tr"); // rows: skip 1st row -> title row for (int i = 1; i < rowElements.size(); i++) { Element tr = rowElements.get(i); if (tr.child(0).tagName().equals("th")) { continue; } LentItem item = new LentItem(); Pattern itemIdPat = Pattern.compile("javascript:smAcc\\('[a-z]+','[a-z]+','([A-Za-z0-9]+)'\\)"); // columns: all elements of one media Iterator<?> keys = copymap.keys(); while (keys.hasNext()) { String key = (String) keys.next(); int index; try { index = copymap.has(key) ? copymap.getInt(key) : -1; } catch (JSONException e1) { index = -1; } if (index >= 0) { String value = tr.child(index).text().trim().replace("\u00A0", ""); switch (key) { case "author": value = findTitleAndAuthor(value)[1]; break; case "title": value = findTitleAndAuthor(value)[0]; break; case "returndate": try { value = fmt.parseLocalDate(value).toString(); } catch (IllegalArgumentException e1) { e1.printStackTrace(); } break; } if (tr.child(index).select("a").size() == 1) { Matcher matcher = itemIdPat.matcher(tr.child(index).select("a").attr("href")); if (matcher.find()) item.setId(matcher.group(1)); } if (value != null && value.length() != 0) item.set(key, value); } } if (tr.select("input[type=checkbox][value=YES]").size() > 0) { item.setProlongData(tr.select("input[type=checkbox][value=YES]").attr("name")); } media.add(item); } return media; }