List of usage examples for org.jsoup.nodes Element select
public Elements select(String cssQuery)
From source file:com.pemikir.youtubeplus.youtube.YoutubeExtractor.java
private VideoInfoItem extractVideoInfoItem(Element li) { VideoInfoItem info = new VideoInfoItem(); info.webpage_url = li.select("a[class*=\"content-link\"]").first().attr("abs:href"); try {/*from w w w . ja v a 2 s . co m*/ Pattern p = Pattern.compile("v=([0-9a-zA-Z-]*)"); Matcher m = p.matcher(info.webpage_url); m.find(); info.id = m.group(1); } catch (Exception e) { e.printStackTrace(); } info.title = li.select("span[class=\"title\"]").first().text(); info.uploader = li.select("span[class=\"g-hovercard\"]").first().text(); info.duration = li.select("span[class=\"video-time\"]").first().text(); Element img = li.select("img").first(); info.thumbnail_url = img.attr("abs:src"); // Sometimes youtube sends links to gif files witch somehow seam to not exist // anymore. Items with such gif also offer a secondary image source. So we are going // to use that if we caught such an item. if (info.thumbnail_url.contains(".gif")) { info.thumbnail_url = img.attr("data-thumb"); } return info; }
From source file:gov.medicaid.screening.dao.impl.NursingLicenseDAOBean.java
/** * Parses the nursing license details page. * * @param page the details page//ww w .ja v a2 s .co m * @param licenseType if user has multiple licenses, this one will be used * @return the parsed license details * @throws ParsingException if the page does not contain the expected elements */ private License parseLicense(Document page, String licenseType) throws ParsingException { if (!page.select("span#lblFormTitle").text().equals("License Details")) { throw new ParsingException(ErrorCode.MITA50002.getDesc()); } License license = new License(); ProviderProfile profile = new ProviderProfile(); license.setProfile(profile); String fullName = page.select("#_ctl7_lblName").text(); User user = new User(); profile.setUser(user); String[] nameParts = fullName.split(" "); user.setLastName(nameParts[nameParts.length - 1]); if (nameParts.length > 1) { user.setFirstName(nameParts[0]); } // everything else goes to middle name (per site behavior) if (nameParts.length > 2) { StringBuffer sb = new StringBuffer(); for (int i = 1; i < nameParts.length - 1; i++) { if (sb.length() > 0) { sb.append(" "); } sb.append(nameParts[i]); } user.setMiddleName(sb.toString()); } String dateOfBirth = page.select("#_ctl7_lblDOB").text(); if (Util.isNotBlank(dateOfBirth)) { profile.setDob(parseDate(dateOfBirth, DATE_FORMAT)); } String gender = page.select("#_ctl7_lblGender").text(); if (Util.isNotBlank(gender)) { if ("Female".equals(gender)) { profile.setSex(Sex.FEMALE); } else { profile.setSex(Sex.MALE); } } Elements licenses = page.select("#_ctl7_dgLicense tr.Normal"); for (Element row : licenses) { String licenseNumber = row.select("td:eq(0)").text(); if (licenseType != null && !licenseNumber.startsWith(licenseType)) { // user has multiple licenses, the results will show this user twice (search by name) continue; } String[] licenseParts = licenseNumber.split(" "); LicenseType type = new LicenseType(); type.setName(TYPES.get(licenseParts[0]) == null ? licenseParts[0] : TYPES.get(licenseParts[0])); license.setType(type); license.setLicenseNumber(licenseParts[1]); String issueDate = row.select("td:eq(1)").text(); if (Util.isNotBlank(issueDate)) { license.setOriginalIssueDate(parseDate(issueDate, DATE_FORMAT)); } String expirationDate = row.select("td:eq(2)").text(); if (Util.isNotBlank(expirationDate)) { license.setExpireDate(parseDate(expirationDate, DATE_FORMAT)); } } return license; }
From source file:de.geeksfactory.opacclient.apis.Littera.java
protected void addSimpleSearchField(List<SearchField> fields) throws IOException, JSONException { final String html = httpGet(getApiUrl() + "&mode=s", getDefaultEncoding()); final Document doc = Jsoup.parse(html); final Element simple = doc.select(".simple_search").first(); final TextSearchField field = new TextSearchField(); field.setFreeSearch(true);//from w ww.java 2s .c o m field.setDisplayName(simple.select("h4").first().text()); field.setId(simple.select("#keyboard").first().attr("name")); field.setHint(""); field.setData(new JSONObject()); field.getData().put("meaning", field.getId()); fields.add(field); }
From source file:mobi.jenkinsci.alm.assembla.client.AssemblaClient.java
public void login() throws IOException { Document pinDoc = Jsoup.parse(getData(String.format(AUTH, appId), false)); if (getLatestRedirectedUrl().getPath().startsWith(LOGIN)) { pinDoc = postLoginForm(pinDoc);// w ww . j a v a 2s .c om } final Element pinBox = pinDoc.select("div[class=box]").first(); if (pinBox == null) { throw new IOException("Missing PIN code from Assembla auth response"); } final Element pinLabel = pinBox.select("p").first(); final Element pinValue = pinBox.select("h1").first(); if (pinLabel == null || pinValue == null) { throw new IOException("Missing PIN code from Assembla auth response"); } final String pin = pinValue.childNode(0).toString(); final HttpPost authPost = new HttpPost( String.format(ASSEMBLA_SITE_APP_AUTH, appId, appSecret) + String.format(PIN_AUTH, pin)); final HttpResponse pinResponse = httpClient.execute(authPost); try { if (pinResponse.getStatusLine().getStatusCode() != HttpURLConnection.HTTP_OK) { throw new IOException( "Post " + authPost.getURI() + " for a PIN failed: " + pinResponse.getStatusLine()); } accessToken = gson.fromJson( new JsonReader(new InputStreamReader(pinResponse.getEntity().getContent(), "UTF-8")), AssemblaAccessToken.class); } finally { authPost.releaseConnection(); } }
From source file:gov.medicaid.screening.dao.impl.NursingLicenseDAOBean.java
/** * Performs a search for all possible results. * * @param criteria The search criteria.//from ww w.j ava 2 s. c o m * @param byName flag indicating it is a name search * @return the search result for licenses * * @throws URISyntaxException if an error occurs while building the URL. * @throws ClientProtocolException if client does not support protocol used. * @throws IOException if an error occurs while parsing response. * @throws ParseException if an error occurs while parsing response. * @throws ServiceException for any other problems encountered */ private SearchResult<License> getAllResults(NursingLicenseSearchCriteria criteria, boolean byName) throws URISyntaxException, ClientProtocolException, IOException, ParseException, ServiceException { DefaultHttpClient client = new DefaultHttpClient(getLaxSSLConnectionManager()); client.setRedirectStrategy(new LaxRedirectStrategy()); client.setCookieStore(loginAsPublicUser()); HttpGet getSearch = new HttpGet(new URIBuilder(getSearchURL()).build()); HttpResponse response = client.execute(getSearch); verifyAndAuditCall(getSearchURL(), response); Document page = Jsoup.parse(EntityUtils.toString(response.getEntity())); HttpPost search = new HttpPost(new URIBuilder(getSearchURL()).build()); List<License> allLicenses = new ArrayList<License>(); // switch to search by name screen if (byName) { HttpEntity entity = postForm(getSearchURL(), client, search, new String[][] { { "__EVENTTARGET", "_ctl7_rbtnSearch_1" }, { "__EVENTARGUMENT", "" }, { "_ctl7:ddlbLicenseType", "R" }, { "_ctl7:rbtnSearch", "2" }, { "_ctl7:txtCheckDigit", "" }, { "_ctl7:txtLicenseNumber", "" }, { "__VIEWSTATE", page.select("input[name=__VIEWSTATE]").first().val() } }, true); page = Jsoup.parse(EntityUtils.toString(entity)); entity = getResultPage(criteria, client, page, search, "_ctl7:cmdSearch", getSearchURL()); page = Jsoup.parse(EntityUtils.toString(entity)); // get the data grid entries if (page.select("table#_ctl7_grdSearchResults").size() < 1) { throw new ParsingException(ErrorCode.MITA50002.getDesc()); } Elements rows = page.select(GRID_ROW_SELECTOR); while (rows.size() > 0) { for (Element row : rows) { String url = row.select("a").first().attr("href"); String licenseNo = row.select("td:eq(4)").text(); HttpGet getDetail = new HttpGet(Util.replaceLastURLPart(getSearchURL(), url)); response = client.execute(getDetail); verifyAndAuditCall(getSearchURL(), response); Document licenseDetails = Jsoup.parse(EntityUtils.toString(response.getEntity())); allLicenses.add(parseLicense(licenseDetails, licenseNo.substring(0, 1))); } rows.clear(); // check for next page Element currentPage = page.select("#_ctl7_grdSearchResults tr.TablePager span").first(); if (getLog() != null) { getLog().log(Level.DEBUG, "Current page is: " + currentPage.text()); } Element pageLink = currentPage.nextElementSibling(); if (pageLink != null && pageLink.hasAttr("href")) { if (getLog() != null) { getLog().log(Level.DEBUG, "There are more results, getting the next page."); } String target = parseEventTarget(pageLink.attr("href")); entity = getResultPage(criteria, client, page, search, target, getSearchURL()); page = Jsoup.parse(EntityUtils.toString(entity)); rows = page.select(GRID_ROW_SELECTOR); } } } else { // search by license number (site supports only exact match) HttpEntity entity = postForm(getSearchURL(), client, search, new String[][] { { "__EVENTTARGET", "_ctl7:cmdSearch" }, { "__EVENTARGUMENT", "" }, { "_ctl7:ddlbLicenseType", Util.defaultString(criteria.getLicenseType().getName()) }, { "_ctl7:rbtnSearch", "1" }, { "_ctl7:txtCheckDigit", Util.defaultString(criteria.getCheckDigit()) }, { "_ctl7:txtLicenseNumber", Util.defaultString(criteria.getIdentifier()) }, { "__VIEWSTATE", page.select("input[name=__VIEWSTATE]").first().val() } }, true); page = Jsoup.parse(EntityUtils.toString(entity)); if (page.select("span#lblFormTitle").text().equals("License Details")) { String prefLicenseType = criteria.getLicenseType().getName(); allLicenses.add(parseLicense(page, prefLicenseType)); } } SearchResult<License> searchResult = new SearchResult<License>(); searchResult.setItems(allLicenses); return searchResult; }
From source file:org.confab.VBulletinParser.java
public List<Forum> parseForums(Document root, BulletinBoard parent) { Utilities.debug("parseForums"); List<Forum> ret = new ArrayList<Forum>(); // get table//from w w w .j a v a2 s . co m Elements forum_table = root.select("tbody[id*=collapseobj_forumbit_] tr"); assert !forum_table.isEmpty(); for (Element el_tr : forum_table) { Forum new_forum = new Forum(parent); // Get the table data for this row Elements el_tds = el_tr.select("td"); assert !el_tds.isEmpty() : el_tr.html(); // xbox360achievements has a lot of subforums and puts these in their own table // The <a>'s are picked up as children of the parent <td> so don't parse this sub- // tables row's seperatly if (!el_tds.select("td.thead").isEmpty() || el_tds.size() < 3) { //Utilities.debug("tr doesn't seem to have anything we want, skipping."); continue; } // Get the title URL Elements els_a = el_tds.get(1).select("a"); assert !els_a.isEmpty() : el_tds.html(); new_forum.url = els_a.first().attr("href"); assert new_forum.url != null; Utilities.debug("new_forum.url : " + new_forum.url); // Get the title text assert els_a.first() != null; new_forum.title = els_a.first().text(); assert new_forum.title != null; Utilities.debug("new_forum.title : " + new_forum.title); // Check for any subforums in remaining a elements els_a.remove(els_a.first()); for (Element el_a : els_a) { Forum sub_forum = new Forum(parent); sub_forum.url = el_a.attr("href"); assert sub_forum.url != null; sub_forum.title = el_a.text(); assert sub_forum.title != null; new_forum.subForums.add(sub_forum); Utilities.debug("added subForum: " + sub_forum.title); } // Get num viewing the current forum Element el_viewing = el_tr.select(":matchesOwn((\\d+ Viewing))").first(); if (el_viewing != null) { new_forum.numViewing = el_viewing.text(); } else { new_forum.numViewing = "0"; } Utilities.debug("new_forum.numViewing : " + new_forum.numViewing); // Get the description/message of this topic Element el_description = el_tds.get(1).select("div.smallfont").first(); if (el_description != null) { new_forum.description = el_description.text(); } else { new_forum.description = ""; } Utilities.debug("new_forum.description : " + new_forum.description); Utilities.debug("new_forum.parent.url : " + new_forum.parent.url); ret.add(new_forum); Utilities.debug("-----"); } Utilities.debug("end parseForums"); return ret; }
From source file:eu.riscoss.dataproviders.providers.FossologyDataProvider.java
/** * Analyses a fossology html file/* w ww . java 2 s . c om*/ * @param target * @param licensesMap * @return * @throws IOException */ private HashMap<String, Integer> analyseOverviewReport(String target, HashMap<String, Collection<String>> licensesMap) throws IOException { //private static HashMap<String, Integer> analyseFossologyReport(String target, String licenseFile) throws IOException { // List<String> result = new ArrayList<String>(); Document document; if (target.startsWith("http")) { document = Jsoup.connect(target).get(); } else { File file = new File(target); document = Jsoup.parse(file, "UTF-8", "http://localhost"); } Element table = document.select("table[id=lichistogram]").first(); Elements rows = table.select("tr"); List<LicenseEntry> llist = new ArrayList<LicenseEntry>(); //list of licenses in the fossology file //for each license, parses the name (0) and the number of occurrences (2) and saves it as a LicenseEntry for (Element element : rows) { Elements col = element.select("td"); if (col.size() != 0) { int c = Integer.parseInt(col.get(0).ownText());//num of occurrences String lic = col.get(2).text(); llist.add(new LicenseEntry(c, lic)); //mlist.put(lic, c); } // System.out.println(col.get(1).ownText()); // Element count=col.get(0); } //get license type buckets HashMap<String, Integer> licenseBuckets = new HashMap<String, Integer>(); int total = 0; Set<String> licenseTypes = licensesMap.keySet(); //initialize with 0 to avoid missing types for (String licensetype : licenseTypes) { licenseBuckets.put(licensetype, 0); } boolean matched = false; int numUnknown = 0; for (LicenseEntry le : llist) { for (String licenseType : licenseTypes) {//cycles on license types from config file if (le.matchesOneOf(licensesMap.get(licenseType), licenseType)) { Integer currentcount = licenseBuckets.get(le.licensetype); if (currentcount == null) //for safety, but should be initialised currentcount = 0; licenseBuckets.put(le.licensetype, currentcount + le.count); matched = true; } } total += le.count; if (matched == false) { //unknown numUnknown += le.count; System.out.println("Unknown license: " + le.getName()); } } licenseBuckets.put("_unknown_", numUnknown); licenseBuckets.put("_sum_", total); licenseBuckets.put("_count_", llist.size()); System.out.println("\nLicense Buckets Fossology from HTML overview scanning:"); System.out.println(licenseBuckets); // for (String license : result) { // System.out.format("%s\n", license); // } return licenseBuckets; }
From source file:mobi.jenkinsci.alm.assembla.client.AssemblaClient.java
private Document postLoginForm(final Document pinDoc) throws IOException { final List<NameValuePair> formNvps = new ArrayList<NameValuePair>(); final Element form = pinDoc.select("form[id=login-box]").first(); final String formAction = form.attr("action"); final HttpPost formPost = new HttpPost(getUrl(formAction).toString()); final Elements formFields = form.select("input"); for (final Element element : formFields) { final String fieldName = element.attr("name"); String fieldValue = element.attr("value"); final String fieldId = element.attr("id"); final String fieldType = element.attr("type"); if (fieldId.equalsIgnoreCase("user_login")) { fieldValue = username;/*from ww w. j a v a2 s. c o m*/ ; } else if (fieldId.equalsIgnoreCase("user_password")) { fieldValue = password; } if (fieldType.equals("submit")) { if (!fieldName.equalsIgnoreCase("commit")) { continue; } } LOG.debug(String.format("Processing form field: name='%s' value='%s' id='%s'", fieldName, fieldValue, fieldId)); formNvps.add(new BasicNameValuePair(fieldName, fieldValue)); } try { formPost.setEntity(new UrlEncodedFormEntity(formNvps, "UTF-8")); } catch (final UnsupportedEncodingException e) { // This would never happen throw new IllegalArgumentException("UTF-8 not recognised"); } HttpResponse response; LOG.debug("Login via posting form-data to " + formPost.getURI()); try { response = sendHttpPost(formPost); if (response.getStatusLine().getStatusCode() != HttpURLConnection.HTTP_MOVED_TEMP) { throw new IOException("Form-based login to Assembla failed: " + response.getStatusLine()); } return Jsoup.parse(getData(response.getFirstHeader("Location").getValue(), false)); } finally { formPost.releaseConnection(); } }
From source file:gov.medicaid.screening.dao.impl.OIGDAOBean.java
/** * Performs a search for all possible results. * * @param criteria The search criteria./*from www . j av a2 s.c o m*/ * @return the search result for provider profiles * * @throws URISyntaxException if an error occurs while building the URL. * @throws ClientProtocolException if client does not support protocol used. * @throws IOException if an error occurs while parsing response. * @throws ParseException if an error occurs while parsing response. * @throws ServiceException for any other problems encountered */ private SearchResult<ProviderProfile> getAllResults(OIGSearchCriteria criteria) throws URISyntaxException, ClientProtocolException, IOException, ParseException, ServiceException { DefaultHttpClient client = new DefaultHttpClient(getLaxSSLConnectionManager()); client.setRedirectStrategy(new LaxRedirectStrategy()); HttpGet getSearch = new HttpGet(new URIBuilder(getSearchURL()).build()); HttpResponse response = client.execute(getSearch); verifyAndAuditCall(getSearchURL(), response); Document page = Jsoup.parse(EntityUtils.toString(response.getEntity())); HttpPost search = new HttpPost(new URIBuilder(getSearchURL()).build()); List<ProviderProfile> allProfiles = new ArrayList<ProviderProfile>(); boolean entitySearch = (Util.isBlank(criteria.getLastName()) && Util.isBlank(criteria.getFirstName())); HttpEntity entity = null; if (!entitySearch) { entity = postForm(getSearchURL(), client, search, new String[][] { { "__EVENTARGUMENT", "" }, { "__EVENTTARGET", "" }, { "__EVENTVALIDATION", page.select("input[name=__EVENTVALIDATION]").first().val() }, { "__VIEWSTATE", page.select("input[name=__VIEWSTATE]").first().val() }, { "ctl00$cpExclusions$ibSearchSP.x", "0" }, { "ctl00$cpExclusions$ibSearchSP.y", "0" }, { "ctl00$cpExclusions$txtSPLastName", Util.defaultString(criteria.getLastName()) }, { "ctl00$cpExclusions$txtSPFirstName", Util.defaultString(criteria.getFirstName()) } }, false); } else { HttpEntity searchEntity = postForm(getSearchURL(), client, search, new String[][] { { "__EVENTARGUMENT", "" }, { "__EVENTTARGET", "ctl00$cpExclusions$Linkbutton1" }, { "__EVENTVALIDATION", page.select("input[name=__EVENTVALIDATION]").first().val() }, { "__VIEWSTATE", page.select("input[name=__VIEWSTATE]").first().val() }, { "ctl00$cpExclusions$txtSPLastName", "" }, { "ctl00$cpExclusions$txtSPFirstName", "" } }, false); page = Jsoup.parse(EntityUtils.toString(searchEntity)); entity = postForm(getSearchURL(), client, search, new String[][] { { "__EVENTARGUMENT", "" }, { "__EVENTTARGET", "" }, { "__EVENTVALIDATION", page.select("input[name=__EVENTVALIDATION]").first().val() }, { "__VIEWSTATE", page.select("input[name=__VIEWSTATE]").first().val() }, { "ctl00$cpExclusions$ibSearchSP.x", "0" }, { "ctl00$cpExclusions$ibSearchSP.y", "0" }, { "ctl00$cpExclusions$txtSBName", Util.defaultString(criteria.getBusinessName()) } }, false); } page = Jsoup.parse(EntityUtils.toString(entity)); Elements rows; int ssnColumnIndex; if (!entitySearch) { rows = page.select("table#ctl00_cpExclusions_gvEmployees tr:gt(0)"); ssnColumnIndex = 7; } else { rows = page.select("table#ctl00_cpExclusions_gvBusiness tr:gt(0)"); ssnColumnIndex = 5; } for (Element row : rows) { String href; if (row.select("td:eq(" + ssnColumnIndex + ")").text().equals("N/A")) { href = row.select("td:eq(0) a").first().attr("href"); } else { href = row.select("td:eq(" + ssnColumnIndex + ") a").first().attr("href"); } href = href.replaceFirst("javascript:__doPostBack\\('", ""); href = href.replaceFirst("',''\\)", ""); ProviderProfile profile = parseProfile(getDetails(client, href, page)); String entityId = href.substring(0, href.lastIndexOf('$')); entityId = entityId.substring(entityId.lastIndexOf('$') + 4); profile.setId(Long.parseLong(entityId) - 2); allProfiles.add(profile); } SearchResult<ProviderProfile> searchResult = new SearchResult<ProviderProfile>(); searchResult.setItems(allProfiles); return searchResult; }