List of usage examples for org.jsoup.nodes Document select
public Elements select(String cssQuery)
From source file:com.github.binlee1990.spider.movie.spider.MovieCrawler.java
private void addFilmRegionList(Document doc, Film film) { Elements keyElements = doc.select(".fm-minfo dt"); Elements valueElements = doc.select(".fm-minfo dd"); if (CollectionUtils.isNotEmpty(keyElements) && CollectionUtils.isNotEmpty(valueElements)) { int keyI = 0; for (; keyI < keyElements.size(); keyI++) { Element keyElement = keyElements.get(keyI); Element valueElement = valueElements.get(keyI); if (null != keyElement && null != valueElement) { String key = StringUtils.trimToEmpty(keyElement.text().toString()); if (StringUtils.isNotBlank(key)) { String value = StringUtils.trimToEmpty(valueElement.text().toString()); if (StringUtils.equalsIgnoreCase(key, "") && StringUtils.isNotBlank(value)) { List<String> regionList = SLASH_SPLITTER.splitToList(value); if (CollectionUtils.isNotEmpty(regionList)) { regionList.forEach(region -> { EnumRegion queryRegion = new EnumRegion(); queryRegion.setUrlRegion(region); EnumRegion enumRegion = enumRegionMapper .queryEnumRegionByEnumRegion(queryRegion); if (null != enumRegion) { FilmRegion filmRegion = new FilmRegion(); filmRegion.setFilmCode(film.getCode()); filmRegion.setRegionId(enumRegion.getId()); Date now = new Date(); filmRegion.setCreateTime(now); filmRegion.setUpdateTime(now); filmRegionMapper.insertSelective(filmRegion); }//from w w w .j ava2 s . c om }); } break; } } } } } }
From source file:qhindex.controller.SearchAuthorWorksController.java
public void searchCitationsForWork(AuthorWork awork, ArrayList<CitingWork> results) throws IOException { int maxResultsPerPage = 10; int indexCitationComponentStart = awork.getCitationsUrl().indexOf('?') + 1; String citationsUrlComponent = awork.getCitationsUrl().substring(indexCitationComponentStart); boolean continueSearch = true; int page = 0; while (continueSearch) { Debug.info("Retrieving citations for work - page " + page); String citationsUrlPage = "https://scholar.google.com.au/scholar?start=" + (page * maxResultsPerPage) + "&" + citationsUrlComponent; Document authorDoc = requestWebDocFromScholar(citationsUrlPage); AppHelper.waitBeforeNewRequest(); Elements citationsElems = null; if (authorDoc != null) { citationsElems = authorDoc.select("div.gs_ri"); for (Element citationElements : citationsElems) { CitingWork citingWork = extractCitationWork(citationElements); results.add(citingWork); }// w w w. ja va 2 s . c o m } page += 1; // End search if cannot retrieve the citations web page from scholar or it does not have // any citation elements because there are no more citations. if (citationsElems == null || citationsElems.size() < maxResultsPerPage) { continueSearch = false; } } }
From source file:com.liato.bankdroid.banking.banks.AbsIkanoPartner.java
@Override protected LoginPackage preLogin() throws BankException, ClientProtocolException, IOException { urlopen = new Urllib(context, CertificateReader.getCertificates(context, R.raw.cert_ikanopartner)); response = urlopen.open("https://partner.ikanobank.se/web/engines/page.aspx?structid=" + structId); Document d = Jsoup.parse(response); Element viewstate = d.getElementById("__VIEWSTATE"); if (viewstate == null || TextUtils.isEmpty(viewstate.val())) { throw new BankException(res.getText(R.string.unable_to_find).toString() + " ViewState."); }//from w w w.j a v a 2 s .c o m Element eventvalidation = d.getElementById("__EVENTVALIDATION"); if (eventvalidation == null || TextUtils.isEmpty(eventvalidation.val())) { throw new BankException(res.getText(R.string.unable_to_find).toString() + " EventValidation."); } Element userField = d.select("#LoginSpan input[type=text]").first(); Element passField = d.select("#LoginSpan input[type=password]").first(); Element submitField = d.select("#LoginCustomerDiv input[type=submit]").first(); if (userField == null || passField == null || submitField == null) { throw new BankException(res.getText(R.string.unable_to_find).toString() + " login fields."); } List<NameValuePair> postData = new ArrayList<NameValuePair>(); postData.add(new BasicNameValuePair("__VIEWSTATE", viewstate.val())); postData.add(new BasicNameValuePair("__EVENTVALIDATION", eventvalidation.val())); postData.add(new BasicNameValuePair(userField.attr("name"), username)); postData.add(new BasicNameValuePair(passField.attr("name"), password)); postData.add(new BasicNameValuePair(submitField.attr("name"), submitField.val())); return new LoginPackage(urlopen, postData, response, "https://partner.ikanobank.se/web/engines/page.aspx?structid=" + structId); }
From source file:gov.medicaid.screening.dao.impl.MedicalPracticeLicenseDAOBean.java
/** * Parse the License information./* w ww . j av a 2s. c o m*/ * * @param city the license provider city * @param details the details page * @return the parsed license */ private License parseLicense(String city, Document details) { License license = new License(); license.setCity(city); ProviderProfile profile = new ProviderProfile(); license.setProfile(profile); String fullName = details.select("#_ctl7_lblName").text(); User user = new User(); String[] nameParts = fullName.split(","); if (nameParts.length > 0) { user.setLastName(nameParts[0].trim()); } if (nameParts.length > 1) { user.setFirstName(nameParts[1].trim()); } profile.setUser(user); String licenseType = details.select("#_ctl7_ProfileInfoLicense_lblLicType").text(); LicenseType licType = new LicenseType(); licType.setName(licenseType); license.setType(licType); String licenseNo = details.select("#_ctl7_ProfileInfoLicense_lblLicNbr").text(); license.setLicenseNumber(licenseNo); String licensureAddress1 = details.select("#_ctl7_ProfileInfoPublic_lblAddress").text(); String licensureAddress2 = details.select("#_ctl7_ProfileInfoPublic_lblAddress2").text(); String licensureCityState = details.select("#_ctl7_ProfileInfoPublic_lblCity").text(); Address address = new Address(); address.setLocation(licensureAddress1 + " " + licensureAddress2); setCityStateZip(address, licensureCityState); String email = details.select("#_ctl7_ProfileInfoPublic_lblEmail").text(); profile.setContactEmail(email); String birthYear = details.select("#_ctl7_ProfileInfoPublic_lblBirthYear").text(); if (Util.isNotBlank(birthYear)) { profile.setDob(new GregorianCalendar(Integer.parseInt(birthYear), Calendar.JANUARY, 1).getTime()); } String gender = details.select("#_ctl7_ProfileInfoPublic_lblGender").text(); if ("Male".equals(gender)) { profile.setSex(Sex.MALE); } else if ("Female".equals(gender)) { profile.setSex(Sex.FEMALE); } String expirationDate = details.select("#_ctl7_ProfileInfoLicense_lblExpDate").text(); String originalIssueDate = details.select("#_ctl7_ProfileInfoLicense_lblGrantDate").text(); Date issueDate = parseDate(originalIssueDate, DATE_FORMAT); if (issueDate != null) { license.setOriginalIssueDate(issueDate); } Date expireDate = parseDate(expirationDate, DATE_FORMAT); if (expireDate != null) { license.setExpireDate(expireDate); } String licenseStatus = details.select("#_ctl7_ProfileInfoLicense_lblLicStatus").text(); LicenseStatus status = new LicenseStatus(); status.setName(licenseStatus); license.setStatus(status); String disciplinaryAction = details.select("#_ctl7_ProfileInfoLicense_lblDiscAction").text(); String correctiveAction = details.select("#_ctl7_ProfileInfoLicense_lblCorrAction").text(); license.setDiscipline(!"No".equals(disciplinaryAction.trim())); license.setCorrectiveAction(!"No".equals(correctiveAction.trim())); String medSchool = details.select("#_ctl7_ProfileInfoEducation_lblName").text(); MedicalSchool medicalSchool = new MedicalSchool(); medicalSchool.setName(medSchool); license.setMedicalSchool(medicalSchool); String degree = details.select("#_ctl7_ProfileInfoEducation_lblDegree").text(); if ("PhD".equals(degree.trim())) { profile.setDegree(Degree.DOCTORATE); } else if (!Util.isBlank(degree)) { profile.setDegree(Degree.MASTER); } PrivatePractice privatePractice = new PrivatePractice(); profile.setPrivatePractice(privatePractice); String primaryAddressName = details.select("#_ctl7_ProfileInfoPractices_lblPrimaryName").text(); String primaryAddress1 = details.select("#_ctl7_ProfileInfoPractices_lblPrimaryAddress").text(); String primaryCityState = details.select("#_ctl7_ProfileInfoPractices_lblPrimaryAddress2").text(); String primaryPhone = details.select("#_ctl7_ProfileInfoPractices_lblPrimaryPhone").text(); if (Util.isNotBlank(primaryAddressName) || Util.isNotBlank(primaryAddress1) || Util.isNotBlank(primaryCityState)) { Address primary = new Address(); address.setLocation(primaryAddressName + " " + primaryAddress1); setCityStateZip(primary, primaryCityState); privatePractice.setOfficeAddress(primary); } privatePractice.setOfficePhoneNumber(primaryPhone); String secondaryAddressName = details.select("#_ctl7_ProfileInfoPractices_lblSecondaryName").text(); String secondaryAddress1 = details.select("#_ctl7_ProfileInfoPractices_lblSecondaryAddress").text(); String secondaryCityState = details.select("#_ctl7_ProfileInfoPractices_lblSecondaryAddress2").text(); String secondaryPhone = details.select("#_ctl7_ProfileInfoPractices_lblSecondaryPhone").text(); if (Util.isNotBlank(secondaryAddressName) || Util.isNotBlank(secondaryAddress1) || Util.isNotBlank(secondaryCityState)) { Address secondary = new Address(); address.setLocation(secondaryAddressName + " " + secondaryAddress1); setCityStateZip(secondary, secondaryCityState); privatePractice.setSecondaryAddress(secondary); } privatePractice.setSecondaryPhoneNumber(secondaryPhone); Elements specialties = details.select("#_ctl7_ProfileInfoSpecialty_dgSpecialty tr:gt(0)"); List<Specialty> sps = new ArrayList<Specialty>(); for (Element element : specialties) { Specialty sp = new Specialty(); SpecialtyType spt = new SpecialtyType(); spt.setName(element.select("td:eq(0)").text()); sp.setType(spt); sp.setName(element.select("td:eq(1)").text()); sps.add(sp); } profile.setSpecialties(sps); return license; }
From source file:eu.riscoss.rdc.RDCFossology.java
/** * Analyses a fossology html file/*w ww . j ava 2 s . co m*/ * @param target * @param licensesMap * @return * @throws IOException */ private HashMap<String, Integer> analyseOverviewReport(String target, HashMap<String, Collection<String>> licensesMap) throws IOException { //private static HashMap<String, Integer> analyseFossologyReport(String target, String licenseFile) throws IOException { // List<String> result = new ArrayList<String>(); Document document; if (target.startsWith("http")) { document = Jsoup.connect(target).get(); } else { File file = new File(target); document = Jsoup.parse(file, "UTF-8", "http://localhost"); } Element table = document.select("table[id=lichistogram]").first(); Elements rows = table.select("tr"); List<LicenseEntry> llist = new ArrayList<LicenseEntry>(); //list of licenses in the fossology file //for each license, parses the name (0) and the number of occurrences (2) and saves it as a LicenseEntry for (Element element : rows) { Elements col = element.select("td"); if (col.size() != 0) { int c = Integer.parseInt(col.get(0).ownText());//num of occurrences String lic = col.get(2).text(); llist.add(new LicenseEntry(c, lic)); } } //get license type buckets HashMap<String, Integer> licenseBuckets = new HashMap<String, Integer>(); int total = 0; Set<String> licenseTypes = licensesMap.keySet(); //initialize with 0 to avoid missing types for (String licensetype : licenseTypes) { licenseBuckets.put(licensetype, 0); } boolean matched = false; int numUnknown = 0; for (LicenseEntry le : llist) { for (String licenseType : licenseTypes) {//cycles on license types from config file if (le.matchesOneOf(licensesMap.get(licenseType), licenseType)) { Integer currentcount = licenseBuckets.get(le.licensetype); if (currentcount == null) //for safety, but should be initialised currentcount = 0; licenseBuckets.put(le.licensetype, currentcount + le.count); matched = true; } } total += le.count; if (matched == false) { //unknown numUnknown += le.count; System.err.println("Unknown license: " + le.getName()); } } licenseBuckets.put("_unknown_", numUnknown); licenseBuckets.put("_sum_", total); licenseBuckets.put("_count_", llist.size()); return licenseBuckets; }
From source file:blackman.matt.board.Post.java
/** * Formats the HTML on the post text to accurately display it on the post. * * @param post The unformatted text of the post. * @return A formatted version of the post. *///from w w w .j a va 2 s . c o m private String formatPostBody(String post) { Document formattedText = Jsoup.parse(post); Pattern p = Pattern.compile("^/.*/index\\.html"); // Red Text Elements redTexts = formattedText.getElementsByClass("heading"); for (Element text : redTexts) { text.wrap("<font color=\"#AF0A0F\"><strong></strong></font>"); } // Green text Elements greenTexts = formattedText.getElementsByClass("quote"); for (Element text : greenTexts) { text.wrap("<font color=\"#789922\"></font>"); } // Board Links Elements boardLinks = formattedText.select("a"); for (Element link : boardLinks) { String url = link.attr("href"); Matcher m = p.matcher(url); if (m.matches()) { link.attr("href", "http://8chan.co" + url); } } // Reply links Elements replyLinks = formattedText.select("a[onclick^=highlightReply"); for (Element reply : replyLinks) { repliedTo.add(reply.attr("href").split("#")[1]); boardLinks.attr("href", "http://8chan.co" + reply.attr("href")); } // Post too long text removal Elements tooLongs = formattedText.getElementsByClass("toolong"); for (Element text : tooLongs) { text.text(""); } return formattedText.toString(); }
From source file:gov.medicaid.screening.dao.impl.NurseAnesthetistsLicenseDAOBean.java
/** * Performs the call to the source site, exact match is expected given the parameters. * * @param criteria the search criteria//ww w . ja v a2 s .c om * @return the matched result, null if not found * @throws IOException if an I/O error is encountered * @throws URISyntaxException if the site URL cannot properly be created * @throws ServiceException for any other exceptions encountered */ private ProviderProfile getProviderProfile(NurseAnesthetistsSearchCriteria criteria) throws ServiceException, IOException, URISyntaxException { DefaultHttpClient client = new DefaultHttpClient(); client.setRedirectStrategy(new LaxRedirectStrategy()); String searchURL = getSearchURL(); HttpGet getSearch = new HttpGet(new URIBuilder(searchURL).build()); HttpResponse response = client.execute(getSearch); verifyAndAuditCall(searchURL, response); Document page = Jsoup.parse(EntityUtils.toString(response.getEntity())); HttpPost search = new HttpPost(new URIBuilder(searchURL).build()); String searchType = "Lookup Certification Status"; String last4 = criteria.getSsn().substring(criteria.getSsn().length() - 4); HttpEntity entity = postForm(searchURL, client, search, new String[][] { { "__EVENTARGUMENT", "" }, { "__EVENTTARGET", "" }, { "__EVENTVALIDATION", page.select("input[name=__EVENTVALIDATION]").first().val() }, { "__VIEWSTATE", page.select("input[name=__VIEWSTATE]").first().val() }, { "ctl00$PageContent$CertRecert$btnLookup", searchType }, { "ctl00$PageContent$CertRecert$txtAANANumber", "" + criteria.getAanaNumber() }, { "ctl00$PageContent$CertRecert$txtSSNLast4", "" + last4 } }, true); page = Jsoup.parse(EntityUtils.toString(entity)); Elements message = page.select("#ctl00_PageContent_ucCredentialsControl_lblErrorMessage"); if (message.size() > 0) { if (message.text().startsWith("No individual with a social security number")) { // no match, return null return null; } } if (criteria.isRecertification()) { searchType = "Lookup Recertification Status"; entity = postForm(searchURL, client, search, new String[][] { { "__EVENTARGUMENT", "" }, { "__EVENTTARGET", "" }, { "__EVENTVALIDATION", page.select("input[name=__EVENTVALIDATION]").first().val() }, { "__VIEWSTATE", page.select("input[name=__VIEWSTATE]").first().val() }, { "ctl00$PageContent$CertRecert$btnSwapDisplayMode", searchType } }, true); page = Jsoup.parse(EntityUtils.toString(entity)); } return parseProvider(page); }
From source file:com.github.binlee1990.spider.movie.spider.MovieCrawler.java
private void setFilmRelated(Document doc, Film film) { Elements keyElements = doc.select(".fm-minfo dt"); Elements valueElements = doc.select(".fm-minfo dd"); if (CollectionUtils.isNotEmpty(keyElements) && CollectionUtils.isNotEmpty(valueElements)) { int keyI = 0; for (; keyI < keyElements.size(); keyI++) { Element keyElement = keyElements.get(keyI); Element valueElement = valueElements.get(keyI); if (null != keyElement && null != valueElement) { String key = StringUtils.trimToEmpty(keyElement.text().toString()); if (StringUtils.isNotBlank(key)) { String value = StringUtils.trimToEmpty(valueElement.text().toString()); if (StringUtils.equalsIgnoreCase(key, "")) { Director director = createOrQueryDirector(value); if (null != director) { film.setDirectorId(director.getId()); }/*from w w w . jav a 2 s . com*/ film.setDirector(value); } if (StringUtils.equalsIgnoreCase(key, "")) { } if (StringUtils.equalsIgnoreCase(key, "")) { } if (StringUtils.equalsIgnoreCase(key, "")) { String urlYear = getFilmUrlYear(doc, value); if (StringUtils.isNotBlank(urlYear)) { EnumYear enumYear = queryEnumYear(urlYear); if (null != enumYear) { film.setYearId(enumYear.getId()); } } Date releaseDate = getFilmReleaseDate(value); if (null != releaseDate) { film.setReleaseDate(releaseDate); } } if (StringUtils.equalsIgnoreCase(key, "")) { int length = getFilmLength(value); film.setLength(length); } if (StringUtils.equalsIgnoreCase(key, "??")) { if (StringUtils.isNotBlank(value)) { film.setAlias(value); } } } } } } }
From source file:gov.medicaid.screening.dao.impl.DieteticsAndNutritionPracticeLicenseDAOBean.java
/** * Performs a search for all possible results. * * @param identifier The value to be searched. * @return the search result for licenses * @throws URISyntaxException When an error occurs while building the URL. * @throws ClientProtocolException When client does not support protocol used. * @throws IOException When an error occurs while parsing response. * @throws ParseException When an error occurs while parsing response. * @throws PersistenceException for database related errors * @throws ServiceException for any other errors *//* ww w . j a va2 s . c o m*/ private SearchResult<License> getAllResults(String identifier) throws URISyntaxException, ClientProtocolException, IOException, ParseException, PersistenceException, ServiceException { DefaultHttpClient client = new DefaultHttpClient(); URIBuilder builder = new URIBuilder(getSearchURL()); String hostId = builder.build().toString(); HttpGet httpget = new HttpGet(builder.build()); HttpResponse landing = client.execute(httpget); Document document = Jsoup.parse(EntityUtils.toString(landing.getEntity())); HttpPost httppost = new HttpPost(builder.build()); HttpEntity entity = postForm(hostId, client, httppost, new String[][] { { "_ctl0:_ctl1:_ctl0:txtCriteria", identifier }, { "_ctl0:_ctl1:_ctl0:btnSubmit", "Search" }, { "__EVENTTARGET", "" }, { "__EVENTARGUMENT", "" }, { "__VIEWSTATE", document.select("#Form input[name=__VIEWSTATE]").first().val() } }, true); // licenses list List<License> licenseList = new ArrayList<License>(); while (entity != null) { String result = EntityUtils.toString(entity); document = Jsoup.parse(result); Elements trs = document.select(GRID_ROW_SELECTOR); if (trs != null) { for (Element element : trs) { licenseList.add(parseLicense(element.children())); } } // done, check if there are additional results entity = null; Elements elements = document.getElementsByTag("a"); for (Element element : elements) { if (element.text().equals("Next >>")) { entity = postForm(hostId, client, httppost, new String[][] { { "_ctl0:_ctl1:_ctl0:txtCriteria", identifier }, { "__EVENTTARGET", "_ctl0:_ctl1:_ctl0:dgrdLicensee:_ctl29:_ctl1" }, { "__EVENTARGUMENT", "" }, { "__VIEWSTATE", document.select("#Form input[name=__VIEWSTATE]").first().val() } }, true); break; } } } SearchResult<License> result = new SearchResult<License>(); result.setItems(licenseList); return result; }
From source file:net.slkdev.swagger.confluence.service.impl.XHtmlToConfluenceServiceImpl.java
private Map<String, ConfluenceLink> buildTableOfContentsLinkMap() { final Map<String, ConfluenceLink> titleLinkMap = new HashMap<>(); final Document document = SWAGGER_DOCUMENT.get(); final Elements tocElements = document.select(".toc"); final Elements tocCategoryElements = tocElements.select(".sectlevel1").first().children(); final Elements tocFilteredCategoryElements = new Elements(); for (final Element tocCategoryElement : tocCategoryElements) { final Element categoryLinkElement = tocCategoryElement.children().first(); tocFilteredCategoryElements.add(categoryLinkElement); }// w w w. java 2 s .c o m final Elements tocIndividualElements = tocElements.select(".sectlevel2"); addLinksByType(titleLinkMap, tocFilteredCategoryElements, PageType.CATEGORY, null); int categoryCount = 1; for (final Element tocIndividualElement : tocIndividualElements) { final Elements tocIndividualElementLinks = tocIndividualElement.select("a"); addLinksByType(titleLinkMap, tocIndividualElementLinks, INDIVIDUAL, categoryCount); categoryCount++; } return titleLinkMap; }