List of usage examples for org.jsoup.nodes Element select
public Elements select(String cssQuery)
From source file:org.bungeni.ext.integration.bungeniportal.BungeniServiceAccess.java
private List<BasicNameValuePair> getFormFieldSelectDefaultValues(Document doc, List<String> fieldNames) { List<BasicNameValuePair> nvp = new ArrayList<BasicNameValuePair>(0); for (String fieldName : fieldNames) { Elements inputItems = doc.select("[name=" + fieldName + "]"); for (int i = 0; i < inputItems.size(); i++) { Element inputItem = inputItems.get(i); Elements selItems = inputItem.select("[selected=selected]"); for (int j = 0; j < selItems.size(); j++) { Element selItem = selItems.get(j); nvp.add(new BasicNameValuePair(fieldName, selItem.attr("value"))); }/*from w ww . ja v a2s . c o m*/ } } return nvp; }
From source file:gov.medicaid.screening.dao.impl.OIGDAOBean.java
/** * Parses the excluded provider profile details page. * * @param page the details page//www. ja v a 2s . c o m * @return the parsed license details * @throws ParsingException if the expected tags were not found */ private ProviderProfile parseProfile(Document page) throws ParsingException { ProviderProfile profile = new ProviderProfile(); // name User user = new User(); profile.setUser(user); user.setLastName(page.select("th:containsOwn(Last Name) + td").text()); user.setFirstName(page.select("th:containsOwn(First Name) + td").text()); // business String businessName = page.select("th:containsOwn(Entity) + td").text(); if (!"N/A".equals(businessName)) { Business business = new Business(); profile.setBusiness(business); business.setName(businessName); } // DOB Date dob = parseDate(page.select("th:has(acronym:containsOwn(DOB)) + td").text(), DATE_FORMAT); if (dob != null) { profile.setDob(dob); } // exclusion type ExclusionType exclusionType = new ExclusionType(); profile.setExclusionType(exclusionType); exclusionType.setName(page.select("th:containsOwn(Excl. Type) + td").text()); // specialty List<Specialty> specialties = new ArrayList<Specialty>(); Specialty specialty = new Specialty(); specialties.add(specialty); specialty.setName(page.select("th:containsOwn(Specialty) + td").text()); profile.setSpecialties(specialties); // address Elements addrElement = page.select("th:containsOwn(Address) + td"); String addr = addrElement.text(); Element addrNextRow = addrElement.parents().first().nextElementSibling(); if ("".equals(addrNextRow.select("th").text())) { addr += " " + addrNextRow.select("td").text(); } Address address = new Address(); address.setLocation(addr); profile.setAddresses(Arrays.asList(new Address[] { address })); Date date = parseDate(page.select("th:containsOwn(Excl. Date) + td").text(), DATE_FORMAT); if (date != null) { profile.setRequestEffectiveDate(date); } return profile; }
From source file:com.bdx.rainbow.service.etl.analyze.SYJHttpAnalyze.java
/** * ???// ww w . ja v a 2 s .c o m */ @Override public Collection<HttpSeed> findPageSeed(Collection<HttpSeed> seeds) throws Exception { if (CollectionUtils.isEmpty(seeds)) { return null; } Collection<HttpSeed> seedGroups = new ArrayList<HttpSeed>(); for (HttpSeed seed : seeds) { Document doc = parse(seed.getHtml()); // ?URL Elements page_form_elements = doc.select("#pageForm"); if (page_form_elements.isEmpty()) { return null; } Element page_form_e = page_form_elements.get(0); // URL String url = DOMAIN + page_form_e.attr("action"); Elements param_elements = page_form_e.select("input"); // int totalPageNum = this.getTotalPageNum(doc); for (int pageNo = 1; pageNo <= totalPageNum; pageNo++) { // ? Map<String, String> params = new HashMap<String, String>(); for (Element param_e : param_elements) { params.put(param_e.attr("name"), param_e.attr("value")); } // params.put("curstart", String.valueOf(pageNo)); HttpSeed httpSeed = this.initListHttpSeed(url, params); seedGroups.add(httpSeed); } } return seedGroups; }
From source file:com.crawler.app.run.JellyfishCrawlerSiteVNW.java
/** * This function is called when a page is fetched and ready to be processed * by your program.// ww w . j a v a 2s . c om */ @Override public void visit(Page page) { String url = page.getWebURL().getURL(); //logger.info("URL: ", url); String host = "127.0.0.1"; String port = "3306"; String dbName = "crawler"; String dbUser = "root"; String dbPwd = ""; MysqlCrawler.createConn(host, port, dbName, dbUser, dbPwd); System.out.println("\n URL visit: " + url); String href = url.toLowerCase(); if (href.startsWith("http://www.vietnamworks.com/") && (href.endsWith("jd") || href.endsWith("jv") || href.endsWith("jv/"))) { if (page.getParseData() instanceof HtmlParseData) { HtmlParseData htmlParseData = (HtmlParseData) page.getParseData(); String text = htmlParseData.getText(); String html = htmlParseData.getHtml(); String title = htmlParseData.getTitle(); Document doc = Jsoup.parse(html, "UTF-8"); //doc.outputSettings().escapeMode(EscapeMode.xhtml); Element body = doc.body(); //get meta description content //String description = doc.select("meta[name=description]").get(0).attr("content"); //System.out.println("Meta description : " + description); //Element e = doc.getElementById("detail_copyB"); Element detail = body.select("section[id=content]").first(); //String aTitlePost = getTagValues(e.toString(), "<h3>", "</h3>"); String jobUrl = url;//detail.select("h3[class=title] a").first().attr("abs:href"); String jobName = detail.select("div[class=job-header-info] h1").html(); String companyName = detail.select("span[class=company-name text-lg block] strong").html(); String companyAddress = detail.select("span[class=company-address block]").html(); String jobLocation = detail.select("p[class=work-location] span[itemprop=address] a").html(); String companyContact = detail.select("div[class=col-xs-12 col-md-8 col-lg-8 pull-left] p strong") .html();// div[class=company-info] span[class=company-address block] p System.out.println("\n Title : " + jobName); System.out.println("\n Contact : " + companyContact); try { /* Integer siteID = 2; String companyPhone = "", companyWebsite = ""; MysqlCrawler.getInstance().insertJFHRContents( siteID , jobUrl , jobName , jobLocation , companyName , companyAddress , companyPhone , companyContact , companyWebsite); */ //System.exit(1); } catch (Exception ex) { //System.out.println("\n Fail I : " + i); System.out.println("\n Ex : " + ex); } //String eCrawl2 = listTD.get(0); //String eCrawl3 = listTD.get(1); /* System.out.println("\n Cate : " + bCate); System.out.println("\n Title : " + aTitlePost); System.out.println("\n Date : " + hDatePost);*/ //System.out.println("\n E : " + listTD.toString() + " --- " + eCrawl2 + "----" + eCrawl3); //System.out.println("\n Count : " + doc.toString()); //System.out.println("\n Total Div: --" + listDetail.size()); //System.exit(1); //String content = htmlParseData.getBodyText(); //Set<WebURL> links = htmlParseData.getOutgoingUrls(); //logger.debug("Text length: {}", text.length()); //System.out.println("Text length: {}" + text); //System.out.println("\n Title: {}" + title); //logger.debug("Html: {}", html); //System.out.println("Html: {}" + html); //logger.debug("Number of outgoing links: {}", links.size()); //System.out.println("Number of outgoing links: {}" + links.size()); //final String str = "<tag>apple</tag><b>hello</b><tag>orange</tag><tag>pear</tag>"; //System.out.println("\n Matcher: {}" + Arrays.toString(getTagValues(html).toArray())); // Prints [apple, orange, pear] //MysqlCrawler.getInstance().insertURL(url, title, ""); } } /* Header[] responseHeaders = page.getFetchResponseHeaders(); if (responseHeaders != null) { logger.debug("Response headers:"); for (Header header : responseHeaders) { logger.debug("\t{}: {}", header.getName(), header.getValue()); } } */ logger.debug("============="); }
From source file:de.geeksfactory.opacclient.apis.Littera.java
protected SearchRequestResult executeSearch(List<SearchQuery> query, int pageIndex) throws IOException, OpacErrorException, JSONException { final String searchUrl; if (!initialised) { start();/*w w w. j av a2 s. c o m*/ } try { searchUrl = buildSearchUrl(query, pageIndex); } catch (URISyntaxException e) { throw new RuntimeException(e); } final String html = httpGet(searchUrl, getDefaultEncoding()); final Document doc = Jsoup.parse(html); final Element navigation = doc.select(".result_view .navigation").first(); final int totalResults = navigation != null ? parseTotalResults(navigation.text()) : 0; final Element ul = doc.select(".result_view ul.list").first(); final List<SearchResult> results = new ArrayList<>(); for (final Element li : ul.children()) { if (li.hasClass("zugangsmonat")) { continue; } final SearchResult result = new SearchResult(); final Element title = li.select(".titelinfo a").first(); result.setId(getQueryParamsFirst(title.attr("href")).get("id")); result.setInnerhtml(title.text() + "<br>" + title.parent().nextElementSibling().text()); result.setNr(results.size()); result.setPage(pageIndex); result.setType(MEDIA_TYPES.get(li.select(".statusinfo .ma").text())); result.setCover(getCover(li)); final String statusImg = li.select(".status img").attr("src"); result.setStatus(statusImg.contains("-yes") ? SearchResult.Status.GREEN : statusImg.contains("-no") ? SearchResult.Status.RED : null); results.add(result); } return new SearchRequestResult(results, totalResults, pageIndex); }
From source file:gov.medicaid.screening.dao.impl.ChiropracticLicenseDAOBean.java
/** * Parses the Chiropractic license details page. * /*w w w.j a v a2 s . co m*/ * @param page * the details page * @param licenseType * if user has multiple licenses, this one will be used * @return the parsed license details * @throws ParsingException * if the expected tags were not found */ private License parseLicense(Document page, String licenseType) throws ParsingException { License license = new License(); ProviderProfile profile = new ProviderProfile(); license.setProfile(profile); User user = new User(); profile.setUser(user); Elements tables = page.select("table"); for (Element cell : tables.get(0).select("td")) { if (cell.text().equals("First Name")) { user.setFirstName(cell.nextElementSibling().text()); } else if (cell.text().equals("Middle Name")) { user.setMiddleName(cell.nextElementSibling().text()); } else if (cell.text().equals("Last Name")) { user.setLastName(cell.nextElementSibling().text()); } else if (cell.text().equals("Gender")) { String gender = cell.nextElementSibling().text(); if (Util.isNotBlank(gender)) { if ("M".equalsIgnoreCase(gender)) { profile.setSex(Sex.MALE); } else { profile.setSex(Sex.FEMALE); } } } } List<Address> addresses = new ArrayList<Address>(); Address address = new Address(); addresses.add(address); profile.setAddresses(addresses); StringBuffer locBuffer = new StringBuffer(); for (Element cell : tables.get(1).select("td")) { if (cell.text().equals("Address Line1")) { locBuffer.insert(0, cell.nextElementSibling().text() + " "); } else if (cell.text().equals("Address Line2")) { locBuffer.append(cell.nextElementSibling().text()); } else if (cell.text().equals("City")) { address.setCity(cell.nextElementSibling().text()); } else if (cell.text().equals("State")) { address.setState(cell.nextElementSibling().text()); } else if (cell.text().equals("ZIP")) { address.setZipcode(cell.nextElementSibling().text()); } else if (cell.text().equals("Phone Number")) { profile.setContactPhoneNumber(cell.nextElementSibling().text()); } } address.setLocation(locBuffer.toString().trim()); for (Element row : tables.get(2).select("tr")) { String lType = row.select("td:eq(0)").text(); if (licenseType != null && !lType.startsWith(licenseType)) { // user has multiple licenses, the results will show this user twice (search by name) continue; } LicenseType type = new LicenseType(); type.setName(row.select("td:eq(0)").text()); license.setType(type); license.setLicenseNumber(row.select("td:eq(1)").text()); LicenseStatus status = new LicenseStatus(); status.setName(row.select("td:eq(2)").text()); license.setStatus(status); String issueDate = row.select("td:eq(3)").text(); if (Util.isNotBlank(issueDate)) { license.setOriginalIssueDate(parseDate(issueDate, DATE_FORMAT)); } String renewalDate = row.select("td:eq(4)").text(); if (Util.isNotBlank(renewalDate)) { license.setRenewalDate(parseDate(renewalDate, DATE_FORMAT)); } String expirationDate = row.select("td:eq(5)").text(); if (Util.isNotBlank(expirationDate)) { license.setExpireDate(parseDate(expirationDate, DATE_FORMAT)); } } return license; }
From source file:net.kevxu.purdueassist.course.CatalogDetail.java
private CatalogDetailEntry parseDocument(Document document) throws HtmlParseException, CourseNotFoundException, IOException { CatalogDetailEntry entry = new CatalogDetailEntry(subject, cnbr); Elements tableElements = document.getElementsByAttributeValue("summary", "This table lists the course detail for the selected term."); if (tableElements.isEmpty() != true) { // get name try {// w w w . j a v a2 s. c o m Element body = tableElements.first().select("tbody").first(); String nameBlock = body.select("tr td.nttitle").first().text(); String[] temp = nameBlock.split(subject.name() + " " + String.valueOf(cnbr)); String name = temp[temp.length - 1].substring(3); entry.setName(name); // get description body = body.select(".ntdefault").first(); String text = body.text(); int split = text.indexOf("Levels:"); String description = text.substring(0, split); description = description.substring(20); entry.setDescription(description); // get levels int begin = split; int end = text.indexOf("Schedule Types:"); String levels = text.substring(begin + 8, end); temp = levels.split("[ ,]"); List<String> lvs = new ArrayList<String>(); for (String s : temp) if (!s.equals("")) { lvs.add(s); } entry.setLevels(lvs); // get type and prerequisites List<Type> types = new ArrayList<Type>(); List<String> preq = new ArrayList<String>(); Elements parsing_A = body.select("a"); for (Element e : parsing_A) { if (e.attr("href").contains("schd_in") && !(e.attr("href").contains("%"))) { try { types.add(Type.valueOf(e.text().replace(" ", ""))); } catch (Exception exception) { throw new HtmlParseException(); } } else if (e.attr("href").contains("sel_attr=")) { preq.add(e.text()); } } if (types.size() > 0) entry.setType(types); if (preq.size() > 0) entry.setPrerequisites(preq); // get offered by begin = text.indexOf("Offered By:"); end = text.indexOf("Department:"); if (end < 0) end = text.indexOf("Course Attributes:"); if (end > 0) { entry.setOfferedBy(text.substring(begin + 12, end - 1)); } // get department begin = text.indexOf("Department:"); if (begin > 0) { end = text.indexOf("Course Attributes:"); entry.setDepartment((text.substring(begin + 12, end - 1))); } // get campus begin = text.indexOf("May be offered at any of the following campuses:"); String campuses; end = text.indexOf("Repeatable for Additional Credit:"); if (end < 0) end = text.indexOf("Learning Objectives:"); if (end < 0) end = text.indexOf("Restrictions:"); if (end < 0) end = text.indexOf("Corequisites:"); if (end < 0) end = text.indexOf("Prerequisites:"); if (end < 0) { campuses = text .substring(begin + "May be offered at any of the following campuses:".length() + 5); } else { campuses = text.substring( begin + "May be offered at any of the following campuses:".length() + 5, end - 1); } temp = campuses.replace(" ", "#").split("#"); List<String> camps = new ArrayList<String>(); for (String s : temp) { if (s.length() > 1) { camps.add(s); } } entry.setCampuses(camps); // get restrictions begin = text.indexOf("Restrictions:"); end = text.indexOf("Corequisites:"); if (end < 0) end = text.indexOf("Prerequisites:"); if (begin > 0 && end < 0) { entry.setRestrictions( text.substring(begin + "Restrictions:".length()).replace(" ", "\n")); } else if (begin > 0) { entry.setRestrictions( text.substring(begin + "Restrictions:".length(), end).replace(" ", "\n")); } } catch (StringIndexOutOfBoundsException e) { // no type, not available // System.out.println("-----------"); // System.out.println("Error for cnbr = " + cnbr); // System.out.println("-----------"); } } else { throw new CourseNotFoundException(); } return entry; }
From source file:com.gumtreescraper.scraper.GumtreeScraper.java
public void scrapeWithJSoup(List<Gumtree> gumtrees, String url) throws IOException { // openSite(url); // waitForPageToLoad(); String nextPageUrl = url;// w w w . j ava 2s. c om boolean needContinue = true; do { try { Document doc = Jsoup.connect(nextPageUrl).timeout(getTimeout() * 1000).userAgent("Mozilla") // .userAgent("Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36") .get(); Elements adElements = doc.select("#srchrslt-adtable > li"); int size = adElements.size(); for (int i = 0; i < size; i++) { Element ad = adElements.get(i); if (!isOwner(ad)) { continue; } Element linkElement = ad.select("h6.rs-ad-title > a").first(); if (linkElement == null) { System.out.print(ad); continue; } String adUrl = linkElement.attr("href"); Gumtree gumtree = new Gumtree(); gumtree.setUrl(BASE_URL + adUrl); gumtrees.add(gumtree); if (i == size - 1) { // last element Elements adDateElements = ad.select("div.rs-ad-date"); if (adDateElements.isEmpty()) { continue; } if (!needToScrapeNextPage(adDateElements.first().text().trim())) { needContinue = false; } } } Elements nextElements = doc.select("a.rs-paginator-btn.next"); if (nextElements.isEmpty()) { break; } nextPageUrl = BASE_URL + nextElements.first().attr("href"); System.out.println("next page: " + nextPageUrl); } catch (Exception oex) { System.out.println(oex); } } while (true && needContinue); }
From source file:me.vertretungsplan.parser.IndiwareParser.java
SubstitutionScheduleDay parseIndiwareDay(Element doc, boolean html) throws IOException { SubstitutionScheduleDay day = new SubstitutionScheduleDay(); DataSource ds;//from ww w . j ava2s . c o m if (html) { ds = new HTMLDataSource(doc); } else { ds = new XMLDataSource(doc); } Matcher matcher = datePattern.matcher(ds.titel().text()); if (!matcher.find()) throw new IOException("malformed date: " + ds.titel().text()); String date = matcher.group(); day.setDate( DateTimeFormat.forPattern("EEEE, dd. MMMM yyyy").withLocale(Locale.GERMAN).parseLocalDate(date)); String lastChange = ds.datum().text(); day.setLastChange(DateTimeFormat.forPattern("dd.MM.yyyy, HH:mm").withLocale(Locale.GERMAN) .parseLocalDateTime(lastChange)); if (ds.kopfinfos().size() > 0) { for (Element kopfinfo : ds.kopfinfos()) { String title = html ? kopfinfo.select("th").text() : kopfinfoTitle(kopfinfo.tagName()) + ":"; StringBuilder message = new StringBuilder(); if (title != null && !title.isEmpty()) { message.append("<b>").append(title).append("</b>").append(" "); } message.append(html ? kopfinfo.select("td").text() : kopfinfo.text()); day.addMessage(message.toString()); } } if (ds.fuss() != null) { StringBuilder message = new StringBuilder(); boolean first = true; for (Element fusszeile : ds.fusszeilen()) { if (first) { first = false; } else { message.append("\n"); } message.append(fusszeile.text()); } day.addMessage(message.toString()); } List<String> columnTypes = null; if (html) { columnTypes = new ArrayList<>(); for (Element th : ((HTMLDataSource) ds).headers()) { columnTypes.add(th.className().replace("thplan", "").replace("thlplan", "")); } } for (Element aktion : ds.aktionen()) { Substitution substitution = new Substitution(); String type = "Vertretung"; String course = null; int i = 0; for (Element info : aktion.children()) { String value = info.text().replace("\u00a0", ""); if (value.equals("---")) { i++; continue; } final String columnType = html ? columnTypes.get(i) : info.tagName(); switch (columnType) { case "klasse": Set<String> classes = new HashSet<>(); for (String klasse : value.split(",")) { Matcher courseMatcher = coursePattern.matcher(klasse); if (courseMatcher.matches()) { classes.add(courseMatcher.group(1)); course = courseMatcher.group(2); } else { classes.add(klasse); } } substitution.setClasses(classes); break; case "stunde": substitution.setLesson(value); break; case "fach": String subject = subjectAndCourse(course, value); if (columnTypes != null && columnTypes.contains("vfach")) { substitution.setPreviousSubject(subject); } else { substitution.setSubject(subject); } break; case "vfach": substitution.setSubject(subjectAndCourse(course, value)); case "lehrer": Matcher bracesMatcher = bracesPattern.matcher(value); if (bracesMatcher.matches()) value = bracesMatcher.group(1); substitution.setTeacher(value); break; case "raum": if (columnTypes != null && columnTypes.contains("vraum")) { substitution.setPreviousRoom(value); } else { substitution.setRoom(value); } break; case "vraum": substitution.setRoom(value); case "info": Matcher substitutionMatcher = substitutionPattern.matcher(value); Matcher cancelMatcher = cancelPattern.matcher(value); Matcher delayMatcher = delayPattern.matcher(value); Matcher selfMatcher = selfPattern.matcher(value); if (substitutionMatcher.matches()) { substitution.setPreviousSubject(substitutionMatcher.group(1)); substitution.setPreviousTeacher(substitutionMatcher.group(2)); if (!substitutionMatcher.group(3).isEmpty()) { substitution.setDesc(substitutionMatcher.group(3)); } } else if (cancelMatcher.matches()) { type = "Entfall"; substitution.setPreviousSubject(cancelMatcher.group(1)); substitution.setPreviousTeacher(cancelMatcher.group(2)); } else if (delayMatcher.matches()) { type = "Verlegung"; substitution.setPreviousSubject(delayMatcher.group(1)); substitution.setPreviousTeacher(delayMatcher.group(2)); substitution.setDesc(delayMatcher.group(3)); } else if (selfMatcher.matches()) { type = "selbst."; if (!selfMatcher.group(1).isEmpty()) substitution.setDesc(selfMatcher.group(1)); } else if (value.equals("fllt aus") || value.equals("Klausur") || value.equals("Aufg.")) { type = value; } else { substitution.setDesc(value); } break; } i++; } substitution.setType(type); substitution.setColor(colorProvider.getColor(substitution.getType())); if (course != null && substitution.getSubject() == null) { substitution.setSubject(course); } day.addSubstitution(substitution); } return day; }
From source file:qhindex.controller.SearchAuthorWorksController.java
private AuthorWork extractAuthorWorkData(Element authorWorkElements) throws IOException { AuthorWork aw = new AuthorWork(); Element titleElem = authorWorkElements.select("td.gsc_a_t > a").get(0); String name = titleElem.text(); aw.setTitle(name);/* www . ja va2 s . c o m*/ String urlAuthorWork = titleElem.attr("href"); Elements workData = authorWorkElements.select("td.gsc_a_t > div"); if (workData.size() > 1) { String publisherInGoogle = workData.get(1).text(); aw.setPublisherInGoogle(publisherInGoogle); aw.setPublisher(handlePublicationMedium(publisherInGoogle, urlAuthorWork)); String authors = workData.get(0).text(); aw.setAuthors(authors); } Elements citationsData = authorWorkElements.select("td.gsc_a_c > a"); if (citationsData.size() > 0) { aw.setCitationsUrl(citationsData.get(0).attr("href")); int cititationsExtractedNumber = 0; try { String citationStr = citationsData.get(0).text(); if (citationStr.length() > 0) { cititationsExtractedNumber = Integer.parseInt(citationStr); } } catch (Exception ex) { Debug.print("Exception while extracting author work data: " + ex.toString()); resultsMsg += "Exception while extracting author work data.\n"; } aw.setCitations(cititationsExtractedNumber); } return aw; }