List of usage examples for org.jsoup.nodes Document select
public Elements select(String cssQuery)
From source file:com.github.binlee1990.spider.movie.spider.MovieCrawler.java
private void addFilmReview(Document doc, Film film) { Elements countElements = doc.select(".fm-movie-users li"); if (CollectionUtils.isNotEmpty(countElements)) { FilmReview filmReview = new FilmReview(); filmReview.setFilmCode(film.getCode()); setFilmReviewCount(countElements, filmReview); setFilmReviewDoubanGrade(doc, filmReview); setFilmReviewImdbGrade(doc, filmReview); Date now = new Date(); filmReview.setCreateTime(now);/*www.j av a2 s . c o m*/ filmReview.setUpdateTime(now); filmReviewMapper.insertSelective(filmReview); } }
From source file:gov.medicaid.screening.dao.impl.OIGDAOBean.java
/** * Parses the excluded provider profile details page. * * @param page the details page//from w w w . ja v a2 s . c o m * @return the parsed license details * @throws ParsingException if the expected tags were not found */ private ProviderProfile parseProfile(Document page) throws ParsingException { ProviderProfile profile = new ProviderProfile(); // name User user = new User(); profile.setUser(user); user.setLastName(page.select("th:containsOwn(Last Name) + td").text()); user.setFirstName(page.select("th:containsOwn(First Name) + td").text()); // business String businessName = page.select("th:containsOwn(Entity) + td").text(); if (!"N/A".equals(businessName)) { Business business = new Business(); profile.setBusiness(business); business.setName(businessName); } // DOB Date dob = parseDate(page.select("th:has(acronym:containsOwn(DOB)) + td").text(), DATE_FORMAT); if (dob != null) { profile.setDob(dob); } // exclusion type ExclusionType exclusionType = new ExclusionType(); profile.setExclusionType(exclusionType); exclusionType.setName(page.select("th:containsOwn(Excl. Type) + td").text()); // specialty List<Specialty> specialties = new ArrayList<Specialty>(); Specialty specialty = new Specialty(); specialties.add(specialty); specialty.setName(page.select("th:containsOwn(Specialty) + td").text()); profile.setSpecialties(specialties); // address Elements addrElement = page.select("th:containsOwn(Address) + td"); String addr = addrElement.text(); Element addrNextRow = addrElement.parents().first().nextElementSibling(); if ("".equals(addrNextRow.select("th").text())) { addr += " " + addrNextRow.select("td").text(); } Address address = new Address(); address.setLocation(addr); profile.setAddresses(Arrays.asList(new Address[] { address })); Date date = parseDate(page.select("th:containsOwn(Excl. Date) + td").text(), DATE_FORMAT); if (date != null) { profile.setRequestEffectiveDate(date); } return profile; }
From source file:abelymiguel.miralaprima.GetPrima.java
private HashMap<String, Float> getPrimaDataBloom(String country_code, String providerUrl, String indexName) { HashMap<String, Float> respuestaJson = new HashMap<String, Float>(); HashMap<String, Object> primaJson; Float prima_value;/*from w w w. j a va 2s . c o m*/ Float prima_delta; Float prima_percent; Document doc; try { doc = Jsoup.connect(providerUrl + indexName).get(); Element riskPremium = doc.select(".price").last(); // System.out.println("Prima: " + riskPremium.text()); prima_value = Float.valueOf(riskPremium.text().replace(".", "")).floatValue(); Elements riskPremiumsUp = doc.select(".trending_up"); Elements riskPremiumsDown = doc.select(".trending_down"); // System.out.println("Trending: " + riskPremiumsUp.text()); // System.out.println("Trending: " + riskPremiumsDown.text()); if (!riskPremiumsUp.text().equals("")) { String delta = riskPremiumsUp.text(); prima_delta = Float.valueOf(delta.substring(0, delta.indexOf(" ")).replace(",", "")).floatValue(); // System.out.println("Delta: " + prima_delta); String percent = riskPremiumsUp.text(); prima_percent = Float.valueOf(percent.substring(percent.indexOf(" ") + 1, percent.length() - 1)) .floatValue(); // System.out.println("Percent: " + prima_percent); } else if (!riskPremiumsDown.text().equals("")) { String delta = riskPremiumsDown.text(); prima_delta = Float.valueOf(delta.substring(0, delta.indexOf(" ")).replace(",", "")).floatValue(); prima_delta = prima_delta * -1; // System.out.println("Delta: " + prima_delta); String percent = riskPremiumsDown.text(); prima_percent = Float.valueOf(percent.substring(percent.indexOf(" ") + 1, percent.length() - 1)) .floatValue(); prima_percent = prima_percent * -1; // System.out.println("Percent: " + prima_percent); } else { prima_delta = 0f; prima_percent = 0f; } respuestaJson.put("prima_value", prima_value); respuestaJson.put("prima_delta", prima_delta); respuestaJson.put("prima_percent", prima_percent); if (isSameDay(country_code)) { this.updatePrimaInDB(prima_value, prima_delta, prima_percent, this.getLatestPrimaIdFromDB(country_code)); } else { this.storePrimaInDB(prima_value, prima_delta, prima_percent, country_code); } } catch (Exception ex) { Logger.getLogger(GetPrima.class.getName()).log(Level.SEVERE, null, ex); primaJson = getLatestPrimaFromDB(country_code); respuestaJson.put("prima_value", (Float) primaJson.get("prima_value")); respuestaJson.put("prima_delta", (Float) primaJson.get("prima_delta")); respuestaJson.put("prima_percent", (Float) primaJson.get("prima_percent")); } return respuestaJson; }
From source file:fr.eolya.extraction.tika.TikaWrapper.java
private String getMetaContent(Document doc, String metaName) { Elements e = doc.select("meta[name=" + metaName + "]"); if (e == null || e.first() == null) return null; return e.first().attr("content"); }
From source file:de.tudarmstadt.ukp.experiments.dip.wp1.documents.helpers.boilerplateremoval.impl.JusTextBoilerplateRemoval.java
/** * remove unwanted parts from a jsoup doc * * @param jsoupDoc/* w w w. j a v a 2 s . c o m*/ * @return */ public Document cleanDom(Document jsoupDoc) { String[] tagsToRemove = { "head", "script", ".hidden", "embedded" }; for (String tag : tagsToRemove) { Elements selectedTags = jsoupDoc.select(tag); for (Element element : selectedTags) { element.remove(); } } //remove comments (might be slow) for (Element element : jsoupDoc.getAllElements()) { for (Node n : element.childNodes()) { NodeHelper.removeComments(n); } } return jsoupDoc; }
From source file:com.github.binlee1990.spider.movie.spider.MovieCrawler.java
private void setFilmDescription(Document doc, Film film) { String description = StringUtils.trimToEmpty(doc.select(".fm-summary").text().toString()); film.setDescription(description);//from w w w.jav a2 s. c om }
From source file:org.bungeni.ext.integration.bungeniportal.BungeniServiceAccess.java
private List<BasicNameValuePair> getFormFieldSelectDefaultValues(Document doc, List<String> fieldNames) { List<BasicNameValuePair> nvp = new ArrayList<BasicNameValuePair>(0); for (String fieldName : fieldNames) { Elements inputItems = doc.select("[name=" + fieldName + "]"); for (int i = 0; i < inputItems.size(); i++) { Element inputItem = inputItems.get(i); Elements selItems = inputItem.select("[selected=selected]"); for (int j = 0; j < selItems.size(); j++) { Element selItem = selItems.get(j); nvp.add(new BasicNameValuePair(fieldName, selItem.attr("value"))); }//from w ww.j a v a 2 s . co m } } return nvp; }
From source file:com.thesmartweb.swebrank.WebParser.java
/** * Parse the url and get all the content * @param link_html the url to parse//from w w w .j a v a 2s . c o m * @return The content parsed */ public String cleanhtml(String link_html) { try { Document doc = Jsoup.connect(link_html).timeout(10 * 1000).get(); String title = doc.title(); String mainbody = doc.body().text(); Elements links = doc.select("a[href]"); Elements media = doc.select("[src]"); //fix link html to remove https:// or http:// and simple / if (link_html.substring(link_html.length() - 1, link_html.length()).equalsIgnoreCase("/")) { link_html = link_html.substring(0, link_html.length() - 1); } if (link_html.substring(0, 5).equalsIgnoreCase("https")) { link_html = link_html.substring(8); } else if (link_html.substring(0, 4).equalsIgnoreCase("http")) { link_html = link_html.substring(7); } String anchortext = ""; String alttext = ""; //-----get the anchor text of internal links for (Element link : links) { String str_check = link.attr("abs:href").toString(); if (link.attr("abs:href").contains(link_html) && link.text().length() > 1) { anchortext = anchortext + link.text() + " "; } } //-------get alt text to internal images links for (Element medi : media) { if (medi.getElementsByTag("img").attr("src").toString().contains(link_html)) { alttext = alttext + " " + medi.getElementsByTag("img").attr("alt").toString(); } if (medi.getElementsByTag("img").attr("src").toString().startsWith("/")) { alttext = alttext + " " + medi.getElementsByTag("img").attr("alt").toString(); } } String content = mainbody + title + anchortext + alttext; return content; } catch (IOException ex) { Logger.getLogger(com.thesmartweb.swebrank.WebParser.class.getName()).log(Level.SEVERE, null, ex); String check = null; return check; } catch (NullPointerException ex) { Logger.getLogger(com.thesmartweb.swebrank.WebParser.class.getName()).log(Level.SEVERE, null, ex); String check = null; return check; } catch (Exception ex) { Logger.getLogger(com.thesmartweb.swebrank.WebParser.class.getName()).log(Level.SEVERE, null, ex); String check = null; return check; } }
From source file:com.github.binlee1990.spider.movie.spider.MovieCrawler.java
private void setFilmReviewImdbGrade(Document doc, FilmReview filmReview) { Elements imdbElements = doc.select(".fm-title .fm-orange"); if (CollectionUtils.isNotEmpty(imdbElements) && imdbElements.size() == 1) { Element imdbElement = imdbElements.get(0); if (null != imdbElement) { String imdbGradeStr = imdbElement.text(); if (StringUtils.isNotBlank(imdbGradeStr) && StringUtils.contains(imdbGradeStr, "IMDB")) { String gradeStr = StringUtils .trimToEmpty(imdbGradeStr.substring(imdbGradeStr.indexOf("IMDB") + "IMDB".length())); if (StringUtils.isNotBlank(gradeStr)) { float grade = getGrade(gradeStr); filmReview.setGradeDouban(grade); }// ww w .j a v a 2s .c o m } } } }
From source file:com.github.binlee1990.spider.movie.spider.MovieCrawler.java
private void setFilmReviewDoubanGrade(Document doc, FilmReview filmReview) { Elements doubanElements = doc.select(".fm-title .fm-green"); if (CollectionUtils.isNotEmpty(doubanElements) && doubanElements.size() == 1) { Element doubanElement = doubanElements.get(0); if (null != doubanElement) { String doubanGradeStr = doubanElement.text(); if (StringUtils.isNotBlank(doubanGradeStr) && StringUtils.contains(doubanGradeStr, "")) { String gradeStr = StringUtils.trimToEmpty( doubanGradeStr.substring(doubanGradeStr.indexOf("") + "".length())); if (StringUtils.isNotBlank(gradeStr)) { float grade = getGrade(gradeStr); filmReview.setGradeDouban(grade); }/*from w w w . j a va 2s . c o m*/ } } } }