Example usage for org.jsoup.nodes Document select

List of usage examples for org.jsoup.nodes Document select

Introduction

In this page you can find the example usage for org.jsoup.nodes Document select.

Prototype

public Elements select(String cssQuery) 

Source Link

Document

Find elements that match the Selector CSS query, with this element as the starting context.

Usage

From source file:com.github.binlee1990.spider.movie.spider.MovieCrawler.java

private void addFilmReview(Document doc, Film film) {
    Elements countElements = doc.select(".fm-movie-users li");
    if (CollectionUtils.isNotEmpty(countElements)) {
        FilmReview filmReview = new FilmReview();
        filmReview.setFilmCode(film.getCode());
        setFilmReviewCount(countElements, filmReview);
        setFilmReviewDoubanGrade(doc, filmReview);
        setFilmReviewImdbGrade(doc, filmReview);

        Date now = new Date();
        filmReview.setCreateTime(now);/*www.j  av  a2 s . c  o m*/
        filmReview.setUpdateTime(now);
        filmReviewMapper.insertSelective(filmReview);
    }
}

From source file:gov.medicaid.screening.dao.impl.OIGDAOBean.java

/**
 * Parses the excluded provider profile details page.
 *
 * @param page the details page//from w w w  .  ja v a2  s .  c o m
 * @return the parsed license details
 * @throws ParsingException if the expected tags were not found
 */
private ProviderProfile parseProfile(Document page) throws ParsingException {
    ProviderProfile profile = new ProviderProfile();

    // name
    User user = new User();
    profile.setUser(user);
    user.setLastName(page.select("th:containsOwn(Last Name) + td").text());
    user.setFirstName(page.select("th:containsOwn(First Name) + td").text());

    // business
    String businessName = page.select("th:containsOwn(Entity) + td").text();
    if (!"N/A".equals(businessName)) {
        Business business = new Business();
        profile.setBusiness(business);
        business.setName(businessName);
    }

    // DOB
    Date dob = parseDate(page.select("th:has(acronym:containsOwn(DOB)) + td").text(), DATE_FORMAT);
    if (dob != null) {
        profile.setDob(dob);
    }

    // exclusion type
    ExclusionType exclusionType = new ExclusionType();
    profile.setExclusionType(exclusionType);
    exclusionType.setName(page.select("th:containsOwn(Excl. Type) + td").text());

    // specialty
    List<Specialty> specialties = new ArrayList<Specialty>();
    Specialty specialty = new Specialty();
    specialties.add(specialty);
    specialty.setName(page.select("th:containsOwn(Specialty) + td").text());
    profile.setSpecialties(specialties);

    // address
    Elements addrElement = page.select("th:containsOwn(Address) + td");
    String addr = addrElement.text();
    Element addrNextRow = addrElement.parents().first().nextElementSibling();
    if ("".equals(addrNextRow.select("th").text())) {
        addr += " " + addrNextRow.select("td").text();
    }
    Address address = new Address();
    address.setLocation(addr);
    profile.setAddresses(Arrays.asList(new Address[] { address }));

    Date date = parseDate(page.select("th:containsOwn(Excl. Date) + td").text(), DATE_FORMAT);
    if (date != null) {
        profile.setRequestEffectiveDate(date);
    }

    return profile;
}

From source file:abelymiguel.miralaprima.GetPrima.java

private HashMap<String, Float> getPrimaDataBloom(String country_code, String providerUrl, String indexName) {

    HashMap<String, Float> respuestaJson = new HashMap<String, Float>();
    HashMap<String, Object> primaJson;

    Float prima_value;/*from  w w w. j  a va 2s . c o m*/
    Float prima_delta;
    Float prima_percent;

    Document doc;
    try {
        doc = Jsoup.connect(providerUrl + indexName).get();
        Element riskPremium = doc.select(".price").last();
        //              System.out.println("Prima: " + riskPremium.text());
        prima_value = Float.valueOf(riskPremium.text().replace(".", "")).floatValue();

        Elements riskPremiumsUp = doc.select(".trending_up");
        Elements riskPremiumsDown = doc.select(".trending_down");
        //              System.out.println("Trending: " + riskPremiumsUp.text());
        //              System.out.println("Trending: " + riskPremiumsDown.text());

        if (!riskPremiumsUp.text().equals("")) {
            String delta = riskPremiumsUp.text();
            prima_delta = Float.valueOf(delta.substring(0, delta.indexOf(" ")).replace(",", "")).floatValue();
            //                  System.out.println("Delta: " + prima_delta);

            String percent = riskPremiumsUp.text();
            prima_percent = Float.valueOf(percent.substring(percent.indexOf(" ") + 1, percent.length() - 1))
                    .floatValue();
            //                  System.out.println("Percent: " + prima_percent);
        } else if (!riskPremiumsDown.text().equals("")) {
            String delta = riskPremiumsDown.text();
            prima_delta = Float.valueOf(delta.substring(0, delta.indexOf(" ")).replace(",", "")).floatValue();
            prima_delta = prima_delta * -1;
            //                  System.out.println("Delta: " + prima_delta);

            String percent = riskPremiumsDown.text();
            prima_percent = Float.valueOf(percent.substring(percent.indexOf(" ") + 1, percent.length() - 1))
                    .floatValue();
            prima_percent = prima_percent * -1;
            //                  System.out.println("Percent: " + prima_percent);
        } else {
            prima_delta = 0f;
            prima_percent = 0f;
        }
        respuestaJson.put("prima_value", prima_value);
        respuestaJson.put("prima_delta", prima_delta);
        respuestaJson.put("prima_percent", prima_percent);

        if (isSameDay(country_code)) {
            this.updatePrimaInDB(prima_value, prima_delta, prima_percent,
                    this.getLatestPrimaIdFromDB(country_code));
        } else {
            this.storePrimaInDB(prima_value, prima_delta, prima_percent, country_code);
        }
    } catch (Exception ex) {
        Logger.getLogger(GetPrima.class.getName()).log(Level.SEVERE, null, ex);
        primaJson = getLatestPrimaFromDB(country_code);
        respuestaJson.put("prima_value", (Float) primaJson.get("prima_value"));
        respuestaJson.put("prima_delta", (Float) primaJson.get("prima_delta"));
        respuestaJson.put("prima_percent", (Float) primaJson.get("prima_percent"));
    }

    return respuestaJson;
}

From source file:fr.eolya.extraction.tika.TikaWrapper.java

private String getMetaContent(Document doc, String metaName) {
    Elements e = doc.select("meta[name=" + metaName + "]");
    if (e == null || e.first() == null)
        return null;
    return e.first().attr("content");
}

From source file:de.tudarmstadt.ukp.experiments.dip.wp1.documents.helpers.boilerplateremoval.impl.JusTextBoilerplateRemoval.java

/**
 * remove unwanted parts from a jsoup doc
 *
 * @param jsoupDoc/*  w w  w.  j  a v  a 2  s . c  o  m*/
 * @return
 */
public Document cleanDom(Document jsoupDoc) {
    String[] tagsToRemove = { "head", "script", ".hidden", "embedded" };

    for (String tag : tagsToRemove) {
        Elements selectedTags = jsoupDoc.select(tag);
        for (Element element : selectedTags) {
            element.remove();
        }
    }
    //remove comments (might be slow)
    for (Element element : jsoupDoc.getAllElements()) {
        for (Node n : element.childNodes()) {
            NodeHelper.removeComments(n);
        }
    }
    return jsoupDoc;

}

From source file:com.github.binlee1990.spider.movie.spider.MovieCrawler.java

private void setFilmDescription(Document doc, Film film) {
    String description = StringUtils.trimToEmpty(doc.select(".fm-summary").text().toString());
    film.setDescription(description);//from  w  w  w.jav  a2 s. c  om
}

From source file:org.bungeni.ext.integration.bungeniportal.BungeniServiceAccess.java

private List<BasicNameValuePair> getFormFieldSelectDefaultValues(Document doc, List<String> fieldNames) {
    List<BasicNameValuePair> nvp = new ArrayList<BasicNameValuePair>(0);
    for (String fieldName : fieldNames) {
        Elements inputItems = doc.select("[name=" + fieldName + "]");
        for (int i = 0; i < inputItems.size(); i++) {
            Element inputItem = inputItems.get(i);
            Elements selItems = inputItem.select("[selected=selected]");
            for (int j = 0; j < selItems.size(); j++) {
                Element selItem = selItems.get(j);
                nvp.add(new BasicNameValuePair(fieldName, selItem.attr("value")));
            }//from w ww.j a  v  a  2  s .  co  m
        }
    }
    return nvp;
}

From source file:com.thesmartweb.swebrank.WebParser.java

/**
 * Parse the url and get all the content
 * @param link_html the url to parse//from   w  w  w  .j a  v  a 2s . c o m
 * @return The content parsed
 */
public String cleanhtml(String link_html) {
    try {
        Document doc = Jsoup.connect(link_html).timeout(10 * 1000).get();
        String title = doc.title();
        String mainbody = doc.body().text();
        Elements links = doc.select("a[href]");
        Elements media = doc.select("[src]");
        //fix link html to remove https:// or http:// and simple /
        if (link_html.substring(link_html.length() - 1, link_html.length()).equalsIgnoreCase("/")) {
            link_html = link_html.substring(0, link_html.length() - 1);
        }
        if (link_html.substring(0, 5).equalsIgnoreCase("https")) {
            link_html = link_html.substring(8);
        } else if (link_html.substring(0, 4).equalsIgnoreCase("http")) {
            link_html = link_html.substring(7);
        }
        String anchortext = "";
        String alttext = "";
        //-----get the anchor text of internal links
        for (Element link : links) {
            String str_check = link.attr("abs:href").toString();
            if (link.attr("abs:href").contains(link_html) && link.text().length() > 1) {
                anchortext = anchortext + link.text() + " ";
            }
        }
        //-------get alt text to internal images links
        for (Element medi : media) {
            if (medi.getElementsByTag("img").attr("src").toString().contains(link_html)) {
                alttext = alttext + " " + medi.getElementsByTag("img").attr("alt").toString();
            }
            if (medi.getElementsByTag("img").attr("src").toString().startsWith("/")) {
                alttext = alttext + " " + medi.getElementsByTag("img").attr("alt").toString();
            }
        }
        String content = mainbody + title + anchortext + alttext;

        return content;
    } catch (IOException ex) {
        Logger.getLogger(com.thesmartweb.swebrank.WebParser.class.getName()).log(Level.SEVERE, null, ex);
        String check = null;
        return check;
    } catch (NullPointerException ex) {
        Logger.getLogger(com.thesmartweb.swebrank.WebParser.class.getName()).log(Level.SEVERE, null, ex);
        String check = null;
        return check;
    } catch (Exception ex) {
        Logger.getLogger(com.thesmartweb.swebrank.WebParser.class.getName()).log(Level.SEVERE, null, ex);
        String check = null;
        return check;
    }

}

From source file:com.github.binlee1990.spider.movie.spider.MovieCrawler.java

private void setFilmReviewImdbGrade(Document doc, FilmReview filmReview) {
    Elements imdbElements = doc.select(".fm-title .fm-orange");
    if (CollectionUtils.isNotEmpty(imdbElements) && imdbElements.size() == 1) {
        Element imdbElement = imdbElements.get(0);
        if (null != imdbElement) {
            String imdbGradeStr = imdbElement.text();
            if (StringUtils.isNotBlank(imdbGradeStr) && StringUtils.contains(imdbGradeStr, "IMDB")) {
                String gradeStr = StringUtils
                        .trimToEmpty(imdbGradeStr.substring(imdbGradeStr.indexOf("IMDB") + "IMDB".length()));
                if (StringUtils.isNotBlank(gradeStr)) {
                    float grade = getGrade(gradeStr);
                    filmReview.setGradeDouban(grade);
                }//  ww w  .j a  v a 2s .c  o  m
            }
        }
    }
}

From source file:com.github.binlee1990.spider.movie.spider.MovieCrawler.java

private void setFilmReviewDoubanGrade(Document doc, FilmReview filmReview) {
    Elements doubanElements = doc.select(".fm-title .fm-green");
    if (CollectionUtils.isNotEmpty(doubanElements) && doubanElements.size() == 1) {
        Element doubanElement = doubanElements.get(0);
        if (null != doubanElement) {
            String doubanGradeStr = doubanElement.text();
            if (StringUtils.isNotBlank(doubanGradeStr) && StringUtils.contains(doubanGradeStr, "")) {
                String gradeStr = StringUtils.trimToEmpty(
                        doubanGradeStr.substring(doubanGradeStr.indexOf("") + "".length()));
                if (StringUtils.isNotBlank(gradeStr)) {
                    float grade = getGrade(gradeStr);
                    filmReview.setGradeDouban(grade);
                }/*from   w w w  .  j a va 2s  . c o m*/
            }
        }
    }
}