Example usage for org.jsoup.nodes Document select

List of usage examples for org.jsoup.nodes Document select

Introduction

In this page you can find the example usage for org.jsoup.nodes Document select.

Prototype

public Elements select(String cssQuery) 

Source Link

Document

Find elements that match the Selector CSS query, with this element as the starting context.

Usage

From source file:com.github.binlee1990.spider.movie.spider.MovieCrawler.java

private void addFilmRegionList(Document doc, Film film) {
    Elements keyElements = doc.select(".fm-minfo dt");
    Elements valueElements = doc.select(".fm-minfo dd");
    if (CollectionUtils.isNotEmpty(keyElements) && CollectionUtils.isNotEmpty(valueElements)) {
        int keyI = 0;
        for (; keyI < keyElements.size(); keyI++) {
            Element keyElement = keyElements.get(keyI);
            Element valueElement = valueElements.get(keyI);

            if (null != keyElement && null != valueElement) {
                String key = StringUtils.trimToEmpty(keyElement.text().toString());
                if (StringUtils.isNotBlank(key)) {
                    String value = StringUtils.trimToEmpty(valueElement.text().toString());

                    if (StringUtils.equalsIgnoreCase(key, "") && StringUtils.isNotBlank(value)) {
                        List<String> regionList = SLASH_SPLITTER.splitToList(value);
                        if (CollectionUtils.isNotEmpty(regionList)) {
                            regionList.forEach(region -> {
                                EnumRegion queryRegion = new EnumRegion();
                                queryRegion.setUrlRegion(region);
                                EnumRegion enumRegion = enumRegionMapper
                                        .queryEnumRegionByEnumRegion(queryRegion);

                                if (null != enumRegion) {
                                    FilmRegion filmRegion = new FilmRegion();
                                    filmRegion.setFilmCode(film.getCode());
                                    filmRegion.setRegionId(enumRegion.getId());

                                    Date now = new Date();
                                    filmRegion.setCreateTime(now);
                                    filmRegion.setUpdateTime(now);
                                    filmRegionMapper.insertSelective(filmRegion);
                                }//from w w w .j  ava2 s  . c  om
                            });
                        }

                        break;
                    }
                }
            }
        }
    }
}

From source file:qhindex.controller.SearchAuthorWorksController.java

public void searchCitationsForWork(AuthorWork awork, ArrayList<CitingWork> results) throws IOException {
    int maxResultsPerPage = 10;
    int indexCitationComponentStart = awork.getCitationsUrl().indexOf('?') + 1;
    String citationsUrlComponent = awork.getCitationsUrl().substring(indexCitationComponentStart);
    boolean continueSearch = true;
    int page = 0;

    while (continueSearch) {
        Debug.info("Retrieving citations for work - page " + page);

        String citationsUrlPage = "https://scholar.google.com.au/scholar?start=" + (page * maxResultsPerPage)
                + "&" + citationsUrlComponent;

        Document authorDoc = requestWebDocFromScholar(citationsUrlPage);
        AppHelper.waitBeforeNewRequest();

        Elements citationsElems = null;
        if (authorDoc != null) {
            citationsElems = authorDoc.select("div.gs_ri");
            for (Element citationElements : citationsElems) {
                CitingWork citingWork = extractCitationWork(citationElements);
                results.add(citingWork);
            }// w  w  w. ja  va  2 s  . c o m
        }
        page += 1;

        // End search if cannot retrieve the citations web page from scholar or it does not have 
        // any citation elements because there are no more citations.
        if (citationsElems == null || citationsElems.size() < maxResultsPerPage) {
            continueSearch = false;
        }
    }
}

From source file:com.liato.bankdroid.banking.banks.AbsIkanoPartner.java

@Override
protected LoginPackage preLogin() throws BankException, ClientProtocolException, IOException {
    urlopen = new Urllib(context, CertificateReader.getCertificates(context, R.raw.cert_ikanopartner));
    response = urlopen.open("https://partner.ikanobank.se/web/engines/page.aspx?structid=" + structId);

    Document d = Jsoup.parse(response);
    Element viewstate = d.getElementById("__VIEWSTATE");
    if (viewstate == null || TextUtils.isEmpty(viewstate.val())) {
        throw new BankException(res.getText(R.string.unable_to_find).toString() + " ViewState.");
    }//from  w w  w.j  a v a  2 s .c  o m

    Element eventvalidation = d.getElementById("__EVENTVALIDATION");
    if (eventvalidation == null || TextUtils.isEmpty(eventvalidation.val())) {
        throw new BankException(res.getText(R.string.unable_to_find).toString() + " EventValidation.");
    }

    Element userField = d.select("#LoginSpan input[type=text]").first();
    Element passField = d.select("#LoginSpan input[type=password]").first();
    Element submitField = d.select("#LoginCustomerDiv input[type=submit]").first();

    if (userField == null || passField == null || submitField == null) {
        throw new BankException(res.getText(R.string.unable_to_find).toString() + " login fields.");
    }
    List<NameValuePair> postData = new ArrayList<NameValuePair>();
    postData.add(new BasicNameValuePair("__VIEWSTATE", viewstate.val()));
    postData.add(new BasicNameValuePair("__EVENTVALIDATION", eventvalidation.val()));
    postData.add(new BasicNameValuePair(userField.attr("name"), username));
    postData.add(new BasicNameValuePair(passField.attr("name"), password));
    postData.add(new BasicNameValuePair(submitField.attr("name"), submitField.val()));
    return new LoginPackage(urlopen, postData, response,
            "https://partner.ikanobank.se/web/engines/page.aspx?structid=" + structId);

}

From source file:gov.medicaid.screening.dao.impl.MedicalPracticeLicenseDAOBean.java

/**
 * Parse the License information./* w ww . j  av  a  2s.  c o  m*/
 *
 * @param city the license provider city
 * @param details the details page
 * @return the parsed license
 */
private License parseLicense(String city, Document details) {
    License license = new License();
    license.setCity(city);

    ProviderProfile profile = new ProviderProfile();
    license.setProfile(profile);

    String fullName = details.select("#_ctl7_lblName").text();
    User user = new User();
    String[] nameParts = fullName.split(",");
    if (nameParts.length > 0) {
        user.setLastName(nameParts[0].trim());
    }
    if (nameParts.length > 1) {
        user.setFirstName(nameParts[1].trim());
    }
    profile.setUser(user);

    String licenseType = details.select("#_ctl7_ProfileInfoLicense_lblLicType").text();
    LicenseType licType = new LicenseType();
    licType.setName(licenseType);
    license.setType(licType);

    String licenseNo = details.select("#_ctl7_ProfileInfoLicense_lblLicNbr").text();
    license.setLicenseNumber(licenseNo);

    String licensureAddress1 = details.select("#_ctl7_ProfileInfoPublic_lblAddress").text();
    String licensureAddress2 = details.select("#_ctl7_ProfileInfoPublic_lblAddress2").text();
    String licensureCityState = details.select("#_ctl7_ProfileInfoPublic_lblCity").text();
    Address address = new Address();
    address.setLocation(licensureAddress1 + " " + licensureAddress2);
    setCityStateZip(address, licensureCityState);

    String email = details.select("#_ctl7_ProfileInfoPublic_lblEmail").text();
    profile.setContactEmail(email);
    String birthYear = details.select("#_ctl7_ProfileInfoPublic_lblBirthYear").text();
    if (Util.isNotBlank(birthYear)) {
        profile.setDob(new GregorianCalendar(Integer.parseInt(birthYear), Calendar.JANUARY, 1).getTime());
    }

    String gender = details.select("#_ctl7_ProfileInfoPublic_lblGender").text();
    if ("Male".equals(gender)) {
        profile.setSex(Sex.MALE);
    } else if ("Female".equals(gender)) {
        profile.setSex(Sex.FEMALE);
    }

    String expirationDate = details.select("#_ctl7_ProfileInfoLicense_lblExpDate").text();
    String originalIssueDate = details.select("#_ctl7_ProfileInfoLicense_lblGrantDate").text();

    Date issueDate = parseDate(originalIssueDate, DATE_FORMAT);
    if (issueDate != null) {
        license.setOriginalIssueDate(issueDate);
    }

    Date expireDate = parseDate(expirationDate, DATE_FORMAT);
    if (expireDate != null) {
        license.setExpireDate(expireDate);
    }

    String licenseStatus = details.select("#_ctl7_ProfileInfoLicense_lblLicStatus").text();
    LicenseStatus status = new LicenseStatus();
    status.setName(licenseStatus);
    license.setStatus(status);

    String disciplinaryAction = details.select("#_ctl7_ProfileInfoLicense_lblDiscAction").text();
    String correctiveAction = details.select("#_ctl7_ProfileInfoLicense_lblCorrAction").text();

    license.setDiscipline(!"No".equals(disciplinaryAction.trim()));
    license.setCorrectiveAction(!"No".equals(correctiveAction.trim()));

    String medSchool = details.select("#_ctl7_ProfileInfoEducation_lblName").text();
    MedicalSchool medicalSchool = new MedicalSchool();
    medicalSchool.setName(medSchool);
    license.setMedicalSchool(medicalSchool);

    String degree = details.select("#_ctl7_ProfileInfoEducation_lblDegree").text();
    if ("PhD".equals(degree.trim())) {
        profile.setDegree(Degree.DOCTORATE);
    } else if (!Util.isBlank(degree)) {
        profile.setDegree(Degree.MASTER);
    }

    PrivatePractice privatePractice = new PrivatePractice();
    profile.setPrivatePractice(privatePractice);

    String primaryAddressName = details.select("#_ctl7_ProfileInfoPractices_lblPrimaryName").text();
    String primaryAddress1 = details.select("#_ctl7_ProfileInfoPractices_lblPrimaryAddress").text();
    String primaryCityState = details.select("#_ctl7_ProfileInfoPractices_lblPrimaryAddress2").text();
    String primaryPhone = details.select("#_ctl7_ProfileInfoPractices_lblPrimaryPhone").text();
    if (Util.isNotBlank(primaryAddressName) || Util.isNotBlank(primaryAddress1)
            || Util.isNotBlank(primaryCityState)) {
        Address primary = new Address();
        address.setLocation(primaryAddressName + " " + primaryAddress1);
        setCityStateZip(primary, primaryCityState);
        privatePractice.setOfficeAddress(primary);
    }
    privatePractice.setOfficePhoneNumber(primaryPhone);

    String secondaryAddressName = details.select("#_ctl7_ProfileInfoPractices_lblSecondaryName").text();
    String secondaryAddress1 = details.select("#_ctl7_ProfileInfoPractices_lblSecondaryAddress").text();
    String secondaryCityState = details.select("#_ctl7_ProfileInfoPractices_lblSecondaryAddress2").text();
    String secondaryPhone = details.select("#_ctl7_ProfileInfoPractices_lblSecondaryPhone").text();

    if (Util.isNotBlank(secondaryAddressName) || Util.isNotBlank(secondaryAddress1)
            || Util.isNotBlank(secondaryCityState)) {
        Address secondary = new Address();
        address.setLocation(secondaryAddressName + " " + secondaryAddress1);
        setCityStateZip(secondary, secondaryCityState);
        privatePractice.setSecondaryAddress(secondary);
    }

    privatePractice.setSecondaryPhoneNumber(secondaryPhone);

    Elements specialties = details.select("#_ctl7_ProfileInfoSpecialty_dgSpecialty tr:gt(0)");
    List<Specialty> sps = new ArrayList<Specialty>();
    for (Element element : specialties) {
        Specialty sp = new Specialty();
        SpecialtyType spt = new SpecialtyType();
        spt.setName(element.select("td:eq(0)").text());
        sp.setType(spt);
        sp.setName(element.select("td:eq(1)").text());
        sps.add(sp);
    }
    profile.setSpecialties(sps);
    return license;
}

From source file:eu.riscoss.rdc.RDCFossology.java

/**
 * Analyses a fossology html file/*w ww  .  j ava  2 s . co  m*/
 * @param target
 * @param licensesMap
 * @return
 * @throws IOException
 */
private HashMap<String, Integer> analyseOverviewReport(String target,
        HashMap<String, Collection<String>> licensesMap) throws IOException {
    //private static HashMap<String, Integer> analyseFossologyReport(String target, String licenseFile) throws IOException {
    //        List<String> result = new ArrayList<String>();
    Document document;

    if (target.startsWith("http")) {
        document = Jsoup.connect(target).get();
    } else {
        File file = new File(target);
        document = Jsoup.parse(file, "UTF-8", "http://localhost");
    }

    Element table = document.select("table[id=lichistogram]").first();
    Elements rows = table.select("tr");

    List<LicenseEntry> llist = new ArrayList<LicenseEntry>(); //list of licenses in the fossology file

    //for each license, parses the name (0) and the number of occurrences (2) and saves it as a LicenseEntry
    for (Element element : rows) {
        Elements col = element.select("td");

        if (col.size() != 0) {
            int c = Integer.parseInt(col.get(0).ownText());//num of occurrences
            String lic = col.get(2).text();
            llist.add(new LicenseEntry(c, lic));
        }
    }

    //get license type buckets

    HashMap<String, Integer> licenseBuckets = new HashMap<String, Integer>();
    int total = 0;

    Set<String> licenseTypes = licensesMap.keySet();
    //initialize with 0 to avoid missing types
    for (String licensetype : licenseTypes) {
        licenseBuckets.put(licensetype, 0);
    }

    boolean matched = false;
    int numUnknown = 0;
    for (LicenseEntry le : llist) {
        for (String licenseType : licenseTypes) {//cycles on license types from config file
            if (le.matchesOneOf(licensesMap.get(licenseType), licenseType)) {
                Integer currentcount = licenseBuckets.get(le.licensetype);
                if (currentcount == null) //for safety, but should be initialised
                    currentcount = 0;
                licenseBuckets.put(le.licensetype, currentcount + le.count);
                matched = true;
            }
        }
        total += le.count;
        if (matched == false) { //unknown
            numUnknown += le.count;
            System.err.println("Unknown license: " + le.getName());
        }
    }

    licenseBuckets.put("_unknown_", numUnknown);
    licenseBuckets.put("_sum_", total);
    licenseBuckets.put("_count_", llist.size());

    return licenseBuckets;
}

From source file:blackman.matt.board.Post.java

/**
 * Formats the HTML on the post text to accurately display it on the post.
 *
 * @param post The unformatted text of the post.
 * @return A formatted version of the post.
 *///from  w  w w  .j a  va  2  s  . c o m
private String formatPostBody(String post) {
    Document formattedText = Jsoup.parse(post);
    Pattern p = Pattern.compile("^/.*/index\\.html");

    // Red Text
    Elements redTexts = formattedText.getElementsByClass("heading");
    for (Element text : redTexts) {
        text.wrap("<font color=\"#AF0A0F\"><strong></strong></font>");
    }

    // Green text
    Elements greenTexts = formattedText.getElementsByClass("quote");
    for (Element text : greenTexts) {
        text.wrap("<font color=\"#789922\"></font>");
    }

    // Board Links
    Elements boardLinks = formattedText.select("a");
    for (Element link : boardLinks) {
        String url = link.attr("href");
        Matcher m = p.matcher(url);
        if (m.matches()) {
            link.attr("href", "http://8chan.co" + url);
        }
    }

    // Reply links
    Elements replyLinks = formattedText.select("a[onclick^=highlightReply");
    for (Element reply : replyLinks) {
        repliedTo.add(reply.attr("href").split("#")[1]);
        boardLinks.attr("href", "http://8chan.co" + reply.attr("href"));
    }

    // Post too long text removal
    Elements tooLongs = formattedText.getElementsByClass("toolong");
    for (Element text : tooLongs) {
        text.text("");
    }

    return formattedText.toString();
}

From source file:gov.medicaid.screening.dao.impl.NurseAnesthetistsLicenseDAOBean.java

/**
 * Performs the call to the source site, exact match is expected given the parameters.
 *
 * @param criteria the search criteria//ww w  . ja v  a2 s  .c  om
 * @return the matched result, null if not found
 * @throws IOException if an I/O error is encountered
 * @throws URISyntaxException if the site URL cannot properly be created
 * @throws ServiceException for any other exceptions encountered
 */
private ProviderProfile getProviderProfile(NurseAnesthetistsSearchCriteria criteria)
        throws ServiceException, IOException, URISyntaxException {
    DefaultHttpClient client = new DefaultHttpClient();
    client.setRedirectStrategy(new LaxRedirectStrategy());

    String searchURL = getSearchURL();
    HttpGet getSearch = new HttpGet(new URIBuilder(searchURL).build());
    HttpResponse response = client.execute(getSearch);
    verifyAndAuditCall(searchURL, response);

    Document page = Jsoup.parse(EntityUtils.toString(response.getEntity()));
    HttpPost search = new HttpPost(new URIBuilder(searchURL).build());

    String searchType = "Lookup Certification Status";

    String last4 = criteria.getSsn().substring(criteria.getSsn().length() - 4);
    HttpEntity entity = postForm(searchURL, client, search,
            new String[][] { { "__EVENTARGUMENT", "" }, { "__EVENTTARGET", "" },
                    { "__EVENTVALIDATION", page.select("input[name=__EVENTVALIDATION]").first().val() },
                    { "__VIEWSTATE", page.select("input[name=__VIEWSTATE]").first().val() },
                    { "ctl00$PageContent$CertRecert$btnLookup", searchType },
                    { "ctl00$PageContent$CertRecert$txtAANANumber", "" + criteria.getAanaNumber() },
                    { "ctl00$PageContent$CertRecert$txtSSNLast4", "" + last4 } },
            true);

    page = Jsoup.parse(EntityUtils.toString(entity));
    Elements message = page.select("#ctl00_PageContent_ucCredentialsControl_lblErrorMessage");
    if (message.size() > 0) {
        if (message.text().startsWith("No individual with a social security number")) {
            // no match, return null
            return null;
        }
    }

    if (criteria.isRecertification()) {
        searchType = "Lookup Recertification Status";
        entity = postForm(searchURL, client, search,
                new String[][] { { "__EVENTARGUMENT", "" }, { "__EVENTTARGET", "" },
                        { "__EVENTVALIDATION", page.select("input[name=__EVENTVALIDATION]").first().val() },
                        { "__VIEWSTATE", page.select("input[name=__VIEWSTATE]").first().val() },
                        { "ctl00$PageContent$CertRecert$btnSwapDisplayMode", searchType } },
                true);
        page = Jsoup.parse(EntityUtils.toString(entity));
    }
    return parseProvider(page);
}

From source file:com.github.binlee1990.spider.movie.spider.MovieCrawler.java

private void setFilmRelated(Document doc, Film film) {
    Elements keyElements = doc.select(".fm-minfo dt");
    Elements valueElements = doc.select(".fm-minfo dd");
    if (CollectionUtils.isNotEmpty(keyElements) && CollectionUtils.isNotEmpty(valueElements)) {
        int keyI = 0;
        for (; keyI < keyElements.size(); keyI++) {
            Element keyElement = keyElements.get(keyI);
            Element valueElement = valueElements.get(keyI);

            if (null != keyElement && null != valueElement) {
                String key = StringUtils.trimToEmpty(keyElement.text().toString());
                if (StringUtils.isNotBlank(key)) {
                    String value = StringUtils.trimToEmpty(valueElement.text().toString());

                    if (StringUtils.equalsIgnoreCase(key, "")) {
                        Director director = createOrQueryDirector(value);
                        if (null != director) {
                            film.setDirectorId(director.getId());
                        }/*from  w w w .  jav a 2  s .  com*/
                        film.setDirector(value);
                    }
                    if (StringUtils.equalsIgnoreCase(key, "")) {
                    }
                    if (StringUtils.equalsIgnoreCase(key, "")) {
                    }
                    if (StringUtils.equalsIgnoreCase(key, "")) {
                        String urlYear = getFilmUrlYear(doc, value);
                        if (StringUtils.isNotBlank(urlYear)) {
                            EnumYear enumYear = queryEnumYear(urlYear);
                            if (null != enumYear) {
                                film.setYearId(enumYear.getId());
                            }
                        }

                        Date releaseDate = getFilmReleaseDate(value);
                        if (null != releaseDate) {
                            film.setReleaseDate(releaseDate);
                        }
                    }
                    if (StringUtils.equalsIgnoreCase(key, "")) {
                        int length = getFilmLength(value);
                        film.setLength(length);
                    }
                    if (StringUtils.equalsIgnoreCase(key, "??")) {
                        if (StringUtils.isNotBlank(value)) {
                            film.setAlias(value);
                        }
                    }
                }
            }
        }
    }
}

From source file:gov.medicaid.screening.dao.impl.DieteticsAndNutritionPracticeLicenseDAOBean.java

/**
 * Performs a search for all possible results.
 *
 * @param identifier The value to be searched.
 * @return the search result for licenses
 * @throws URISyntaxException When an error occurs while building the URL.
 * @throws ClientProtocolException When client does not support protocol used.
 * @throws IOException When an error occurs while parsing response.
 * @throws ParseException When an error occurs while parsing response.
 * @throws PersistenceException for database related errors
 * @throws ServiceException for any other errors
 *//* ww w  . j  a  va2 s . c  o  m*/
private SearchResult<License> getAllResults(String identifier) throws URISyntaxException,
        ClientProtocolException, IOException, ParseException, PersistenceException, ServiceException {
    DefaultHttpClient client = new DefaultHttpClient();
    URIBuilder builder = new URIBuilder(getSearchURL());
    String hostId = builder.build().toString();

    HttpGet httpget = new HttpGet(builder.build());
    HttpResponse landing = client.execute(httpget);
    Document document = Jsoup.parse(EntityUtils.toString(landing.getEntity()));

    HttpPost httppost = new HttpPost(builder.build());
    HttpEntity entity = postForm(hostId, client, httppost,
            new String[][] { { "_ctl0:_ctl1:_ctl0:txtCriteria", identifier },
                    { "_ctl0:_ctl1:_ctl0:btnSubmit", "Search" }, { "__EVENTTARGET", "" },
                    { "__EVENTARGUMENT", "" },
                    { "__VIEWSTATE", document.select("#Form input[name=__VIEWSTATE]").first().val() } },
            true);

    // licenses list
    List<License> licenseList = new ArrayList<License>();
    while (entity != null) {
        String result = EntityUtils.toString(entity);
        document = Jsoup.parse(result);

        Elements trs = document.select(GRID_ROW_SELECTOR);
        if (trs != null) {
            for (Element element : trs) {
                licenseList.add(parseLicense(element.children()));
            }
        }

        // done, check if there are additional results
        entity = null;
        Elements elements = document.getElementsByTag("a");
        for (Element element : elements) {
            if (element.text().equals("Next >>")) {
                entity = postForm(hostId, client, httppost,
                        new String[][] { { "_ctl0:_ctl1:_ctl0:txtCriteria", identifier },
                                { "__EVENTTARGET", "_ctl0:_ctl1:_ctl0:dgrdLicensee:_ctl29:_ctl1" },
                                { "__EVENTARGUMENT", "" },
                                { "__VIEWSTATE",
                                        document.select("#Form input[name=__VIEWSTATE]").first().val() } },
                        true);
                break;
            }
        }
    }

    SearchResult<License> result = new SearchResult<License>();
    result.setItems(licenseList);
    return result;
}

From source file:net.slkdev.swagger.confluence.service.impl.XHtmlToConfluenceServiceImpl.java

private Map<String, ConfluenceLink> buildTableOfContentsLinkMap() {
    final Map<String, ConfluenceLink> titleLinkMap = new HashMap<>();

    final Document document = SWAGGER_DOCUMENT.get();
    final Elements tocElements = document.select(".toc");

    final Elements tocCategoryElements = tocElements.select(".sectlevel1").first().children();

    final Elements tocFilteredCategoryElements = new Elements();

    for (final Element tocCategoryElement : tocCategoryElements) {
        final Element categoryLinkElement = tocCategoryElement.children().first();
        tocFilteredCategoryElements.add(categoryLinkElement);
    }// w  w w. java  2 s .c o  m

    final Elements tocIndividualElements = tocElements.select(".sectlevel2");

    addLinksByType(titleLinkMap, tocFilteredCategoryElements, PageType.CATEGORY, null);

    int categoryCount = 1;

    for (final Element tocIndividualElement : tocIndividualElements) {
        final Elements tocIndividualElementLinks = tocIndividualElement.select("a");
        addLinksByType(titleLinkMap, tocIndividualElementLinks, INDIVIDUAL, categoryCount);
        categoryCount++;
    }

    return titleLinkMap;
}