Example usage for org.jsoup.nodes Element select

List of usage examples for org.jsoup.nodes Element select

Introduction

In this page you can find the example usage for org.jsoup.nodes Element select.

Prototype

public Elements select(String cssQuery) 

Source Link

Document

Find elements that match the Selector CSS query, with this element as the starting context.

Usage

From source file:com.pemikir.youtubeplus.youtube.YoutubeExtractor.java

private VideoInfoItem extractVideoInfoItem(Element li) {
    VideoInfoItem info = new VideoInfoItem();
    info.webpage_url = li.select("a[class*=\"content-link\"]").first().attr("abs:href");
    try {/*from  w w w . ja v a 2  s . co  m*/
        Pattern p = Pattern.compile("v=([0-9a-zA-Z-]*)");
        Matcher m = p.matcher(info.webpage_url);
        m.find();
        info.id = m.group(1);
    } catch (Exception e) {
        e.printStackTrace();
    }
    info.title = li.select("span[class=\"title\"]").first().text();

    info.uploader = li.select("span[class=\"g-hovercard\"]").first().text();

    info.duration = li.select("span[class=\"video-time\"]").first().text();

    Element img = li.select("img").first();
    info.thumbnail_url = img.attr("abs:src");
    // Sometimes youtube sends links to gif files witch somehow seam to not exist
    // anymore. Items with such gif also offer a secondary image source. So we are going
    // to use that if we caught such an item.
    if (info.thumbnail_url.contains(".gif")) {
        info.thumbnail_url = img.attr("data-thumb");
    }

    return info;
}

From source file:gov.medicaid.screening.dao.impl.NursingLicenseDAOBean.java

/**
 * Parses the nursing license details page.
 *
 * @param page the details page//ww  w  .ja v  a2 s  .co  m
 * @param licenseType if user has multiple licenses, this one will be used
 * @return the parsed license details
 * @throws ParsingException if the page does not contain the expected elements
 */
private License parseLicense(Document page, String licenseType) throws ParsingException {
    if (!page.select("span#lblFormTitle").text().equals("License Details")) {
        throw new ParsingException(ErrorCode.MITA50002.getDesc());
    }

    License license = new License();
    ProviderProfile profile = new ProviderProfile();
    license.setProfile(profile);

    String fullName = page.select("#_ctl7_lblName").text();

    User user = new User();
    profile.setUser(user);
    String[] nameParts = fullName.split(" ");
    user.setLastName(nameParts[nameParts.length - 1]);
    if (nameParts.length > 1) {
        user.setFirstName(nameParts[0]);
    }
    // everything else goes to middle name (per site behavior)
    if (nameParts.length > 2) {
        StringBuffer sb = new StringBuffer();
        for (int i = 1; i < nameParts.length - 1; i++) {
            if (sb.length() > 0) {
                sb.append(" ");
            }
            sb.append(nameParts[i]);
        }
        user.setMiddleName(sb.toString());
    }

    String dateOfBirth = page.select("#_ctl7_lblDOB").text();
    if (Util.isNotBlank(dateOfBirth)) {
        profile.setDob(parseDate(dateOfBirth, DATE_FORMAT));
    }

    String gender = page.select("#_ctl7_lblGender").text();
    if (Util.isNotBlank(gender)) {
        if ("Female".equals(gender)) {
            profile.setSex(Sex.FEMALE);
        } else {
            profile.setSex(Sex.MALE);
        }
    }

    Elements licenses = page.select("#_ctl7_dgLicense tr.Normal");
    for (Element row : licenses) {
        String licenseNumber = row.select("td:eq(0)").text();
        if (licenseType != null && !licenseNumber.startsWith(licenseType)) {
            // user has multiple licenses, the results will show this user twice (search by name)
            continue;
        }

        String[] licenseParts = licenseNumber.split(" ");
        LicenseType type = new LicenseType();
        type.setName(TYPES.get(licenseParts[0]) == null ? licenseParts[0] : TYPES.get(licenseParts[0]));
        license.setType(type);
        license.setLicenseNumber(licenseParts[1]);

        String issueDate = row.select("td:eq(1)").text();
        if (Util.isNotBlank(issueDate)) {
            license.setOriginalIssueDate(parseDate(issueDate, DATE_FORMAT));
        }

        String expirationDate = row.select("td:eq(2)").text();
        if (Util.isNotBlank(expirationDate)) {
            license.setExpireDate(parseDate(expirationDate, DATE_FORMAT));
        }
    }
    return license;
}

From source file:de.geeksfactory.opacclient.apis.Littera.java

protected void addSimpleSearchField(List<SearchField> fields) throws IOException, JSONException {
    final String html = httpGet(getApiUrl() + "&mode=s", getDefaultEncoding());
    final Document doc = Jsoup.parse(html);
    final Element simple = doc.select(".simple_search").first();
    final TextSearchField field = new TextSearchField();
    field.setFreeSearch(true);//from w ww.java 2s  .c o m
    field.setDisplayName(simple.select("h4").first().text());
    field.setId(simple.select("#keyboard").first().attr("name"));
    field.setHint("");
    field.setData(new JSONObject());
    field.getData().put("meaning", field.getId());
    fields.add(field);
}

From source file:mobi.jenkinsci.alm.assembla.client.AssemblaClient.java

public void login() throws IOException {
    Document pinDoc = Jsoup.parse(getData(String.format(AUTH, appId), false));
    if (getLatestRedirectedUrl().getPath().startsWith(LOGIN)) {
        pinDoc = postLoginForm(pinDoc);// w  ww . j  a v a 2s .c om
    }

    final Element pinBox = pinDoc.select("div[class=box]").first();
    if (pinBox == null) {
        throw new IOException("Missing PIN code from Assembla auth response");
    }
    final Element pinLabel = pinBox.select("p").first();
    final Element pinValue = pinBox.select("h1").first();
    if (pinLabel == null || pinValue == null) {
        throw new IOException("Missing PIN code from Assembla auth response");
    }
    final String pin = pinValue.childNode(0).toString();
    final HttpPost authPost = new HttpPost(
            String.format(ASSEMBLA_SITE_APP_AUTH, appId, appSecret) + String.format(PIN_AUTH, pin));
    final HttpResponse pinResponse = httpClient.execute(authPost);
    try {
        if (pinResponse.getStatusLine().getStatusCode() != HttpURLConnection.HTTP_OK) {
            throw new IOException(
                    "Post " + authPost.getURI() + " for a PIN failed: " + pinResponse.getStatusLine());
        }
        accessToken = gson.fromJson(
                new JsonReader(new InputStreamReader(pinResponse.getEntity().getContent(), "UTF-8")),
                AssemblaAccessToken.class);
    } finally {
        authPost.releaseConnection();
    }
}

From source file:gov.medicaid.screening.dao.impl.NursingLicenseDAOBean.java

/**
 * Performs a search for all possible results.
 *
 * @param criteria The search criteria.//from  ww  w.j  ava 2 s.  c  o  m
 * @param byName flag indicating it is a name search
 * @return the search result for licenses
 *
 * @throws URISyntaxException if an error occurs while building the URL.
 * @throws ClientProtocolException if client does not support protocol used.
 * @throws IOException if an error occurs while parsing response.
 * @throws ParseException if an error occurs while parsing response.
 * @throws ServiceException for any other problems encountered
 */
private SearchResult<License> getAllResults(NursingLicenseSearchCriteria criteria, boolean byName)
        throws URISyntaxException, ClientProtocolException, IOException, ParseException, ServiceException {
    DefaultHttpClient client = new DefaultHttpClient(getLaxSSLConnectionManager());
    client.setRedirectStrategy(new LaxRedirectStrategy());
    client.setCookieStore(loginAsPublicUser());

    HttpGet getSearch = new HttpGet(new URIBuilder(getSearchURL()).build());
    HttpResponse response = client.execute(getSearch);
    verifyAndAuditCall(getSearchURL(), response);

    Document page = Jsoup.parse(EntityUtils.toString(response.getEntity()));

    HttpPost search = new HttpPost(new URIBuilder(getSearchURL()).build());

    List<License> allLicenses = new ArrayList<License>();

    // switch to search by name screen
    if (byName) {
        HttpEntity entity = postForm(getSearchURL(), client, search,
                new String[][] { { "__EVENTTARGET", "_ctl7_rbtnSearch_1" }, { "__EVENTARGUMENT", "" },
                        { "_ctl7:ddlbLicenseType", "R" }, { "_ctl7:rbtnSearch", "2" },
                        { "_ctl7:txtCheckDigit", "" }, { "_ctl7:txtLicenseNumber", "" },
                        { "__VIEWSTATE", page.select("input[name=__VIEWSTATE]").first().val() } },
                true);

        page = Jsoup.parse(EntityUtils.toString(entity));
        entity = getResultPage(criteria, client, page, search, "_ctl7:cmdSearch", getSearchURL());
        page = Jsoup.parse(EntityUtils.toString(entity));

        // get the data grid entries
        if (page.select("table#_ctl7_grdSearchResults").size() < 1) {
            throw new ParsingException(ErrorCode.MITA50002.getDesc());
        }

        Elements rows = page.select(GRID_ROW_SELECTOR);
        while (rows.size() > 0) {
            for (Element row : rows) {
                String url = row.select("a").first().attr("href");
                String licenseNo = row.select("td:eq(4)").text();
                HttpGet getDetail = new HttpGet(Util.replaceLastURLPart(getSearchURL(), url));
                response = client.execute(getDetail);
                verifyAndAuditCall(getSearchURL(), response);
                Document licenseDetails = Jsoup.parse(EntityUtils.toString(response.getEntity()));
                allLicenses.add(parseLicense(licenseDetails, licenseNo.substring(0, 1)));
            }
            rows.clear();

            // check for next page
            Element currentPage = page.select("#_ctl7_grdSearchResults tr.TablePager span").first();
            if (getLog() != null) {
                getLog().log(Level.DEBUG, "Current page is: " + currentPage.text());
            }
            Element pageLink = currentPage.nextElementSibling();
            if (pageLink != null && pageLink.hasAttr("href")) {
                if (getLog() != null) {
                    getLog().log(Level.DEBUG, "There are more results, getting the next page.");
                }

                String target = parseEventTarget(pageLink.attr("href"));
                entity = getResultPage(criteria, client, page, search, target, getSearchURL());
                page = Jsoup.parse(EntityUtils.toString(entity));
                rows = page.select(GRID_ROW_SELECTOR);
            }
        }

    } else { // search by license number (site supports only exact match)

        HttpEntity entity = postForm(getSearchURL(), client, search,
                new String[][] { { "__EVENTTARGET", "_ctl7:cmdSearch" }, { "__EVENTARGUMENT", "" },
                        { "_ctl7:ddlbLicenseType", Util.defaultString(criteria.getLicenseType().getName()) },
                        { "_ctl7:rbtnSearch", "1" },
                        { "_ctl7:txtCheckDigit", Util.defaultString(criteria.getCheckDigit()) },
                        { "_ctl7:txtLicenseNumber", Util.defaultString(criteria.getIdentifier()) },
                        { "__VIEWSTATE", page.select("input[name=__VIEWSTATE]").first().val() } },
                true);

        page = Jsoup.parse(EntityUtils.toString(entity));
        if (page.select("span#lblFormTitle").text().equals("License Details")) {
            String prefLicenseType = criteria.getLicenseType().getName();
            allLicenses.add(parseLicense(page, prefLicenseType));
        }
    }

    SearchResult<License> searchResult = new SearchResult<License>();
    searchResult.setItems(allLicenses);
    return searchResult;
}

From source file:org.confab.VBulletinParser.java

public List<Forum> parseForums(Document root, BulletinBoard parent) {
    Utilities.debug("parseForums");

    List<Forum> ret = new ArrayList<Forum>();

    // get table//from  w  w w .j a  v a2 s  . co m
    Elements forum_table = root.select("tbody[id*=collapseobj_forumbit_] tr");
    assert !forum_table.isEmpty();

    for (Element el_tr : forum_table) {
        Forum new_forum = new Forum(parent);

        // Get the table data for this row
        Elements el_tds = el_tr.select("td");
        assert !el_tds.isEmpty() : el_tr.html();

        // xbox360achievements has a lot of subforums and puts these in their own table
        // The <a>'s are picked up as children of the parent <td> so don't parse this sub-
        // tables row's seperatly
        if (!el_tds.select("td.thead").isEmpty() || el_tds.size() < 3) {
            //Utilities.debug("tr doesn't seem to have anything we want, skipping.");
            continue;
        }

        // Get the title URL
        Elements els_a = el_tds.get(1).select("a");
        assert !els_a.isEmpty() : el_tds.html();
        new_forum.url = els_a.first().attr("href");
        assert new_forum.url != null;
        Utilities.debug("new_forum.url : " + new_forum.url);

        // Get the title text
        assert els_a.first() != null;
        new_forum.title = els_a.first().text();
        assert new_forum.title != null;
        Utilities.debug("new_forum.title : " + new_forum.title);

        // Check for any subforums in remaining a elements
        els_a.remove(els_a.first());
        for (Element el_a : els_a) {
            Forum sub_forum = new Forum(parent);
            sub_forum.url = el_a.attr("href");
            assert sub_forum.url != null;
            sub_forum.title = el_a.text();
            assert sub_forum.title != null;
            new_forum.subForums.add(sub_forum);
            Utilities.debug("added subForum: " + sub_forum.title);
        }

        // Get num viewing the current forum
        Element el_viewing = el_tr.select(":matchesOwn((\\d+ Viewing))").first();
        if (el_viewing != null) {
            new_forum.numViewing = el_viewing.text();
        } else {
            new_forum.numViewing = "0";
        }
        Utilities.debug("new_forum.numViewing : " + new_forum.numViewing);

        // Get the description/message of this topic
        Element el_description = el_tds.get(1).select("div.smallfont").first();
        if (el_description != null) {
            new_forum.description = el_description.text();
        } else {
            new_forum.description = "";
        }
        Utilities.debug("new_forum.description : " + new_forum.description);

        Utilities.debug("new_forum.parent.url : " + new_forum.parent.url);

        ret.add(new_forum);
        Utilities.debug("-----");
    }
    Utilities.debug("end parseForums");
    return ret;
}

From source file:eu.riscoss.dataproviders.providers.FossologyDataProvider.java

/**
 * Analyses a fossology html file/* w ww . java 2 s .  c  om*/
 * @param target
 * @param licensesMap
 * @return
 * @throws IOException
 */
private HashMap<String, Integer> analyseOverviewReport(String target,
        HashMap<String, Collection<String>> licensesMap) throws IOException {
    //private static HashMap<String, Integer> analyseFossologyReport(String target, String licenseFile) throws IOException {
    //        List<String> result = new ArrayList<String>();
    Document document;

    if (target.startsWith("http")) {
        document = Jsoup.connect(target).get();
    } else {
        File file = new File(target);
        document = Jsoup.parse(file, "UTF-8", "http://localhost");
    }

    Element table = document.select("table[id=lichistogram]").first();
    Elements rows = table.select("tr");

    List<LicenseEntry> llist = new ArrayList<LicenseEntry>(); //list of licenses in the fossology file

    //for each license, parses the name (0) and the number of occurrences (2) and saves it as a LicenseEntry
    for (Element element : rows) {
        Elements col = element.select("td");

        if (col.size() != 0) {
            int c = Integer.parseInt(col.get(0).ownText());//num of occurrences
            String lic = col.get(2).text();
            llist.add(new LicenseEntry(c, lic));
            //mlist.put(lic, c);
        }
        //           System.out.println(col.get(1).ownText());
        //           Element count=col.get(0);
    }

    //get license type buckets

    HashMap<String, Integer> licenseBuckets = new HashMap<String, Integer>();
    int total = 0;

    Set<String> licenseTypes = licensesMap.keySet();
    //initialize with 0 to avoid missing types
    for (String licensetype : licenseTypes) {
        licenseBuckets.put(licensetype, 0);
    }

    boolean matched = false;
    int numUnknown = 0;
    for (LicenseEntry le : llist) {
        for (String licenseType : licenseTypes) {//cycles on license types from config file
            if (le.matchesOneOf(licensesMap.get(licenseType), licenseType)) {
                Integer currentcount = licenseBuckets.get(le.licensetype);
                if (currentcount == null) //for safety, but should be initialised
                    currentcount = 0;
                licenseBuckets.put(le.licensetype, currentcount + le.count);
                matched = true;
            }
        }
        total += le.count;
        if (matched == false) { //unknown
            numUnknown += le.count;
            System.out.println("Unknown license: " + le.getName());
        }
    }

    licenseBuckets.put("_unknown_", numUnknown);
    licenseBuckets.put("_sum_", total);
    licenseBuckets.put("_count_", llist.size());

    System.out.println("\nLicense Buckets Fossology from HTML overview scanning:");
    System.out.println(licenseBuckets);

    //        for (String license : result) {
    //            System.out.format("%s\n", license);
    //        }
    return licenseBuckets;
}

From source file:mobi.jenkinsci.alm.assembla.client.AssemblaClient.java

private Document postLoginForm(final Document pinDoc) throws IOException {
    final List<NameValuePair> formNvps = new ArrayList<NameValuePair>();
    final Element form = pinDoc.select("form[id=login-box]").first();
    final String formAction = form.attr("action");
    final HttpPost formPost = new HttpPost(getUrl(formAction).toString());
    final Elements formFields = form.select("input");
    for (final Element element : formFields) {
        final String fieldName = element.attr("name");
        String fieldValue = element.attr("value");
        final String fieldId = element.attr("id");
        final String fieldType = element.attr("type");

        if (fieldId.equalsIgnoreCase("user_login")) {
            fieldValue = username;/*from ww  w. j a v a2  s.  c o m*/
            ;
        } else if (fieldId.equalsIgnoreCase("user_password")) {
            fieldValue = password;
        }

        if (fieldType.equals("submit")) {
            if (!fieldName.equalsIgnoreCase("commit")) {
                continue;
            }
        }

        LOG.debug(String.format("Processing form field: name='%s' value='%s' id='%s'", fieldName, fieldValue,
                fieldId));
        formNvps.add(new BasicNameValuePair(fieldName, fieldValue));
    }
    try {
        formPost.setEntity(new UrlEncodedFormEntity(formNvps, "UTF-8"));
    } catch (final UnsupportedEncodingException e) {
        // This would never happen
        throw new IllegalArgumentException("UTF-8 not recognised");
    }

    HttpResponse response;
    LOG.debug("Login via posting form-data to " + formPost.getURI());
    try {
        response = sendHttpPost(formPost);
        if (response.getStatusLine().getStatusCode() != HttpURLConnection.HTTP_MOVED_TEMP) {
            throw new IOException("Form-based login to Assembla failed: " + response.getStatusLine());
        }
        return Jsoup.parse(getData(response.getFirstHeader("Location").getValue(), false));
    } finally {
        formPost.releaseConnection();
    }
}

From source file:gov.medicaid.screening.dao.impl.OIGDAOBean.java

/**
 * Performs a search for all possible results.
 *
 * @param criteria The search criteria./*from www .  j  av a2 s.c o  m*/
 * @return the search result for provider profiles
 *
 * @throws URISyntaxException if an error occurs while building the URL.
 * @throws ClientProtocolException if client does not support protocol used.
 * @throws IOException if an error occurs while parsing response.
 * @throws ParseException if an error occurs while parsing response.
 * @throws ServiceException for any other problems encountered
 */
private SearchResult<ProviderProfile> getAllResults(OIGSearchCriteria criteria)
        throws URISyntaxException, ClientProtocolException, IOException, ParseException, ServiceException {
    DefaultHttpClient client = new DefaultHttpClient(getLaxSSLConnectionManager());
    client.setRedirectStrategy(new LaxRedirectStrategy());

    HttpGet getSearch = new HttpGet(new URIBuilder(getSearchURL()).build());
    HttpResponse response = client.execute(getSearch);

    verifyAndAuditCall(getSearchURL(), response);

    Document page = Jsoup.parse(EntityUtils.toString(response.getEntity()));
    HttpPost search = new HttpPost(new URIBuilder(getSearchURL()).build());
    List<ProviderProfile> allProfiles = new ArrayList<ProviderProfile>();

    boolean entitySearch = (Util.isBlank(criteria.getLastName()) && Util.isBlank(criteria.getFirstName()));

    HttpEntity entity = null;
    if (!entitySearch) {
        entity = postForm(getSearchURL(), client, search,
                new String[][] { { "__EVENTARGUMENT", "" }, { "__EVENTTARGET", "" },
                        { "__EVENTVALIDATION", page.select("input[name=__EVENTVALIDATION]").first().val() },
                        { "__VIEWSTATE", page.select("input[name=__VIEWSTATE]").first().val() },
                        { "ctl00$cpExclusions$ibSearchSP.x", "0" }, { "ctl00$cpExclusions$ibSearchSP.y", "0" },
                        { "ctl00$cpExclusions$txtSPLastName", Util.defaultString(criteria.getLastName()) },
                        { "ctl00$cpExclusions$txtSPFirstName", Util.defaultString(criteria.getFirstName()) } },
                false);
    } else {
        HttpEntity searchEntity = postForm(getSearchURL(), client, search, new String[][] {
                { "__EVENTARGUMENT", "" }, { "__EVENTTARGET", "ctl00$cpExclusions$Linkbutton1" },
                { "__EVENTVALIDATION", page.select("input[name=__EVENTVALIDATION]").first().val() },
                { "__VIEWSTATE", page.select("input[name=__VIEWSTATE]").first().val() },
                { "ctl00$cpExclusions$txtSPLastName", "" }, { "ctl00$cpExclusions$txtSPFirstName", "" } },
                false);

        page = Jsoup.parse(EntityUtils.toString(searchEntity));

        entity = postForm(getSearchURL(), client, search,
                new String[][] { { "__EVENTARGUMENT", "" }, { "__EVENTTARGET", "" },
                        { "__EVENTVALIDATION", page.select("input[name=__EVENTVALIDATION]").first().val() },
                        { "__VIEWSTATE", page.select("input[name=__VIEWSTATE]").first().val() },
                        { "ctl00$cpExclusions$ibSearchSP.x", "0" }, { "ctl00$cpExclusions$ibSearchSP.y", "0" },
                        { "ctl00$cpExclusions$txtSBName", Util.defaultString(criteria.getBusinessName()) } },
                false);
    }

    page = Jsoup.parse(EntityUtils.toString(entity));

    Elements rows;
    int ssnColumnIndex;
    if (!entitySearch) {
        rows = page.select("table#ctl00_cpExclusions_gvEmployees tr:gt(0)");
        ssnColumnIndex = 7;
    } else {
        rows = page.select("table#ctl00_cpExclusions_gvBusiness tr:gt(0)");
        ssnColumnIndex = 5;
    }

    for (Element row : rows) {
        String href;
        if (row.select("td:eq(" + ssnColumnIndex + ")").text().equals("N/A")) {
            href = row.select("td:eq(0) a").first().attr("href");
        } else {
            href = row.select("td:eq(" + ssnColumnIndex + ") a").first().attr("href");
        }

        href = href.replaceFirst("javascript:__doPostBack\\('", "");
        href = href.replaceFirst("',''\\)", "");

        ProviderProfile profile = parseProfile(getDetails(client, href, page));
        String entityId = href.substring(0, href.lastIndexOf('$'));
        entityId = entityId.substring(entityId.lastIndexOf('$') + 4);
        profile.setId(Long.parseLong(entityId) - 2);
        allProfiles.add(profile);
    }

    SearchResult<ProviderProfile> searchResult = new SearchResult<ProviderProfile>();
    searchResult.setItems(allProfiles);
    return searchResult;
}