Example usage for org.jsoup.nodes Document select

List of usage examples for org.jsoup.nodes Document select

Introduction

In this page you can find the example usage for org.jsoup.nodes Document select.

Prototype

public Elements select(String cssQuery) 

Source Link

Document

Find elements that match the Selector CSS query, with this element as the starting context.

Usage

From source file:gov.medicaid.screening.dao.impl.SocialWorkLicenseDAOBean.java

/**
 * When there is exactly one match, the details page is displayed instead of the grid. This parses the details from
 * that page./*  w w w.  ja v  a 2 s.  co m*/
 *
 * @param page the license details page
 * @return the parsed license
 */
private License parseLicenseDetail(Document page) {
    License license = new License();
    ProviderProfile profile = new ProviderProfile();
    license.setProfile(profile);

    String name = page.select("#_ctl7_lblName").text();

    profile.setUser(parsePersonDetailName(name));

    String licenseNumber = page.select("#_ctl7_lblLicNumber").text();
    license.setLicenseNumber(licenseNumber);

    String licenseLevel = page.select("#_ctl7_lblLicLevel").text();
    if (Util.isNotBlank(licenseLevel)) {
        LicenseLevel level = new LicenseLevel();
        level.setName(licenseLevel);
        license.setLevel(level);
    }

    String city = page.select("#_ctl7_lblWorkCity").text();
    license.setCity(city);

    String licenseStatus = page.select("#_ctl7_lblLicStatus").text();
    if (Util.isNotBlank(licenseStatus)) {
        LicenseStatus status = new LicenseStatus();
        status.setName(licenseStatus);
        license.setStatus(status);
    }

    // per reviewer include originalIssueDate,expireDate,correctiveAction,discipline
    String issueDate = page.select("#_ctl7_lblIssueDate").text();
    if (Util.isNotBlank(issueDate)) {
        license.setOriginalIssueDate(parseDate(issueDate, DATE_FORMAT));
    }

    String expireDate = page.select("#_ctl7_lblExpDate").text();
    if (Util.isNotBlank(expireDate)) {
        license.setExpireDate(parseDate(expireDate, DATE_FORMAT));
    }

    String correctiveAction = page.select("#_ctl7_lblCorrectiveAction").text();
    if (Util.isNotBlank(correctiveAction)) {
        license.setCorrectiveAction(!"No".equals(correctiveAction));
    }

    String disciplineAction = page.select("#_ctl7_lblDisciplineAction").text();
    if (Util.isNotBlank(disciplineAction)) {
        license.setDiscipline(!"No".equals(disciplineAction));
    }
    return license;
}

From source file:org.bungeni.ext.integration.bungeniportal.BungeniServiceAccess.java

private String getItemAttributeValue(Document doc, String itemPath, String attributeReturn) {
    Elements item = doc.select(itemPath);
    return item.get(0).attr(attributeReturn);
}

From source file:com.thesmartweb.swebrank.WebParser.java

/**
 * Method to get the number of links (total, internal)
 * @param link_html the url to parse//from  w w w  .j  a  va  2 s  .  co  m
 * @return the number of links
 */
public int[] getnlinks(String link_html) {
    int[] nlinks = new int[2];
    nlinks[0] = 0;//total number of links
    nlinks[1] = 0;//number of internal links 
    try {
        Document doc = Jsoup.connect(link_html).timeout(10 * 1000).get();
        Elements links = doc.select("a[href]");
        nlinks[0] = links.size();
        //----we check if a link is internal or not (abs is used to get the whole link (abs stands for abs)
        for (Element link : links) {
            if (link.attr("abs:href").contains(link_html)) {
                nlinks[1]++;
            }
        }
        return nlinks;
    } catch (Exception ex) {
        Logger.getLogger(com.thesmartweb.swebrank.WebParser.class.getName()).log(Level.SEVERE, null, ex);
        return nlinks;
    }

}

From source file:gov.medicaid.screening.dao.impl.PodiatricMedicineLicenseDAOBean.java

/**
 * Retrieves all results from the source site.
 *
 * @param criteria the search criteria./*from w  ww  . j  av a 2 s . c  om*/
 * @return the providers matched
 * @throws URISyntaxException if the URL could not be correctly constructed
 * @throws IOException for any I/O related errors
 * @throws ServiceException for any other errors encountered
 */
private SearchResult<License> getAllResults(String criteria)
        throws URISyntaxException, IOException, ServiceException {
    DefaultHttpClient client = new DefaultHttpClient();
    client.setRedirectStrategy(new LaxRedirectStrategy());

    HttpGet getFrontPage = new HttpGet(new URIBuilder(getSearchURL()).build());
    HttpResponse response = client.execute(getFrontPage);

    verifyAndAuditCall(getSearchURL(), response);

    Document page = Jsoup.parse(EntityUtils.toString(response.getEntity()));

    HttpPost getSearchPage = new HttpPost(new URIBuilder(getSearchURL()).build());
    HttpEntity entity = postForm(getSearchURL(), client, getSearchPage,
            new String[][] { { "_ctl2:dropAgencyCode", "H7Q" }, { "_ctl2:btnLogin", "Login" },
                    { "__VIEWSTATE", page.select("#__aspnetForm input[name=__VIEWSTATE]").first().val() } },
            true);

    page = Jsoup.parse(EntityUtils.toString(entity));

    HttpPost search = new HttpPost(new URIBuilder(getSearchURL()).build());
    entity = postForm(getSearchURL(), client, search,
            new String[][] { { "_ctl2:txtCriteria", criteria }, { "_ctl2:btnSearch", "Search" },
                    { "__VIEWSTATE", page.select("#__aspnetForm input[name=__VIEWSTATE]").first().val() } },
            true);

    page = Jsoup.parse(EntityUtils.toString(entity));

    List<License> allLicenses = new ArrayList<License>();
    Elements rows = page.select("table#_ctl2_dgrdResults tr.DataGrid");
    for (Element row : rows) {
        License license = parseLicense(row.children());
        if (license != null) {
            allLicenses.add(license);
        }
    }
    SearchResult<License> results = new SearchResult<License>();
    results.setItems(allLicenses);
    return results;
}

From source file:org.confab.VBulletinParser.java

public List<Forum> parseForums(Document root, BulletinBoard parent) {
    Utilities.debug("parseForums");

    List<Forum> ret = new ArrayList<Forum>();

    // get table// ww w  .  ja  v  a  2  s  .co  m
    Elements forum_table = root.select("tbody[id*=collapseobj_forumbit_] tr");
    assert !forum_table.isEmpty();

    for (Element el_tr : forum_table) {
        Forum new_forum = new Forum(parent);

        // Get the table data for this row
        Elements el_tds = el_tr.select("td");
        assert !el_tds.isEmpty() : el_tr.html();

        // xbox360achievements has a lot of subforums and puts these in their own table
        // The <a>'s are picked up as children of the parent <td> so don't parse this sub-
        // tables row's seperatly
        if (!el_tds.select("td.thead").isEmpty() || el_tds.size() < 3) {
            //Utilities.debug("tr doesn't seem to have anything we want, skipping.");
            continue;
        }

        // Get the title URL
        Elements els_a = el_tds.get(1).select("a");
        assert !els_a.isEmpty() : el_tds.html();
        new_forum.url = els_a.first().attr("href");
        assert new_forum.url != null;
        Utilities.debug("new_forum.url : " + new_forum.url);

        // Get the title text
        assert els_a.first() != null;
        new_forum.title = els_a.first().text();
        assert new_forum.title != null;
        Utilities.debug("new_forum.title : " + new_forum.title);

        // Check for any subforums in remaining a elements
        els_a.remove(els_a.first());
        for (Element el_a : els_a) {
            Forum sub_forum = new Forum(parent);
            sub_forum.url = el_a.attr("href");
            assert sub_forum.url != null;
            sub_forum.title = el_a.text();
            assert sub_forum.title != null;
            new_forum.subForums.add(sub_forum);
            Utilities.debug("added subForum: " + sub_forum.title);
        }

        // Get num viewing the current forum
        Element el_viewing = el_tr.select(":matchesOwn((\\d+ Viewing))").first();
        if (el_viewing != null) {
            new_forum.numViewing = el_viewing.text();
        } else {
            new_forum.numViewing = "0";
        }
        Utilities.debug("new_forum.numViewing : " + new_forum.numViewing);

        // Get the description/message of this topic
        Element el_description = el_tds.get(1).select("div.smallfont").first();
        if (el_description != null) {
            new_forum.description = el_description.text();
        } else {
            new_forum.description = "";
        }
        Utilities.debug("new_forum.description : " + new_forum.description);

        Utilities.debug("new_forum.parent.url : " + new_forum.parent.url);

        ret.add(new_forum);
        Utilities.debug("-----");
    }
    Utilities.debug("end parseForums");
    return ret;
}

From source file:me.vertretungsplan.parser.UntisInfoParser.java

@Override
public SubstitutionSchedule getSubstitutionSchedule()
        throws IOException, JSONException, CredentialInvalidException {
    new LoginHandler(scheduleData, credential, cookieProvider).handleLogin(executor, cookieStore);

    Document navbarDoc = Jsoup.parse(getNavbarDoc().replace("&nbsp;", ""));
    Element select = navbarDoc.select("select[name=week]").first();

    SubstitutionSchedule v = SubstitutionSchedule.fromData(scheduleData);

    String info = navbarDoc.select(".description").text();
    String lastChange;//from   ww  w.  j a  va  2 s  . c o m
    try {
        lastChange = info.substring(info.indexOf("Stand:") + "Stand:".length()).trim();
    } catch (Exception e) {
        try {
            String infoHtml = httpGet(baseUrl + "/frames/title.htm", data.optString(PARAM_ENCODING, null));
            Document infoDoc = Jsoup.parse(infoHtml);
            String info2 = infoDoc.select(".description").text();
            lastChange = info2.substring(info2.indexOf("Stand:") + "Stand:".length()).trim();
        } catch (Exception e1) {
            lastChange = "";
        }
    }

    int successfulWeeks = 0;
    HttpResponseException lastException = null;
    for (Element option : select.children()) {
        String week = option.attr("value");
        String weekName = option.text();
        if (data.optBoolean(PARAM_SINGLE_CLASSES, data.optBoolean("single_classes", false)) // backwards compatibility
                || data.optString(PARAM_SCHEDULE_TYPE, "substitution").equals("timetable")) {
            int classNumber = 1;
            for (String klasse : getAllClasses()) {
                String url = getScheduleUrl(week, classNumber, data);
                try {
                    parsePage(v, lastChange, klasse, url, weekName);
                } catch (HttpResponseException e) {
                    if (e.getStatusCode() == 500) {
                        // occurs in Hannover_MMBS
                        classNumber++;
                        continue;
                    } else {
                        throw e;
                    }
                }

                classNumber++;
            }
            successfulWeeks++;
        } else {
            String url = getScheduleUrl(week, 0, data);
            try {
                parsePage(v, lastChange, null, url, weekName);
                successfulWeeks++;
            } catch (HttpResponseException e) {
                lastException = e;
            }
        }
    }
    if (successfulWeeks == 0 && lastException != null) {
        throw lastException;
    }
    v.setClasses(getAllClasses());
    v.setTeachers(getAllTeachers());
    v.setWebsite(baseUrl + "/default.htm");
    return v;
}

From source file:com.amastigote.xdu.query.module.EduSystem.java

private @Nullable JSONObject personalInfoQuery() throws IOException, JSONException {
    if (!checkIsLogin(ID)) {
        return null;
    }/*  w  w w  .j av a2 s. c o  m*/

    URL url = new URL(SYS_HOST + "xjInfoAction.do?oper=xjxx");
    HttpURLConnection httpURLConnection = (HttpURLConnection) url.openConnection();
    httpURLConnection.setRequestProperty("Cookie", "JSESSIONID=" + SYS_JSESSIONID);
    httpURLConnection.connect();

    Document document = Jsoup.parse(httpURLConnection.getInputStream(), "gb2312",
            httpURLConnection.getURL().toString());
    document = Jsoup.parse(document.toString().replaceAll("&nbsp;", ""));

    Elements elements1 = document.select("td[width=275]");

    JSONObject jsonObject = new JSONObject();
    jsonObject.put(StudentKey.ID, elements1.get(0).text());
    jsonObject.put(StudentKey.NAME, elements1.get(1).text());
    jsonObject.put(StudentKey.GENDER, elements1.get(6).text());
    jsonObject.put(StudentKey.NATION, elements1.get(10).text());
    jsonObject.put(StudentKey.NATIVE_PLACE, elements1.get(11).text());
    jsonObject.put(StudentKey.DEPARTMENT, elements1.get(24).text());
    jsonObject.put(StudentKey.MAJOR, elements1.get(25).text());
    jsonObject.put(StudentKey.CLASS, elements1.get(28).text());

    return jsonObject;
}

From source file:com.mythesis.userbehaviouranalysis.WebParser.java

/**
 * Parse the url and get all the content
 * @param link the url to parse/*from  w w  w  .j  a v a 2  s .  c  o m*/
 * @return The content parsed
 */
private String cleanhtml(String link) {
    try {
        Document doc = Jsoup.connect(link).timeout(10 * 1000).get();
        String title = doc.title();
        String mainbody = doc.body().text();
        Elements links = doc.select("a[href]");
        Elements media = doc.select("[src]");
        //fix link html to remove https:// or http:// and simple /
        if (link.substring(link.length() - 1, link.length()).equalsIgnoreCase("/")) {
            link = link.substring(0, link.length() - 1);
        }
        if (link.substring(0, 5).equalsIgnoreCase("https")) {
            link = link.substring(8);
        } else if (link.substring(0, 4).equalsIgnoreCase("http")) {
            link = link.substring(7);
        }
        String anchortext = "";
        String alttext = "";
        //-----get the anchor text of internal links
        for (Element el : links) {
            String str_check = el.attr("abs:href");
            if (el.attr("abs:href").contains(link) && el.text().length() > 1) {
                anchortext = anchortext + el.text() + " ";
            }
        }
        //-------get alt text to internal images links
        for (Element medi : media) {
            if (medi.getElementsByTag("img").attr("src").contains(link)) {
                alttext = alttext + " " + medi.getElementsByTag("img").attr("alt");
            }
            if (medi.getElementsByTag("img").attr("src").startsWith("/")) {
                alttext = alttext + " " + medi.getElementsByTag("img").attr("alt");
            }
        }
        String content = mainbody + title + anchortext + alttext;

        return content;

    } catch (IOException ex) {
        Logger.getLogger(WebParser.class.getName()).log(Level.SEVERE, null, ex);
        String check = null;
        return check;
    } catch (NullPointerException ex) {
        Logger.getLogger(WebParser.class.getName()).log(Level.SEVERE, null, ex);
        String check = null;
        return check;
    } catch (Exception ex) {
        Logger.getLogger(WebParser.class.getName()).log(Level.SEVERE, null, ex);
        String check = null;
        return check;
    }

}

From source file:org.confab.PhpBB3Parser.java

/**
* Constructs and submits a POST with the appropriate parameters to login to a vbulletin.
* @param  rootURL     Base or root URL for the site to log into 
* @param  username    User's login name// ww w  . ja v a  2 s. c om
* @param  password    User's password
* @return             User object initialised with a HttpContext
*/
public User login(String rootURL, String username, String password) {
    Utilities.debug("login");

    User ret = new User(username, password);

    CookieStore cookieStore = new BasicCookieStore();
    HttpContext localContext = new BasicHttpContext();
    localContext.setAttribute(ClientContext.COOKIE_STORE, cookieStore);

    try {
        // set up the POST
        HttpPost httppost = new HttpPost(rootURL + "login.php");
        List<NameValuePair> nvps = new ArrayList<NameValuePair>();
        nvps.add(new BasicNameValuePair("do", "login"));
        nvps.add(new BasicNameValuePair("vb_login_username", username));
        nvps.add(new BasicNameValuePair("vb_login_password", ""));
        nvps.add(new BasicNameValuePair("s", ""));
        nvps.add(new BasicNameValuePair("securitytoken", "guest"));
        nvps.add(new BasicNameValuePair("do", "login"));
        nvps.add(new BasicNameValuePair("vb_login_md5password", Utilities.md5(password)));
        nvps.add(new BasicNameValuePair("vb_login_md5password_utf", Utilities.md5(password)));
        httppost.setEntity(new UrlEncodedFormEntity(nvps, HTTP.UTF_8));

        // execute the POST 
        Utilities.debug("Executing POST");
        HttpResponse response = httpclient.execute(httppost, localContext);
        Utilities.debug("POST response: " + response.getStatusLine());
        assert response.getStatusLine().getStatusCode() == 200;

        //TODO: store the cookies
        //http://bit.ly/e7yY5i (CookieStore javadoc)

        Utilities.printCookieStore(cookieStore);

        // confirm we are logged in 
        HttpGet httpget = new HttpGet(rootURL);
        response = httpclient.execute(httpget, localContext);
        HttpEntity entity = response.getEntity();
        Document page = Jsoup.parse(EntityUtils.toString(entity));
        EntityUtils.consume(entity);
        assert page != null;

        Utilities.debug("Checking that we are logged in..");
        Element username_box = page.select("input[name=vb_login_username]").first();
        assert username_box == null;
        Element password_box = page.select("input[name=vb_login_password]").first();
        assert password_box == null;

        // parse the user's new securitytoken
        Element el_security_token = page.select("input[name=securitytoken]").first();
        assert el_security_token != null;
        String security_token = el_security_token.attr("value");
        assert security_token != null;
        String[] token_array = security_token.split("-");
        assert token_array.length == 2;
        ret.vb_security_token = token_array[1];
        assert ret.vb_security_token.length() == 40;
        Utilities.debug("securitytoken: " + ret.vb_security_token);

        Utilities.debug("Login seems ok");
        ret.httpContext = localContext;
    } catch (IOException e) {
        System.out.println(e);
    }

    Utilities.debug("end login");
    return ret;
}

From source file:de.geeksfactory.opacclient.apis.BiBer1992.java

static List<LentItem> parseMediaList(AccountData res, Document doc, JSONObject data) throws JSONException {
    List<LentItem> media = new ArrayList<>();
    if (doc == null) {
        return media;
    }//from  w ww .j  a va2 s  . c  o  m

    // parse result list
    JSONObject copymap = data.getJSONObject("accounttable");

    Pattern expire = Pattern.compile("Ausweisg.ltigkeit: ([0-9.]+)");
    Pattern fees = Pattern.compile("([0-9,.]+) .");
    for (Element td : doc.select(".td01x09n")) {
        String text = td.text().trim();
        if (expire.matcher(text).matches()) {
            res.setValidUntil(expire.matcher(text).replaceAll("$1"));
        } else if (fees.matcher(text).matches()) {
            res.setPendingFees(text);
        }
    }
    DateTimeFormatter fmt = DateTimeFormat.forPattern("dd.MM.yyyy").withLocale(Locale.GERMAN);
    Elements rowElements = doc.select("form[name=medkl] table tr");

    // rows: skip 1st row -> title row
    for (int i = 1; i < rowElements.size(); i++) {
        Element tr = rowElements.get(i);
        if (tr.child(0).tagName().equals("th")) {
            continue;
        }
        LentItem item = new LentItem();

        Pattern itemIdPat = Pattern.compile("javascript:smAcc\\('[a-z]+','[a-z]+','([A-Za-z0-9]+)'\\)");
        // columns: all elements of one media
        Iterator<?> keys = copymap.keys();
        while (keys.hasNext()) {
            String key = (String) keys.next();
            int index;
            try {
                index = copymap.has(key) ? copymap.getInt(key) : -1;
            } catch (JSONException e1) {
                index = -1;
            }
            if (index >= 0) {
                String value = tr.child(index).text().trim().replace("\u00A0", "");

                switch (key) {
                case "author":
                    value = findTitleAndAuthor(value)[1];
                    break;
                case "title":
                    value = findTitleAndAuthor(value)[0];
                    break;
                case "returndate":
                    try {
                        value = fmt.parseLocalDate(value).toString();
                    } catch (IllegalArgumentException e1) {
                        e1.printStackTrace();
                    }
                    break;
                }

                if (tr.child(index).select("a").size() == 1) {
                    Matcher matcher = itemIdPat.matcher(tr.child(index).select("a").attr("href"));
                    if (matcher.find())
                        item.setId(matcher.group(1));
                }

                if (value != null && value.length() != 0)
                    item.set(key, value);
            }
        }

        if (tr.select("input[type=checkbox][value=YES]").size() > 0) {
            item.setProlongData(tr.select("input[type=checkbox][value=YES]").attr("name"));
        }

        media.add(item);
    }
    return media;
}