Example usage for org.jsoup.nodes Document select

List of usage examples for org.jsoup.nodes Document select

Introduction

In this page you can find the example usage for org.jsoup.nodes Document select.

Prototype

public Elements select(String cssQuery) 

Source Link

Document

Find elements that match the Selector CSS query, with this element as the starting context.

Usage

From source file:com.msds.km.service.Impl.YunmaiAPIDrivingLicenseRecognitionServcieiImpl.java

/**
 * html???//from w  w w .j  a  v  a 2 s  .com
 * @param html ??xml?java
 * @return 
 */
protected DrivingLicense parseDrivingLicense(String html) {
    if (html.isEmpty()) {
        throw new RecognitionException("the html content is empty");
    }
    Document document = Jsoup.parse(html);
    if (document == null) {
        throw new RecognitionException(
                "the document prased from html content is null, please check the website");
    }

    Elements fieldsets = document.select("div[class=left result] fieldset");
    if (fieldsets.size() != 1) {
        throw new RecognitionException(
                "the document should has result filedset, the content of the web page may be changed.");
    }
    Element regResult = fieldsets.first();
    String result = regResult.html().trim();

    //
    String removedStr = "<legend></legend>";
    if (result.startsWith(removedStr)) {
        result = result.substring(removedStr.length());
    }

    //??xml
    result = StringEscapeUtils.unescapeXml(result);

    //
    result = "<drivingLicense>" + result + "</drivingLicense>";
    return (DrivingLicense) stream.fromXML(result);
}

From source file:net.liuxuan.Tools.signup.SignupQjvpn.java

public void getLoginForm() throws IOException {

    HttpGet httpget = new HttpGet("http://www.qjvpn.com/user/login.php");
    CloseableHttpResponse response1 = httpclient.execute(httpget);
    try {/*  w  w w.j  a  v a 2 s  .  com*/
        HttpEntity entity = response1.getEntity();
        //?once
        String content = EntityUtils.toString(entity);
        //                System.out.println(content);
        System.out.println("--------------");
        System.out.println("--------------");
        Document doc = Jsoup.parse(content);
        //                Elements inputs = doc.select("input[type=text]");
        Elements inputs = doc.select("input[type=hidden]");
        for (int i = 0; i < inputs.size(); i++) {
            Element element = inputs.get(i);
            params.add(new BasicNameValuePair(element.attr("name"), element.attr("value")));
            //                    params.put(element.attr("name"), element.attr("value"));
            System.out.println(element.toString());
            System.out.println(element.attr("name"));
            System.out.println(element.attr("value"));

        }

        System.out.println("--------------");
        System.out.println("--------------");

        System.out.println("--------------");
        System.out.println("--------------");
        System.out.println("Login form get: " + response1.getStatusLine());
        EntityUtils.consume(entity);

        System.out.println("Initial set of cookies:");
        List<Cookie> cookies = cookieStore.getCookies();
        if (cookies.isEmpty()) {
            System.out.println("None");
        } else {
            for (int i = 0; i < cookies.size(); i++) {
                System.out.println("- " + cookies.get(i).toString());
            }
        }
    } finally {
        response1.close();
    }

    //            HttpUriRequest login = RequestBuilder.post()
    //                    .setUri(new URI("http://v2ex.com/signin"))
    //                    .addParameter("u", "mosliu")
    //                    .addParameter("p", "mosesmoses")
    //                    .build();
    //            CloseableHttpResponse response2 = httpclient.execute(login);
    //            try {
    //                HttpEntity entity = response2.getEntity();
    //
    //                System.out.println("Login form get: " + response2.getStatusLine());
    //                
    //                EntityUtils.consume(entity);
    //
    //                System.out.println("Post logon cookies:");
    //                List<Cookie> cookies = cookieStore.getCookies();
    //                if (cookies.isEmpty()) {
    //                    System.out.println("None");
    //                } else {
    //                    for (int i = 0; i < cookies.size(); i++) {
    //                        System.out.println("- " + cookies.get(i).toString());
    //                    }
    //                }
    //                
    //                
    //                
    //            } finally {
    //                response2.close();
    //            }
    //            
    //            
    //            httpget = new HttpGet("http://v2ex.com/signin");
    //            response1 = httpclient.execute(httpget);
    //            try {
    //                HttpEntity entity = response1.getEntity();
    //                String content = EntityUtils.toString(entity);
    //                System.out.println("-----------------content---------------------");
    //                System.out.println(content);
    //                
    //                EntityUtils.consume(entity);
    //            } finally {
    //                response1.close();
    //            }
    //            
    //            
}

From source file:de.geeksfactory.opacclient.apis.Zones.java

static List<LentItem> parseMediaList(Document doc) {
    List<LentItem> lent = new ArrayList<>();

    DateTimeFormatter fmt = DateTimeFormat.forPattern("dd/MM/yyyy").withLocale(Locale.GERMAN);
    Pattern id_pat = Pattern.compile("javascript:renewItem\\('[0-9]+','(.*)'\\)");
    Pattern cannotrenew_pat = Pattern.compile("javascript:CannotRenewLoan\\('[0-9]+','(.*)','[0-9]+'\\)");

    for (Element table : doc
            .select(".LoansBrowseItemDetailsCellStripe table, " + ".LoansBrowseItemDetailsCell " + "table")) {
        LentItem item = new LentItem();

        for (Element tr : table.select("tr")) {
            String desc = tr.select(".LoanBrowseFieldNameCell").text().trim();
            String value = tr.select(".LoanBrowseFieldDataCell").text().trim();
            if (desc.equals("Titel")) {
                item.setTitle(value);/* www. j  ava  2  s  .c  om*/
                if (tr.select(".LoanBrowseFieldDataCell a[href]").size() > 0) {
                    String href = tr.select(".LoanBrowseFieldDataCell a[href]").attr("href");
                    Map<String, String> params = getQueryParamsFirst(href);
                    if (params.containsKey("BACNO")) {
                        item.setId(params.get("BACNO"));
                    }
                }
            }
            if (desc.equals("Verfasser"))
                item.setAuthor(value);
            if (desc.equals("Mediennummer"))
                item.setBarcode(value);
            if (desc.equals("ausgeliehen in"))
                item.setHomeBranch(value);
            if (desc.matches("F.+lligkeits.*datum")) {
                value = value.split(" ")[0];
                try {
                    item.setDeadline(fmt.parseLocalDate(value));
                } catch (IllegalArgumentException e) {
                    e.printStackTrace();
                }
            }
        }
        if (table.select(".button[Title~=Zum]").size() == 1) {
            Matcher matcher1 = id_pat.matcher(table.select(".button[Title~=Zum]").attr("href"));
            if (matcher1.matches())
                item.setProlongData(matcher1.group(1));
        } else if (table.select(".CannotRenewLink").size() == 1) {
            Matcher matcher = cannotrenew_pat.matcher(table.select(".CannotRenewLink").attr("href").trim());
            if (matcher.matches()) {
                item.setProlongData("cannotrenew|" + matcher.group(1));
            }
            item.setRenewable(false);
        }
        lent.add(item);
    }
    return lent;
}

From source file:gov.medicaid.screening.dao.impl.HealthOccupationsProgramCredentialDAOBean.java

/**
 * Performs a search for all possible results.
 *
 * @param criteria The search criteria.//ww w .  ja v a  2  s . co m
 * @return the search result for licenses
 *
 * @throws URISyntaxException if an error occurs while building the URL.
 * @throws ClientProtocolException if client does not support protocol used.
 * @throws IOException if an error occurs while parsing response.
 * @throws ParseException if an error occurs while parsing response.
 * @throws ServiceException for any other problems encountered
 */
private SearchResult<ProviderProfile> getAllResults(HealthOccupationsProgramCredentialSearchCriteria criteria)
        throws URISyntaxException, ClientProtocolException, IOException, ParseException, ServiceException {
    DefaultHttpClient client = new DefaultHttpClient(getLaxSSLConnectionManager());
    client.setRedirectStrategy(new LaxRedirectStrategy());

    HttpGet getSearch = new HttpGet(new URIBuilder(getSearchURL()).build());
    HttpResponse response = client.execute(getSearch);

    verifyAndAuditCall(getSearchURL(), response);
    EntityUtils.consume(response.getEntity());

    String fullSearchURL = Util.replaceLastURLPart(getSearchURL(), "credential_search.do");
    HttpPost search = new HttpPost(new URIBuilder(fullSearchURL).build());
    List<ProviderProfile> allProfiles = new ArrayList<ProviderProfile>();

    HttpEntity entity = postForm(fullSearchURL, client, search,
            new String[][] { { "city", Util.defaultString(criteria.getCity()) },
                    { "credentialNumber", Util.defaultLongString(criteria.getCredentialNumber()) },
                    { "firstName", Util.defaultString(criteria.getFirstName()) },
                    { "lastName", Util.defaultString(criteria.getLastName()) },
                    { "county", getDefaultValue(criteria.getCounty()) },
                    { "credentialStatus", getDefaultValue(criteria.getStatus()) },
                    { "credentialType", getDefaultValue(criteria.getOccupationType()) },
                    { "discipline", getDefaultValue(criteria.getDiscipline()) },
                    { "state", getDefaultValue(criteria.getState()) }, { "p_action", "search" } },
            false);

    Document page = Jsoup.parse(EntityUtils.toString(entity));

    Elements rows = page.select("table.formTable tr:gt(0)");
    for (Element row : rows) {
        allProfiles.add(parseProfile(row.children()));
    }

    SearchResult<ProviderProfile> searchResult = new SearchResult<ProviderProfile>();
    searchResult.setItems(allProfiles);
    return searchResult;
}

From source file:me.vertretungsplan.parser.ESchoolParser.java

@NotNull
SubstitutionSchedule parseESchoolSchedule(Document doc) throws IOException, JSONException {
    SubstitutionSchedule schedule = SubstitutionSchedule.fromData(scheduleData);

    String infoString = doc.select("#Content table").first().select("td").get(1).ownText();
    Pattern pattern = Pattern.compile("Letzte Aktualisierung:\u00a0(\\d{2}.\\d{2}.\\d{4} - \\d{2}:\\d{2})");
    Matcher matcher = pattern.matcher(infoString);
    if (matcher.find()) {
        LocalDateTime lastChange = DateTimeFormat.forPattern("dd.MM.yyyy - HH:mm")
                .parseLocalDateTime(matcher.group(1));
        schedule.setLastChange(lastChange);
    }/*from  w w  w  .j ava 2 s .c o m*/

    Elements titles = doc.select("center b");
    Elements tables = doc.select("table#DATA");

    if (!tables.get(0).text().contains("Keine Daten verfgbar")) {
        if (titles.size() != tables.size())
            throw new IOException("Anzahl berschriften != Anzahl Tabellen");

        for (int i = 0; i < titles.size(); i++) {
            SubstitutionScheduleDay day = new SubstitutionScheduleDay();
            day.setDate(ParserUtils.parseDate(titles.get(i).text()));
            parseTable(tables.get(i), day);
            schedule.addDay(day);
        }
    }

    schedule.setClasses(getAllClasses());
    schedule.setTeachers(getAllTeachers());
    return schedule;
}

From source file:me.vertretungsplan.parser.DSBLightParser.java

private void parseDay(String url, Map<String, String> referer, SubstitutionSchedule schedule, String startUrl)
        throws IOException, JSONException, CredentialInvalidException {
    String html = httpGet(url, data.optString(PARAM_ENCODING, null), referer);
    Document doc = Jsoup.parse(html);
    if (doc.title().toLowerCase().contains("untis") || doc.html().toLowerCase().contains("untis")
            || doc.select(".mon_list").size() > 0) {
        parseMultipleMonitorDays(schedule, doc, data);
        if (doc.select("meta[http-equiv=refresh]").size() > 0) {
            Element meta = doc.select("meta[http-equiv=refresh]").first();
            String attr = meta.attr("content").toLowerCase();
            String redirectUrl = url.substring(0, url.lastIndexOf("/") + 1)
                    + attr.substring(attr.indexOf("url=") + 4);
            if (!redirectUrl.equals(startUrl)) {
                parseDay(redirectUrl, referer, schedule, startUrl);
            }/*  w w w .  jav a2  s.c  o  m*/
        }
    }
}

From source file:com.illustrationfinder.process.post.HtmlPostProcessor.java

@Override
public List<String> generateKeywords() {
    // TODO If two words are always close to each other, they should be considered as an expression and managed like one word
    if (this.url == null)
        return null;

    try {/*w  ww.ja  v a 2 s .  c o  m*/
        // Retrieve the document and store it temporary
        try (final InputStream stream = this.url.openStream()) {
            final String rawText = IOUtils.toString(stream);

            // Retrieve useful HTML data
            final Document document = Jsoup.parse(rawText);

            String htmlTitle = document.title();
            String htmlKeywords = document.select("meta[name=keywords]").text();
            String htmlDescription = document.select("meta[name=description]").text();

            // Extract the content of the raw text
            String content = ArticleExtractor.getInstance().getText(rawText);

            // Now we apply a simple algorithm to get keywords
            //  1) We remove all punctuation marks from the title
            //  2) We remove all words with less than 4 characters
            //  3) We remove excessive spacing and tabulations

            htmlTitle = htmlTitle.toLowerCase();
            htmlTitle = htmlTitle.replaceAll(PUNCTUATION_REGEX, "");
            htmlTitle = htmlTitle.replaceAll(WORD_WITH_LESS_THAN_4_CHARACTERS_REGEX, "");
            htmlTitle = htmlTitle.replaceAll(EXCESSIVE_SPACING_REGEX, " ");

            final List<String> keywords = new ArrayList<>();
            final List<String> keywordsList = Arrays.asList(htmlTitle.split(" "));
            for (String tmp : keywordsList) {
                if (tmp.length() >= MINIMUM_WORD_LENGTH) {
                    keywords.add(tmp);
                }
            }

            // If there is enough keywords, we return
            if (keywords.size() >= MINIMUM_KEYWORDS_COUNT) {
                return keywords;
            } else {
                // Otherwise, we look for more keywords from the text by taking the more frequent words
                content = content.toLowerCase();
                content = content.replaceAll(PUNCTUATION_REGEX, "");
                content = content.replaceAll(WORD_WITH_LESS_THAN_4_CHARACTERS_REGEX, "");
                content = content.replaceAll(EXCESSIVE_SPACING_REGEX, " ");

                final Map<String, Integer> frequencies = new HashMap<>();
                final String[] words = content.split(" ");

                // Count word frequencies
                for (final String word : words) {
                    if (frequencies.containsKey(word)) {
                        frequencies.put(word, frequencies.get(word) + 1);
                    } else {
                        frequencies.put(word, 1);
                    }
                }

                // Sort the words per frequency
                final SortedMap<Integer, HashSet<String>> sortedWords = new TreeMap<>();

                for (Map.Entry<String, Integer> entry : frequencies.entrySet()) {
                    if (sortedWords.containsKey(entry.getValue())) {
                        sortedWords.get(entry.getValue()).add(entry.getKey());
                    } else {
                        final HashSet<String> set = new HashSet<>();
                        set.add(entry.getKey());
                        sortedWords.put(entry.getValue(), set);
                    }
                }

                // Add the most frequent words until we reach the minimu keywords count
                while (keywords.size() < MINIMUM_KEYWORDS_COUNT) {
                    final HashSet<String> set = sortedWords.get(sortedWords.lastKey());
                    final String keyword = set.iterator().next();

                    set.remove(keyword);
                    if (set.size() == 0) {
                        sortedWords.remove(sortedWords.lastKey());
                    }

                    if (keyword.length() > MINIMUM_WORD_LENGTH) {
                        keywords.add(keyword);
                    }
                }

                return keywords;
            }
        }
    } catch (BoilerpipeProcessingException e) {
        // TODO
        e.printStackTrace();
    } catch (IOException e) {
        // TODO
        e.printStackTrace();
    }

    return null;
}

From source file:web.analyzer.utils.Utils.java

public LinkResult getLinks(Document doc, String hostName) throws IOException {
    List<Link> linksInfo = new ArrayList<Link>();
    int totalInternalLink = 0;
    int totalExternalLink = 0;
    Elements links = doc.select("a[href]");
    for (Element link : links) {
        String href = link.attr("abs:href");
        if (isValidUrl(href)) {
            URL url = new URL(href);
            String linkHostName = url.getHost();
            String linkType = "";
            if (linkHostName.equalsIgnoreCase(hostName)) {
                linkType = "internal";
                totalInternalLink++;/* ww w . ja v a2 s  .c om*/
            } else {
                linkType = "external";
                totalExternalLink++;
            }

            linksInfo.add(new Link(href, linkType));
        }
    }

    return new LinkResult(linksInfo, totalInternalLink, totalExternalLink);
}

From source file:com.amastigote.xdu.query.module.WaterAndElectricity.java

private List<String> query_metInfo() throws IOException {
    Document document = getPage("", METINFO_SUFFIX);
    Elements elements = document.select("td");

    List<String> stringArrayList = new ArrayList<>();

    for (Element td : elements) {
        String tmp = td.text();//  ww  w .  j ava 2s.co  m
        if (!"".equals(tmp)) {
            stringArrayList.add(tmp);
        }
    }

    for (int i = 0; i < stringArrayList.size(); i++) {
        stringArrayList.set(i, stringArrayList.get(i).substring(stringArrayList.get(i).indexOf("") + 1));
    }

    /*
     * (stringArrayList):
     *      - 0, ????
     *      - ? [ ?? | ? | ? ]
     *      - , (3n), n???
     *
     *      - ?: ?null!
     */
    return stringArrayList;
}

From source file:me.vertretungsplan.parser.UntisMonitorParser.java

public SubstitutionSchedule getSubstitutionSchedule()
        throws IOException, JSONException, CredentialInvalidException {
    loginResponse = new LoginHandler(scheduleData, credential, cookieProvider).handleLoginWithResponse(executor,
            cookieStore);//from w w w.  j a va2s  .co m

    SubstitutionSchedule v = SubstitutionSchedule.fromData(scheduleData);

    JSONArray urls = scheduleData.getData().getJSONArray(PARAM_URLS);
    String encoding = scheduleData.getData().optString(PARAM_ENCODING, null);
    List<Document> docs = new ArrayList<>();

    for (int i = 0; i < urls.length(); i++) {
        JSONObject url = urls.getJSONObject(i);
        final String urlStr = url.getString(SUBPARAM_URL);
        for (String dateUrl : ParserUtils.handleUrlWithDateFormat(urlStr)) {
            loadUrl(dateUrl, encoding, url.getBoolean(SUBPARAM_FOLLOWING), docs);
        }
    }

    for (Document doc : docs) {
        if (scheduleData.getData().has(PARAM_EMBEDDED_CONTENT_SELECTOR)) {
            for (Element part : doc.select(scheduleData.getData().getString(PARAM_EMBEDDED_CONTENT_SELECTOR))) {
                SubstitutionScheduleDay day = parseMonitorDay(part, scheduleData.getData());
                v.addDay(day);
            }
        } else if (doc.title().contains("Untis") || doc.html().contains("<!--<title>Untis")) {
            SubstitutionScheduleDay day = parseMonitorDay(doc, scheduleData.getData());
            v.addDay(day);
        }
        // else Error

        if (scheduleData.getData().has(PARAM_LAST_CHANGE_SELECTOR)
                && doc.select(scheduleData.getData().getString(PARAM_LAST_CHANGE_SELECTOR)).size() > 0) {
            String text = doc.select(scheduleData.getData().getString(PARAM_LAST_CHANGE_SELECTOR)).first()
                    .text();
            String lastChange;
            Pattern pattern = Pattern.compile("\\d\\d\\.\\d\\d\\.\\d\\d\\d\\d,? \\d\\d:\\d\\d");
            Matcher matcher = pattern.matcher(text);
            if (matcher.find()) {
                lastChange = matcher.group();
            } else {
                lastChange = text;
            }
            v.setLastChangeString(lastChange);
            v.setLastChange(ParserUtils.parseDateTime(lastChange));
        }
    }

    if (scheduleData.getData().has(PARAM_WEBSITE)) {
        v.setWebsite(scheduleData.getData().getString(PARAM_WEBSITE));
    } else if (urls.length() == 1) {
        v.setWebsite(urls.getJSONObject(0).getString("url"));
    }

    v.setClasses(getAllClasses());
    v.setTeachers(getAllTeachers());

    return v;
}