Example usage for org.jsoup.select Elements first

List of usage examples for org.jsoup.select Elements first

Introduction

In this page you can find the example usage for org.jsoup.select Elements first.

Prototype

public Element first() 

Source Link

Document

Get the first matched element.

Usage

From source file:org.sbs.goodcrawler.extractor.selector.UrlElementCssSelector.java

@SuppressWarnings("unchecked")
@Override// w ww.j a v  a2s  .  c  o m
public HashMap<String, Object> getContent() throws ExtractException {
    if (null != content && !newDoc) {
        return content;
    }
    // ??document
    if (StringUtils.isNotBlank(this.url) && !newDoc) {
        return content;
    }
    // ?documentSelector
    if (super.document != null) {
        Elements elements = super.document.select(value);
        if (elements.isEmpty())
            return null;
        switch ($Attr) {
        case text:
            this.url = elements.first().text();
            break;
        default:
            this.url = elements.first().attr(attr);
            break;
        }
    }
    if (StringUtils.isNotBlank(this.url)) {
        Document doc = null;
        PageFetchResult result = null;
        try {
            WebURL webUrl = new WebURL();
            webUrl.setURL(this.url);
            result = FetchForeman.fetcher.fetchHeader(webUrl);
            // ??
            int statusCode = result.getStatusCode();
            if (statusCode == CustomFetchStatus.PageTooBig) {
                return null;
            }
            if (statusCode != HttpStatus.SC_OK) {
                return null;
            } else {
                Page page = new Page(webUrl);
                if (!result.fetchContent(page)) {
                    return null;
                }
                if (!parser.parse(page, webUrl.getURL())) {
                    return null;
                }
                doc = Jsoup.parse(new String(page.getContentData(), page.getContentCharset()),
                        urlUtils.getBaseUrl(page.getWebURL().getURL()));
            }
        } catch (IOException e) {
            e.printStackTrace();
            throw new ExtractException(e.getMessage());
        } finally {
            if (result != null)
                result.discardContentIfNotConsumed();
        }

        content = Maps.newHashMap();
        if (selectors != null)
            for (ElementCssSelector<?> selector : selectors) {
                if (selector instanceof FileElementCssSelector) {
                    Map<String, Object> m = ((FileElementCssSelector) selector).setResult(content)
                            .setDocument(doc).getContentMap();
                    if ((null == m || m.size() == 0) && selector.isRequired()) {
                        return null;
                    } else {
                        if (null != m && m.size() > 0)
                            content.putAll(m);
                    }
                } else {
                    Map<String, Object> m = selector.setDocument(doc).getContentMap();
                    if ((null == m || m.size() == 0) && selector.isRequired()) {
                        return null;
                    } else {
                        if (null != m && m.size() > 0)
                            content.putAll(m);
                    }
                }
            }
        return content;
    }
    newDoc = false;
    return null;
}

From source file:eu.masconsult.bgbanking.banks.dskbank.DskClient.java

@Override
public String authenticate(String username, String password)
        throws IOException, ParseException, CaptchaException {
    final HttpResponse resp;
    final ArrayList<NameValuePair> params = new ArrayList<NameValuePair>();
    params.add(new BasicNameValuePair(PARAM_USERNAME, username));
    params.add(new BasicNameValuePair(PARAM_PASSWORD, password));
    final HttpEntity entity;
    try {//  w w w .j  a  va  2s.co  m
        entity = new UrlEncodedFormEntity(params);
    } catch (final UnsupportedEncodingException e) {
        // this should never happen.
        throw new IllegalStateException(e);
    }
    String uri = BASE_URL + "?"
            + URLEncodedUtils.format(Arrays.asList(new BasicNameValuePair(XML_ID, AUTH_XML_ID)), ENCODING);
    Log.i(TAG, "Authenticating to: " + uri);
    final HttpPost post = new HttpPost(uri);
    post.addHeader(entity.getContentType());
    post.setHeader("Accept", "*/*");
    post.setEntity(entity);
    try {
        resp = getHttpClient().execute(post);

        if (resp.getStatusLine().getStatusCode() != HttpStatus.SC_OK) {
            throw new ParseException("login: unhandled http status " + resp.getStatusLine().getStatusCode()
                    + " " + resp.getStatusLine().getReasonPhrase());
        }

        String response = EntityUtils.toString(resp.getEntity());
        Log.v(TAG, "response = " + response);

        Document doc = Jsoup.parse(response, BASE_URL);
        Element mainForm = doc.getElementById("mainForm");
        if (mainForm == null) {
            throw new ParseException("login: missing mainForm");
        }

        String action = BASE_URL + mainForm.attr("action");
        Log.v(TAG, "action=" + action);
        UrlQuerySanitizer sanitizer = new UrlQuerySanitizer(action);
        String user_id = sanitizer.getValue(PARAM_USER_ID);
        String session_id = sanitizer.getValue(PARAM_SESSION_ID);

        if (user_id == null || "".equals(user_id) || session_id == null || "".equals(session_id)) {
            if (doc.getElementsByClass("redtext").size() > 0) {
                // bad authentication
                return null;
            } else {
                // TODO handle captcha
                Elements captcha = doc.select("input[name=captcha_hkey]");
                if (captcha != null && captcha.size() == 1) {
                    String captchaHash = captcha.first().attr("value");
                    String captchaUri = BASE_URL + "?"
                            + URLEncodedUtils
                                    .format(Arrays.asList(new BasicNameValuePair(XML_ID, CAPTCHA_XML_ID),
                                            new BasicNameValuePair("captcha_key", captchaHash)), ENCODING);
                    throw new CaptchaException(captchaUri);
                }
                throw new ParseException("no user_id or session_id: " + action);
            }
        }

        return URLEncodedUtils.format(Arrays.asList(new BasicNameValuePair(PARAM_USER_ID, user_id),
                new BasicNameValuePair(PARAM_SESSION_ID, session_id)), ENCODING);
    } catch (ClientProtocolException e) {
        throw new IOException(e.getMessage());
    }
}

From source file:com.sinelead.car.club.NewsFragment.java

public void parseNewsUrl() {
    HttpCache httpCache = new HttpCache(context);
    httpCache.httpGet("http://m.xincheping.com/", new HttpCacheListener() {

        protected void onPreGet() {
            // do something like show progressBar before httpGet, runs on
            // the UI thread
        }// www  .  ja v  a  2 s .c  o m

        protected void onPostGet(HttpResponse httpResponse, boolean isInCache) {
            // do something like show data after httpGet, runs on the UI
            // thread
            if (httpResponse != null) {
                // get data success
                String html = httpResponse.getResponseBody();
                Document doc = Jsoup.parse(html);
                Elements uls = doc.select("ul.slides"); // classul

                bannerList = uls.first().getElementsByTag("a");

                if (imagePagerAdapter != null) {
                    imagePagerAdapter.setBannerList(bannerList);
                    imagePagerAdapter.notifyDataSetChanged();
                }

            } else {
                // get data fail
            }
        }
    });
    return;
}

From source file:com.github.binlee1990.spider.video.spider.PersonCrawler.java

private void setVideoReleaseDate(Document doc, Video video) {
    Elements rdElements = doc.select("div#video_date td.text");
    if (CollectionUtils.isNotEmpty(rdElements)) {
        String releaseDate = rdElements.first().text().toString();
        SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd");
        try {/*w w  w  . j  a  v a  2s. c  o m*/
            Date date = sdf.parse(releaseDate);
            video.setReleaseDate(date);
        } catch (ParseException e) {
        }
    }
}

From source file:com.github.binlee1990.spider.video.spider.PersonCrawler.java

private void setVideoDuration(Document doc, Video video) {
    Elements dmElements = doc.select("div#video_length span.text");
    if (CollectionUtils.isNotEmpty(dmElements)) {
        String durationMinutes = dmElements.first().text().toString();
        video.setDurationMinutes(Integer.valueOf(durationMinutes));
    }//from   w  ww. ja v a  2 s . c  om
}

From source file:org.confab.PhpBB3Parser.java

public List<Forum> parseForums(Document root, BulletinBoard parent) {
    Utilities.debug("parseForums");

    List<Forum> ret = new ArrayList<Forum>();

    // get table/*  ww w . j  a  va 2 s  . c  o m*/
    Elements forum_tables = root.select("ul[class=topiclist forums]");
    assert !forum_tables.isEmpty() : root.html();

    for (Element forum_table : forum_tables) {
        Elements els_li = forum_table.select("li.row");
        assert !els_li.isEmpty();
        for (Element el_li : els_li) {
            Forum new_forum = new Forum(parent);

            // Get the forum url
            Elements els_a = el_li.select("a.forumtitle");
            Element el_a = els_a.first();
            assert el_a != null;
            new_forum.url = el_a.attr("href");
            assert new_forum.url != null;
            Utilities.debug("new_forum.url : " + new_forum.url);

            // Get the title text
            new_forum.title = el_a.text();
            assert new_forum.title != null;
            Utilities.debug("new_forum.title : " + new_forum.title);

            // Check for any subforums in remaining a elements
            els_a.remove(els_a.first());
            for (Element _el_a : els_a) {
                Forum sub_forum = new Forum(parent);
                sub_forum.url = el_a.attr("href");
                assert sub_forum.url != null;
                sub_forum.title = el_a.text();
                assert sub_forum.title != null;
                new_forum.subForums.add(sub_forum);
                Utilities.debug("added subForum: " + sub_forum.title);
            }

            // Get the description/message of this topic
            String el_description = el_a.parent().text();
            if (el_description != null) {
                new_forum.description = el_description;
            } else {
                new_forum.description = "";
            }
            Utilities.debug("new_forum.description : " + new_forum.description);

            Utilities.debug("new_forum.parent.url : " + new_forum.parent.url);

            ret.add(new_forum);
            Utilities.debug("-----");
        }
    }
    Utilities.debug("end parseForums");
    return ret;
}

From source file:com.github.binlee1990.spider.video.spider.PersonCrawler.java

private void setVideoPeople(Document doc, Video video) {
    Elements dElements = doc.select("div#video_director td.text");
    if (CollectionUtils.isNotEmpty(dElements)) {
        String director = dElements.first().text().toString();
        video.setDirector(director);// w  ww .  j av  a 2  s  .c  om
    }

    Elements pElements = doc.select("div#video_maker td.text");
    if (CollectionUtils.isNotEmpty(pElements)) {
        String producer = pElements.first().text().toString();
        video.setProducer(producer);
    }

    Elements disElements = doc.select("div#video_label td.text");
    if (CollectionUtils.isNotEmpty(disElements)) {
        String distributor = disElements.first().text().toString();
        video.setDistributor(distributor);
    }
}

From source file:com.github.binlee1990.spider.video.spider.PersonCrawler.java

private void setVideoScore(Document doc, Video video) {
    Elements sElements = doc.select("div#video_review td.text span.score");
    if (CollectionUtils.isNotEmpty(sElements)) {
        String score = sElements.first().text().toString();
        score = StringUtils.replace(score, "(", "");
        score = StringUtils.replace(score, ")", "");
        if (StringUtils.isNotBlank(score)) {
            try {
                video.setScore(Float.valueOf(score));
            } catch (Exception e) {
            }/*w  ww .  j  a  v a2s  .com*/
        }
    }
}

From source file:lolthx.autohome.buy.AutohomePriceListFetch.java

@Override
public void parse(String result, Task task) throws Exception {
    if (StringUtils.isBlank(result)) {
        return;//from  w w  w .ja  v  a  2  s .c  o  m
    }

    Date start = task.getStartDate();
    Date end = task.getEndDate();

    Document doc = Jsoup.parse(result);
    Elements lis = doc.select("li.price-item");

    AutohomePriceInfoBean bean = new AutohomePriceInfoBean();

    for (Element li : lis) {

        try {
            Elements postTimeEl = li.select("div.user-name span");
            String postTime = "";
            if (!postTimeEl.isEmpty()) {
                postTime = StringUtils.trim(
                        StringUtils.substringBefore(postTimeEl.first().text(), "?").replaceAll("", ""));

                if (!isTime(postTime, start, end)) {
                    continue;
                }
            }
            bean.setPostTime(postTime);
            bean.setUrl(task.getUrl());
            bean.setForumId(StringUtils.substringBefore(task.getExtra(), ":"));
            bean.setProjectName(task.getProjectName());
            bean.setKeyword(StringUtils.substringAfter(task.getExtra(), ":"));

            // post id
            Elements id = li.select("div.price-share a.share");
            if (!id.isEmpty()) {
                String idStr = id.first().attr("data-target");
                idStr = StringUtils.substringAfterLast(idStr, "_");
                if (StringUtils.isBlank(idStr)) {
                    continue;
                }

                bean.setId(idStr);
            }

            // 
            Elements user = li.select("div.user-name a");
            if (!user.isEmpty()) {
                String userUrl = user.first().absUrl("href");
                String userId = StringUtils.substringAfterLast(userUrl, "/");
                String userName = user.first().text();

                bean.setUserId(userId);
                bean.setUserUrl(userUrl);
                bean.setUserName(userName);
            }

            Elements dataLis = li.select("div.price-item-bd li");
            for (Element dataLi : dataLis) {
                String data = dataLi.text();

                if (StringUtils.startsWith(data, "")) {
                    bean.setCar(StringUtils.trim(StringUtils.substringAfter(data, "")));
                }

                if (StringUtils.startsWith(data, "")) {
                    bean.setPrice(StringUtils.trim(StringUtils.substringAfter(data, "")));
                }

                if (StringUtils.startsWith(data, "")) {
                    bean.setGuidePrice(StringUtils.trim(StringUtils.substringAfter(data, "")));
                }

                if (StringUtils.startsWith(data, "?")) {
                    bean.setTotalPrice(StringUtils.trim(StringUtils.substringAfter(data, "")));
                }

                if (StringUtils.startsWith(data, "")) {
                    bean.setPurchaseTax(StringUtils.trim(StringUtils.substringAfter(data, "")));
                }

                if (StringUtils.startsWith(data, "?")) {
                    bean.setCommercialInsurance(StringUtils.trim(StringUtils.substringAfter(data, "")));
                }

                if (StringUtils.startsWith(data, "")) {
                    bean.setVehicleUseTax(StringUtils.trim(StringUtils.substringAfter(data, "")));
                }
                if (StringUtils.startsWith(data, "")) {
                    bean.setCompulsoryInsurance(StringUtils.trim(StringUtils.substringAfter(data, "")));
                }
                if (StringUtils.startsWith(data, "")) {
                    bean.setLicenseFee(StringUtils.trim(StringUtils.substringAfter(data, "")));
                }
                if (StringUtils.startsWith(data, "?")) {
                    bean.setPromotion(StringUtils.trim(StringUtils.substringAfter(data, "")));
                }
                if (StringUtils.startsWith(data, "")) {
                    bean.setBuyTime(StringUtils.trim(StringUtils.substringAfter(data, "")));
                }
                if (StringUtils.startsWith(data, "")) {
                    String area = StringUtils.trim(StringUtils.substringAfter(data, ""));
                    String[] pAndC = StringUtils.splitByWholeSeparator(area, ",", 2);

                    if (pAndC.length == 1) {
                        bean.setBuyProvince(pAndC[0]);
                        bean.setBuyCity(pAndC[0]);
                    }

                    if (pAndC.length == 2) {
                        bean.setBuyProvince(pAndC[0]);
                        bean.setBuyCity(pAndC[1]);
                    }

                }
                if (StringUtils.startsWith(data, "")) {
                    Elements level = dataLi.select("span.level");
                    // 
                    if (!level.isEmpty()) {
                        bean.setSellerComment(level.first().text());
                    }

                    // ?
                    Elements seller = dataLi.select("a.title");
                    if (!seller.isEmpty()) {
                        String sellerUrl = seller.first().absUrl("href");
                        String sellerName = seller.first().text();
                        String sellerId = StringUtils.substringAfterLast(sellerUrl, "/");

                        bean.setSellerId(sellerId);
                        bean.setSellerName(sellerName);
                        bean.setSellerUrl(sellerUrl);
                    }

                    // ?
                    Elements sellerPhone = dataLi.select("em.phone-num");
                    if (!sellerPhone.isEmpty()) {
                        bean.setSellerPhone(sellerPhone.first().text());
                    }

                    // ?
                    // Elements sellerAddress =
                    // dataLi.select("em.phone-num");

                }
                if (StringUtils.startsWith(data, "?")) {
                    bean.setBuyFeeling(StringUtils.trim(StringUtils.substringAfter(data, "")));
                }
            }
            bean.saveOnNotExist();
        } catch (Exception e) {
            e.printStackTrace();
            continue;
        }
    }
}

From source file:eu.masconsult.bgbanking.banks.dskbank.DskClient.java

@Override
public List<RawBankAccount> getBankAccounts(String authToken)
        throws IOException, ParseException, AuthenticationException {
    String uri = BASE_URL + "?" + URLEncodedUtils.format(
            Arrays.asList(new BasicNameValuePair(XML_ID, LIST_ACCOUNTS_XML_ID)), ENCODING) + "&" + authToken;

    // Get the accounts list
    Log.i(TAG, "Getting from: " + uri);
    final HttpGet get = new HttpGet(uri);
    get.setHeader("Accept", "*/*");

    DefaultHttpClient httpClient = getHttpClient();

    Log.v(TAG, "sending " + get.toString());
    final HttpResponse resp = httpClient.execute(get);

    if (resp.getStatusLine().getStatusCode() != HttpStatus.SC_OK) {
        throw new ParseException("getBankAccounts: unhandled http status "
                + resp.getStatusLine().getStatusCode() + " " + resp.getStatusLine().getReasonPhrase());
    }//  w ww .  j ava2  s .c  o m

    HttpEntity entity = resp.getEntity();
    Document doc = Jsoup.parse(entity.getContent(), "utf-8", BASE_URL);

    if (!checkLoggedIn(doc)) {
        throw new AuthenticationException("session expired!");
    }

    Element content = doc.getElementById("PageContent");
    if (content == null) {
        throw new ParseException("getBankAccounts: can't find PageContent");
    }

    Elements tables = content.getElementsByTag("table");
    if (tables == null || tables.size() == 0) {
        throw new ParseException("getBankAccounts: can't find table in PageContent");
    }

    Elements rows = tables.first().getElementsByTag("tr");
    if (rows == null || rows.size() == 0) {
        throw new ParseException("getBankAccounts: first table is empty in PageContent");
    }

    ArrayList<RawBankAccount> bankAccounts = new ArrayList<RawBankAccount>(rows.size());

    String lastCurrency = null;
    for (Element row : rows) {
        RawBankAccount bankAccount = obtainBankAccountFromHtmlTableRow(row);
        if (bankAccount != null) {
            if (bankAccount.getCurrency() == null) {
                bankAccount.setCurrency(lastCurrency);
            } else {
                lastCurrency = bankAccount.getCurrency();
            }
            bankAccounts.add(bankAccount);
        }
    }

    return bankAccounts;
}