Example usage for org.jsoup.nodes Element select

List of usage examples for org.jsoup.nodes Element select

Introduction

In this page you can find the example usage for org.jsoup.nodes Element select.

Prototype

public Elements select(String cssQuery) 

Source Link

Document

Find elements that match the Selector CSS query, with this element as the starting context.

Usage

From source file:mobi.jenkinsci.ci.client.JenkinsClient.java

private String getCommitIdFromRow(final Element row) {
    final Element fullChangeDesc = row.select("div[class=changeset-message]").first();
    if (fullChangeDesc == null) {
        return null;
    }//from  ww  w  .  j a  v  a 2 s  .  com
    final Element message = fullChangeDesc.select("b").first();
    final String messageText = message.childNode(0).toString();
    final Matcher commitMatch = Pattern.compile("Commit ([^ ]+)").matcher(messageText);
    if (commitMatch.find()) {
        return commitMatch.group(1);
    } else {
        return null;
    }
}

From source file:net.parser.JobParser.java

public boolean regionalJobFirst() {

    Elements elements = doc.select(".searchlist").eq(0);
    for (Element element : elements) {
        if (element.select("h2 span").text().contains("Regionalni poslovi")) {
            return true;
        } else {/*w ww  .  j a va  2  s .co  m*/
            return false;
        }
    }
    return false;
}

From source file:gov.medicaid.screening.dao.impl.LicensedProviderCommonDAO.java

/**
 * Parse the ProviderProfile information.
 * /*from  w ww .  ja v a 2s . co m*/
 * @param header
 *            the header element
 * @param body
 *            the body element
 * @throws ParsingException
 *             if any error occurs when parsing
 * @return parsed ProviderProfile
 */
private ProviderProfile parseProfile(Element header, Element body) throws ParsingException {
    try {
        String name = header.select("a").html();
        String str1 = body.select("td").get(0).html();
        String str2 = body.select("td").get(1).html();

        String licenseNumber = Util.getStringInBetween(str2, "License number:", "<br />");
        String[] sp = str1.split("<br />");
        String address = "", phone = "", county = "", city = "", state = "", zipcode = "";
        address = sp[0].trim();
        if (sp.length > 3) {
            phone = sp[2].trim();
        }
        county = sp[sp.length - 1].trim();
        sp = sp[1].trim().split(",");
        city = sp[0].trim();
        sp = sp[1].trim().split(" ");
        state = sp[0].trim();
        zipcode = sp[1].trim();
        ProviderProfile profile = new ProviderProfile();
        // name
        Business business = new Business();
        profile.setBusiness(business);
        business.setName(name);

        // address
        List<Address> addresses = new ArrayList<Address>();
        Address addressObj = new Address();
        addresses.add(addressObj);
        profile.setAddresses(addresses);
        addressObj.setCity(city);
        addressObj.setCounty(county);
        addressObj.setState(state);
        addressObj.setLocation(address);
        addressObj.setZipcode(zipcode);

        // phone
        profile.setContactPhoneNumber(phone);

        // license
        List<License> licenses = new ArrayList<License>();
        License licenseObj = new License();
        licenses.add(licenseObj);
        profile.setLicenses(licenses);
        licenseObj.setLicenseNumber(licenseNumber);
        profile.setProviderType(getProviderType());
        return profile;
    } catch (Throwable e) {
        throw new ParsingException("Failed to parse the html", e);
    }
}

From source file:lolth.autohome.buy.AutohomeBuyInfoListTaskFetch.java

@Override
protected void parsePage(Document doc, FetchTask task) throws Exception {
    Elements lis = doc.select("li.price-item");

    for (Element li : lis) {
        AutohomeBuyInfoBean bean = new AutohomeBuyInfoBean();
        bean.setUrl(task.getUrl());/*from w w w .  j a  v a2 s . c o  m*/
        bean.setForumId(task.getExtra());

        // post id
        Elements id = li.select("div.price-share a.share");
        if (!id.isEmpty()) {
            String idStr = id.first().attr("data-target");
            idStr = StringUtils.substringAfterLast(idStr, "_");
            if (StringUtils.isBlank(idStr)) {
                continue;
            }

            bean.setId(idStr);
        }

        // 
        Elements user = li.select("div.user-name a");
        if (!user.isEmpty()) {
            String userUrl = user.first().absUrl("href");
            String userId = StringUtils.substringAfterLast(userUrl, "/");
            String userName = user.first().text();

            bean.setUserId(userId);
            bean.setUserUrl(userUrl);
            bean.setUserName(userName);
        }

        // ?
        Elements postTime = li.select("div.user-name span");
        if (!postTime.isEmpty()) {
            bean.setPostTime(StringUtils.trim(StringUtils.substringBefore(postTime.first().text(), "?")));
        }

        Elements dataLis = li.select("div.price-item-bd li");
        for (Element dataLi : dataLis) {
            String data = dataLi.text();

            if (StringUtils.startsWith(data, "")) {
                bean.setCar(StringUtils.trim(StringUtils.substringAfter(data, "")));
            }

            if (StringUtils.startsWith(data, "")) {
                bean.setPrice(StringUtils.trim(StringUtils.substringAfter(data, "")));
            }

            if (StringUtils.startsWith(data, "")) {
                bean.setGuidePrice(StringUtils.trim(StringUtils.substringAfter(data, "")));
            }

            if (StringUtils.startsWith(data, "?")) {
                bean.setTotalPrice(StringUtils.trim(StringUtils.substringAfter(data, "")));
            }

            if (StringUtils.startsWith(data, "")) {
                bean.setPurchaseTax(StringUtils.trim(StringUtils.substringAfter(data, "")));
            }

            if (StringUtils.startsWith(data, "?")) {
                bean.setCommercialInsurance(StringUtils.trim(StringUtils.substringAfter(data, "")));
            }

            if (StringUtils.startsWith(data, "")) {
                bean.setVehicleUseTax(StringUtils.trim(StringUtils.substringAfter(data, "")));
            }
            if (StringUtils.startsWith(data, "")) {
                bean.setCompulsoryInsurance(StringUtils.trim(StringUtils.substringAfter(data, "")));
            }
            if (StringUtils.startsWith(data, "")) {
                bean.setLicenseFee(StringUtils.trim(StringUtils.substringAfter(data, "")));
            }
            if (StringUtils.startsWith(data, "?")) {
                bean.setPromotion(StringUtils.trim(StringUtils.substringAfter(data, "")));
            }
            if (StringUtils.startsWith(data, "")) {
                bean.setBuyTime(StringUtils.trim(StringUtils.substringAfter(data, "")));
            }
            if (StringUtils.startsWith(data, "")) {
                String area = StringUtils.trim(StringUtils.substringAfter(data, ""));
                String[] pAndC = StringUtils.splitByWholeSeparator(area, ",", 2);

                if (pAndC.length == 1) {
                    bean.setBuyProvince(pAndC[0]);
                    bean.setBuyCity(pAndC[0]);
                }

                if (pAndC.length == 2) {
                    bean.setBuyProvince(pAndC[0]);
                    bean.setBuyCity(pAndC[1]);
                }

            }
            if (StringUtils.startsWith(data, "")) {
                Elements level = dataLi.select("span.level");
                // 
                if (!level.isEmpty()) {
                    bean.setSellerComment(level.first().text());
                }

                // ?
                Elements seller = dataLi.select("a.title");
                if (!seller.isEmpty()) {
                    String sellerUrl = seller.first().absUrl("href");
                    String sellerName = seller.first().text();
                    String sellerId = StringUtils.substringAfterLast(sellerUrl, "/");

                    bean.setSellerId(sellerId);
                    bean.setSellerName(sellerName);
                    bean.setSellerUrl(sellerUrl);
                }

                // ?
                Elements sellerPhone = dataLi.select("em.phone-num");
                if (!sellerPhone.isEmpty()) {
                    bean.setSellerPhone(sellerPhone.first().text());
                }

                // ?
                // Elements sellerAddress = dataLi.select("em.phone-num");

            }
            if (StringUtils.startsWith(data, "?")) {
                bean.setBuyFeeling(StringUtils.trim(StringUtils.substringAfter(data, "")));
            }
        }

        log.debug("Bean : {}", bean);

        bean.persistOnNotExist();
    }
}

From source file:de.tudarmstadt.ukp.argumentation.data.roomfordebate.NYTimesArticleExtractor.java

public Article extractArticle(String html) throws ParseException, IOException {
    Article result = new Article();

    Document doc = Jsoup.parse(html, getBaseName());

    Element element;
    try {/* w w w  .j a v  a2  s.  co m*/
        element = doc.select("article.rfd").iterator().next();
    } catch (NoSuchElementException exception) {
        throw new IOException("Cannot find article.rfd element");
    }

    //      System.out.println(element);

    String dateText = element.select("p.pubdate").text().replaceAll("Updated[\\s]+", "");
    // time
    try {
        DateFormat df = new SimpleDateFormat("MMM dd, yyyy, hh:mm aaa", Locale.ENGLISH);
        Date date = df.parse(dateText);
        result.setTimestamp(date);
    } catch (ParseException e) {
        // June 24, 2015
        DateFormat df = new SimpleDateFormat("MMM dd, yyyy", Locale.ENGLISH);
        Date date = df.parse(dateText);
        result.setTimestamp(date);
    }

    // title
    result.setTitle(TextCleaningUtils.normalizeWithParagraphs(element.select("h1").text()));

    // text
    StringBuilder sb = new StringBuilder();
    for (Element p : element.select("div.nytint-post > p")) {
        sb.append(p.text());
        sb.append("\n");
    }
    result.setText(TextCleaningUtils.normalizeWithParagraphs(sb.toString()));

    // debate title
    result.setDebateTitle(TextCleaningUtils
            .normalizeWithParagraphs(doc.select("div.nytint-discussion-overview > h2").text()));

    // debate url
    result.setDebateUrl(doc.select("div.nytint-discussion-overview > h2 > a").iterator().next().attr("href"));

    // document url
    result.setUrl(doc.select("meta[name=communityAssetURL]").attr("content"));

    // debate description
    result.setDebateDescription(TextCleaningUtils
            .normalizeWithParagraphs(((TextNode) doc.select("div.nytint-discussion-overview > p").iterator()
                    .next().childNodes().iterator().next()).text()));

    // aurhor
    result.setAuthor(element.select("div.nytint-mugshots > img").iterator().next().attr("alt"));

    // topics
    for (Element a : element.select("p.nytint-tags > a")) {
        result.getTopics().add(a.attr("href"));
    }

    return result;
}

From source file:de.geeksfactory.opacclient.apis.BiBer1992.java

static List<ReservedItem> parseResList(Document doc, JSONObject data) throws JSONException {
    List<ReservedItem> reservations = new ArrayList<>();
    if (doc == null) {
        // error message as html result
        return reservations;
    }//from   w  w  w .j a v a 2  s . c  om

    // parse result list
    JSONObject copymap;
    if (!data.has("reservationtable")) {
        // reservations not specifically supported, let's just try it
        // with default values but fail silently
        copymap = new JSONObject();
        copymap.put("author", 3);
        copymap.put("availability", 6);
        copymap.put("branch", -1);
        copymap.put("cancelurl", -1);
        copymap.put("expirationdate", 5);
        copymap.put("title", 3);
    } else {
        copymap = data.getJSONObject("reservationtable");
    }

    DateTimeFormatter fmt = DateTimeFormat.forPattern("dd.MM.yyyy").withLocale(Locale.GERMAN);
    Elements rowElements = doc.select("form[name=vorml] table tr");

    // rows: skip 1st row -> title row
    for (int i = 1; i < rowElements.size(); i++) {
        Element tr = rowElements.get(i);
        if (tr.child(0).tagName().equals("th")) {
            continue;
        }
        ReservedItem item = new ReservedItem();

        item.setCancelData(tr.select("input[type=checkbox]").attr("name"));

        // columns: all elements of one media
        Iterator<?> keys = copymap.keys();
        while (keys.hasNext()) {
            String key = (String) keys.next();
            int index = copymap.getInt(key);
            if (index >= 0) {
                String value = tr.child(index).text().trim();

                switch (key) {
                case "author":
                    value = findTitleAndAuthor(value)[1];
                    break;
                case "title":
                    value = findTitleAndAuthor(value)[0];
                    break;
                case "availability":
                    try {
                        value = fmt.parseLocalDate(value).toString();
                    } catch (IllegalArgumentException e1) {
                        key = "status";
                    }
                    break;
                case "expirationdate":
                    try {
                        value = fmt.parseLocalDate(value).toString();
                    } catch (IllegalArgumentException e1) {
                        key = "status";
                    }
                    break;
                }

                if (value != null && value.length() != 0) {
                    item.set(key, value);
                }

            }
        }
        reservations.add(item);
    }
    return reservations;
}

From source file:net.devietti.ArchConfMapServlet.java

/** Fetch info for a list of conferences from WikiCFP */
private List<Conf> getConfInfo(List<String> confs) throws IOException {
    String query = StringUtils.join(confs, "+");
    List<Conf> results = new LinkedList<Conf>();

    /*//from  w  w  w  .  jav a  2 s  . com
     * NB: year=f returns hits for this year and future years. This is exactly what we want, since
     * we automatically discard conferences that have already happened.
     */
    Document doc = getURL("http://www.wikicfp.com/cfp/servlet/tool.search?year=f&q=" + query);

    Elements rows = doc.select("div[class=contsec] table table tr");
    for (Iterator<Element> iter = rows.iterator(); iter.hasNext();) {
        final Element firstRow = iter.next();
        final Elements confName = firstRow.select("td a");
        if (confName.isEmpty())
            continue;

        final Conf conf = new Conf();

        // make sure we match one of the conferences we're interested in
        String cn = confName.first().text().split(" ")[0];
        int found = Arrays.binarySearch(CONFERENCE_NAMES, cn);
        if (found < 0)
            continue; // not found

        final String confFullName = firstRow.select("td").get(1).text();
        // don't match other ICS conferences, eg Information, Communication, Society
        if (CONFERENCE_NAMES[found].equals("ICS")) {
            if (!confFullName.toLowerCase().contains("supercomputing")) {
                continue;
            }
        }
        // don't match other CC conferences, eg Creative Construction
        if (CONFERENCE_NAMES[found].equals("CC")) {
            if (!confFullName.toLowerCase().contains("compiler")) {
                continue;
            }
        }

        conf.name = confName.first().text();

        /*
         * we found a hit! The conference information is split across two <tr> table elements.
         * Conference name and link to cfp are in the first <tr>, and dates, location and deadline
         * in the second.
         */

        final Element secondRow = iter.next();
        String dates = secondRow.select("td").first().text();
        String startDate = dates.substring(0, dates.indexOf('-')).trim();
        conf.start = cfpDateFormat.parseDateTime(startDate);
        conf.end = cfpDateFormat.parseDateTime(dates.substring(dates.indexOf('-') + 1).trim());

        conf.dates = cfpDateFormat.print(conf.start) + " - " + cfpDateFormat.print(conf.end);
        if (conf.start.year().equals(conf.end.year())
                && conf.start.monthOfYear().equals(conf.end.monthOfYear())) {
            conf.dates = monthFormat.print(conf.start) + " " + dayFormat.print(conf.start) + "-"
                    + dayFormat.print(conf.end) + " " + yearFormat.print(conf.start);
        }

        String deadline = secondRow.select("td").get(2).text().trim();
        if (deadline.contains("(")) { // abstract deadline may be in parentheses
            deadline = deadline.substring(0, deadline.indexOf('(')).trim();
        }
        conf.deadline = cfpDateFormat.parseDateTime(deadline);

        conf.url = "http://www.wikicfp.com" + confName.attr("href");
        /*
         * extract the WikiCFP eventid from the link, so that, later on, the client can pull the
         * cfp page and get the direct conference site link.
         */

        com.shopobot.util.URL url = new com.shopobot.util.URL(conf.url);
        String[] eid = url.getParameters("eventid");
        if (0 == eid.length)
            continue;
        try {
            conf.eventid = Integer.valueOf(eid[0]);
        } catch (NumberFormatException e) {
            error("invalid event id " + eid);
            continue;
        }

        conf.location = secondRow.select("td").get(1).text();

        results.add(conf);
    }
    return results;
}

From source file:net.parser.JobParser.java

public boolean checkIfLastPage() {

    Elements elements1 = doc.select(".searchlist").eq(1);

    if (elements1.size() == 0) {
        Elements elements2 = doc.select(".searchlist").eq(0);
        for (Element element : elements2) {
            if (element.select("h2 span").text().contains("Regionalni poslovi")) {
                return true;
            } else {
                return false;
            }/*from   w w w.j  a v  a2  s.  c  o  m*/
        }
        return true;
    } else {
        return true;
    }
}

From source file:mobi.jenkinsci.ci.client.JenkinsClient.java

private Issue getIssueFromRow(final Element row) throws MalformedURLException {
    final Element fullChangeMessage = row.select("div[class=changeset-message]").first();
    if (fullChangeMessage == null) {
        return null;
    }//from   www.j a  va  2s  . com

    final Element issueLink = fullChangeMessage.select("pre").first().select("a").first();
    if (issueLink == null) {
        return null;
    } else {
        final Element issueIcon = issueLink.select("img").first();
        return new Issue(getUrl(issueLink, "href"), issueLink.attr("tooltip"), getUrl(issueIcon, "src"));
    }
}

From source file:de.geeksfactory.opacclient.apis.Littera.java

private String getCellContent(Element detailTable, String pattern) {
    final Element first = detailTable.select("td.label:matchesOwn(" + pattern + ")").first();
    return first == null ? null : first.nextElementSibling().text();
}