Example usage for org.jsoup.nodes Document select

List of usage examples for org.jsoup.nodes Document select

Introduction

In this page you can find the example usage for org.jsoup.nodes Document select.

Prototype

public Elements select(String cssQuery) 

Source Link

Document

Find elements that match the Selector CSS query, with this element as the starting context.

Usage

From source file:de.geeksfactory.opacclient.apis.Littera.java

protected void addSortingSearchFields(List<SearchField> fields) throws IOException, JSONException {
    final String html = httpGet(getApiUrl() + "&mode=a", getDefaultEncoding());
    final Document doc = Jsoup.parse(html);
    for (int i = 0; i < 3; i++) {
        final Element tr = doc.select("#sort_editor tr.sort_" + i).first();
        final DropdownSearchField field = new DropdownSearchField();
        field.setMeaning(SearchField.Meaning.ORDER);
        field.setId("sort_" + i);
        field.setDisplayName(tr.select("td").first().text());
        field.addDropdownValue("", "");
        for (final Element option : tr.select(".crit option")) {
            if (option.hasAttr("selected")) {
                field.addDropdownValue(0, option.attr("value"), option.text());
            } else {
                field.addDropdownValue(option.attr("value"), option.text());
            }/*from w ww .j a  va2  s .c  o m*/
        }
        fields.add(field);
    }
}

From source file:io.seldon.importer.articles.dynamicextractors.FirstElementAttrValueDateWithFormatDynamicExtractor.java

@Override
public String extract(AttributeDetail attributeDetail, String url, Document articleDoc) throws Exception {

    String attrib_value = null;/* w  w  w  . j a  v a2  s  .c o  m*/
    String dateFormatString = null;

    if ((attributeDetail.extractor_args != null) && (attributeDetail.extractor_args.size() >= 3)) {
        String cssSelector = attributeDetail.extractor_args.get(0);
        dateFormatString = attributeDetail.extractor_args.get(1);
        Element element = articleDoc.select(cssSelector).first();
        if (StringUtils.isNotBlank(cssSelector)) {
            int arg_count = 0;
            for (String value_name : attributeDetail.extractor_args) {
                if (arg_count > 1) { // skip the first one, its the cssSelector, and second thats the Date format
                    if (element != null && element.attr(value_name) != null) {
                        attrib_value = element.attr(value_name);
                        if (StringUtils.isNotBlank(attrib_value)) {
                            break;
                        }
                    }
                }
                arg_count++;
            }
        }
    }

    if ((attrib_value != null) && (dateFormatString != null)) {
        String pubtext = attrib_value;
        SimpleDateFormat dateFormatter = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
        DateFormat df = new SimpleDateFormat(dateFormatString, Locale.ENGLISH);
        Date result = null;
        try {
            result = df.parse(pubtext);
        } catch (ParseException e) {
            logger.info("Failed to parse date with format [" + dateFormatString + "] " + pubtext);
        }

        if (result != null) {
            String attrib_value_orig = attrib_value;
            attrib_value = dateFormatter.format(result);
            String msg = "Extracted date [" + attrib_value_orig + "] - > [" + attrib_value + "]";
            logger.info(msg);
        } else {
            logger.error("Failed to parse date " + pubtext);
            attrib_value = null;
        }
    }

    return attrib_value;
}

From source file:gov.medicaid.screening.dao.impl.BusinessLienDAOBean.java

/**
 * Parses the excluded provider profile details page.
 *
 * @param page the details page/*w  ww. j a  v a  2  s  . c om*/
 * @return the parsed license details
 * @throws ParsingException if the expected tags were not found
 */
private ProviderProfile parseProfile(Document page) throws ParsingException {
    ProviderProfile profile = new ProviderProfile();

    // business
    String businessName = page.select("#searchItemDetail #recordReview h3").text();
    Business business = new Business();
    profile.setBusiness(business);
    business.setName(businessName);

    Elements detailMaster = page.select("#searchItemDetail #RecordDetailMaster #filingSummary dl");
    // business type
    BusinessType businessType = new BusinessType();
    business.setType(businessType);
    businessType.setName(getValuePairOfLabel(detailMaster, "Business Type"));

    // statute
    business.setStatute(getValuePairOfLabel(detailMaster, "MN Statute"));

    // file number
    business.setFileNumber(getValuePairOfLabel(detailMaster, "File Number"));

    // home jurisdiction
    business.setHomeJurisdiction(getValuePairOfLabel(detailMaster, "Home Jurisdiction"));

    // filing date
    Date filingDate = parseDate(getValuePairOfLabel(detailMaster, "Filing Date"), DATE_FORMAT);
    if (filingDate != null) {
        business.setFilingDate(filingDate);
    }

    // status
    BusinessStatus status = new BusinessStatus();
    status.setName(getValuePairOfLabel(detailMaster, "Status"));
    business.setStatus(status);

    // renewal date
    Date renewalDate = parseDate(getValuePairOfLabel(detailMaster, "Renewal Due Date:"), DATE_FORMAT);
    if (renewalDate != null) {
        business.setRenewalDueDate(renewalDate);
    }

    // registered office address
    business.setRegisteredOfficeAddress(
            parseAddress(getValuePairOfLabel(detailMaster, "Registered Office Address")));

    // registered agents
    ProviderProfile agent = new ProviderProfile();
    List<ProviderProfile> agents = new ArrayList<ProviderProfile>();
    business.setRegisteredAgents(agents);
    agents.add(agent);
    agent.setAgency(getValuePairOfLabel(detailMaster, "Registered Agent(s)"));

    List<FilingHistory> filingHistories = new ArrayList<FilingHistory>();
    business.setFilingHistory(filingHistories);

    Elements fileHistories = page.select("#filing table tr");
    for (Element fileHistory : fileHistories) {
        FilingHistory fh = new FilingHistory();
        Date fileDate = parseDate(fileHistory.select("td.date").text(), DATE_FORMAT);
        if (fileDate != null) {
            fh.setDate(fileDate);
        }
        fh.setDescription(fileHistory.select("td.action").text());
        filingHistories.add(fh);
    }

    return profile;
}

From source file:com.elevenpaths.googleindexretriever.GoogleSearch.java

public Elements getResultSpam()
        throws EmptyQueryException, ManyResultsException, CaptchaException, UnsupportedEncodingException {
    if (this.query.isEmpty()) {
        throw new EmptyQueryException();
    }//from w w w  .j av a  2 s  .  c o  m
    Document doc = getData(this.query);

    Elements data = doc.select(".st");

    return data;
}

From source file:com.bdx.rainbow.service.etl.analyze.SYJHttpAnalyze.java

/**
 * ?//  w  w  w  .  j ava 2s . c  om
 * 
 * @param document
 * @return
 * @throws Exception
 */
private Object analyzeLicenseDetail(HttpSeed seed) throws Exception {

    Document doc = parse(seed.getHtml());

    Elements eleTable = doc.select(".listmain table");
    // TR
    Elements eleTrs = eleTable.get(0).select("tr");

    // ?
    Object entity = AnalyzeUtil.getInstant(PREFIX_ENTITY_PATH + syjTableBean.getTableClass());

    // tr?trtd?
    int rowNo = 1;
    for (int i = 0; i < eleTrs.size(); i++) {
        Element eleTr = eleTrs.get(i);

        // ??trtd??nowrapnowrap?true
        if (i != eleTrs.size() - 1 && (!eleTr.select("td").get(0).hasAttr("nowrap")
                || !"true".equals(eleTr.select("td").get(0).attr("nowrap")))) {
            continue;
        }

        // td?
        String tdVal = parseDetailTr(eleTr);

        // TABLE7411??
        if (syjTableBean.getTableClass().equals("TABLE74") && rowNo == 11) {
            continue;
        }

        // entity
        AnalyzeUtil.executeMethod(entity, PREFIX_ATTRIBUTE + rowNo++, new Object[] { tdVal },
                new Class[] { String.class });
    }

    // ?ID, ?createEmpCode
    String regex = ".+?&Id=(.+?)";
    Object obj = AnalyzeUtil.regex(seed.getUrl(), regex);

    if (null == obj) {
        // ID
        AnalyzeUtil.executeMethod(entity, "setContentId", new Object[] { 0l }, new Class[] { Long.class });
    } else {
        // ID
        AnalyzeUtil.executeMethod(entity, "setContentId", new Object[] { Long.valueOf(obj.toString()) },
                new Class[] { Long.class });
    }

    // ?
    AnalyzeUtil.executeMethod(entity, "setCreateTime", new Object[] { new Timestamp(new Date().getTime()) },
            new Class[] { Timestamp.class });

    return entity;
}

From source file:com.elevenpaths.googleindexretriever.GoogleSearch.java

public String getImageCaptcha(Document doc) {
    String img = "";
    Elements image = doc.select("img");
    if (image.size() == 1) {
        img = doc.select("img").first().attr("src");
    }/*w w w  .  j  av a2  s  .  co m*/

    return img;
}

From source file:com.elevenpaths.googleindexretriever.GoogleSearch.java

public String getResults()
        throws EmptyQueryException, ManyResultsException, CaptchaException, UnsupportedEncodingException {
    if (this.query.isEmpty()) {
        throw new EmptyQueryException();
    }/* w ww  . j a v a  2 s. co m*/
    Document doc = getData(this.query);

    Elements data = doc.select(".st");

    if (data.size() > 1) {
        throw new ManyResultsException();
    }

    return data.text();
}

From source file:de.fzi.ALERT.actor.MessageObserver.NotificationObserver.JMSNotificationParser.java

private void parseIssueAlertIusse(Document doc) {
    // TODO Auto-generated method stub
    Message message = null;/*from   www  .  j a  va 2 s .c  om*/
    RssFeedGenerator newRssFeedGenerator = new RssFeedGenerator();
    try {

        String content = "EventName: AlertIusse\n";
        message = new Message();
        Elements events = doc.select("ns1|eventData");
        org.jsoup.nodes.Element event = events.get(0);
        Elements elements = event.getAllElements();
        for (org.jsoup.nodes.Element element : elements) {
            content = content + element.tagName() + " : " + element.ownText() + "\n";
        }
        System.out.println("content:" + content);
        newRssFeedGenerator.RssFeedXml("title", "Link", content);
        System.out.println("!!!");
    } catch (Exception e) {
        System.out.println(e.getMessage());
    }

}

From source file:de.geeksfactory.opacclient.apis.WebOpacNet.java

@Override
public List<SearchField> getSearchFields() throws IOException, JSONException {
    List<SearchField> fields = new ArrayList<>();

    // Text fields
    String html = httpGet(opac_url + "/de/mobile/default.aspx", getDefaultEncoding());
    Document doc = Jsoup.parse(html);
    Elements options = doc.select("#drpOptSearchT option");
    for (Element option : options) {
        TextSearchField field = new TextSearchField();
        field.setDisplayName(option.text());
        field.setId(option.attr("value"));
        field.setData(new JSONObject("{\"filter\":false}"));
        field.setHint("");
        fields.add(field);/*from www  . j  av a 2s.  co m*/
    }

    // Dropdowns
    String text = httpGet(opac_url + "/de/mobile/GetRestrictions.ashx", getDefaultEncoding());
    JSONArray filters = new JSONObject(text).getJSONArray("restrcontainers");
    for (int i = 0; i < filters.length(); i++) {
        JSONObject filter = filters.getJSONObject(i);
        if (filter.getString("querytyp").equals("EJ")) {
            // Querying by year also works for other years than the ones
            // listed
            // -> Make it a text field instead of a dropdown
            TextSearchField field = new TextSearchField();
            field.setDisplayName(filter.getString("kopf"));
            field.setId(filter.getString("querytyp"));
            field.setData(new JSONObject("{\"filter\":true}"));
            field.setHint("");
            fields.add(field);
        } else {
            DropdownSearchField field = new DropdownSearchField();
            field.setId(filter.getString("querytyp"));
            field.setDisplayName(filter.getString("kopf"));

            JSONArray restrictions = filter.getJSONArray("restrictions");

            field.addDropdownValue("", "Alle");

            for (int j = 0; j < restrictions.length(); j++) {
                JSONObject restriction = restrictions.getJSONObject(j);
                field.addDropdownValue(restriction.getString("id"), restriction.getString("bez"));
            }

            field.setData(new JSONObject("{\"filter\":true}"));
            fields.add(field);
        }
    }

    return fields;
}

From source file:mobi.jenkinsci.alm.assembla.client.AssemblaClient.java

private Document postLoginForm(final Document pinDoc) throws IOException {
    final List<NameValuePair> formNvps = new ArrayList<NameValuePair>();
    final Element form = pinDoc.select("form[id=login-box]").first();
    final String formAction = form.attr("action");
    final HttpPost formPost = new HttpPost(getUrl(formAction).toString());
    final Elements formFields = form.select("input");
    for (final Element element : formFields) {
        final String fieldName = element.attr("name");
        String fieldValue = element.attr("value");
        final String fieldId = element.attr("id");
        final String fieldType = element.attr("type");

        if (fieldId.equalsIgnoreCase("user_login")) {
            fieldValue = username;//from ww  w . ja va  2  s.  c  o  m
            ;
        } else if (fieldId.equalsIgnoreCase("user_password")) {
            fieldValue = password;
        }

        if (fieldType.equals("submit")) {
            if (!fieldName.equalsIgnoreCase("commit")) {
                continue;
            }
        }

        LOG.debug(String.format("Processing form field: name='%s' value='%s' id='%s'", fieldName, fieldValue,
                fieldId));
        formNvps.add(new BasicNameValuePair(fieldName, fieldValue));
    }
    try {
        formPost.setEntity(new UrlEncodedFormEntity(formNvps, "UTF-8"));
    } catch (final UnsupportedEncodingException e) {
        // This would never happen
        throw new IllegalArgumentException("UTF-8 not recognised");
    }

    HttpResponse response;
    LOG.debug("Login via posting form-data to " + formPost.getURI());
    try {
        response = sendHttpPost(formPost);
        if (response.getStatusLine().getStatusCode() != HttpURLConnection.HTTP_MOVED_TEMP) {
            throw new IOException("Form-based login to Assembla failed: " + response.getStatusLine());
        }
        return Jsoup.parse(getData(response.getFirstHeader("Location").getValue(), false));
    } finally {
        formPost.releaseConnection();
    }
}