Example usage for org.jsoup.nodes Element select

List of usage examples for org.jsoup.nodes Element select

Introduction

In this page you can find the example usage for org.jsoup.nodes Element select.

Prototype

public Elements select(String cssQuery) 

Source Link

Document

Find elements that match the Selector CSS query, with this element as the starting context.

Usage

From source file:net.parser.JobParser.java

public List<Job> getJobIdAndLink(List<Job> jobs) {
    Job job = null;// w  w w . j ava2  s  .co  m
    Employer employer = null;
    Elements elements = doc.select(".searchlist").eq(0).select(".job .details");

    for (Element jobElement : elements) {

        Elements aElements = jobElement.select("a");
        job = new Job();
        employer = new Employer();

        Element aJob = aElements.get(0);
        String linkJob = aJob.attr("href");
        linkJob = linkJob.replaceFirst("www", "m");
        job.setLink(linkJob);

        linkJob = linkJob.substring(linkJob.indexOf("/") + 1, linkJob.length());
        linkJob = linkJob.substring(linkJob.indexOf("/") + 1, linkJob.length());
        linkJob = linkJob.substring(linkJob.indexOf("/") + 1, linkJob.length());
        linkJob = linkJob.substring(linkJob.indexOf("/") + 1, linkJob.length());
        String id = linkJob.substring(0, linkJob.indexOf("/"));

        job.setId(Integer.parseInt(id));

        Element aEmpoyer = null;
        try {
            aEmpoyer = aElements.get(1);
            String linkEmployer = aEmpoyer.attr("href");
            employer.setLink(linkEmployer);

            linkEmployer = linkEmployer.substring(linkEmployer.indexOf("/") + 1, linkEmployer.length());
            linkEmployer = linkEmployer.substring(linkEmployer.indexOf("/") + 1, linkEmployer.length());
            linkEmployer = linkEmployer.substring(linkEmployer.indexOf("/") + 1, linkEmployer.length());
            linkEmployer = linkEmployer.substring(linkEmployer.indexOf("/") + 1, linkEmployer.length());
            id = linkEmployer.substring(0, linkEmployer.indexOf("/"));

            employer.setId(Integer.parseInt(id));
        } catch (IndexOutOfBoundsException e) {
            employer.setId(0);
        }

        employer.setName(getEmployerName(jobElement, employer.getId()));

        job.setEmployer(employer);
        jobs.add(job);
    }

    return jobs;
}

From source file:gov.medicaid.screening.dao.impl.BusinessLienDAOBean.java

/**
 * Parses the excluded provider profile details page.
 *
 * @param page the details page//  w  w  w.  j  av a2 s .c  o  m
 * @return the parsed license details
 * @throws ParsingException if the expected tags were not found
 */
private ProviderProfile parseProfile(Document page) throws ParsingException {
    ProviderProfile profile = new ProviderProfile();

    // business
    String businessName = page.select("#searchItemDetail #recordReview h3").text();
    Business business = new Business();
    profile.setBusiness(business);
    business.setName(businessName);

    Elements detailMaster = page.select("#searchItemDetail #RecordDetailMaster #filingSummary dl");
    // business type
    BusinessType businessType = new BusinessType();
    business.setType(businessType);
    businessType.setName(getValuePairOfLabel(detailMaster, "Business Type"));

    // statute
    business.setStatute(getValuePairOfLabel(detailMaster, "MN Statute"));

    // file number
    business.setFileNumber(getValuePairOfLabel(detailMaster, "File Number"));

    // home jurisdiction
    business.setHomeJurisdiction(getValuePairOfLabel(detailMaster, "Home Jurisdiction"));

    // filing date
    Date filingDate = parseDate(getValuePairOfLabel(detailMaster, "Filing Date"), DATE_FORMAT);
    if (filingDate != null) {
        business.setFilingDate(filingDate);
    }

    // status
    BusinessStatus status = new BusinessStatus();
    status.setName(getValuePairOfLabel(detailMaster, "Status"));
    business.setStatus(status);

    // renewal date
    Date renewalDate = parseDate(getValuePairOfLabel(detailMaster, "Renewal Due Date:"), DATE_FORMAT);
    if (renewalDate != null) {
        business.setRenewalDueDate(renewalDate);
    }

    // registered office address
    business.setRegisteredOfficeAddress(
            parseAddress(getValuePairOfLabel(detailMaster, "Registered Office Address")));

    // registered agents
    ProviderProfile agent = new ProviderProfile();
    List<ProviderProfile> agents = new ArrayList<ProviderProfile>();
    business.setRegisteredAgents(agents);
    agents.add(agent);
    agent.setAgency(getValuePairOfLabel(detailMaster, "Registered Agent(s)"));

    List<FilingHistory> filingHistories = new ArrayList<FilingHistory>();
    business.setFilingHistory(filingHistories);

    Elements fileHistories = page.select("#filing table tr");
    for (Element fileHistory : fileHistories) {
        FilingHistory fh = new FilingHistory();
        Date fileDate = parseDate(fileHistory.select("td.date").text(), DATE_FORMAT);
        if (fileDate != null) {
            fh.setDate(fileDate);
        }
        fh.setDescription(fileHistory.select("td.action").text());
        filingHistories.add(fh);
    }

    return profile;
}

From source file:com.bdx.rainbow.service.etl.analyze.SYJHttpAnalyze.java

/**
 * ?/*from  w w  w .j  a  v  a  2s . c  om*/
 * 
 * @throws Exception
 */
private int getTotalPageNum(Document document) throws Exception {
    // ?table?
    Element pageTable = document.select("table").get(4);

    String pageHtml = pageTable.select("tr td").get(0).html();

    // ??
    String regex = ".+?(.+?).+?";

    Object result = AnalyzeUtil.regex(pageHtml, regex);
    if (null == result) {
        return 0;
    }

    int totalPageNum = Integer.parseInt(result.toString());

    return totalPageNum;
}

From source file:net.acesinc.convergentui.ConvergentUIResponseFilter.java

@Override
public Object run() {

    String origBody = contentManager.getDownstreamResponse();
    if (origBody == null || origBody.isEmpty()) {
        return null;
    }//from   www  .j  ava  2s  .co  m

    String composedBody = null;
    log.trace("Response from downstream server: " + origBody);

    Document doc = Jsoup.parse(origBody);
    if (hasReplaceableElements(doc)) {
        log.debug("We have replaceable elements. Let's get em!");
        Elements elementsToUpdate = doc.select("div[data-loc]");
        for (Element e : elementsToUpdate) {
            StringBuilder content = new StringBuilder();
            String location = e.dataset().get("loc");
            String fragmentName = e.dataset().get("fragment-name");
            String cacheName = e.dataset().get("cache-name");
            boolean useCaching = !Boolean.valueOf(e.dataset().get("disable-caching"));
            boolean failQuietly = Boolean.valueOf(e.dataset().get("fail-quietly"));
            URL url = null;
            try {
                url = new URL(location);
                String protocol = url.getProtocol();
                String service = url.getHost();

                log.debug("Fetching content at location [ " + location + " ] with cacheName = [ " + cacheName
                        + " ]");

                try {
                    RequestContext context = RequestContext.getCurrentContext();
                    ContentResponse response = contentManager.getContentFromService(location, cacheName,
                            useCaching, context);

                    log.trace(response.toString());

                    if (!response.isError()) {
                        Object resp = response.getContent();
                        if (String.class.isAssignableFrom(resp.getClass())) {
                            String subContentResponse = (String) resp;
                            //TODO You better trust the source of your downstream HTML!
                            //                    String cleanedContent = Jsoup.clean(subContentResponse, Whitelist.basic()); //this totally stripped the html out...
                            Document subDocument = Jsoup.parse(subContentResponse);

                            if (fragmentName != null) {
                                Elements fragments = subDocument
                                        .select("div[data-fragment-name=\"" + fragmentName + "\"]");

                                if (fragments != null && fragments.size() > 0) {
                                    if (fragments.size() == 1) {
                                        Element frag = fragments.first();

                                        //need to see if there are images that we need to replace the urls on
                                        Elements images = frag.select("img");
                                        for (Element i : images) {
                                            String src = i.attr("src");
                                            if (src.startsWith("/") && !src.startsWith("//")) {
                                                i.attr("src", "/cui-req://" + protocol + "://" + service + src);
                                            } //else what do we do about relative urls?
                                        }

                                        content.append(frag.toString());

                                    } else {
                                        for (Element frag : fragments) {
                                            content.append(frag.toString()).append("\n\n");
                                        }
                                    }
                                } else {
                                    log.debug("Found no matching fragments for [ " + fragmentName + " ]");
                                    if (failQuietly) {
                                        content.append("<div class='cui-error'></div>");
                                    } else {
                                        content.append(
                                                "<span class='cui-error'>Failed getting content from remote service. Possible reason in reponse below</span>");
                                        content.append(subDocument.toString());
                                    }
                                }
                            } else {
                                //take the whole thing and cram it in there!
                                content.append(subDocument.toString());
                            }
                        } else {
                            //not text...
                            if (!failQuietly) {
                                content.append(
                                        "<span class='cui-error'>Failed getting content from remote service. Reason: content was not text</span>");
                            } else {
                                content.append("<div class='cui-error'></div>");
                            }
                        }

                    } else {
                        if (!failQuietly) {
                            content.append(
                                    "<span class='cui-error'>Failed getting content from remote service. Reason: "
                                            + response.getMessage() + "</span>");
                        } else {
                            content.append("<div class='cui-error'></div>");
                        }
                    }

                    //now append it to the page
                    if (!content.toString().isEmpty()) {
                        e.html(content.toString());
                    }
                } catch (Throwable t) {
                    if (!failQuietly) {
                        e.html("<span class='cui-error'>Failed getting content from remote service. Reason: "
                                + t.getMessage() + "</span>");
                    }
                    log.warn("Failed replacing content", t);
                }
            } catch (MalformedURLException ex) {
                log.warn("location was invalid: [ " + location + " ]", ex);
                if (!failQuietly) {
                    content.append(
                            "<span class='cui-error'>Failed getting content from remote service. Reason: data-loc was an invalid location.</span>");
                } else {
                    content.append("<div class='cui-error'></div>");
                }
            }

        }

        composedBody = doc.toString();
    } else {
        log.debug("Document has no replaeable elements. Skipping");
    }

    try {
        addResponseHeaders();
        if (composedBody != null && !composedBody.isEmpty()) {
            writeResponse(composedBody, getMimeType(RequestContext.getCurrentContext()));
        } else {
            writeResponse(origBody, getMimeType(RequestContext.getCurrentContext()));
        }
    } catch (Exception ex) {
        log.error("Error sending response", ex);

    }
    return null;
}

From source file:com.amastigote.xdu.query.module.EduSystem.java

private @Nullable JSONObject gradesQuery() throws IOException, JSONException {
    if (!checkIsLogin(ID)) {
        return null;
    }//  ww  w  . j a v  a 2 s  .c om

    URL url = new URL(SYS_HOST + GRADE_QUERY_SUFFIX);
    HttpURLConnection httpURLConnection = (HttpURLConnection) url.openConnection();
    httpURLConnection.setRequestProperty("Cookie", "JSESSIONID=" + SYS_JSESSIONID);
    httpURLConnection.connect();

    Document document = Jsoup.parse(httpURLConnection.getInputStream(), "gb2312",
            httpURLConnection.getURL().toString());
    document = Jsoup.parse(document.toString().replaceAll("&nbsp;", ""));

    JSONObject jsonObject = new JSONObject();
    Elements elements_content = document.select("td[class=pageAlign]");
    Elements elements_titles = document.select("b");
    for (int i = 0; i < elements_titles.size(); i++) {
        JSONObject jsonObject_semester = new JSONObject();
        String semester_key = elements_titles.get(i).text().trim();

        Element table_for_this_semester = elements_content.get(i);
        Elements elements_rows = table_for_this_semester.select("td[align=center]");

        for (int j = 0; j < elements_rows.size() / 7; j++) {
            JSONObject jsonObject_course = new JSONObject();
            String course_key = elements_rows.get(j * 7 + 2).text().trim();

            jsonObject_course.put(GradeKey.ID, elements_rows.get(j * 7).text().trim());
            jsonObject_course.put(GradeKey.CREDIT, elements_rows.get(j * 7 + 4).text().trim());
            jsonObject_course.put(GradeKey.ATTR, elements_rows.get(j * 7 + 5).text().trim());
            jsonObject_course.put(GradeKey.GRADE, elements_rows.get(j * 7 + 6).text().trim());
            jsonObject_semester.put(course_key, jsonObject_course);
        }
        jsonObject.put(semester_key, jsonObject_semester);
    }
    return jsonObject;
}

From source file:me.vertretungsplan.parser.SVPlanParser.java

@NotNull
SubstitutionSchedule parseSVPlanSchedule(List<Document> docs) throws IOException, JSONException {
    SubstitutionSchedule v = SubstitutionSchedule.fromData(scheduleData);

    for (Document doc : docs) {
        if (doc.select(".svp").size() > 0) {
            for (Element svp : doc.select(".svp")) {
                parseSvPlanDay(v, svp, doc);
            }// w w w.  j  av  a2  s  .c om
        } else if (doc.select(".Trennlinie").size() > 0) {
            Element div = new Element(Tag.valueOf("div"), "");
            for (Node node : doc.body().childNodesCopy()) {
                if (node instanceof Element && ((Element) node).hasClass("Trennlinie")
                        && div.select("table").size() > 0) {
                    parseSvPlanDay(v, div, doc);
                    div = new Element(Tag.valueOf("div"), "");
                } else {
                    div.appendChild(node);
                }
            }
            parseSvPlanDay(v, div, doc);
        } else {
            parseSvPlanDay(v, doc, doc);
        }
    }

    v.setClasses(getAllClasses());
    v.setTeachers(getAllTeachers());
    return v;
}

From source file:com.johan.vertretungsplan.parser.SVPlanParser.java

public Vertretungsplan getVertretungsplan() throws IOException, JSONException {
    new LoginHandler(schule).handleLogin(executor, cookieStore, username, password); //

    JSONArray urls = schule.getData().getJSONArray("urls");
    String encoding = schule.getData().getString("encoding");
    List<Document> docs = new ArrayList<Document>();

    for (int i = 0; i < urls.length(); i++) {
        JSONObject url = urls.getJSONObject(i);
        loadUrl(url.getString("url"), encoding, docs);
    }/*w  w  w  .j  av a2 s . c o m*/

    LinkedHashMap<String, VertretungsplanTag> tage = new LinkedHashMap<String, VertretungsplanTag>();
    for (Document doc : docs) {
        if (doc.select(".svp-tabelle").size() > 0) {
            VertretungsplanTag tag = new VertretungsplanTag();
            String date = "Unbekanntes Datum";
            if (doc.select(".svp-plandatum-heute, .svp-plandatum-morgen").size() > 0)
                date = doc.select(".svp-plandatum-heute, .svp-plandatum-morgen").text();
            else if (doc.title().startsWith("Vertretungsplan fr "))
                date = doc.title().substring("Vertretungsplan fr ".length());
            tag.setDatum(date);
            if (doc.select(".svp-uploaddatum").size() > 0)
                tag.setStand(doc.select(".svp-uploaddatum").text().replace("Aktualisierung: ", ""));

            Elements rows = doc.select(".svp-tabelle tr");
            String lastLesson = "";
            for (Element row : rows) {
                if (row.hasClass("svp-header"))
                    continue;

                Vertretung vertretung = new Vertretung();
                List<String> affectedClasses = new ArrayList<String>();

                for (Element column : row.select("td")) {
                    if (!hasData(column.text())) {
                        continue;
                    }
                    String type = column.className();
                    if (type.startsWith("svp-stunde")) {
                        vertretung.setLesson(column.text());
                        lastLesson = column.text();
                    } else if (type.startsWith("svp-klasse"))
                        affectedClasses = Arrays.asList(column.text().split(", "));
                    else if (type.startsWith("svp-esfehlt"))
                        vertretung.setPreviousTeacher(column.text());
                    else if (type.startsWith("svp-esvertritt"))
                        vertretung.setTeacher(column.text());
                    else if (type.startsWith("svp-fach"))
                        vertretung.setSubject(column.text());
                    else if (type.startsWith("svp-bemerkung")) {
                        vertretung.setDesc(column.text());
                        vertretung.setType(recognizeType(column.text()));
                    } else if (type.startsWith("svp-raum"))
                        vertretung.setRoom(column.text());

                    if (vertretung.getLesson() == null)
                        vertretung.setLesson(lastLesson);
                }

                if (vertretung.getType() == null) {
                    vertretung.setType("Vertretung");
                }

                for (String klasse : affectedClasses) {
                    KlassenVertretungsplan kv = tag.getKlassen().get(klasse);
                    if (kv == null)
                        kv = new KlassenVertretungsplan(klasse);
                    kv.add(vertretung);
                    tag.getKlassen().put(klasse, kv);
                }
            }

            List<String> nachrichten = new ArrayList<String>();
            if (doc.select("h2:contains(Mitteilungen)").size() > 0) {
                Element h2 = doc.select("h2:contains(Mitteilungen)").first();
                Element sibling = h2.nextElementSibling();
                while (sibling != null && sibling.tagName().equals("p")) {
                    for (String nachricht : TextNode.createFromEncoded(sibling.html(), null).getWholeText()
                            .split("<br />\\s*<br />")) {
                        if (hasData(nachricht))
                            nachrichten.add(nachricht);
                    }
                    sibling = sibling.nextElementSibling();
                }
            }
            tag.setNachrichten(nachrichten);

            tage.put(date, tag);
        } else {
            throw new IOException("keine SVPlan-Tabelle gefunden");
        }
    }
    Vertretungsplan v = new Vertretungsplan();
    v.setTage(new ArrayList<VertretungsplanTag>(tage.values()));

    return v;
}

From source file:de.geeksfactory.opacclient.apis.IOpac.java

static void parseResList(List<ReservedItem> media, Document doc, JSONObject data) {
    if (doc.select("a[name=RES]").size() == 0)
        return;/*from  w w w  .  j a  v a 2 s.c  o m*/
    Elements copytrs = doc.select("a[name=RES] ~ table:contains(Titel)").first().select("tr");
    doc.setBaseUri(data.optString("baseurl"));
    DateTimeFormatter fmt = DateTimeFormat.forPattern("dd.MM.yyyy").withLocale(Locale.GERMAN);

    int trs = copytrs.size();
    if (trs < 2) {
        return;
    }
    assert (trs > 0);
    for (int i = 1; i < trs; i++) {
        Element tr = copytrs.get(i);
        ReservedItem item = new ReservedItem();

        item.setTitle(tr.child(0).text().trim().replace("\u00a0", ""));
        item.setAuthor(tr.child(1).text().trim().replace("\u00a0", ""));
        try {
            item.setReadyDate(fmt.parseLocalDate(tr.child(4).text().trim().replace("\u00a0", "")));
        } catch (IllegalArgumentException e) {
            item.setStatus(tr.child(4).text().trim().replace("\u00a0", ""));
        }
        if (tr.select("a").size() > 0) {
            item.setCancelData(tr.select("a").last().attr("href"));
        }

        media.add(item);
    }
    assert (media.size() == trs - 1);

}

From source file:eu.masconsult.bgbanking.banks.sgexpress.SGExpressClient.java

@Override
public List<RawBankAccount> getBankAccounts(String authTokenString)
        throws IOException, ParseException, AuthenticationException {
    AuthToken authToken = AuthToken.fromJson(authTokenString);

    String response = loadPageWithAuth(getHttpClient(), authToken, LIST_ACCOUNTS_XML_ID);

    Document doc = Jsoup.parse(response, BASE_URL);

    Element content = doc.getElementById("main");
    if (content == null) {
        throw new ParseException("getBankAccounts: can't find #main");
    }//  w  w w  . j a v a  2 s . co m

    Elements tables = content.select("section.result table.data");
    if (tables == null || tables.size() == 0) {
        throw new ParseException("getBankAccounts: can't find table section.result table.data");
    }

    Elements rows = tables.first().getElementsByTag("tr");
    if (rows == null || rows.size() == 0) {
        throw new ParseException("getBankAccounts: first table is empty");
    }

    ArrayList<RawBankAccount> bankAccounts = new ArrayList<RawBankAccount>(rows.size());

    String type = "undef";
    for (Element row : rows) {
        if (row.getElementsByTag("th").size() > 0) {
            // header row
            type = row.child(0).text();
        } else {
            RawBankAccount bankAccount = obtainBankAccountFromHtmlTableRow(type, row);
            if (bankAccount != null) {
                bankAccounts.add(bankAccount);
            }
        }
    }

    return bankAccounts;
}

From source file:de.geeksfactory.opacclient.apis.Littera.java

@Override
public DetailledItem getResultById(String id, String homebranch) throws IOException, OpacErrorException {
    if (!initialised) {
        start();//from  w  w  w  .jav a  2s .  c o m
    }
    final String html = httpGet(getApiUrl() + "&view=detail&id=" + id, getDefaultEncoding());
    final Document doc = Jsoup.parse(html);
    final Element detailData = doc.select(".detailData").first();
    final Element detailTable = detailData.select("table.titel").first();
    final Element availabilityTable = doc.select(".bibliothek table").first();

    final DetailledItem result = new DetailledItem();
    final Copy copy = new Copy();
    result.addCopy(copy);
    result.setId(id);
    result.setCover(getCover(doc));
    result.setTitle(detailData.select("h3").first().text());
    result.setMediaType(MEDIA_TYPES.get(getCellContent(detailTable, "Medienart|Type of media")));
    copy.setStatus(getCellContent(availabilityTable, "Verfgbar|Available"));
    copy.setReturnDate(parseCopyReturn(getCellContent(availabilityTable, "Exemplare verliehen|Copies lent")));
    copy.setReservations(getCellContent(availabilityTable, "Reservierungen|Reservations"));
    for (final Element tr : detailTable.select("tr")) {
        final String desc = tr.child(0).text();
        final String content = tr.child(1).text();
        if (desc != null && !desc.trim().equals("")) {
            result.addDetail(new Detail(desc, content));
        } else if (!result.getDetails().isEmpty()) {
            final Detail lastDetail = result.getDetails().get(result.getDetails().size() - 1);
            lastDetail.setHtml(true);
            lastDetail.setContent(lastDetail.getContent() + "\n" + content);
        }
    }
    return result;
}