Example usage for org.jsoup.nodes Element text

List of usage examples for org.jsoup.nodes Element text

Introduction

In this page you can find the example usage for org.jsoup.nodes Element text.

Prototype

public String text() 

Source Link

Document

Gets the combined text of this element and all its children.

Usage

From source file:org.aliuge.crawler.fetcher.FetchWorker.java

/**
 * @param url//from   ww w .  j  a  va 2s.c o  m
 * @desc 
 */
public void fetchPage(WebURL url) {

    PageFetchResult result = null;
    try {
        if (null != url && StringUtils.isNotBlank(url.getUrl())) {

            result = fetcher.fetch(url, true);
            // ??
            int statusCode = result.getStatusCode();
            if (statusCode == CustomFetchStatus.PageTooBig) {
                onIgnored(url);
                return;
            }
            if (statusCode != HttpStatus.SC_OK) {
                onFailed(url);
            } else {
                Page page = new Page(url);
                onSuccessed();
                if (!result.fetchContent(page)) {
                    onFailed(url);
                    return;
                }
                if (!parser.parse(page, url.getUrl())) {
                    onFailed(url);
                    return;
                }
                // ??
                String e_url = extractFilterAndChangeUrl(url.getUrl());
                if (StringUtils.isNoneBlank(e_url)) {
                    url.setUrl(e_url);
                    page.setWebURL(url);
                    pendingPages.addElement(page);
                    return;
                }

                // depth
                if (url.getDepth() > config.getMaxDepthOfCrawling() && config.getMaxDepthOfCrawling() != -1) {
                    return;
                }
                // ???Url?Url
                Document doc = Jsoup.parse(new String(page.getContentData(), page.getContentCharset()),
                        urlUtils.getBaseUrl(page.getWebURL().getUrl()));
                Elements links = doc.getElementsByTag("a");
                if (!links.isEmpty()) {
                    for (Element link : links) {
                        String linkHref = link.absUrl("href");

                        // ???url
                        if ((fetchFilter(linkHref) || extractFilter(linkHref))
                                && !bloomfilterHelper.exist(linkHref)) {
                            WebURL purl = new WebURL();
                            purl.setName(link.text());
                            purl.setUrl(linkHref);

                            purl.setDepth((short) (url.getDepth() + 1));
                            if (purl.getDepth() > config.getMaxDepthOfCrawling()
                                    && config.getMaxDepthOfCrawling() != -1)
                                return;
                            try {
                                if (!pendingUrls.addElement(purl)) {
                                    FileUtils.writeStringToFile(new File("status/_urls.good"),
                                            url.getUrl() + "\n", true);
                                }
                            } catch (QueueException e) {
                                log.error(e.getMessage());
                            }
                        }
                    }
                }
            }

        }
    } catch (QueueException e) {
        onFailed(url);
    } catch (Exception e) {
        e.printStackTrace();
        onFailed(url);
    } finally {
        if (null != result)
            result.discardContentIfNotConsumed();
    }
}

From source file:de.geeksfactory.opacclient.apis.Littera.java

protected void addAdvancedSearchFields(List<SearchField> fields) throws IOException, JSONException {
    final String html = httpGet(getApiUrl() + "&mode=a", getDefaultEncoding());
    final Document doc = Jsoup.parse(html);
    final Elements options = doc.select("select#adv_search_crit_0").first().select("option");
    for (final Element option : options) {
        final SearchField field;
        if (SEARCH_FIELDS_FOR_DROPDOWN.contains(option.val())) {
            field = new DropdownSearchField();
            addDropdownValuesForField(((DropdownSearchField) field), option.val());
        } else {//from   w  w w. j  a  v  a2  s  .c  om
            field = new TextSearchField();
            ((TextSearchField) field).setHint("");
        }
        field.setDisplayName(option.text());
        field.setId(option.val());
        field.setData(new JSONObject());
        field.getData().put("meaning", field.getId());
        fields.add(field);
    }
}

From source file:cn.edu.hfut.dmic.contentextractor.ContentExtractor.java

/**
 * ??:/*from   ww  w. j a  v  a  2  s.  c o m*/
 * 1. ???
 * 2. ???????
 * 3. ??
 * 4. ?? ??
 * 5. ?
 *
 * @return
 * @throws XpathSyntaxErrorException
 */
private String getAuthor() throws XpathSyntaxErrorException {
    String author = "";
    if (StringUtils.isBlank(srcTime)) {
        author = getAuthor(doc.body().html());
        return author;
    }
    Element cur = doc.body().select("*:containsOwn(" + srcTime + ")").first();
    if (cur == null) {
        LOG.warn("?srcTime=" + srcTime);
        author = getAuthor(doc.body().html());
        return author;
    }

    if (!noText(cur)) {
        String arr[] = cur.html().split(srcTime);
        for (String text : arr) {
            author = getShortText(text);
            if (!StringUtils.isBlank(author))
                return author;
        }
    }
    Element parent = cur.parent();
    while (parent != null && noText(parent)) {
        cur = parent;
        parent = parent.parent();
    }
    author = getAuthor(parent.html());
    if (!StringUtils.isBlank(author))
        return author;

    Element pre = cur.previousElementSibling();
    while (pre != null && noText(pre)) {
        pre = pre.previousElementSibling();
    }
    if (pre != null) {
        author = getShortText(pre.text());
    }
    if (!StringUtils.isBlank(author))
        return author;
    Element next = cur.nextElementSibling();
    while (next != null && noText(next)) {
        next = next.nextElementSibling();
    }
    if (next != null) {
        author = getShortText(next.text());
    }
    if (!StringUtils.isBlank(author))
        return author;

    author = getShortText(parent.html().replace(srcTime, " "));
    if (!StringUtils.isBlank(author))
        return author;

    author = getAuthor(doc.body().html());
    if (StringUtils.isBlank(author)) {
        return author_bak;
    }
    return author;
}

From source file:cn.edu.hfut.dmic.contentextractor.ContentExtractor.java

/**
 * metaTitle?metaTitle,metaTitle??????title
 *
 * @param contentElement//from  w w  w . j  av a 2  s. c o m
 * @return
 * @throws Exception
 */
protected String getTitle(final Element contentElement) throws Exception {
    final ArrayList<Element> titleList = new ArrayList<Element>();
    final ArrayList<Double> titleSim = new ArrayList<Double>();
    final String metaTitle = getText(doc.title().trim());
    if (!metaTitle.isEmpty()) {
        doc.body().traverse(new NodeVisitor() {
            @Override
            public void head(Node node, int i) {
                if (node instanceof Element) {
                    Element tag = (Element) node;
                    String tagName = tag.tagName();
                    if (Pattern.matches("h[1-6]", tagName)) {
                        String title = tag.text().trim();
                        double sim = strSim(title, metaTitle);
                        titleSim.add(sim);
                        titleList.add(tag);
                    }
                }
            }

            @Override
            public void tail(Node node, int i) {
            }
        });
        int index = titleSim.size();
        if (index >= 0) {
            double maxScore = 0;
            int maxIndex = -1;
            for (int i = 0; i < index; i++) {
                double score = (i + 1) * titleSim.get(i);
                if (score > maxScore) {
                    maxScore = score;
                    maxIndex = i;
                }
            }

            if (maxIndex == -1 || titleSim.get(maxIndex) < 0.3) {
                String title = getText(metaTitle);
                if (!title.endsWith("") && title.length() > 7) {
                    return title;
                }
                Collections.sort(titleList, new Comparator<Element>() {
                    @Override
                    public int compare(Element o1, Element o2) {
                        int len1 = 1;
                        int len2 = 1;
                        if (o1.text().replaceAll("[^\\u4e00-\\u9fa5]", "").length() > 26
                                || o1.text().replaceAll("[^\\u4e00-\\u9fa5]", "").length() < 7) {
                            len1 = 0;
                        }
                        if (o2.text().replaceAll("[^\\u4e00-\\u9fa5]", "").length() > 26
                                || o2.text().replaceAll("[^\\u4e00-\\u9fa5]", "").length() < 7) {
                            len2 = 0;
                        }
                        if (len1 == len2) {
                            return o1.tagName().charAt(1) - o2.tagName().charAt(1);
                        }
                        return len2 - len1;
                    }
                });
                return getText(titleList.get(0).text());
            }
            return titleList.get(maxIndex).text();
        }
    }

    /**
     * ?
     */
    Elements titles = doc.body().select("*[id^=title],*[id$=title],*[class^=title],*[class$=title]");
    if (titles.size() > 0) {
        String title = titles.first().text();
        if (title.length() > 5 && title.length() < 40) {
            return titles.first().text();
        }
    }
    try {
        return getTitleByEditDistance(contentElement);
    } catch (Exception ex) {
        throw new Exception("title not found");
    }

}

From source file:net.slkdev.swagger.confluence.service.impl.XHtmlToConfluenceServiceImpl.java

private void addLinksByType(final Map<String, ConfluenceLink> confluenceLinkMap, final Elements elements,
        final PageType pageType, final Integer numericPrefix) {
    final SwaggerConfluenceConfig swaggerConfluenceConfig = SWAGGER_CONFLUENCE_CONFIG.get();

    int linkCount = 1;

    for (final Element element : elements) {
        final String confluenceLinkMarkup;
        final String originalTarget = element.attr("href");
        final String text = element.text();
        final String confluencePageTitle;

        if (pageType == INDIVIDUAL) {
            confluencePageTitle = buildConfluenceTitle(text, numericPrefix, linkCount);
        } else {/*from ww w . j ava 2s  .co  m*/
            confluencePageTitle = buildConfluenceTitle(text, linkCount, null);
        }

        switch (swaggerConfluenceConfig.getPaginationMode()) {
        case SINGLE_PAGE:
            confluenceLinkMarkup = formatSinglePageLink(text);
            break;

        case CATEGORY_PAGES:
            confluenceLinkMarkup = formatCategoryPageLink(text, confluencePageTitle, pageType);
            break;

        case INDIVIDUAL_PAGES:
            confluenceLinkMarkup = formatIndividualPageLink(text, confluencePageTitle);
            break;

        default:
            throw new SwaggerConfluenceConfigurationException("Unhandled Pagination Mode!");
        }

        final ConfluenceLink confluenceLink = ConfluenceLinkBuilder.aConfluenceLink().withPageType(pageType)
                .withOriginalHref(originalTarget).withText(text).withConfluenceLinkMarkup(confluenceLinkMarkup)
                .build();

        LOG.debug("LINK MAP: {} -> {}", originalTarget, confluenceLinkMarkup);

        confluenceLinkMap.put(originalTarget, confluenceLink);

        linkCount++;
    }
}

From source file:net.kevxu.purdueassist.course.ScheduleDetail.java

private ScheduleDetailEntry parseDocument(Document document)
        throws HtmlParseException, CourseNotFoundException, ResultNotMatchException {
    ScheduleDetailEntry entry = new ScheduleDetailEntry(term, crn);
    Elements tableElements = document.getElementsByAttributeValue("summary",
            "This table is used to present the detailed class information.");

    if (!tableElements.isEmpty()) {
        for (Element tableElement : tableElements) {
            // get basic info for selected course
            Element tableBasicInfoElement = tableElement.getElementsByClass("ddlabel").first();
            if (tableBasicInfoElement != null) {
                setBasicInfo(entry, tableBasicInfoElement.text());
            } else {
                throw new HtmlParseException("Basic info element empty.");
            }/*from w  ww  .ja va2  s .  c o m*/

            // get detailed course info
            Element tableDetailedInfoElement = tableElement.getElementsByClass("dddefault").first();

            if (tableDetailedInfoElement != null) {
                // process seat info
                Elements tableSeatDetailElements = tableDetailedInfoElement.getElementsByAttributeValue(
                        "summary", "This layout table is used to present the seating numbers.");
                if (tableSeatDetailElements.size() == 1) {
                    Element tableSeatDetailElement = tableSeatDetailElements.first();
                    Elements tableSeatDetailEntryElements = tableSeatDetailElement.getElementsByTag("tbody")
                            .first().children();
                    if (tableSeatDetailEntryElements.size() == 3 || tableSeatDetailEntryElements.size() == 4) {
                        setSeats(entry, tableSeatDetailEntryElements.get(1).text());
                        setWaitlistSeats(entry, tableSeatDetailEntryElements.get(2).text());
                        if (tableSeatDetailEntryElements.size() == 4) {
                            setCrosslistSeats(entry, tableSeatDetailEntryElements.get(3).text());
                        }
                    } else {
                        throw new HtmlParseException("Seat detail entry elements size not 3. We have "
                                + tableSeatDetailEntryElements.size() + ".");
                    }
                } else {
                    throw new HtmlParseException(
                            "Seat detail elements size not 1. We have " + tableSeatDetailElements.size() + ".");
                }
                // remove the seat info from detailed info
                tableSeatDetailElements.remove();

                // remaining information
                setRemainingInfo(entry, tableDetailedInfoElement.html());

            } else {
                throw new HtmlParseException("Detailed info element empty.");
            }

        }
    } else {
        // test empty
        Elements informationElements = document.getElementsByAttributeValue("summary",
                "This layout table holds message information");
        if (!informationElements.isEmpty()
                && informationElements.text().contains("No detailed class information found")) {
            throw new CourseNotFoundException(informationElements.text());
        } else {
            throw new HtmlParseException(
                    "Course table not found, but page does not contain message stating no course found.");
        }
    }

    return entry;
}

From source file:de.geeksfactory.opacclient.apis.Open.java

protected DetailledItem parse_result(Document doc) {
    DetailledItem item = new DetailledItem();

    // Title and Subtitle
    item.setTitle(doc.select("span[id$=LblShortDescriptionValue]").text());
    String subtitle = doc.select("span[id$=LblSubTitleValue]").text();
    if (!subtitle.equals("")) {
        item.addDetail(new Detail(stringProvider.getString(StringProvider.SUBTITLE), subtitle));
    }//from w ww.  j a  va  2s. com

    // Cover
    if (doc.select("input[id$=mediumImage]").size() > 0) {
        item.setCover(doc.select("input[id$=mediumImage]").attr("src"));
    } else if (doc.select("img[id$=CoverView_Image]").size() > 0) {
        item.setCover(getCoverUrl(doc.select("img[id$=CoverView_Image]").first()));
    }

    // ID
    item.setId(doc.select("input[id$=regionmednr]").val());

    // Description
    if (doc.select("span[id$=ucCatalogueContent_LblAnnotation]").size() > 0) {
        String name = doc.select("span[id$=lblCatalogueContent]").text();
        String value = doc.select("span[id$=ucCatalogueContent_LblAnnotation]").text();
        item.addDetail(new Detail(name, value));
    }
    // Details
    for (Element detail : doc.select("div[id$=CatalogueDetailView] .spacingBottomSmall:has(span+span)")) {
        String name = detail.select("span").get(0).text().replace(": ", "");
        String value = detail.select("span").get(1).text();
        item.addDetail(new Detail(name, value));
    }

    // Copies
    Element table = doc.select("table[id$=grdViewMediumCopies]").first();
    Elements trs = table.select("tr");
    List<String> columnmap = new ArrayList<>();
    for (Element th : trs.first().select("th")) {
        columnmap.add(getCopyColumnKey(th.text()));
    }

    DateTimeFormatter fmt = DateTimeFormat.forPattern("dd.MM.yyyy").withLocale(Locale.GERMAN);
    for (int i = 1; i < trs.size(); i++) {
        Elements tds = trs.get(i).select("td");
        Copy copy = new Copy();
        for (int j = 0; j < tds.size(); j++) {
            if (columnmap.get(j) == null)
                continue;
            String text = tds.get(j).text().replace("\u00a0", "");
            if (text.equals(""))
                continue;
            copy.set(columnmap.get(j), text, fmt);
        }
        item.addCopy(copy);
    }

    return item;
}

From source file:de.geeksfactory.opacclient.apis.Heidi.java

@Override
public AccountData account(Account account) throws IOException, JSONException, OpacErrorException {
    login(account);/*w  w  w .  j a  v  a 2 s  .c  o m*/
    String html;
    Document doc;
    AccountData adata = new AccountData(account.getId());
    DateTimeFormatter fmt = DateTimeFormat.forPattern("dd.MM.yyyy").withLocale(Locale.GERMAN);

    html = httpGet(opac_url + "/konto.cgi?sess=" + sessid, getDefaultEncoding());
    doc = Jsoup.parse(html);
    doc.setBaseUri(opac_url + "/");

    for (Element td : doc.select("table.konto td")) {
        if (td.text().contains("Offene")) {
            String text = td.text().trim().replaceAll(
                    "Offene[^0-9]+Geb.+hren:[^0-9]+([0-9.," + "]+)[^0-9A-Z]*(|EUR|CHF|Fr.)", "$1 $2");
            adata.setPendingFees(text);
        }
    }

    List<LentItem> lent = new ArrayList<>();
    for (Element tr : doc.select("table.kontopos tr")) {
        LentItem item = new LentItem();
        Element desc = tr.child(1).select("label").first();
        String dates = tr.child(2).text().trim();
        if (tr.child(1).select("a").size() > 0) {
            String kk = getQueryParamsFirst(tr.child(1).select("a").first().absUrl("href")).get("katkey");
            item.setId(kk);
        }

        int i = 0;
        for (Node node : desc.childNodes()) {
            if (node instanceof TextNode) {
                String text = ((TextNode) node).text().trim();
                if (i == 0) {
                    item.setAuthor(text);
                } else if (i == 1) {
                    item.setTitle(text);
                } else if (text.contains("Mediennummer")) {
                    item.setBarcode(text.replace("Mediennummer: ", ""));
                }
                i++;
            }
        }

        if (tr.child(0).select("input").size() == 1) {
            item.setProlongData(tr.child(0).select("input").first().val());
            item.setRenewable(true);
        } else {
            item.setProlongData("" + tr.child(0).select("span").first().attr("class"));
            item.setRenewable(false);
        }

        String todate = dates;
        if (todate.contains("-")) {
            String[] datesplit = todate.split("-");
            todate = datesplit[1].trim();
        }
        try {
            item.setDeadline(fmt.parseLocalDate(todate.substring(0, 10)));
        } catch (IllegalArgumentException e) {
            e.printStackTrace();
        }

        lent.add(item);
    }
    adata.setLent(lent);

    List<ReservedItem> reservations = new ArrayList<>();
    html = httpGet(opac_url + "/konto.cgi?konto=v&sess=" + sessid, getDefaultEncoding());
    reservations.addAll(parse_reservations(html));
    html = httpGet(opac_url + "/konto.cgi?konto=b&sess=" + sessid, getDefaultEncoding());
    reservations.addAll(parse_reservations(html));

    adata.setReservations(reservations);

    return adata;
}

From source file:gov.medicaid.screening.dao.impl.OptometryLicenseDAOBean.java

/**
 * Performs a search for all possible results.
 *
 * @param identifier The value to be searched.
 * @return the search result for licenses
 * @throws URISyntaxException When an error occurs while building the URL.
 * @throws ClientProtocolException When client does not support protocol used.
 * @throws IOException When an error occurs while parsing response.
 * @throws ParseException When an error occurs while parsing response.
 * @throws PersistenceException for database related errors
 * @throws ServiceException for any other problems encountered
 *//*from  ww  w .j a  v a 2s  .  c  o  m*/
private SearchResult<License> getAllResults(String identifier) throws URISyntaxException,
        ClientProtocolException, IOException, ParseException, PersistenceException, ServiceException {
    DefaultHttpClient client = new DefaultHttpClient();
    URIBuilder builder = new URIBuilder(getSearchURL()).setPath("/Default.aspx");
    String hostId = builder.build().toString();
    builder.setParameter("tabid", "799");

    HttpGet httpget = new HttpGet(builder.build());
    HttpResponse landing = client.execute(httpget);
    Document document = Jsoup.parse(EntityUtils.toString(landing.getEntity()));

    HttpPost httppost = new HttpPost(builder.build());
    HttpEntity entity = postForm(hostId, client, httppost,
            new String[][] { { "_ctl0:_ctl1:_ctl0:txtCriteria", identifier },
                    { "_ctl0:_ctl1:_ctl0:btnSubmit", "Search" }, { "__EVENTTARGET", "" },
                    { "__EVENTARGUMENT", "" },
                    { "__VIEWSTATE", document.select("#Form input[name=__VIEWSTATE]").first().val() } },
            true);

    // licenses list
    List<License> licenseList = new ArrayList<License>();
    while (entity != null) {
        String result = EntityUtils.toString(entity);
        document = Jsoup.parse(result);

        Elements trs = document.select("table.Datagrid tr");
        if (trs != null) {
            for (Element element : trs) {
                String cssClass = element.attr("class");
                if (!"DatagridHeaderStyle".equals(cssClass.trim()) && element.children().size() == 8) {
                    Elements tds = element.children();
                    licenseList.add(parseLicense(tds));
                }
            }
        }

        // done, check if there are additional results
        entity = null;
        Elements elements = document.getElementsByTag("a");
        for (Element element : elements) {
            if (element.text().equals("Next >>")) {
                entity = postForm(hostId, client, httppost,
                        new String[][] { { "_ctl0:_ctl1:_ctl0:txtCriteria", identifier },
                                { "__EVENTTARGET", "_ctl0:_ctl1:_ctl0:dgrdLicensee:_ctl29:_ctl1" },
                                { "__EVENTARGUMENT", "" },
                                { "__VIEWSTATE",
                                        document.select("#Form input[name=__VIEWSTATE]").first().val() } },
                        true);
                break;
            }
        }
    }

    SearchResult<License> result = new SearchResult<License>();
    result.setItems(licenseList);
    return result;
}

From source file:me.vertretungsplan.parser.UntisCommonParser.java

/**
 * Parses an Untis substitution table ({@link UntisSubstitutionParser}).
 *
 * @param v//from  w w w .j av a  2 s.c  o  m
 * @param lastChange
 * @param doc
 * @throws JSONException
 * @throws CredentialInvalidException
 */
protected void parseSubstitutionTable(SubstitutionSchedule v, String lastChange, Document doc)
        throws JSONException, CredentialInvalidException {
    JSONObject data = scheduleData.getData();

    LocalDateTime lastChangeDate = ParserUtils.parseDateTime(lastChange);
    Pattern dayPattern = Pattern.compile("\\d\\d?.\\d\\d?. / \\w+");

    int dateColumn = -1;
    JSONArray columns = data.getJSONArray("columns");
    for (int i = 0; i < columns.length(); i++) {
        if (columns.getString(i).equals("date")) {
            dateColumn = i;
            break;
        }
    }

    Element table = doc.select("table[rules=all], table:has(tr:has(td[align=center]))").first();
    if (table.text().replace("\u00a0", "").trim().equals("Keine Vertretungen"))
        return;

    if (dateColumn == -1) {
        SubstitutionScheduleDay day = new SubstitutionScheduleDay();
        day.setLastChangeString(lastChange);
        day.setLastChange(lastChangeDate);
        String title = doc.select("font[size=5], font[size=4], font[size=3] b").text();
        Matcher matcher = dayPattern.matcher(title);
        if (matcher.find()) {
            String date = matcher.group();
            day.setDateString(date);
            day.setDate(ParserUtils.parseDate(date));
        }
        parseSubstitutionScheduleTable(table, data, day);
        v.addDay(day);
    } else {
        for (Element line : table.select("tr.list.odd:not(:has(td.inline_header)), "
                + "tr.list.even:not(:has(td.inline_header)), " + "tr:has(td[align=center]):gt(0)")) {
            SubstitutionScheduleDay day = null;
            String date = line.select("td").get(dateColumn).text().trim();

            if (date.indexOf("-") > 0) {
                date = date.substring(0, date.indexOf("-") - 1).trim();
            }

            LocalDate parsedDate = ParserUtils.parseDate(date);
            for (SubstitutionScheduleDay search : v.getDays()) {
                if (Objects.equals(search.getDate(), parsedDate)
                        || Objects.equals(search.getDateString(), date)) {
                    day = search;
                    break;
                }
            }
            if (day == null) {
                day = new SubstitutionScheduleDay();
                day.setDateString(date);
                day.setDate(parsedDate);
                day.setLastChangeString(lastChange);
                day.setLastChange(lastChangeDate);
                v.addDay(day);
            }
            parseSubstitutionScheduleTable(line, data, day);
        }
    }
}