Example usage for org.jsoup.nodes Element text

Introduction

In this page you can find the example usage for org.jsoup.nodes Element text.

Prototype

public String text()

Source Link

Document

Gets the combined text of this element and all its children.

Usage

From source file:cn.edu.hfut.dmic.contentextractor.ContentExtractor.java

private boolean noText(Element ele) {
    return noText(ele.text());
}

From source file:com.jejking.hh.nord.corpus.AllrisHtmlToRawDrucksache.java

private ImmutableList<String> druckSachenContents(Document htmlDoc) {
    /*//from w  w  w  .  j a v  a2 s . c  o m
     * In this way we can identify the bits of "RTF" like text inserted into the overall HTML.
     * JSoup cleans up the broken HTML removing the xml declaration and inserted html roots
     * that ALLRIS manages to put in.
     */
    Elements contentMetaElements = htmlDoc.getElementsByAttributeValue("name", "generator");
    ImmutableList.Builder<String> listBuilder = ImmutableList.builder();

    /*
     * Iterate over our candidates. Sometimes there are several.
     */
    for (Element contentMetaElement : contentMetaElements) {
        StringBuilder contentAsTextBuilder = new StringBuilder();
        Element nextSibling = contentMetaElement.nextElementSibling();

        /*
         * In the cleaned up HTML DOM returned by JSoup the "RTF" content is
         * rendered as siblings of the meta node (JSoup having removed the html, head, body
         * elements which should never have been there in the first place). 
         */
        while (nextSibling != null && !nextSibling.tag().equals("meta")) {
            contentAsTextBuilder.append(nextSibling.text());
            nextSibling = nextSibling.nextElementSibling();
        }
        /*
         * Only carry over non-empty content.
         */
        String contentAsText = contentAsTextBuilder.toString();
        if (!removeNonBreakingSpacesAndTrim(contentAsText).isEmpty()) {
            listBuilder.add(contentAsText);
        }
    }

    return listBuilder.build();
}

From source file:me.vertretungsplan.parser.UntisInfoParser.java

@Override
public SubstitutionSchedule getSubstitutionSchedule()
        throws IOException, JSONException, CredentialInvalidException {
    new LoginHandler(scheduleData, credential, cookieProvider).handleLogin(executor, cookieStore);

    Document navbarDoc = Jsoup.parse(getNavbarDoc().replace("&nbsp;", ""));
    Element select = navbarDoc.select("select[name=week]").first();

    SubstitutionSchedule v = SubstitutionSchedule.fromData(scheduleData);

    String info = navbarDoc.select(".description").text();
    String lastChange;//  ww  w .  ja  v  a 2  s  .co  m
    try {
        lastChange = info.substring(info.indexOf("Stand:") + "Stand:".length()).trim();
    } catch (Exception e) {
        try {
            String infoHtml = httpGet(baseUrl + "/frames/title.htm", data.optString(PARAM_ENCODING, null));
            Document infoDoc = Jsoup.parse(infoHtml);
            String info2 = infoDoc.select(".description").text();
            lastChange = info2.substring(info2.indexOf("Stand:") + "Stand:".length()).trim();
        } catch (Exception e1) {
            lastChange = "";
        }
    }

    int successfulWeeks = 0;
    HttpResponseException lastException = null;
    for (Element option : select.children()) {
        String week = option.attr("value");
        String weekName = option.text();
        if (data.optBoolean(PARAM_SINGLE_CLASSES, data.optBoolean("single_classes", false)) // backwards compatibility
                || data.optString(PARAM_SCHEDULE_TYPE, "substitution").equals("timetable")) {
            int classNumber = 1;
            for (String klasse : getAllClasses()) {
                String url = getScheduleUrl(week, classNumber, data);
                try {
                    parsePage(v, lastChange, klasse, url, weekName);
                } catch (HttpResponseException e) {
                    if (e.getStatusCode() == 500) {
                        // occurs in Hannover_MMBS
                        classNumber++;
                        continue;
                    } else {
                        throw e;
                    }
                }

                classNumber++;
            }
            successfulWeeks++;
        } else {
            String url = getScheduleUrl(week, 0, data);
            try {
                parsePage(v, lastChange, null, url, weekName);
                successfulWeeks++;
            } catch (HttpResponseException e) {
                lastException = e;
            }
        }
    }
    if (successfulWeeks == 0 && lastException != null) {
        throw lastException;
    }
    v.setClasses(getAllClasses());
    v.setTeachers(getAllTeachers());
    v.setWebsite(baseUrl + "/default.htm");
    return v;
}

From source file:gov.medicaid.screening.dao.impl.DieteticsAndNutritionPracticeLicenseDAOBean.java

/**
 * Performs a search for all possible results.
 *
 * @param identifier The value to be searched.
 * @return the search result for licenses
 * @throws URISyntaxException When an error occurs while building the URL.
 * @throws ClientProtocolException When client does not support protocol used.
 * @throws IOException When an error occurs while parsing response.
 * @throws ParseException When an error occurs while parsing response.
 * @throws PersistenceException for database related errors
 * @throws ServiceException for any other errors
 *//*from   w  w  w .  j a va 2 s  .com*/
private SearchResult<License> getAllResults(String identifier) throws URISyntaxException,
        ClientProtocolException, IOException, ParseException, PersistenceException, ServiceException {
    DefaultHttpClient client = new DefaultHttpClient();
    URIBuilder builder = new URIBuilder(getSearchURL());
    String hostId = builder.build().toString();

    HttpGet httpget = new HttpGet(builder.build());
    HttpResponse landing = client.execute(httpget);
    Document document = Jsoup.parse(EntityUtils.toString(landing.getEntity()));

    HttpPost httppost = new HttpPost(builder.build());
    HttpEntity entity = postForm(hostId, client, httppost,
            new String[][] { { "_ctl0:_ctl1:_ctl0:txtCriteria", identifier },
                    { "_ctl0:_ctl1:_ctl0:btnSubmit", "Search" }, { "__EVENTTARGET", "" },
                    { "__EVENTARGUMENT", "" },
                    { "__VIEWSTATE", document.select("#Form input[name=__VIEWSTATE]").first().val() } },
            true);

    // licenses list
    List<License> licenseList = new ArrayList<License>();
    while (entity != null) {
        String result = EntityUtils.toString(entity);
        document = Jsoup.parse(result);

        Elements trs = document.select(GRID_ROW_SELECTOR);
        if (trs != null) {
            for (Element element : trs) {
                licenseList.add(parseLicense(element.children()));
            }
        }

        // done, check if there are additional results
        entity = null;
        Elements elements = document.getElementsByTag("a");
        for (Element element : elements) {
            if (element.text().equals("Next >>")) {
                entity = postForm(hostId, client, httppost,
                        new String[][] { { "_ctl0:_ctl1:_ctl0:txtCriteria", identifier },
                                { "__EVENTTARGET", "_ctl0:_ctl1:_ctl0:dgrdLicensee:_ctl29:_ctl1" },
                                { "__EVENTARGUMENT", "" },
                                { "__VIEWSTATE",
                                        document.select("#Form input[name=__VIEWSTATE]").first().val() } },
                        true);
                break;
            }
        }
    }

    SearchResult<License> result = new SearchResult<License>();
    result.setItems(licenseList);
    return result;
}

From source file:org.confab.VBulletinParser.java

public List<Forum> parseForums(Document root, BulletinBoard parent) {
    Utilities.debug("parseForums");

    List<Forum> ret = new ArrayList<Forum>();

    // get table//from  w ww  . ja  va2 s .  c o  m
    Elements forum_table = root.select("tbody[id*=collapseobj_forumbit_] tr");
    assert !forum_table.isEmpty();

    for (Element el_tr : forum_table) {
        Forum new_forum = new Forum(parent);

        // Get the table data for this row
        Elements el_tds = el_tr.select("td");
        assert !el_tds.isEmpty() : el_tr.html();

        // xbox360achievements has a lot of subforums and puts these in their own table
        // The <a>'s are picked up as children of the parent <td> so don't parse this sub-
        // tables row's seperatly
        if (!el_tds.select("td.thead").isEmpty() || el_tds.size() < 3) {
            //Utilities.debug("tr doesn't seem to have anything we want, skipping.");
            continue;
        }

        // Get the title URL
        Elements els_a = el_tds.get(1).select("a");
        assert !els_a.isEmpty() : el_tds.html();
        new_forum.url = els_a.first().attr("href");
        assert new_forum.url != null;
        Utilities.debug("new_forum.url : " + new_forum.url);

        // Get the title text
        assert els_a.first() != null;
        new_forum.title = els_a.first().text();
        assert new_forum.title != null;
        Utilities.debug("new_forum.title : " + new_forum.title);

        // Check for any subforums in remaining a elements
        els_a.remove(els_a.first());
        for (Element el_a : els_a) {
            Forum sub_forum = new Forum(parent);
            sub_forum.url = el_a.attr("href");
            assert sub_forum.url != null;
            sub_forum.title = el_a.text();
            assert sub_forum.title != null;
            new_forum.subForums.add(sub_forum);
            Utilities.debug("added subForum: " + sub_forum.title);
        }

        // Get num viewing the current forum
        Element el_viewing = el_tr.select(":matchesOwn((\\d+ Viewing))").first();
        if (el_viewing != null) {
            new_forum.numViewing = el_viewing.text();
        } else {
            new_forum.numViewing = "0";
        }
        Utilities.debug("new_forum.numViewing : " + new_forum.numViewing);

        // Get the description/message of this topic
        Element el_description = el_tds.get(1).select("div.smallfont").first();
        if (el_description != null) {
            new_forum.description = el_description.text();
        } else {
            new_forum.description = "";
        }
        Utilities.debug("new_forum.description : " + new_forum.description);

        Utilities.debug("new_forum.parent.url : " + new_forum.parent.url);

        ret.add(new_forum);
        Utilities.debug("-----");
    }
    Utilities.debug("end parseForums");
    return ret;
}

From source file:com.spd.ukraine.lucenewebsearch1.web.IndexingController.java

private boolean prevElementContainsElementText(Element prevElement, Element element) {
    return (prevElement.hasText() && element.hasText() && prevElement.text().contains(element.text()));
}

From source file:net.groupbuy.entity.Article.java

/**
 * ?//from   w  w  w. ja va2  s . c o m
 * 
 * @return 
 */
@Transient
public String[] getPageContents() {
    if (StringUtils.isEmpty(content)) {
        return new String[] { "" };
    }
    if (content.contains(PAGE_BREAK_SEPARATOR)) {
        return content.split(PAGE_BREAK_SEPARATOR);
    } else {
        List<String> pageContents = new ArrayList<String>();
        Document document = Jsoup.parse(content);
        List<Node> children = document.body().childNodes();
        if (children != null) {
            int textLength = 0;
            StringBuffer html = new StringBuffer();
            for (Node node : children) {
                if (node instanceof Element) {
                    Element element = (Element) node;
                    html.append(element.outerHtml());
                    textLength += element.text().length();
                    if (textLength >= PAGE_CONTENT_LENGTH) {
                        pageContents.add(html.toString());
                        textLength = 0;
                        html.setLength(0);
                    }
                } else if (node instanceof TextNode) {
                    TextNode textNode = (TextNode) node;
                    String text = textNode.text();
                    String[] contents = PARAGRAPH_SEPARATOR_PATTERN.split(text);
                    Matcher matcher = PARAGRAPH_SEPARATOR_PATTERN.matcher(text);
                    for (String content : contents) {
                        if (matcher.find()) {
                            content += matcher.group();
                        }
                        html.append(content);
                        textLength += content.length();
                        if (textLength >= PAGE_CONTENT_LENGTH) {
                            pageContents.add(html.toString());
                            textLength = 0;
                            html.setLength(0);
                        }
                    }
                }
            }
            String pageContent = html.toString();
            if (StringUtils.isNotEmpty(pageContent)) {
                pageContents.add(pageContent);
            }
        }
        return pageContents.toArray(new String[pageContents.size()]);
    }
}

From source file:gov.medicaid.screening.dao.impl.MedicalPracticeLicenseDAOBean.java

/**
 * Searches for the available specialty options matching the criteria.
 *
 * @param criteria the criteria for specialty search
 * @param document the current page// w  w  w.j av  a2  s.  c om
 * @return the matched code
 * @throws ServiceException if the code provided is not present
 */
private String matchSpecialtyCode(MedicalPracticeLicenseSearchCriteria criteria, Document document)
        throws ServiceException {
    Elements specialtyOptions = document.select("select#_ctl7_ddlbSpecialty option");
    Specialty specialty = criteria.getSpecialty();
    String code = null;
    boolean found = false;
    for (Element option : specialtyOptions) {
        code = option.attr("value");
        if (Util.isNotBlank(specialty.getName())) { // match the name
            if (specialty.getName().equalsIgnoreCase(option.text())) {
                if (specialty.getCode() > 0 && Integer.parseInt(code) != specialty.getCode()) {
                    throw new ServiceException(ErrorCode.MITA10007.getDesc());
                }
                found = true;
                break;
            }
        } else { // match only the code
            if (Integer.parseInt(code) == specialty.getCode()) {
                found = true;
                break;
            }
        }
    }
    if (!found) {
        throw new ServiceException(ErrorCode.MITA10007.getDesc());
    }
    return code;
}

From source file:us.colloquy.index.IndexHandler.java

public void getURIForAllLetters(Set<DocumentPointer> uriList, String letterDirectory, boolean useOnlyNumber) {
    ///Documents/Tolstoy/diaries

    Path pathToLetters = FileSystems.getDefault().getPath(letterDirectory);

    List<Path> results = new ArrayList<>();

    int maxDepth = 6;

    try (Stream<Path> stream = Files.find(pathToLetters, maxDepth, (path, attr) -> {
        return String.valueOf(path).endsWith(".ncx");
    })) {//from  w  w  w.j a v  a2 s  . c om

        stream.forEach(results::add);

        //            String joined = stream
        //                    .sorted()
        //                    .map(String::valueOf)
        //                    .collect(Collectors.joining("; "));
        //
        //            System.out.println("\nFound: " + joined);

    } catch (IOException e) {
        e.printStackTrace();
    }

    System.out.println("files: " + results.size());

    try {

        for (Path res : results) {
            Path parent = res.getParent();

            //                System.out.println("---------------------------------------------");
            //                System.out.println(parent.toString());
            //use jsoup to list all files that contain something useful
            Document doc = Jsoup.parse(res.toFile(), "UTF-8");

            String title = "";

            for (Element element : doc.getElementsByTag("docTitle")) {
                //Letter letter = new Letter();

                // StringBuilder content = new StringBuilder();

                for (Element child : element.children()) {
                    title = child.text();
                    // System.out.println("Title: " + title);
                }
            }

            for (Element element : doc.getElementsByTag("navPoint")) {
                //Letter letter = new Letter();

                // StringBuilder content = new StringBuilder();

                for (Element child : element.children()) {
                    String label = child.text();

                    if (StringUtils.isNotEmpty(label)) {
                        if (label.matches("?")) {
                            System.out.println("------------------");
                        }

                        String url = child.getElementsByTag("content").attr("src");

                        if (label.matches(".*\\d{1,3}.*[?--?]+.*") && StringUtils.isNotEmpty(url)) {
                            DocumentPointer documentPointer = new DocumentPointer(
                                    parent.toString() + File.separator + url.replaceAll("#.*", ""), title);

                            uriList.add(documentPointer);
                            //                                System.out.println("nav point: " + label + " src " + parent.toString()
                            //                                        + System.lineSeparator() + url.replaceAll("#.*",""));

                        } else if (label.matches(".*\\d{1,3}.*") && StringUtils.isNotEmpty(url)
                                && useOnlyNumber) {
                            DocumentPointer documentPointer = new DocumentPointer(
                                    parent.toString() + File.separator + url.replaceAll("#.*", ""), title);

                            uriList.add(documentPointer);
                            //                                System.out.println("nav point: " + label + " src " + parent.toString()
                            //                                        + System.lineSeparator() + url.replaceAll("#.*",""));

                        } else {
                            // System.out.println("nav point: " + label + " src " + child.getElementsByTag("content").attr("src"));
                        }

                    }
                }
            }

        }
    } catch (Exception e) {
        e.printStackTrace();
    }

    //        System.out.println("Size: " + uriList.size());

    //        for (DocumentPointer pointer : uriList)
    //        {
    //            //parse and
    //            System.out.println(pointer.getSourse() + "\t" + pointer.getUri());
    //        }
}

From source file:us.colloquy.index.IndexHandler.java

public void getURIForAllDiaries(List<DocumentPointer> documentPointers, Path pathToLetters) {
    List<Path> results = new ArrayList<>();

    int maxDepth = 6;

    try (Stream<Path> stream = Files.find(pathToLetters, maxDepth, (path, attr) -> {
        return String.valueOf(path).endsWith(".ncx");
    })) {//from ww w .j  a v  a  2 s.c  o m

        stream.forEach(results::add);

    } catch (IOException e) {
        e.printStackTrace();
    }

    System.out.println("files: " + results.size());

    try {

        for (Path res : results) {
            Path parent = res.getParent();

            //                System.out.println("---------------------------------------------");
            //                System.out.println(parent.toString());
            //use jsoup to list all files that contain something useful
            Document doc = Jsoup.parse(res.toFile(), "UTF-8");

            String title = "";

            for (Element element : doc.getElementsByTag("docTitle")) {
                //Letter letter = new Letter();

                // StringBuilder content = new StringBuilder();

                for (Element child : element.children()) {
                    title = child.text();
                    // System.out.println("Title: " + title);
                }
            }

            //  System.out.println("==========================   " + res.toString() + " ==========================");

            boolean startPrinting = false;

            boolean newFile = true;

            for (Element element : doc.getElementsByTag("navPoint")) {

                //get nav label and content

                Element navLabelElement = element.select("navLabel").first();
                Element srsElement = element.select("content").first();

                String navLabel = "";
                String srs = "";

                if (navLabelElement != null) {
                    navLabel = navLabelElement.text().replaceAll("\\*", "").trim();
                }

                if (srsElement != null) {
                    srs = srsElement.attr("src");
                }

                if ("??".matches(navLabel))

                {
                    startPrinting = false;

                    // System.out.println("----------------- end of file pointer ---------------");
                }

                if (StringUtils.isNotEmpty(navLabel)
                        && navLabel.matches("??.*|?? ?.*") && newFile) {
                    newFile = false;
                    startPrinting = true;
                }

                if (startPrinting && !navLabel
                        .matches("(|??? ??)")) {
                    // System.out.println("----------------- file pointer ---------------");
                    //   System.out.println(navLabel + "\t" + srs);

                    DocumentPointer documentPointer = new DocumentPointer(
                            parent.toString() + File.separator + srs.replaceAll("#.*", ""), title);

                    documentPointers.add(documentPointer);
                }

            }

            //   System.out.println("==========================   END OF FILE ==========================");

        }
    } catch (Exception e) {
        e.printStackTrace();
    }

    System.out.println("Size: " + documentPointers.size());

    //  for (DocumentPointer pointer : documentPointers)
    // {
    //parse and
    //     System.out.println(pointer.getSourse() + "\t" + pointer.getUri());
}