Example usage for org.jsoup.nodes Element text

List of usage examples for org.jsoup.nodes Element text

Introduction

In this page you can find the example usage for org.jsoup.nodes Element text.

Prototype

public String text() 

Source Link

Document

Gets the combined text of this element and all its children.

Usage

From source file:mergedoc.core.APIDocument.java

/**
 * Javadoc ? ??????/*from  www .  j av  a 2  s. c  om*/
 * @param className ??
 * @param context 
 * @param comment 
 */
private void parseCommonTag(String className, Element element, Comment comment) {
    Elements dts = element.select("dl dt");
    for (Element dt : dts) {
        String dtText = dt.text();
        if (dtText.contains("")) {
            Elements aTags = dt.nextElementSibling().select("a:has(code)");
            for (Element a : aTags) {
                String url = a.attr("href");
                String ref;
                if (a.childNodeSize() != 1) {
                    ref = aTags.outerHtml();
                } else {
                    ref = formatClassName(className, url);
                    ref = FastStringUtils.replace(ref, "%28", "(");
                    ref = FastStringUtils.replace(ref, "%29", ")");

                    Pattern methodRefPat = PatternCache.getPattern("-(.*)-$");
                    Matcher methodRefMat = methodRefPat.matcher(ref);
                    if (methodRefMat.find()) {
                        ref = FastStringUtils.replaceAll(ref, "-(.*)-$", "($1)"); // for Java8
                        ref = FastStringUtils.replace(ref, "-", ","); // for Java8
                        ref = FastStringUtils.replace(ref, ":A", "[]"); // for Java8
                    }
                }
                comment.addSee(ref);
            }
        } else if (dtText.contains("???:")) {
            comment.addSince(dt.nextElementSibling().text());
        }
    }
}

From source file:by.heap.remark.convert.TextCleaner.java

/**
 * Replaces all {@code <br/>} tags with a newline in a copy of the input node, and
 * returns the resulting innter text./* ww w  . j  a  va 2s . co  m*/
 * This is necessary to ensure that manual linebreaks are supported in preformatted code.
 * 
 * @param oinput Preformatted node to process
 * @return inner text of the node.
 */
private String getPreformattedText(Element oinput) {
    Element el = oinput.clone();
    fixLineBreaks(el);
    return el.text();
}

From source file:accountgen.controller.Controller.java

private void setBday(Document doc, Person p) {
    Element bday = doc.select(".bday").first();
    Date bd = new Date();

    Date date = null;//from w w  w. j  a v a  2  s  . c  o  m
    try {
        date = new SimpleDateFormat("MMM", Locale.ENGLISH).parse(bday.text().split(" ")[0]);
    } catch (ParseException ex) {
        Logger.getLogger(Controller.class.getName()).log(Level.SEVERE, null, ex);
    }
    Calendar cal = Calendar.getInstance();
    cal.setTime(date);
    int month = cal.get(Calendar.MONTH);
    bd.setMonth(month);
    bd.setDate(Integer.parseInt(bday.text().split(" ")[1].replace(",", "")));
    bd.setYear(Integer.parseInt(bday.text().split(",")[1].substring(1, 5)) - 1900);
    p.setBirthday(bd);
}

From source file:GIST.IzbirkomExtractor.TableExtractor.java

public void processHTMLfile(File input_html) throws IOException, TableExtractorException,
            CloneNotSupportedException, SQLException, ResultSinkException {

        logger.info("Start processing " + input_html);

        Document doc = Jsoup.parse(input_html, "UTF-8");
        Elements tables = doc.getElementsByTag("table");

        /* count of parseable tables found */
        int tables_found = 0;

        /* determine raion name */
        String raion_name = extractRaionFromFileName(input_html.getName());
        //System.err.println(raion_name);

        // TODO: inflect raion name in  case

        /* searches for a table that has " . -" in its very 1st cell */
        for (Element table : tables) {
            Elements rows = table.getElementsByTag("tr");
            boolean firstRow = true;

            row_loop: for (Element row : rows) {
                Elements cells = row.getElementsByTag("td");

                if (firstRow) {
                    //System.err.println(row.text());
                    if (isParsableTable(row)) {
                        firstRow = false;
                        logger.info("Processing table #" + ++tables_found + " in " + input_html);
                    } else
                        break row_loop;
                }/*from  www  .j  av a2  s  .c o  m*/

                if (StringUtils.getLevenshteinDistance(cleanupUNICODE(cells.first().text()),
                        " . -") < 3)
                    continue row_loop; /* skip the row if it looks like a table header */

                /* skip rows with all cells empty */
                boolean emptyRow = true;
                for (Element cell : cells)
                    emptyRow = emptyRow && cleanupUNICODE(cell.text()).isEmpty();
                if (emptyRow)
                    continue;

                int i_cell = 0;
                Element station_id = null;
                Element address_field = null;
                Element org_address = null; /* address of the ??? */
                Element station_address = null;

                for (Element cell : cells) {
                    switch (i_cell) {
                    case 0:
                        station_id = cell;
                        break;
                    case 1:
                        address_field = cell;
                        break;
                    case 2:
                        org_address = cell;
                        break;
                    case 3:
                        station_address = cell;
                    default:
                        break;
                    }
                    i_cell++;
                }

                if (station_id == null)
                    throw new TableExtractorException("Polling station ID not found", row, input_html);
                if (address_field == null)
                    throw new TableExtractorException("Address list not found", row, input_html);

                /* extract int from poll station id */
                int psid;
                try {
                    psid = Integer.valueOf(cleanupUNICODE(station_id.text()).trim().replaceAll("[^\\d]", ""));
                } catch (NumberFormatException e) {
                    Exception te = new TableExtractorException("Failed to parse polling station ID >"
                            + cleanupUNICODE(station_id.text()).trim() + "<: ", station_id, input_html);
                    logger.severe(te.getMessage() + "; rest of " + input_html + " ignored.");
                    return;
                }

                /* extraction from HTML completely finished, now we work only with the addresses in the text form */
                extractAddressesFromText(raion_name.trim(), psid, cleanLeftoverHTML(address_field),
                        cleanLeftoverHTML(org_address), cleanLeftoverHTML(station_address));
            }
        }

        if (tables_found == 0)
            logger.severe("No parsable tables found in " + input_html);
        resultSink.commit();

        logger.info("" + tables_found + " table(s) processed in " + input_html);
    }

From source file:me.vertretungsplan.parser.SVPlanParser.java

private void parseSvPlanDay(SubstitutionSchedule v, Element svp, Document doc) throws IOException {
    SubstitutionScheduleDay day = new SubstitutionScheduleDay();
    if ((svp.select(".svp-plandatum-heute, .svp-plandatum-morgen, .Titel").size() > 0
            || doc.title().startsWith("Vertretungsplan fr "))) {
        setDate(svp, doc, day);/*from w  w  w. j  ava2  s  . co  m*/
        if (svp.select(".svp-tabelle, table:has(.Klasse)").size() > 0) {

            Elements rows = svp.select(".svp-tabelle tr, table:has(.Klasse) tr");
            String lastLesson = "";
            String lastClass = "";
            for (Element row : rows) {
                if ((doc.select(".svp-header").size() > 0 && row.hasClass("svp-header"))
                        || row.select("th").size() > 0 || row.text().trim().equals("")) {
                    continue;
                }

                Substitution substitution = new Substitution();

                for (Element column : row.select("td")) {
                    String type = column.className();
                    if (!hasData(column.text())) {
                        if ((type.startsWith("svp-stunde") || type.startsWith("Stunde"))
                                && hasData(lastLesson)) {
                            substitution.setLesson(lastLesson);
                        } else if ((type.startsWith("svp-klasse") || type.startsWith("Klasse"))
                                && hasData(lastClass)) {
                            substitution.getClasses().addAll(Arrays
                                    .asList(lastClass.split(data.optString(PARAM_CLASS_SEPARATOR, ", "))));
                        }
                        continue;
                    }
                    if (type.startsWith("svp-stunde") || type.startsWith("Stunde")) {
                        substitution.setLesson(column.text());
                        lastLesson = column.text();
                    } else if (type.startsWith("svp-klasse") || type.startsWith("Klasse")) {
                        substitution.getClasses().addAll(Arrays
                                .asList(column.text().split(data.optString(PARAM_CLASS_SEPARATOR, ", "))));
                        lastClass = column.text();
                    } else if (type.startsWith("svp-esfehlt") || type.startsWith("Lehrer")) {
                        if (!data.optBoolean(PARAM_EXCLUDE_TEACHERS)) {
                            substitution.setPreviousTeacher(column.text());
                        }
                    } else if (type.startsWith("svp-esvertritt") || type.startsWith("Vertretung")) {
                        if (!data.optBoolean(PARAM_EXCLUDE_TEACHERS)) {
                            substitution.setTeacher(column.text().replaceAll(" \\+$", ""));
                        }
                    } else if (type.startsWith("svp-fach") || type.startsWith("Fach")) {
                        substitution.setSubject(column.text());
                    } else if (type.startsWith("svp-bemerkung") || type.startsWith("Anmerkung")) {
                        substitution.setDesc(column.text());
                        String recognizedType = recognizeType(column.text());
                        substitution.setType(recognizedType);
                        substitution.setColor(colorProvider.getColor(recognizedType));
                    } else if (type.startsWith("svp-raum") || type.startsWith("Raum")) {
                        substitution.setRoom(column.text());
                    }
                }

                if (substitution.getType() == null) {
                    substitution.setType("Vertretung");
                    substitution.setColor(colorProvider.getColor("Vertretung"));
                }

                day.addSubstitution(substitution);
            }
        }
        if (svp.select(".LehrerVerplant").size() > 0) {
            day.addMessage("<b>Verplante Lehrer:</b> " + svp.select(".LehrerVerplant").text());
        }
        if (svp.select(".Abwesenheiten").size() > 0) {
            day.addMessage("<b>Abwesenheiten:</b> " + svp.select(".Abwesenheiten").text());
        }

        if (svp.select("h2:contains(Mitteilungen)").size() > 0) {
            Element h2 = svp.select("h2:contains(Mitteilungen)").first();
            Element sibling = h2.nextElementSibling();
            while (sibling != null && sibling.tagName().equals("p")) {
                for (String nachricht : TextNode.createFromEncoded(sibling.html(), null).getWholeText()
                        .split("<br />\\s*<br />")) {
                    if (hasData(nachricht))
                        day.addMessage(nachricht);
                }
                sibling = sibling.nextElementSibling();
            }
        } else if (svp.select(".Mitteilungen").size() > 0) {
            for (Element p : svp.select(".Mitteilungen")) {
                for (String nachricht : TextNode.createFromEncoded(p.html(), null).getWholeText()
                        .split("<br />\\s*<br />")) {
                    if (hasData(nachricht))
                        day.addMessage(nachricht);
                }
            }
        }
        v.addDay(day);
    } else {
        throw new IOException("keine SVPlan-Tabelle gefunden");
    }
}

From source file:GIST.IzbirkomExtractor.TableExtractor.java

/**
     * Cleaning up leftover of HTML code from the cell content.
     * /*  ww  w  . j  a  v a 2s.co m*/
     * @param cell_content HTML code contains in the table cell 
     * @return an array list containing each line of the cell_content withh all HTML markup removed
     */
    private ArrayList<String> cleanLeftoverHTML(Element cell_content) {

        ArrayList<String> streets_and_numbers = new ArrayList<String>();

        /* <div>s designate separate lines inside the table cell */
        for (Element addr_line : cell_content.getElementsByTag("div")) {

            /* skip empty address lines */
            String addr_line_text = cleanupUNICODE(addr_line.text());
            if (StringUtils.isBlank(addr_line_text))
                continue;

            /* <strong> is not particularly useful, but can designate placement of simple separators like space */
            Elements streets = addr_line.getElementsByTag("strong");
            if (!streets.isEmpty()) {
                addr_line_text = addr_line_text.replaceFirst(Pattern.quote(streets.text()),
                        " " + streets.text() + " ");
            }

            streets_and_numbers.add(addr_line_text);
        }
        return streets_and_numbers;
    }

From source file:net.parser.JobParser.java

public Set<Category> getCategories() {

    Elements elements = doc.select("#job-position .details dd a");

    Category category = null;//  w ww  .  j a  va2 s  .c  o  m
    if (elements.size() == 0) {
        return null;
    } else {
        Set<Category> categories = new HashSet<>();
        for (Element element : elements) {
            category = new Category();
            category.setId(CategoryEnum.getId(element.text()));
            categories.add(category);
        }
        return categories;
    }
}

From source file:eu.sisob.uma.extractors.adhoc.websearchers_cv.WebSearchersCVExtractor.java

/**
 *
 * @param nextLine/*  www .ja v  a 2 s.  c  om*/
 * @param idStaffIdentifier
 * @param idName
 * @param idFirstName
 * @param idLastName
 * @param idInitials
 * @param idSubject
 * @param idInstitutionName
 * @param idWebAddress
 * @param expression
 * @param params
 * @return
 */
@Override
protected String get_result(String[] nextLine, int idStaffIdentifier, int idName, int idFirstName,
        int idLastName, int idInitials, int idSubject, int idInstitutionName, int idWebAddress,
        String expression, Object[] params) {

    String domain = clean_site(nextLine[idWebAddress]);
    String subject = nextLine[idSubject];
    String expression_subject = expression + " " + subject + " " + files + " " + cv_keywords_in_query;
    expression_subject = expression_subject.replaceAll("\t", " ");
    expression_subject = expression_subject.replaceAll("  ", " ");

    String url = "https://duckduckgo.com/html/?q=" + expression_subject;
    Logger.getRootLogger().info("Go with " + url);
    boolean again = false;
    Document doc = null;
    do {
        doc = getDocumentFromPage(url, 10, 2000, 5000);

        if (doc != null && doc.text().contains("If this error persists, please let us know")) {
            try {
                Thread.sleep(30000);
            } catch (InterruptedException ex) {
            }
            again = true;
        } else {
            again = false;
        }
    } while (again);

    //if(doc.select("div[class*=links_main] > a[href*=" + domain + "]").size() > 0){
    String final_result = "";
    if (doc != null && doc.select("div[class*=links_main] > a").size() > 0) {

        /* Write resercher founded */
        Elements elements = doc.select("div[class*=links_main] > a");

        /* We will take the first html page and the first pdf */

        List<String[]> results = new ArrayList<String[]>();
        final int EXT_I = 0;
        final int SCORE_INT_I = 1;
        final int SCORE_LETTER_I = 2;
        final int RESULT_I = 3;
        final int WORST_SCORE = 67;

        //int max_results = elements.size();
        //int i_result = 0; 
        for (Element e : elements) {
            if ((e.text().startsWith("[") && !e.text().startsWith("[PDF]"))
                    || e.absUrl("href").contains("duckduckgo.com/y.js")
                    || e.absUrl("href").contains("wikipedia.") || e.absUrl("href").contains("microsoft.com")
                    || e.absUrl("href").contains("google.com") || e.absUrl("href").contains("linkedin")
                    || e.absUrl("href").contains("www.biography.com")
                    || e.absUrl("href").contains("biomedexperts.com")
                    || e.absUrl("href").contains("www.experts.scival.com")
                    || e.absUrl("href").contains("ratemyprofessors.com")
                    || e.absUrl("href").contains("flickr.com") || e.absUrl("href").endsWith(".txt")
                    || e.absUrl("href").endsWith(".csv") || e.absUrl("href").endsWith(".xml")
                    || e.absUrl("href").endsWith(".doc") || e.absUrl("href").endsWith(".docx")
                    || e.absUrl("href").endsWith(".xls") || e.absUrl("href").endsWith(".xlxs")
                    || e.absUrl("href").contains("www.amazon")) {
                continue;
            }

            boolean add = false;
            int score_int = WORST_SCORE;
            String score = "";
            String ext = "";

            if (e.text().startsWith("[PDF]") || e.text().startsWith("[DOCX]") || e.text().startsWith("[DOC]")
                    || e.text().startsWith("[RTF]")) {

                String clean_name_1 = e.text().replaceAll("[^\\w\\s]", "").toLowerCase();
                int i = e.absUrl("href").lastIndexOf("/");
                int f = e.absUrl("href").lastIndexOf(".");
                String clean_name_2 = "";
                if (i != -1 && f != -1)
                    clean_name_2 = e.absUrl("href").substring(i, f).toLowerCase();
                boolean b = false;
                for (String k : cv_keywords_in_name_list) {
                    if (clean_name_1.contains(k) || clean_name_2.contains(k)) {
                        b = true;
                        break;
                    }
                }
                if (b) {
                    score_int--;
                }

                if (clean_name_1.contains(nextLine[idLastName])
                        || clean_name_2.contains(nextLine[idLastName])) {
                    score_int--;
                }

                score = Character.toChars(score_int)[0] + "";
                add = true;
                ext = "PDF";
            }

            //if(!results.containsKey("HTML") && !e.text().startsWith("[")){
            //}                                                 

            if (add) {
                String result = "";
                result += "\"" + nextLine[idStaffIdentifier] + "\";";
                result += "\"" + nextLine[idLastName] + "\";";
                result += "\"" + nextLine[idInitials] + "\";";
                if (idFirstName != -1)
                    result += "\"" + nextLine[idFirstName] + "\";";
                if (idName != -1)
                    result += "\"" + nextLine[idName] + "\";";
                result += "\"" + e.absUrl("href") + "\";";
                result += "\"" + ext + "\";";
                result += "\"" + "CV" + "\";";
                result += "\"" + score + "\"";
                result += "\r\n";
                results.add(new String[] { ext, score_int + "", score, result });

                Logger.getRootLogger().info("Select " + e.absUrl("href") + " - " + score + " - " + e.text());
            }
        }

        final_result = "";
        int best_score = WORST_SCORE;
        for (String[] result : results) {

            if (result[EXT_I].equals("PDF")) {
                int act_score = Integer.parseInt(result[SCORE_INT_I]);

                if (act_score < best_score) {
                    best_score = act_score;
                    final_result = result[RESULT_I];
                }

            }
        }
    }

    return final_result;
}

From source file:com.obnsoft.ptcm3.MyApplication.java

private void parseCommandHtml() {
    mCommands = new ArrayList<Command>();
    mCategories = new ArrayList<String>();
    int categoryId = -1;
    try {//from  w  ww  .  ja  va2  s . com
        InputStream in = openFileInput(FNAME_CMD_HTML);
        Document document = Jsoup.parse(in, "UTF-8", URL_CMD_HTML);
        in.close();
        Element divContentArea = document.getElementById(ID_CONTENTAREA);
        for (Element e : divContentArea.children()) {
            if (e.tagName().equals(TAG_TABLE)) {
                if (e.className().equals("")) {
                    mCommands.add(new Command(e, categoryId));
                }
            } else if (e.tagName().equals(TAG_H3)) {
                mCategories.add(e.text());
                categoryId++;
            }
        }
    } catch (IOException e) {
        e.printStackTrace();
        mCommands = null;
        mCategories = null;
    }
}

From source file:org.confab.PhpBB3Parser.java

/**
 * Parses each topic for a particular forum.
 * @param  forum        Document of html containing topics
 * @param  parent       Forum the threads belong to
 * @return              List of ForumThread objects 
 *//*  www . ja  v  a 2 s  .c  om*/
public List<ForumThread> parseForumThreads(Document forum, Forum parent) {
    Utilities.debug("parseForumThreads");

    List<ForumThread> ret = new ArrayList<ForumThread>();

    // Get topic table
    Elements thread_table_tds = forum.select("tbody[id*=threadbits_forum_] td");
    if (thread_table_tds.isEmpty()) {
        Utilities.debug("It seems " + parent.url + " has no topics.");
        return ret;
    }

    // Get any stickies
    Elements stickies = thread_table_tds.select("td:contains(Sticky:)  a[id*=thread_title_]");

    // Get all topics
    Elements els_a = thread_table_tds.select("a[id*=thread_title_]");
    assert !els_a.isEmpty();

    // Loop topics and grab info about each
    for (Element el_a : els_a) {
        ForumThread new_topic = new ForumThread(parent);

        // Get topic 
        new_topic.title = el_a.text();
        assert new_topic.title != null;
        Utilities.debug("new_topic.title: " + new_topic.title);

        // Check if sticky
        if (stickies.html().contains(new_topic.title)) {
            new_topic.isSticky = true;
            Utilities.debug("new_topic.isSticky: " + new_topic.isSticky);
        }

        // Get URL
        new_topic.url = el_a.attr("href");
        assert new_topic.url != null;
        Utilities.debug("new_topic.url:" + new_topic.url);

        ret.add(new_topic);
    }

    Utilities.debug("end printForumThreads");
    return ret;
}