Example usage for org.jsoup.nodes Element text

List of usage examples for org.jsoup.nodes Element text

Introduction

In this page you can find the example usage for org.jsoup.nodes Element text.

Prototype

public String text() 

Source Link

Document

Gets the combined text of this element and all its children.

Usage

From source file:com.jejking.hh.nord.corpus.AllrisHtmlToRawDrucksache.java

private String druckSacheId(Document htmlDoc) {
    Elements druckSacheIdElememnts = htmlDoc.select("#risname > h1");
    Element druckSacheIdElement = druckSacheIdElememnts.first();
    String elementText = druckSacheIdElement.text();
    String druckSacheId = removeNonBreakingSpacesAndTrim(elementText.substring("Drucksache - ".length()));
    return druckSacheId;
}

From source file:org.ala.lucene.CreateWordPressIndex.java

/**
 * Index the WP pages by parsing with Jsoup and indexing into SOLR
 *
 * @return//from  w w w.j ava 2 s  .  c om
 * @throws IOException
 */
protected int indexPages() throws Exception {
    int documentCount = 0;
    // Initialise SOLR
    SolrServer solrServer = solrUtils.getSolrServer();
    logger.info("Deleting all WordPress documents in SOLR index...");
    solrServer.deleteByQuery("idxtype:" + IndexedTypes.WORDPRESS); // delete WP pages
    solrServer.commit();

    for (String pageUrl : this.pageUrls) {
        try {
            // Crawl and extract text from WP pages
            Document document = Jsoup.connect(pageUrl + CONTENT_ONLY_PARAM).get();
            String title = document.select("head > title").text();
            String id = document.select("head > meta[name=id]").attr("content");
            String bodyText = document.body().text();
            Elements postCategories = document.select("ul[class=post-categories]");
            List<String> categoriesOut = new ArrayList<String>();
            Boolean excludePost = false;

            if (!postCategories.isEmpty()) {
                // Is a WP post (not page)
                Elements categoriesIn = postCategories.select("li > a"); // get list of li elements

                for (Element cat : categoriesIn) {
                    String thisCat = cat.text();

                    if (thisCat != null && excludedCategories.contains(thisCat)) { // "button".equals(thisCat)
                        // exclude category "button" posts
                        excludePost = true;
                    }
                    if (thisCat != null) {
                        // add category to list
                        categoriesOut.add(thisCat.replaceAll(" ", "_"));
                    }
                }
            }

            if (excludePost) {
                logger.debug("Excluding post (id: " + id + ") with category: "
                        + StringUtils.join(categoriesOut, "|"));
                continue;
            }

            documentCount++;
            // Index with SOLR
            logger.debug(documentCount + ". Indexing WP page - id: " + id + " | title: " + title + " | text: "
                    + StringUtils.substring(bodyText, 0, 100) + "... ");
            SolrInputDocument doc = new SolrInputDocument();
            doc.addField("idxtype", IndexedTypes.WORDPRESS);
            doc.addField("guid", WP_BASE_URI + id); // use page_id based URI instead of permalink in case permalink is too long for id field
            doc.addField("id", "wp" + id); // probably not needed but safer to leave in
            doc.addField("name", title, 1.2f);
            doc.addField("content", bodyText);
            doc.addField("australian_s", "recorded"); // so they appear in default QF search
            doc.addField("categories", categoriesOut);
            // add to index
            solrServer.add(doc);

            if (documentCount % 100 == 0) {
                logger.info("Committing to SOLR (count = " + documentCount + ")...");
                solrServer.commit();
            }
        } catch (IOException ex) {
            // catch it so we don't stop indexing other pages
            logger.warn("Problem accessing/reading WP page: " + ex.getMessage(), ex);
        }
    }

    logger.info("Final Committing to SOLR...");
    solrServer.commit();
    //logger.info("Optimising SOLR index...");
    //solrServer.optimize(); // throws errors on my machine??
    logger.info("Committed to SOLR. Final document count: " + documentCount);
    return documentCount;
}

From source file:me.vertretungsplan.parser.UntisCommonParser.java

static void handleTeacher(Substitution subst, Element cell, JSONObject data) {
    cell = getContentElement(cell);//from w w w .j ava 2s  .  c o m
    if (cell.select("s").size() > 0) {
        subst.setPreviousTeachers(splitTeachers(cell.select("s").text(), data));
        if (cell.ownText().length() > 0) {
            subst.setTeachers(
                    splitTeachers(cell.ownText().replaceFirst("^\\?", "").replaceFirst("", ""), data));
        }
    } else {
        subst.setTeachers(splitTeachers(cell.text(), data));
    }
}

From source file:net.pixomania.crawler.W3C.parser.rules.principalAuthors.PrincipalAuthorsRule1.java

@Override
public ArrayList<Person> run(String url, Document doc) {
    ArrayList<Person> editorList = new ArrayList<>();

    Elements editors = doc.select("dt:contains(Principal Author) ~ dd");
    if (editors.size() == 0)
        return null;

    boolean skip = false;
    for (Element editor : editors) {
        Element prev = editor.previousElementSibling();
        if (prev.tagName().equals("dt")) {
            if (!prev.text().trim().toLowerCase().startsWith("principal author")) {
                skip = true;//  w w  w. j  a  v  a2s . c om
            }
        }

        if (skip) {
            Element next = editor.nextElementSibling();
            if (next != null) {
                if (next.text().trim().toLowerCase().startsWith("principal author")) {
                    skip = false;
                    continue;
                }
            }
            continue;
        }

        String[] splitted = editor.html().split(",");

        for (String split : splitted) {
            if (!split.isEmpty()) {
                if (split.toLowerCase().startsWith("(in alphabetic")
                        || split.toLowerCase().startsWith("see acknowl")
                        || split.toLowerCase().startsWith("the w3") || split.toLowerCase().startsWith("(see ac")
                        || split.toLowerCase().startsWith("see participants")
                        || split.toLowerCase().contains("note:")) {
                    Log.log("warning", "Spec " + url + " may refer to a different section!");
                    continue;
                }
                if (split.equals("WHATWG:") || split.equals("W3C:"))
                    continue;
                Document newdoc = Jsoup.parse(split.replaceAll("\n", ""));
                Person result = NameParser.parse(newdoc.text());
                if (result == null)
                    continue;

                for (int i = 0; i < newdoc.select("a").size(); i++) {
                    if (!newdoc.select("a").get(i).attr("href").isEmpty()) {
                        if (newdoc.select("a").get(i).attr("href").contains("@")) {
                            result.setEmail(newdoc.select("a").get(i).attr("href").replace("mailto:", ""));
                        } else {
                            result.addWebsite(newdoc.select("a").get(i).attr("href"));
                        }
                    }
                }

                editorList.add(result);
            }
        }
    }

    if (editorList.size() == 0)
        return null;

    return editorList;
}

From source file:com.jejking.hh.nord.corpus.AllrisHtmlToRawDrucksache.java

private ImmutableMap<String, String> druckSachenProperties(Document htmlDoc) {

    ImmutableMap.Builder<String, String> mapBuilder = ImmutableMap.builder();
    Elements keyElements = htmlDoc.getElementsByClass("kb1"); // td elements
    for (Element element : keyElements) {
        String key = removeNonBreakingSpacesAndTrim(element.text());
        if (key.endsWith(":")) {
            key = key.substring(0, key.length() - 1);
        }//ww  w .ja v  a 2s .  c  o m
        if (element.nextElementSibling() != null && !element.nextElementSibling().hasAttr("kb1")) {
            String value = removeNonBreakingSpacesAndTrim(element.nextElementSibling().text());

            if ((!key.isEmpty()) && (!value.isEmpty())) {
                mapBuilder.put(key, value);
            }
        }
    }
    return mapBuilder.build();
}

From source file:com.jimplush.goose.outputformatters.DefaultOutputFormatter.java

/**
 * replace common tags with just text so we don't have any crazy formatting issues
 * so replace <br>, <i>, <strong>, etc.... with whatever text is inside them
 *///  w w  w.  ja  v a2s .c  o  m
private void replaceTagsWithText() {

    Elements strongs = topNode.getElementsByTag("strong");
    for (Element item : strongs) {
        TextNode tn = new TextNode(item.text(), topNode.baseUri());
        item.replaceWith(tn);
    }

    Elements bolds = topNode.getElementsByTag("b");
    for (Element item : bolds) {
        TextNode tn = new TextNode(item.text(), topNode.baseUri());
        item.replaceWith(tn);
    }

    Elements italics = topNode.getElementsByTag("i");
    for (Element item : italics) {
        TextNode tn = new TextNode(item.text(), topNode.baseUri());
        item.replaceWith(tn);
    }
}

From source file:com.johan.vertretungsplan.parser.UntisInfoHeadlessParser.java

@Override
public Vertretungsplan getVertretungsplan() throws IOException, JSONException {
    new LoginHandler(schule).handleLogin(executor, cookieStore, username, password);

    Vertretungsplan v = new Vertretungsplan();
    List<VertretungsplanTag> tage = new ArrayList<VertretungsplanTag>();

    Document doc = Jsoup.parse(httpGet(url, schule.getData().getString("encoding")));
    Elements days = doc.select("#vertretung > p > b, #vertretung > b");
    for (Element day : days) {
        VertretungsplanTag tag = new VertretungsplanTag();
        tag.setStand("");
        tag.setDatum(day.text());
        Element next = null;// w w  w  . j  a v  a  2  s .c  o  m
        if (day.parent().tagName().equals("p")) {
            next = day.parent().nextElementSibling().nextElementSibling();
        } else
            next = day.parent().select("p").first().nextElementSibling();
        if (next.className().equals("subst")) {
            //Vertretungstabelle
            if (next.text().contains("Vertretungen sind nicht freigegeben"))
                continue;
            parseVertretungsplanTable(next, data, tag);
        } else {
            //Nachrichten
            parseNachrichten(next, data, tag);
            next = next.nextElementSibling().nextElementSibling();
            parseVertretungsplanTable(next, data, tag);
        }
        tage.add(tag);
    }
    v.setTage(tage);
    return v;
}

From source file:cvegrabber.CVEController.java

@RequestMapping(value = "/newest", produces = { "application/json" })
public CVE[] cve() {

    String url = "https://web.nvd.nist.gov/view/vuln/search-results?query=&search_type=all&cves=on";
    CVE[] cvearray = new CVE[10];
    try {/*  www.j  a  v  a 2  s  .co m*/
        Document doc = Jsoup.connect(url).get();
        Elements newest = doc.select(
                "a[id*=BodyPlaceHolder_cplPageContent_plcZones_lt_zoneCenter_VulnerabilitySearchResults_VulnResultsRepeater_CveDetailAnchor_]");
        int counter = 0;
        for (Element cveid : newest) {
            if (counter == 10)
                break;
            cvearray[counter] = new CVE(cveid.text(), grabMitreData(cveid.text(), "description"),
                    grabMitreData(cveid.text(), "references"));
            counter++;
        }
        for (int i = 0; i < 10; i++) {
            logger.info("CVEID: " + cvearray[i].getCVE() + " CVE Description: " + cvearray[i].getDescription()
                    + " CVE References: " + cvearray[i].getReferences());
        }
    } catch (Exception ex) {
        logger.error("Unable to fetch latest cves. " + ex.getMessage());
    }
    return cvearray;
}

From source file:de.geeksfactory.opacclient.apis.Bibliotheca.java

public static AccountData parse_account(Account acc, Document doc, JSONObject data) throws JSONException {
    JSONObject copymap = data.getJSONObject("accounttable");

    List<LentItem> media = new ArrayList<>();

    if (doc.select(".kontozeile_center table").size() == 0) {
        return null;
    }/*from w  w w .  j av a  2 s  .  c o m*/

    Elements exemplartrs = doc.select(".kontozeile_center table").get(0).select("tr.tabKonto");

    DateTimeFormatter fmt = DateTimeFormat.forPattern("dd.MM.yyyy").withLocale(Locale.GERMAN);

    for (int i = 0; i < exemplartrs.size(); i++) {
        Element tr = exemplartrs.get(i);
        LentItem item = new LentItem();

        Iterator<?> keys = copymap.keys();
        while (keys.hasNext()) {
            String key = (String) keys.next();
            int index;
            try {
                index = copymap.has(key) ? copymap.getInt(key) : -1;
            } catch (JSONException e1) {
                index = -1;
            }
            if (index >= 0) {
                if (key.equals("prolongurl")) {
                    if (tr.child(index).children().size() > 0) {
                        item.setProlongData(tr.child(index).child(0).attr("href"));
                        item.setRenewable(tr.child(index).child(0).attr("href").contains("vermsg"));
                    }
                } else if (key.equals("returndate")) {
                    try {
                        item.setDeadline(fmt.parseLocalDate(tr.child(index).text()));
                    } catch (IllegalArgumentException e1) {
                        e1.printStackTrace();
                    }
                } else {
                    item.set(key, tr.child(index).text());
                }
            }
        }

        media.add(item);
    }
    assert (doc.select(".kontozeile_center table").get(0).select("tr").size() > 0);
    assert (exemplartrs.size() == media.size());

    copymap = data.getJSONObject("reservationtable");

    List<ReservedItem> reservations = new ArrayList<>();
    exemplartrs = doc.select(".kontozeile_center table").get(1).select("tr.tabKonto");
    for (int i = 0; i < exemplartrs.size(); i++) {
        Element tr = exemplartrs.get(i);
        ReservedItem item = new ReservedItem();

        Iterator<?> keys = copymap.keys();
        while (keys.hasNext()) {
            String key = (String) keys.next();
            int index;
            try {
                index = copymap.has(key) ? copymap.getInt(key) : -1;
            } catch (JSONException e1) {
                index = -1;
            }
            if (index >= 0) {
                if (key.equals("cancelurl")) {
                    if (tr.child(index).children().size() > 0) {
                        item.setCancelData(tr.child(index).child(0).attr("href"));
                    }
                } else if (key.equals("availability")) {
                    try {
                        item.setReadyDate(fmt.parseLocalDate(tr.child(index).text()));
                    } catch (IllegalArgumentException e1) {
                        item.setStatus(tr.child(index).text());
                    }
                } else if (key.equals("expirationdate")) {
                    try {
                        item.setExpirationDate(fmt.parseLocalDate(tr.child(index).text()));
                    } catch (IllegalArgumentException e1) {
                        item.setStatus(tr.child(index).text());
                    }
                } else {
                    item.set(key, tr.child(index).text());
                }
            }
        }

        reservations.add(item);
    }
    assert (doc.select(".kontozeile_center table").get(1).select("tr").size() > 0);
    assert (exemplartrs.size() == reservations.size());

    AccountData res = new AccountData(acc.getId());

    for (Element row : doc.select(".kontozeile_center, div[align=center]")) {
        String text = row.text().trim();
        if (text.matches(".*Ausstehende Geb.+hren:[^0-9]+([0-9.,]+)[^0-9A-Z]*(|EUR|CHF|Fr.).*")) {
            text = text.replaceAll(
                    ".*Ausstehende Geb.+hren:[^0-9]+([0-9.," + "]+)[^0-9A-Z]*(|EUR|CHF|Fr.).*", "$1 $2");
            res.setPendingFees(text);
        }
        if (text.matches("Ihr Ausweis ist g.ltig bis:.*")) {
            text = text.replaceAll("Ihr Ausweis ist g.ltig bis:[^A-Za-z0-9]+", "");
            res.setValidUntil(text);
        } else if (text.matches("Ausweis g.ltig bis:.*")) {
            text = text.replaceAll("Ausweis g.ltig bis:[^A-Za-z0-9]+", "");
            res.setValidUntil(text);
        }
    }

    res.setLent(media);
    res.setReservations(reservations);
    return res;
}

From source file:com.webbfontaine.valuewebb.timer.RatesUpdater.java

public HashMap<String, BigDecimal> ratesFromBank() {
    HashMap<String, BigDecimal> rates = new HashMap<String, BigDecimal>();

    Document doc = getPage();//from   w w w.  jav a2  s. c  om
    Elements tables = doc.getElementsByTag("table");
    Element tableOfRates = null;
    Elements trs;
    int pairsCodeIndex = 0;
    int sellingIndex = 0;

    for (Element table : tables) {
        if (table.text().contains("Dollar")) {
            tableOfRates = table;
            break;
        }
    }

    if (tableOfRates != null) {
        trs = tableOfRates.getElementsByTag("tr");
    } else {
        LOGGER.error("Error reading rates from URL");
        return rates;
    }

    Elements columns = trs.get(0).getElementsByTag("th");

    for (int i = 0; i < columns.size(); ++i) {
        if (columns.get(i).text().equalsIgnoreCase("Pairs Code")) {
            pairsCodeIndex = i;
        }

        if (columns.get(i).text().equalsIgnoreCase("Selling")) {
            sellingIndex = i;
        }
    }

    for (Element tr : trs) {
        Elements tds = tr.getElementsByTag("td");

        if (tds.size() != 0) {
            String currPair = tds.get(pairsCodeIndex).text().trim();
            String rateText = tds.get(sellingIndex).text().trim().replace(",", "");
            BigDecimal rate = new BigDecimal(rateText);
            String curr;

            if (currPair.startsWith("GHS")) {
                curr = currPair.substring(3);
                rate = new BigDecimal(1).divide(rate, Constants.FRACTION_DIGITS_NUMBER_4,
                        Utils.getRoundingMode());
            } else {
                curr = currPair.substring(0, currPair.lastIndexOf("GHS"));
            }

            rates.put(curr, rate);
        }
    }
    return rates;
}