Example usage for org.jsoup.nodes Element text

Introduction

In this page you can find the example usage for org.jsoup.nodes Element text.

Prototype

public String text()

Source Link

Document

Gets the combined text of this element and all its children.

Usage

From source file:GIST.IzbirkomExtractor.TableExtractor.java

/**
     * Tests the row if it looks like the 1st row of a parsable table
     * @param row//from  w  ww .  jav  a2  s  .  c  om
     * @return
     */
    private boolean isParsableTable(Element row) {

        Elements cells = row.getElementsByTag("td");

        /* number of columns should be 4 */
        if (cells.size() != 4)
            return false;

        /* look for number signs in 1st cell*/
        if (StringUtils.getLevenshteinDistance(cleanupUNICODE(cells.first().text()),
                " . -") < 3)
            return true;

        /* discard the table if any of the cells is empty */
        for (Element cell : cells) {
            if (cleanupUNICODE(cell.text()).isEmpty())
                return false;
        }

        /* 1st column should be a number */
        try {
            Integer.parseInt(cleanupUNICODE(cells.first().text()).trim());
            return true;
        } catch (NumberFormatException e) {
            return false;
        }
    }

From source file:eu.sisob.uma.extractors.adhoc.websearchers.WebSearchersExtractor.java

/**
 *
 * @param nextLine/*w w  w.  j a  v a2s .  c om*/
 * @param idStaffIdentifier
 * @param idName
 * @param idFirstName
 * @param idLastName
 * @param idInitials
 * @param idSubject
 * @param idInstitutionName
 * @param idWebAddress
 * @param expression
 * @param params
 * @return
 */
@Override
protected String get_result(String[] nextLine, int idStaffIdentifier, int idName, int idFirstName,
        int idLastName, int idInitials, int idSubject, int idInstitutionName, int idWebAddress,
        String expression, Object[] params) {

    String keywords = " (PROFILE OR PHD OR RESEARCHER OR FACULTY OR PROFESSOR OR RESEARCH) AND ";
    keywords = "";

    String domain = clean_site(nextLine[idWebAddress]);
    String subject = nextLine[idSubject];
    String and_institution_name = (idInstitutionName != -1 ? " AND " + nextLine[idInstitutionName] : "");
    String expression_subject = expression + " AND " + subject;
    String expression_site = expression + " site: " + domain;
    String expression_inst_name = expression + and_institution_name;
    String expression_inst_name_and_subject = expression + and_institution_name + " AND " + subject;

    String url = "";

    switch (search_patterns) {
    case P1:
        url = "https://duckduckgo.com/html/?q=" + keywords + expression;
        break;
    case P2:
        url = "https://duckduckgo.com/html/?q=" + keywords + expression_subject;
        break;
    case P3:
        url = "https://duckduckgo.com/html/?q=" + keywords + expression_site;
        break;
    case P4:
        url = "https://duckduckgo.com/html/?q=" + keywords + expression_inst_name;
        break;
    case P5:
        url = "https://duckduckgo.com/html/?q=" + keywords + expression_inst_name_and_subject;
        break;
    default:
        url = "https://duckduckgo.com/html/?q=" + keywords + expression_subject;
        break;
    }
    Logger.getRootLogger().info("Go with " + url);
    boolean again = false;
    Document doc = null;
    do {
        doc = getDocumentFromPage(url, 10, 1000, 5000);

        if (doc != null && doc.text().contains("If this error persists, please let us know")) {
            try {
                Thread.sleep(30000);
            } catch (InterruptedException ex) {
            }
            again = true;
        } else {
            again = false;
        }
    } while (again);

    String final_result = "";
    if (doc != null && doc.select("div[class*=links_main] > a").size() > 0) {

        /* Write resercher founded */
        Elements elements = doc.select("div[class*=links_main] > a");

        /* We will take the first html page and the first pdf */

        HashMap<String, String> results = new HashMap<String, String>();

        int max_results = 2;
        int i_result = 0;
        for (Element e : elements) {
            if ((e.text().startsWith("[")
            //&& !e.text().startsWith("[PDF]")
            ) || e.absUrl("href").contains("duckduckgo.com/y.js") || e.absUrl("href").contains("wikipedia.")
                    || e.absUrl("href").contains("facebook.com") || e.absUrl("href").contains("microsoft.com")
                    || e.absUrl("href").contains("google.com") || e.absUrl("href").contains("linkedin")
                    || e.absUrl("href").contains("www.biography.com")
                    || e.absUrl("href").contains("biomedexperts.com")
                    || e.absUrl("href").contains("www.experts.scival.com")
                    || e.absUrl("href").contains("ratemyprofessors.com")
                    || e.absUrl("href").contains("flickr.com") || e.absUrl("href").endsWith(".txt")
                    || e.absUrl("href").endsWith(".csv") || e.absUrl("href").endsWith(".xml")
                    || e.absUrl("href").endsWith(".doc") || e.absUrl("href").endsWith(".docx")
                    || e.absUrl("href").endsWith(".xls") || e.absUrl("href").endsWith(".xlxs")
                    || e.absUrl("href").contains("www.amazon")) {
                max_results++;
                continue;
            }

            boolean add = false;
            String score = "";
            String ext = "";
            if (!results.containsKey("HTML") && !e.text().startsWith("[")) {
                //results.put("html", )

                File temp;
                try {
                    temp = File.createTempFile("temp-file-name", ".tmp");
                    URL fetched_url = Downloader.fetchURL(e.absUrl("href"));
                    FileUtils.copyURLToFile(fetched_url, temp);
                    long sizeInBytes = temp.length();
                    long sizeInMb = sizeInBytes / (1024 * 1024);
                    if (sizeInMb > 100) {
                        score = "B";
                    } else {
                        String content = FileUtils.readFileToString(temp);
                        if (content.contains(nextLine[idLastName])) {
                            score = "A";
                        } else {
                            score = "B";
                        }
                    }
                } catch (IOException ex) {
                    score = "B";
                }

                ext = "HTML";
                add = true;
            }

            //if(!results.containsKey("PDF") && e.text().startsWith("[PDF]")){                                                        
            //    score = "A";
            //    ext = "PDF";
            //    add = true;
            //}                          

            if (add) {
                String result = "";
                result += "\"" + nextLine[idStaffIdentifier] + "\";";
                result += "\"" + nextLine[idLastName] + "\";";
                result += "\"" + nextLine[idInitials] + "\";";
                if (idFirstName != -1)
                    result += "\"" + nextLine[idFirstName] + "\";";
                if (idName != -1)
                    result += "\"" + nextLine[idName] + "\";";
                result += "\"" + e.absUrl("href") + "\";";
                result += "\"" + ext + "\";";
                result += "\"" + "CV" + "\";";
                result += "\"" + score + "\"";
                result += "\r\n";
                results.put(ext, result);

                Logger.getRootLogger().info("Select " + e.absUrl("href") + " - " + e.text());
            }

            //                if(results.containsKey("PDF") && results.containsKey("HTML")){
            //                    break;
            //                }

            i_result++;
            if (max_results <= i_result) {
                break;
            }
        }

        //            if(results.containsKey("PDF"))
        //                final_result = results.get("PDF");
        //            else 
        if (results.containsKey("HTML"))
            final_result = results.get("HTML");
        else
            final_result = "";
    }

    return final_result;
}

From source file:com.johan.vertretungsplan.parser.UntisCommonParser.java

protected VertretungsplanTag parseMonitorVertretungsplanTag(Document doc, JSONObject data)
        throws JSONException {
    VertretungsplanTag tag = new VertretungsplanTag();
    tag.setDatum(doc.select(".mon_title").first().text().replaceAll(" \\(Seite \\d+ / \\d+\\)", ""));
    if (doc.select("table.mon_head td[align=right] p").size() == 0
            || schule.getData().optBoolean("stand_links", false)) {
        tag.setStand(doc.select("body").html().substring(0, doc.select("body").html().indexOf("<p>") - 1));
    } else {/*from  w  w  w .  java2s  .  co  m*/
        Element stand = doc.select("table.mon_head td[align=right] p").first();
        String info = stand.text();
        tag.setStand(info.substring(info.indexOf("Stand:")));
    }

    // NACHRICHTEN
    if (doc.select("table.info").size() > 0)
        parseNachrichten(doc.select("table.info").first(), data, tag);

    // VERTRETUNGSPLAN
    if (doc.select("table:has(tr.list)").size() > 0)
        parseVertretungsplanTable(doc.select("table:has(tr.list)").first(), data, tag);

    return tag;
}

From source file:net.parser.JobParser.java

public String getEmployerAddress() {

    Elements elements = doc.select("#employer-profile .details li");
    String address = null;/* w  w w  .j  a  va  2  s.  co  m*/

    for (Element element : elements) {
        if (element.text().contains("Adresa:")) {
            address = element.text().replace("Adresa: ", "");
        }
    }

    return address;
}

From source file:net.poemerchant.scraper.ShopScraper.java

public String scrapeAccountName() {
    Element profileLink = doc.select(".profile-link").select("a[href^=/account/view-profile]").first();
    // TODO, take the challenge data also
    // TODO, take the twitch data
    accountName = profileLink.text();
    return accountName;
}

From source file:com.liato.bankdroid.banking.banks.Bioklubben.java

@Override
public void update() throws BankException, LoginException, BankChoiceException {
    super.update();
    if (username == null || password == null || username.length() == 0 || password.length() == 0) {
        throw new LoginException(res.getText(R.string.invalid_username_password).toString());
    }//from w w  w. j  ava2s. c  o  m
    urlopen = login();
    try {
        Document d = Jsoup
                .parse(urlopen.open("http://bioklubben.sf.se/MyPurchases.aspx?ParentTreeID=1&TreeID=1"));
        Element e = d.getElementById("ctl00_ContentPlaceHolder1_BonusPointsLabel");
        if (e == null) {
            throw new BankException(res.getText(R.string.unable_to_find).toString() + " points element.");
        }
        BigDecimal b = Helpers.parseBalance(e.text());
        Account a = new Account("Pong", b, "1");
        a.setCurrency(context.getString(R.string.points));
        accounts.add(a);
        balance = balance.add(a.getBalance());

        Elements es = d.select(".GridViewStd_Item,.GridViewStd_ItemAlt");
        List<Transaction> transactions = new ArrayList<Transaction>();
        if (es != null) {
            for (Element el : es) {
                transactions.add(new Transaction(el.child(0).text().trim(), el.child(1).text().trim(),
                        Helpers.parseBalance(el.child(2).text())));
            }
        }
        a.setTransactions(transactions);

    } catch (IOException e) {
        if (e == null) {
            throw new BankException(e.getMessage());
        }
    }
    if (accounts.isEmpty()) {
        throw new BankException(res.getText(R.string.no_accounts_found).toString());
    }
    super.updateComplete();
}

From source file:DownloadDialog.java

/********************************************************************
 * Method: storeTerms/*  w  w w.  j a va  2  s  . c o m*/
 * Purpose: store available terms to use
/*******************************************************************/
public void storeTerms() {

    try {

        // Default terms
        termsName = new ArrayList<String>();
        termsValue = new ArrayList<String>();

        // Create client for terms
        DefaultHttpClient client = new DefaultHttpClient();
        HttpGet dynamicGet = new HttpGet("http://jweb.kettering.edu/cku1/xhwschedule.P_SelectSubject");

        // Execute post call
        HttpResponse response = client.execute(dynamicGet);
        Document doc = Jsoup.parse(HTMLParser.parse(response));
        Elements options = doc.getElementsByTag("option");

        // Store every option
        for (Element option : options) {

            // First term option
            if (!option.text().contains("None")) {

                this.termsName.add(option.text());
                this.termsValue.add(option.val());
            }
        }

        //client.close();
    }

    // Catch all exceptions
    catch (Exception e) {

        // Print track, set false, return false
        e.printStackTrace();
    }
}

From source file:me.vertretungsplan.parser.DaVinciParser.java

@Override
public List<String> getAllClasses() throws IOException, JSONException, CredentialInvalidException {
    if (scheduleData.getData().has(PARAM_CLASSES_SOURCE)) {
        Document doc = Jsoup.parse(httpGet(scheduleData.getData().getString("classesSource"), ENCODING));
        List<String> classes = new ArrayList<>();
        Elements elems = doc.select("li.Class");
        if (elems.size() == 0) {
            // daVinci 5
            elems = doc.select("td[align=left] a");
        }//from w  ww  .  ja v a 2  s .  c om
        for (Element li : elems) {
            classes.add(li.text());
        }
        return classes;
    } else {
        return getClassesFromJson();
    }
}

From source file:com.lingxiang2014.entity.Article.java

@Transient
public String[] getPageContents() {
    if (StringUtils.isEmpty(content)) {
        return new String[] { "" };
    }/*from   ww w. j a v a  2 s  .  c  o  m*/
    if (content.contains(PAGE_BREAK_SEPARATOR)) {
        return content.split(PAGE_BREAK_SEPARATOR);
    } else {
        List<String> pageContents = new ArrayList<String>();
        Document document = Jsoup.parse(content);
        List<Node> children = document.body().childNodes();
        if (children != null) {
            int textLength = 0;
            StringBuffer html = new StringBuffer();
            for (Node node : children) {
                if (node instanceof Element) {
                    Element element = (Element) node;
                    html.append(element.outerHtml());
                    textLength += element.text().length();
                    if (textLength >= PAGE_CONTENT_LENGTH) {
                        pageContents.add(html.toString());
                        textLength = 0;
                        html.setLength(0);
                    }
                } else if (node instanceof TextNode) {
                    TextNode textNode = (TextNode) node;
                    String text = textNode.text();
                    String[] contents = PARAGRAPH_SEPARATOR_PATTERN.split(text);
                    Matcher matcher = PARAGRAPH_SEPARATOR_PATTERN.matcher(text);
                    for (String content : contents) {
                        if (matcher.find()) {
                            content += matcher.group();
                        }
                        html.append(content);
                        textLength += content.length();
                        if (textLength >= PAGE_CONTENT_LENGTH) {
                            pageContents.add(html.toString());
                            textLength = 0;
                            html.setLength(0);
                        }
                    }
                }
            }
            String pageContent = html.toString();
            if (StringUtils.isNotEmpty(pageContent)) {
                pageContents.add(pageContent);
            }
        }
        return pageContents.toArray(new String[pageContents.size()]);
    }
}

From source file:de.stkl.gbgvertretungsplan.sync.SyncAdapter.java

private List<List<String>> parseRows(Element root) {
    Element table = root.select("table.mon_list").first();
    // each row has categories.size() categories, build a two dimensional array:
    // <row-index><category-index> = <value>
    // rows[0] is the name of the class, if multiple classes are set there, split them (separator: ,)
    List<List<String>> allRows = new ArrayList<List<String>>();
    Elements rows = table.select("tr:gt(0)");
    for (Element row : rows) {
        int i = 0;
        ArrayList<String> newrow = new ArrayList<String>();

        String[] pendingClasses = null;
        // each category
        for (Element categ : row.select("td")) {
            if (i == 0) { // split class field by separator(,) if needed
                String text = categ.text();
                pendingClasses = text.split(",");
            }/*from ww w . j av a 2s.co m*/
            // dont add class if multiple classes are given
            if (i != 0 || (pendingClasses == null || pendingClasses.length == 0))
                newrow.add(categ.text());
            //                Log.i(LOG_TAG, categ.text());
            i++;
        }

        // add row with category info to allRows array, if not multiple classes
        if (pendingClasses == null || pendingClasses.length == 0)
            allRows.add(newrow);
        // otherwise set class names to multiple rows
        else {
            for (String classN : pendingClasses) {
                ArrayList<String> n = (ArrayList<String>) newrow.clone();
                n.add(0, classN.trim());
                allRows.add(n);
            }
        }
    }

    return allRows;
}