Example usage for org.jsoup.nodes Element text

Introduction

In this page you can find the example usage for org.jsoup.nodes Element text.

Prototype

public String text()

Source Link

Document

Gets the combined text of this element and all its children.

Usage

From source file:FILER.java

public static String[] Dealing_Files(File f) throws IOException //return array of important strings in the file
{
    Text = "";
    String[] Importants = { "", "", "", "" }; //first element is the title,second is all headers,third is img alt,4th is the url
    org.jsoup.nodes.Document doc = Jsoup.parse(f, "UTF-8");
    Importants[0] = doc.title(); //get the title of the file
    //Text=Text+" "+doc.title(); 
    String tag = "h";
    String All_Headers = "";
    Elements Header;/*from  www . j  a va2  s. co m*/
    for (int i = 1; i < 20; i++) //loop to get text with headers tag of the file
    {
        tag = "h" + String.valueOf(i);
        Header = doc.select(tag);
        if (Header.size() > 0) {
            Header = doc.getElementsByTag(tag);
            String pConcatenated = "";
            for (Element x : Header) {
                pConcatenated += x.text() + " ";
            }
            All_Headers = All_Headers + pConcatenated;
        } else
            break;

    }
    Importants[1] = All_Headers;
    Text = Text + " " + doc.text(); //get the text of the document
    Elements img = doc.getElementsByTag("img"); //get the text with img tag 
    for (Element element : img) {
        if (element.attr("alt") != null && !(element.attr("alt").equals(""))) {
            Text = Text + " " + element.attr("alt");
            Importants[2] = Importants[2] + " " + element.attr("alt");
        }
    }
    return Importants;
}

From source file:er.java

/**
 * Purpose: jsoup?Html/*from   w  w w .  j  a  v  a  2  s .  co  m*/
 * 
 * @param html
 * @return: String[]
 */
private static String[] htmlToPlainText(String html) {
    String[] content = new String[] { "", "" };
    Document doc = Jsoup.parse(html);
    // ???blog
    Elements titles = doc.select("h3.title-article>strong");
    for (Element oneSelect : titles)
        content[0] += oneSelect.text();
    // ???blog
    Elements contents = doc.select("div#blogContent");
    for (Element oneSelect : contents)
        content[1] += oneSelect.text();
    return content;
}

From source file:controllers.CNNProxy.java

public static F.Promise<Result> index(String query) {

    if (StringUtils.isEmpty(query)) {

        F.Promise.promise(new F.Function0<Object>() {
            @Override/*from  www.  j  a va2s .  co  m*/
            public Object apply() throws Throwable {
                return ok(Json.toJson("Query parameter  (q) not provided "));
            }

        });
    }

    String feedUrl = "http://searchapp.cnn.com/search/query.jsp";

    //query = query  + "&type=all";
    String page = "1";
    String npp = "10";
    String start = "1";
    String type = "all";
    String bucket = "true";
    String sort = "relevance";
    String csiID = "csi1";

    F.Promise<WSResponse> wsResponsePromise = WS.url(feedUrl).setQueryParameter("page", page)
            .setQueryParameter("npp", npp).setQueryParameter("start", start).setQueryParameter("text", query)
            .setQueryParameter("type", type).setQueryParameter("bucket", bucket).setQueryParameter("sort", sort)
            .setQueryParameter("csiID", csiID).get();

    return wsResponsePromise.map(new F.Function<WSResponse, Result>() {
        @Override
        public Result apply(WSResponse wsResponse) throws Throwable {

            String body = wsResponse.getBody();

            List<Map<String, String>> ret = new ArrayList<Map<String, String>>();

            try {
                // Reach json code into html response from ajax call
                org.jsoup.nodes.Document doc = Jsoup.parse(body);
                Element resultElement = doc.select("textarea#jsCode").first();
                String resultString = resultElement.text();

                // Parse the json code
                JSONObject obj = new JSONObject(resultString);
                JSONArray array = new JSONArray(obj.get("results").toString());
                JSONArray internalArray = new JSONArray(array.get(0).toString());

                // Insert each result's elements into map with corresponding key
                for (int i = 0; i < internalArray.length(); i++) {
                    JSONObject elementObj = new JSONObject(internalArray.get(i).toString());

                    String image = elementObj.get("thumbnail").toString();
                    String title = elementObj.get("title").toString();
                    String content = elementObj.get("description").toString();
                    String date = elementObj.get("mediaDateUts").toString();
                    String url = elementObj.get("url").toString();

                    Map<String, String> keyValue = new LinkedHashMap<String, String>();

                    keyValue.put("image", image);
                    keyValue.put("title", title);
                    keyValue.put("content", content);
                    keyValue.put("date", date);
                    keyValue.put("url", url);

                    ret.add(keyValue);
                }
            } catch (DOMException e) {
                e.printStackTrace();
            }

            return ok(Json.toJson(ret));

        }
    });
}

From source file:org.shareok.data.plosdata.PlosUtil.java

/**
 * //from   w w w  . j a va 2 s. c  o m
 * @param html : The string of the web page source
 * @return acknowledge statement
 */
public static String[] getSubjects(String html) {
    List<String> subjectsList = new ArrayList<>();

    Document doc = Jsoup.parse(html.toString());
    Elements subjectListDiv = doc.select("div[class=subject-areas-container]");
    if (null != subjectListDiv && !subjectListDiv.isEmpty()) {
        Element subjectList = subjectListDiv.first().child(1);
        if (null != subjectList) {
            Elements lis = subjectList.select("li");
            if (null != lis && lis.size() > 0) {
                for (Element li : lis) {
                    Element link = li.child(0);
                    subjectsList.add(link.text());
                }
            }
        }
    }
    if (subjectsList.size() > 0) {
        return subjectsList.toArray(new String[subjectsList.size()]);
    } else {
        return null;
    }
}

From source file:com.dajodi.scandic.JSoupScraper.java

private static String getStringFromNode(Element accountOverview, String id, String defaultValue) {
    Element node = accountOverview.getElementById(id);
    if (node == null)
        return defaultValue;
    return Util.trimIfNonNull(node.text());
}

From source file:app.data.parse.WebPageUtil.java

public static WebPageInfo parse(String url, Cache<String, WebPageInfo> urlInfoCache) throws IOException {
    String original = url;//from  w ww  .j a va  2  s .com

    // hit toutiao.io
    // fixme http://toutiao.io/shares/640539/url
    if (original.startsWith("https://toutiao.io/posts/")) {
        original = original.replace("/posts/", "/k/");
    }

    // check cache
    WebPageInfo info = urlInfoCache != null ? urlInfoCache.getIfPresent(original) : null;
    if (info != null) {
        return info;
    } else {
        info = new WebPageInfo();
        info.url = original;
    }

    // attach url
    Document doc = requestUrl(info.url);
    info.url = doc.baseUri(); // or doc.location()

    // hit gold.xitu.io
    if (info.url.startsWith("http://gold.xitu.io/entry/")) {
        Elements origin = doc.select("div[class=ellipsis]");
        Elements originLink = origin.select("a[class=share-link]");
        info.url = originLink.attr("href");

        // reconnect
        doc = requestUrl(info.url);
        info.url = doc.baseUri(); // or doc.location()
    }

    info.url = smartUri(info.url);

    // get title
    Elements metaTitle = doc.select("meta[property=og:title]");
    if (metaTitle != null) {
        info.title = metaTitle.attr("content");
    }
    if (StringUtils.isEmpty(info.title)) {
        metaTitle = doc.select("meta[property=twitter:title]");
        if (metaTitle != null) {
            info.title = metaTitle.attr("content");
        }
        info.title = StringUtils.isEmpty(info.title) ? doc.title() : info.title;
    }

    // get desc
    Elements metaDesc = doc.select("meta[property=og:description]");
    if (metaDesc != null) {
        info.description = metaDesc.attr("content");
    }
    if (StringUtils.isEmpty(info.description)) {
        metaDesc = doc.select("meta[property=twitter:description]");
        if (metaDesc != null) {
            info.description = metaDesc.attr("content");
        }
        if (StringUtils.isEmpty(info.description)) {
            metaDesc = doc.select("meta[name=description]");
            if (metaDesc != null) {
                info.description = metaDesc.attr("content");
            }
            if (StringUtils.isEmpty(info.description)) {
                metaDesc = doc.body().select("p");
                if (metaDesc != null) {
                    for (Element element : metaDesc) {
                        info.description = element.text();
                        if (info.description != null && info.description.length() >= 20) {
                            break;
                        }
                    }
                }
            }
        }
    }
    info.description = ellipsis(info.description, 140, "...");

    // cache info
    if (urlInfoCache != null) {
        urlInfoCache.put(original, info);
    }
    return info;
}

From source file:org.brnvrn.Main.java

/**
 * Parse a tr HTML element describing the tool
 * @param tool is to be updated//from  w  ww.ja va  2  s . c o m
 * @param tr   brings the data
 * @return true if successful
 */
private static boolean parseTrTool(Tool tool, Element tr) {
    boolean success = true;

    Element nameLink = tr.select("td:eq(0)").first();
    if (nameLink == null)
        return false;
    tool.setName(nameLink.text());
    tool.setUrl(nameLink.getElementsByTag("a").attr("href"));

    tool.setLicense(tr.select("td:eq(2)").first().text());

    tool.setCompatibility(tr.select("td:eq(3)").first().text());

    // More complicated: We will extract and remove known nodes, the rest will be description
    Element tdDescription = tr.select("td:eq(1)").first();
    Elements smalls = tdDescription.getElementsByTag("small");
    for (Element small : smalls) {
        Element author = small.getElementsContainingText("Author").first();
        if (author != null) {
            String authorsString = author.text();
            authorsString = authorsString.substring(authorsString.indexOf(":") + 1);
            tool.addAuthor(authorsString.split(","));
            small.remove();
        }
        Element sourceCode = small.getElementsContainingText("ource").last();
        if (sourceCode != null) {
            tool.setUrl_src(sourceCode.attr("href"));
            small.remove();
        }
    }
    tdDescription.getElementsByTag("br").remove();
    tool.setDescription(Jsoup.clean(tdDescription.html(), Whitelist.relaxed())); // ownText will miss the contained links in the description
    tool.setDescriptionText(tdDescription.text());

    bestEffortThemeLanguage(tool);

    return success;
}

From source file:damo.three.ie.util.HtmlUtilities.java

/**
 * Parses the My3 account usage page to nicer JSON format.
 *
 * @param pageContent Page content as HTML.
 * @return Usage information stripped out and formatted as JSON.
 * @throws JSONException//from   ww w  .  j a v  a2s .co  m
 */
public static JSONArray parseUsageAsJSONArray(String pageContent) throws JSONException {
    // The HTML on prepay is pig-ugly, so we will use JSoup to
    // clean and parse it.
    Document doc = Jsoup.parse(pageContent);
    HtmlUtilities.removeComments(doc);

    Elements elements = doc.getElementsByTag("table");

    JSONArray jsonArray = new JSONArray();

    // three don't have a sub label for the 3-to-3 calls, which is not consistent with other items.
    // .. feck them!
    boolean three2threeCallsBug = false;

    for (Element element : elements) {

        for (Element subelement : element.select("tbody > tr")) {

            if ((subelement.text().contains("3 to 3 Calls")) && (subelement.text().contains("Valid until"))) {
                three2threeCallsBug = true;
            }

            Elements subsubelements = subelement.select("td");

            if (subsubelements.size() == 3) {

                // skip the "total" entries
                if (subsubelements.select("td").get(0).text().contains("Total")) {
                    continue;
                }

                JSONObject currentItem = new JSONObject();

                if (three2threeCallsBug) {
                    currentItem.put("item", "3 to 3 Calls");
                } else {
                    // Get rid of that "non-breaking space" character if it exists
                    String titleToClean = subsubelements.select("td").get(0).text().replace("\u00a0", "")
                            .trim();
                    currentItem.put("item", titleToClean);
                }

                /**
                 * Check if date contains "Today", if so, change it to a date.
                 * Otherwise we will never know when usage ends, unless user refreshes, As 'today'
                 * is 'today', tomorrow.. see!
                 */
                String value1 = subsubelements.select("td").get(1).text();
                if (value1.equals("Today")) {
                    DateTimeFormatter formatter = DateTimeFormat.forPattern("dd/MM/yy").withLocale(Locale.UK);
                    DateTime dt = new DateTime(); // current datetime
                    value1 = "Expires " + formatter.print(dt);
                }
                currentItem.put("value1", value1);
                currentItem.put("value2", subsubelements.select("td").get(2).text());

                // Out of Bundle charges have an extra property
                if (currentItem.getString("item").startsWith("Internet")) {

                    Pattern p1 = Pattern.compile(Constants.OUT_OF_BUNDLE_REGEX, Pattern.DOTALL);
                    Matcher m1 = p1.matcher(pageContent);

                    StringBuilder cleanedDate = new StringBuilder();
                    if (m1.matches()) {
                        cleanedDate.append(m1.group(1));
                        cleanedDate.append(' ');
                        cleanedDate.append(m1.group(2));
                        cleanedDate.append(' ');
                        cleanedDate.append(m1.group(3));
                        currentItem.put("value3", cleanedDate.toString());
                    }

                }
                jsonArray.put(currentItem);
            }

        }

        // reset the 3-to-3 call bug flag for next Element
        if (three2threeCallsBug) {
            three2threeCallsBug = false;
        }
    }

    return jsonArray;
}

From source file:module.entities.NameFinder.RegexNameFinder.java

public static String getSignatureFromParagraphs(Elements paragraphs) {
    String signature = "";
    String signName = "", roleName = "";
    int signIdx = 0, roleIdx = 0;
    int row = 0;/*from w  w w . jav  a  2 s .c o m*/
    TreeMap<Integer, String> roles = new TreeMap<Integer, String>();
    for (Element n : paragraphs) {
        row++;
        String formatedText = Normalizer.normalize(n.text().toUpperCase(locale), Normalizer.Form.NFD)
                .replaceAll("\\p{M}", "");
        if (formatedText.contains(" ") && !formatedText.matches(".*[0-9].*")) {
            //                  if (formatedText.contains("<br>")) {
            //                      formatedText = formatedText.replaceAll("<br\\s*/>", " ");
            //                   }
            String[] splitedText = formatedText.split(" ");
            //                    System.out.println(splitedText.length);
            if (splitedText.length < 7) {
                boolean isSign = false;
                String text = "";
                for (int z = 0; z < splitedText.length; z++) {
                    String splText = splitedText[z].replaceAll("[\\s.]", "").replaceAll("\u00a0", "")
                            .replaceAll("", "").replaceAll(",", "");
                    if (names.contains(splText) || surnames.contains(splText)) {
                        signName += splText + " ";
                        signIdx = row;
                        isSign = true;
                    }
                    text += splText + " ";
                    //                            if (z == splitedText.length-1){
                    //                                System.out.println(signName.trim());
                    //                            }
                }
                if (!isSign) {
                    roleIdx = row;
                    if (!text.contains(" ") && !text.contains("")) {
                        roles.put(roleIdx, text.trim());
                    }
                }
            }
        }
    }
    for (Integer roleRow : roles.keySet()) {
        //                    if (signName.length() == 0) {
        if (Math.abs(signIdx - roleRow) < 4) {
            roleName += roles.get(roleRow) + " ";
        }

    }

    if (signName.length() > 0) {
        signature = signName + "#" + roleName;
    }
    return signature;
}

From source file:uk.co.blackpepper.support.retrofit.jsoup.spring.AbstractBeanHtmlConverterTest.java

private static Function<Element, String> elementToText() {
    return new Function<Element, String>() {
        @Override//from  ww w .  j a  v  a  2s  .c  om
        public String apply(Element element) {
            return element.text();
        }
    };
}