List of usage examples for org.jsoup.nodes Element text
public String text()
From source file:FILER.java
public static String[] Dealing_Files(File f) throws IOException //return array of important strings in the file { Text = ""; String[] Importants = { "", "", "", "" }; //first element is the title,second is all headers,third is img alt,4th is the url org.jsoup.nodes.Document doc = Jsoup.parse(f, "UTF-8"); Importants[0] = doc.title(); //get the title of the file //Text=Text+" "+doc.title(); String tag = "h"; String All_Headers = ""; Elements Header;/*from www . j a va2 s. co m*/ for (int i = 1; i < 20; i++) //loop to get text with headers tag of the file { tag = "h" + String.valueOf(i); Header = doc.select(tag); if (Header.size() > 0) { Header = doc.getElementsByTag(tag); String pConcatenated = ""; for (Element x : Header) { pConcatenated += x.text() + " "; } All_Headers = All_Headers + pConcatenated; } else break; } Importants[1] = All_Headers; Text = Text + " " + doc.text(); //get the text of the document Elements img = doc.getElementsByTag("img"); //get the text with img tag for (Element element : img) { if (element.attr("alt") != null && !(element.attr("alt").equals(""))) { Text = Text + " " + element.attr("alt"); Importants[2] = Importants[2] + " " + element.attr("alt"); } } return Importants; }
From source file:er.java
/** * Purpose: jsoup?Html/*from w w w . j a v a 2 s . co m*/ * * @param html * @return: String[] */ private static String[] htmlToPlainText(String html) { String[] content = new String[] { "", "" }; Document doc = Jsoup.parse(html); // ???blog Elements titles = doc.select("h3.title-article>strong"); for (Element oneSelect : titles) content[0] += oneSelect.text(); // ???blog Elements contents = doc.select("div#blogContent"); for (Element oneSelect : contents) content[1] += oneSelect.text(); return content; }
From source file:controllers.CNNProxy.java
public static F.Promise<Result> index(String query) { if (StringUtils.isEmpty(query)) { F.Promise.promise(new F.Function0<Object>() { @Override/*from www. j a va2s . co m*/ public Object apply() throws Throwable { return ok(Json.toJson("Query parameter (q) not provided ")); } }); } String feedUrl = "http://searchapp.cnn.com/search/query.jsp"; //query = query + "&type=all"; String page = "1"; String npp = "10"; String start = "1"; String type = "all"; String bucket = "true"; String sort = "relevance"; String csiID = "csi1"; F.Promise<WSResponse> wsResponsePromise = WS.url(feedUrl).setQueryParameter("page", page) .setQueryParameter("npp", npp).setQueryParameter("start", start).setQueryParameter("text", query) .setQueryParameter("type", type).setQueryParameter("bucket", bucket).setQueryParameter("sort", sort) .setQueryParameter("csiID", csiID).get(); return wsResponsePromise.map(new F.Function<WSResponse, Result>() { @Override public Result apply(WSResponse wsResponse) throws Throwable { String body = wsResponse.getBody(); List<Map<String, String>> ret = new ArrayList<Map<String, String>>(); try { // Reach json code into html response from ajax call org.jsoup.nodes.Document doc = Jsoup.parse(body); Element resultElement = doc.select("textarea#jsCode").first(); String resultString = resultElement.text(); // Parse the json code JSONObject obj = new JSONObject(resultString); JSONArray array = new JSONArray(obj.get("results").toString()); JSONArray internalArray = new JSONArray(array.get(0).toString()); // Insert each result's elements into map with corresponding key for (int i = 0; i < internalArray.length(); i++) { JSONObject elementObj = new JSONObject(internalArray.get(i).toString()); String image = elementObj.get("thumbnail").toString(); String title = elementObj.get("title").toString(); String content = elementObj.get("description").toString(); String date = elementObj.get("mediaDateUts").toString(); String url = elementObj.get("url").toString(); Map<String, String> keyValue = new LinkedHashMap<String, String>(); keyValue.put("image", image); keyValue.put("title", title); keyValue.put("content", content); keyValue.put("date", date); keyValue.put("url", url); ret.add(keyValue); } } catch (DOMException e) { e.printStackTrace(); } return ok(Json.toJson(ret)); } }); }
From source file:org.shareok.data.plosdata.PlosUtil.java
/** * //from w w w . j a va 2 s. c o m * @param html : The string of the web page source * @return acknowledge statement */ public static String[] getSubjects(String html) { List<String> subjectsList = new ArrayList<>(); Document doc = Jsoup.parse(html.toString()); Elements subjectListDiv = doc.select("div[class=subject-areas-container]"); if (null != subjectListDiv && !subjectListDiv.isEmpty()) { Element subjectList = subjectListDiv.first().child(1); if (null != subjectList) { Elements lis = subjectList.select("li"); if (null != lis && lis.size() > 0) { for (Element li : lis) { Element link = li.child(0); subjectsList.add(link.text()); } } } } if (subjectsList.size() > 0) { return subjectsList.toArray(new String[subjectsList.size()]); } else { return null; } }
From source file:com.dajodi.scandic.JSoupScraper.java
private static String getStringFromNode(Element accountOverview, String id, String defaultValue) { Element node = accountOverview.getElementById(id); if (node == null) return defaultValue; return Util.trimIfNonNull(node.text()); }
From source file:app.data.parse.WebPageUtil.java
public static WebPageInfo parse(String url, Cache<String, WebPageInfo> urlInfoCache) throws IOException { String original = url;//from w ww .j a va 2 s .com // hit toutiao.io // fixme http://toutiao.io/shares/640539/url if (original.startsWith("https://toutiao.io/posts/")) { original = original.replace("/posts/", "/k/"); } // check cache WebPageInfo info = urlInfoCache != null ? urlInfoCache.getIfPresent(original) : null; if (info != null) { return info; } else { info = new WebPageInfo(); info.url = original; } // attach url Document doc = requestUrl(info.url); info.url = doc.baseUri(); // or doc.location() // hit gold.xitu.io if (info.url.startsWith("http://gold.xitu.io/entry/")) { Elements origin = doc.select("div[class=ellipsis]"); Elements originLink = origin.select("a[class=share-link]"); info.url = originLink.attr("href"); // reconnect doc = requestUrl(info.url); info.url = doc.baseUri(); // or doc.location() } info.url = smartUri(info.url); // get title Elements metaTitle = doc.select("meta[property=og:title]"); if (metaTitle != null) { info.title = metaTitle.attr("content"); } if (StringUtils.isEmpty(info.title)) { metaTitle = doc.select("meta[property=twitter:title]"); if (metaTitle != null) { info.title = metaTitle.attr("content"); } info.title = StringUtils.isEmpty(info.title) ? doc.title() : info.title; } // get desc Elements metaDesc = doc.select("meta[property=og:description]"); if (metaDesc != null) { info.description = metaDesc.attr("content"); } if (StringUtils.isEmpty(info.description)) { metaDesc = doc.select("meta[property=twitter:description]"); if (metaDesc != null) { info.description = metaDesc.attr("content"); } if (StringUtils.isEmpty(info.description)) { metaDesc = doc.select("meta[name=description]"); if (metaDesc != null) { info.description = metaDesc.attr("content"); } if (StringUtils.isEmpty(info.description)) { metaDesc = doc.body().select("p"); if (metaDesc != null) { for (Element element : metaDesc) { info.description = element.text(); if (info.description != null && info.description.length() >= 20) { break; } } } } } } info.description = ellipsis(info.description, 140, "..."); // cache info if (urlInfoCache != null) { urlInfoCache.put(original, info); } return info; }
From source file:org.brnvrn.Main.java
/** * Parse a tr HTML element describing the tool * @param tool is to be updated//from w ww.ja va 2 s . c o m * @param tr brings the data * @return true if successful */ private static boolean parseTrTool(Tool tool, Element tr) { boolean success = true; Element nameLink = tr.select("td:eq(0)").first(); if (nameLink == null) return false; tool.setName(nameLink.text()); tool.setUrl(nameLink.getElementsByTag("a").attr("href")); tool.setLicense(tr.select("td:eq(2)").first().text()); tool.setCompatibility(tr.select("td:eq(3)").first().text()); // More complicated: We will extract and remove known nodes, the rest will be description Element tdDescription = tr.select("td:eq(1)").first(); Elements smalls = tdDescription.getElementsByTag("small"); for (Element small : smalls) { Element author = small.getElementsContainingText("Author").first(); if (author != null) { String authorsString = author.text(); authorsString = authorsString.substring(authorsString.indexOf(":") + 1); tool.addAuthor(authorsString.split(",")); small.remove(); } Element sourceCode = small.getElementsContainingText("ource").last(); if (sourceCode != null) { tool.setUrl_src(sourceCode.attr("href")); small.remove(); } } tdDescription.getElementsByTag("br").remove(); tool.setDescription(Jsoup.clean(tdDescription.html(), Whitelist.relaxed())); // ownText will miss the contained links in the description tool.setDescriptionText(tdDescription.text()); bestEffortThemeLanguage(tool); return success; }
From source file:damo.three.ie.util.HtmlUtilities.java
/** * Parses the My3 account usage page to nicer JSON format. * * @param pageContent Page content as HTML. * @return Usage information stripped out and formatted as JSON. * @throws JSONException//from ww w . j a v a2s .co m */ public static JSONArray parseUsageAsJSONArray(String pageContent) throws JSONException { // The HTML on prepay is pig-ugly, so we will use JSoup to // clean and parse it. Document doc = Jsoup.parse(pageContent); HtmlUtilities.removeComments(doc); Elements elements = doc.getElementsByTag("table"); JSONArray jsonArray = new JSONArray(); // three don't have a sub label for the 3-to-3 calls, which is not consistent with other items. // .. feck them! boolean three2threeCallsBug = false; for (Element element : elements) { for (Element subelement : element.select("tbody > tr")) { if ((subelement.text().contains("3 to 3 Calls")) && (subelement.text().contains("Valid until"))) { three2threeCallsBug = true; } Elements subsubelements = subelement.select("td"); if (subsubelements.size() == 3) { // skip the "total" entries if (subsubelements.select("td").get(0).text().contains("Total")) { continue; } JSONObject currentItem = new JSONObject(); if (three2threeCallsBug) { currentItem.put("item", "3 to 3 Calls"); } else { // Get rid of that "non-breaking space" character if it exists String titleToClean = subsubelements.select("td").get(0).text().replace("\u00a0", "") .trim(); currentItem.put("item", titleToClean); } /** * Check if date contains "Today", if so, change it to a date. * Otherwise we will never know when usage ends, unless user refreshes, As 'today' * is 'today', tomorrow.. see! */ String value1 = subsubelements.select("td").get(1).text(); if (value1.equals("Today")) { DateTimeFormatter formatter = DateTimeFormat.forPattern("dd/MM/yy").withLocale(Locale.UK); DateTime dt = new DateTime(); // current datetime value1 = "Expires " + formatter.print(dt); } currentItem.put("value1", value1); currentItem.put("value2", subsubelements.select("td").get(2).text()); // Out of Bundle charges have an extra property if (currentItem.getString("item").startsWith("Internet")) { Pattern p1 = Pattern.compile(Constants.OUT_OF_BUNDLE_REGEX, Pattern.DOTALL); Matcher m1 = p1.matcher(pageContent); StringBuilder cleanedDate = new StringBuilder(); if (m1.matches()) { cleanedDate.append(m1.group(1)); cleanedDate.append(' '); cleanedDate.append(m1.group(2)); cleanedDate.append(' '); cleanedDate.append(m1.group(3)); currentItem.put("value3", cleanedDate.toString()); } } jsonArray.put(currentItem); } } // reset the 3-to-3 call bug flag for next Element if (three2threeCallsBug) { three2threeCallsBug = false; } } return jsonArray; }
From source file:module.entities.NameFinder.RegexNameFinder.java
public static String getSignatureFromParagraphs(Elements paragraphs) { String signature = ""; String signName = "", roleName = ""; int signIdx = 0, roleIdx = 0; int row = 0;/*from w w w . jav a 2 s .c o m*/ TreeMap<Integer, String> roles = new TreeMap<Integer, String>(); for (Element n : paragraphs) { row++; String formatedText = Normalizer.normalize(n.text().toUpperCase(locale), Normalizer.Form.NFD) .replaceAll("\\p{M}", ""); if (formatedText.contains(" ") && !formatedText.matches(".*[0-9].*")) { // if (formatedText.contains("<br>")) { // formatedText = formatedText.replaceAll("<br\\s*/>", " "); // } String[] splitedText = formatedText.split(" "); // System.out.println(splitedText.length); if (splitedText.length < 7) { boolean isSign = false; String text = ""; for (int z = 0; z < splitedText.length; z++) { String splText = splitedText[z].replaceAll("[\\s.]", "").replaceAll("\u00a0", "") .replaceAll("", "").replaceAll(",", ""); if (names.contains(splText) || surnames.contains(splText)) { signName += splText + " "; signIdx = row; isSign = true; } text += splText + " "; // if (z == splitedText.length-1){ // System.out.println(signName.trim()); // } } if (!isSign) { roleIdx = row; if (!text.contains(" ") && !text.contains("")) { roles.put(roleIdx, text.trim()); } } } } } for (Integer roleRow : roles.keySet()) { // if (signName.length() == 0) { if (Math.abs(signIdx - roleRow) < 4) { roleName += roles.get(roleRow) + " "; } } if (signName.length() > 0) { signature = signName + "#" + roleName; } return signature; }
From source file:uk.co.blackpepper.support.retrofit.jsoup.spring.AbstractBeanHtmlConverterTest.java
private static Function<Element, String> elementToText() { return new Function<Element, String>() { @Override//from ww w . j a v a 2s .c om public String apply(Element element) { return element.text(); } }; }