List of usage examples for org.jsoup.nodes Element select
public Elements select(String cssQuery)
From source file:controllers.BIProxy.java
public static F.Promise<Result> index(String query) { if (StringUtils.isEmpty(query)) { F.Promise.promise(new F.Function0<Object>() { @Override// ww w .j a v a 2 s . c om public Object apply() throws Throwable { return ok(Json.toJson("Query parameter (q) not provided ")); } }); } F.Promise<WSResponse> wsResponsePromise = WS.url("http://www.businessinsider.com/s") .setQueryParameter("q", query).get(); return wsResponsePromise.map(new F.Function<WSResponse, Result>() { @Override public Result apply(WSResponse wsResponse) throws Throwable { String body = wsResponse.getBody(); List<Map<String, String>> results = new ArrayList<Map<String, String>>(); try { // Insert into map org.jsoup.nodes.Document doc = Jsoup.parse(body); Elements items = doc.select("div.search-result"); // Iterate through results for (Element item : items) { Map<String, String> keyValue = new LinkedHashMap<String, String>(); keyValue.put("image", item.select("img").attr("src")); keyValue.put("title", item.select("h3").text()); keyValue.put("content", item.select("div.excerpt").first().text()); keyValue.put("date", item.select("li.date").text()); keyValue.put("url", item.select("a").attr("href")); results.add(keyValue); } } catch (DOMException e) { e.printStackTrace(); } return ok(Json.toJson(results)); } }); }
From source file:downloadwolkflow.getWorkFlowList.java
public static String[] getPageList() { String[] pageList = null;/*w w w. j a v a 2 s. co m*/ CloseableHttpClient httpclient = HttpClients.createDefault(); try { HttpGet httpget = new HttpGet("http://www.myexperiment.org/workflows"); HttpResponse response = httpclient.execute(httpget); String mainpage = EntityUtils.toString(response.getEntity()); Document mainDoc = Jsoup.parse(mainpage); Element pageinfo = mainDoc.select("div.pagination ").first(); // System.out.println(pageinfo.toString()); Elements pagesElemenets = pageinfo.select("[href]"); int pageSize = Integer.parseInt(pagesElemenets.get(pagesElemenets.size() - 2).text()); pageList = new String[pageSize + 1]; for (int i = 1; i <= pageSize; i++) { pageList[i] = "http://www.myexperiment.org/workflows?page=" + i; } } catch (IOException ex) { Logger.getLogger(getWorkFlowList.class.getName()).log(Level.SEVERE, null, ex); } return pageList; }
From source file:org.brnvrn.Main.java
/** * Parse the HTML containing a category table and the interleaved comments ... */// w ww .j av a 2 s . co m private static void parseCategory(List<Tool> tools, Element tool_div, String category, boolean obsolete) { Tool tool = new Tool(obsolete); for (Node child : tool_div.select("tbody").first().childNodes()) { switch (child.nodeName()) { case "#comment": parseComment(tool, (Comment) child); break; case "tr": Element tr = (Element) child; if (tr.select("th").size() > 0) // Skip headings break; tool.setCategory(category); if (!parseTrTool(tool, tr)) System.out.println(" Could not parse: " + tr.outerHtml()); tools.add(tool); tool = new Tool(obsolete); break; } } }
From source file:com.cbmapi.CbmAPI.java
private static String parseHtmlForInfo(Document html) { //Instead of parsing the the whole html page everytime, only useful table section is used. Element table = html.select("table.desc").first(); //<span> containing the name is clearly labeled as cpuname. String cpuName = table.select("span.cpuname").text(); //Score is the last one to use <span> tag and will be parsed to int. int cpuScore = Integer.parseInt(table.select("span").last().text()); //There are 2 <em> tags containing information. First one has description and second one has "Other names" eg.alternative name. String description = table.select("em").first().text(); String altName = table.select("em").last().text(); //Name -> Score -> possible description -> AltName. String infoString = cpuName + ",Score:" + cpuScore + "," + description + ",AltName:" + altName; return infoString; }
From source file:controllers.NWProxy.java
public static F.Promise<Result> index(String query) { if (StringUtils.isEmpty(query)) { F.Promise.promise(new F.Function0<Object>() { @Override/* w w w . j a va2 s . com*/ public Object apply() throws Throwable { return ok(Json.toJson("Query parameter (q) not provided ")); } }); } final String officialUrl = "http://www.newsweek.com"; F.Promise<WSResponse> wsResponsePromise = WS.url(officialUrl + "/search/site/" + query).get(); return wsResponsePromise.map(new F.Function<WSResponse, Result>() { @Override public Result apply(WSResponse wsResponse) throws Throwable { String body = wsResponse.getBody(); List<Map<String, String>> results = new ArrayList<Map<String, String>>(); try { // Insert into map org.jsoup.nodes.Document doc = Jsoup.parse(body); Elements items = doc.select("li.search-result"); // All articles belong to this class for (Element item : items) { Map<String, String> keyValue = new LinkedHashMap<String, String>(); keyValue.put("image", item.select("img").attr("src")); keyValue.put("title", item.select("h2").select("a").text()); keyValue.put("content", item.select("div.article-summary").first().text()); // Get date from each article separately org.jsoup.nodes.Document articleDoc = RedirectionHandler( officialUrl + item.select("a").attr("href")); keyValue.put("date", articleDoc.select("span.timedate").text()); keyValue.put("url", officialUrl + item.select("a").attr("href")); results.add(keyValue); } } catch (DOMException e) { e.printStackTrace(); } return ok(Json.toJson(results)); } }); }
From source file:damo.three.ie.util.HtmlUtilities.java
/** * Parses the My3 account usage page to nicer JSON format. * * @param pageContent Page content as HTML. * @return Usage information stripped out and formatted as JSON. * @throws JSONException//from w w w . ja v a2s . co m */ public static JSONArray parseUsageAsJSONArray(String pageContent) throws JSONException { // The HTML on prepay is pig-ugly, so we will use JSoup to // clean and parse it. Document doc = Jsoup.parse(pageContent); HtmlUtilities.removeComments(doc); Elements elements = doc.getElementsByTag("table"); JSONArray jsonArray = new JSONArray(); // three don't have a sub label for the 3-to-3 calls, which is not consistent with other items. // .. feck them! boolean three2threeCallsBug = false; for (Element element : elements) { for (Element subelement : element.select("tbody > tr")) { if ((subelement.text().contains("3 to 3 Calls")) && (subelement.text().contains("Valid until"))) { three2threeCallsBug = true; } Elements subsubelements = subelement.select("td"); if (subsubelements.size() == 3) { // skip the "total" entries if (subsubelements.select("td").get(0).text().contains("Total")) { continue; } JSONObject currentItem = new JSONObject(); if (three2threeCallsBug) { currentItem.put("item", "3 to 3 Calls"); } else { // Get rid of that "non-breaking space" character if it exists String titleToClean = subsubelements.select("td").get(0).text().replace("\u00a0", "") .trim(); currentItem.put("item", titleToClean); } /** * Check if date contains "Today", if so, change it to a date. * Otherwise we will never know when usage ends, unless user refreshes, As 'today' * is 'today', tomorrow.. see! */ String value1 = subsubelements.select("td").get(1).text(); if (value1.equals("Today")) { DateTimeFormatter formatter = DateTimeFormat.forPattern("dd/MM/yy").withLocale(Locale.UK); DateTime dt = new DateTime(); // current datetime value1 = "Expires " + formatter.print(dt); } currentItem.put("value1", value1); currentItem.put("value2", subsubelements.select("td").get(2).text()); // Out of Bundle charges have an extra property if (currentItem.getString("item").startsWith("Internet")) { Pattern p1 = Pattern.compile(Constants.OUT_OF_BUNDLE_REGEX, Pattern.DOTALL); Matcher m1 = p1.matcher(pageContent); StringBuilder cleanedDate = new StringBuilder(); if (m1.matches()) { cleanedDate.append(m1.group(1)); cleanedDate.append(' '); cleanedDate.append(m1.group(2)); cleanedDate.append(' '); cleanedDate.append(m1.group(3)); currentItem.put("value3", cleanedDate.toString()); } } jsonArray.put(currentItem); } } // reset the 3-to-3 call bug flag for next Element if (three2threeCallsBug) { three2threeCallsBug = false; } } return jsonArray; }
From source file:controllers.WDCDProxy.java
public static F.Promise<Result> index(String query) { if (StringUtils.isEmpty(query)) { F.Promise.promise(new F.Function0<Object>() { @Override// w ww . jav a 2 s .c om public Object apply() throws Throwable { return ok(Json.toJson("Query parameter (q) not provided ")); } }); } F.Promise<WSResponse> wsResponsePromise = WS.url("http://www.whatdesigncando.com/") .setQueryParameter("s", query).get(); return wsResponsePromise.map(new F.Function<WSResponse, Result>() { @Override public Result apply(WSResponse wsResponse) throws Throwable { String body = wsResponse.getBody(); List<Map<String, String>> results = new ArrayList<Map<String, String>>(); try { // Insert into map org.jsoup.nodes.Document doc = Jsoup.parse(body); Elements items = doc.select("div.item"); // Iterate through results for (Element item : items) { Map<String, String> keyValue = new LinkedHashMap<String, String>(); String imageUrl = item.select("a").attr("style"); keyValue.put("image", imageUrl.substring(imageUrl.indexOf("'") + 1, imageUrl.indexOf("'", imageUrl.indexOf("'") + 1))); keyValue.put("title", item.select("h3").text()); // Get date and the first sentence as "content" from each article separately (or the "sub-title" tag) org.jsoup.nodes.Document articleDoc = Jsoup.connect(item.select("a").attr("href")).get(); String datePublished = articleDoc.select("div#maincontent p.metainfo").text().substring(0, articleDoc.select("div#maincontent p.metainfo").text().indexOf("Published")); String firstSentence; if (articleDoc.select("div#maincontent p.sub-title").text().length() == 0) { firstSentence = articleDoc.select("div#maincontent p:not(.metainfo)").text().substring( 0, articleDoc.select("div#maincontent p:not(.metainfo)").text().indexOf(".") + 1); firstSentence = firstSentence + "."; } else { firstSentence = articleDoc.select("div#maincontent p.sub-title").text(); firstSentence = firstSentence + ".."; } keyValue.put("content", firstSentence); keyValue.put("date", datePublished); keyValue.put("url", item.select("a").attr("href")); results.add(keyValue); } } catch (DOMException e) { e.printStackTrace(); } return ok(Json.toJson(results)); } }); }
From source file:controllers.TAXIProxy.java
public static F.Promise<Result> index(String query) { if (StringUtils.isEmpty(query)) { F.Promise.promise(new F.Function0<Object>() { @Override/*from www. j a v a 2 s . co m*/ public Object apply() throws Throwable { return ok(Json.toJson("Query parameter (q) not provided ")); } }); } final String baseUrl = "http://designtaxi.com/"; F.Promise<WSResponse> wsResponsePromise = WS.url(baseUrl + "news-search.php") .setQueryParameter("news_keyword", query).get(); return wsResponsePromise.map(new F.Function<WSResponse, Result>() { @Override public Result apply(WSResponse wsResponse) throws Throwable { String body = wsResponse.getBody(); List<Map<String, String>> results = new ArrayList<Map<String, String>>(); try { // Insert into map org.jsoup.nodes.Document doc = Jsoup.parse(body); Elements items = doc.select("div.news-cover"); // Iterate through results for (Element item : items) { Map<String, String> keyValue = new LinkedHashMap<String, String>(); keyValue.put("image", baseUrl + item.select("img").attr("src")); keyValue.put("title", item.select("a.addthis_button_expanded").attr("addthis:title")); // Connect to each and every article to get date and first sentence as content try { org.jsoup.nodes.Document articleDoc = Jsoup .connect(item.select("a.addthis_button_expanded").attr("addthis:url")) .userAgent("Mozilla").get(); // If connection successful(STATUS 200), the add content and date keys to map keyValue.put("content", articleDoc.select("div#news-content").text().substring(0, articleDoc.select("div#news-content").text().indexOf(".") + 1) + "."); keyValue.put("date", articleDoc.select("span.date").text()); } catch (IOException e) { System.out.println(e); } keyValue.put("url", item.select("a.addthis_button_expanded").attr("addthis:url")); results.add(keyValue); } } catch (DOMException e) { e.printStackTrace(); } return ok(Json.toJson(results)); } }); }
From source file:org.shareok.data.plosdata.PlosUtil.java
public static String getPlosAck(String html) { String ack = ""; Document doc = Jsoup.parse(html.toString()); Elements ackLinks = doc.select("a[id=ack]"); if (!ackLinks.isEmpty()) { Element ackDiv = ackLinks.first().parent(); if (null != ackDiv) { Elements ackParagraphs = ackDiv.select("p"); if (!ackParagraphs.isEmpty()) { for (Element element : ackParagraphs) { if (element.hasText()) ack += element.text(); }/*from ww w .j a va2 s . c o m*/ } //System.out.println("the ack = "+ack+"\n\n"); } } return ack; }
From source file:org.shareok.data.plosdata.PlosUtil.java
/** * //from w w w . ja v a 2 s.c o m * @param html : The string of the web page source * @return acknowledge statement */ public static String[] getSubjects(String html) { List<String> subjectsList = new ArrayList<>(); Document doc = Jsoup.parse(html.toString()); Elements subjectListDiv = doc.select("div[class=subject-areas-container]"); if (null != subjectListDiv && !subjectListDiv.isEmpty()) { Element subjectList = subjectListDiv.first().child(1); if (null != subjectList) { Elements lis = subjectList.select("li"); if (null != lis && lis.size() > 0) { for (Element li : lis) { Element link = li.child(0); subjectsList.add(link.text()); } } } } if (subjectsList.size() > 0) { return subjectsList.toArray(new String[subjectsList.size()]); } else { return null; } }