List of usage examples for org.jsoup.nodes Document select
public Elements select(String cssQuery)
From source file:downloadwolkflow.getWorkFlowList.java
private static void downloadWorkFlow(String detailUrl, CloseableHttpClient httpclient) { try {//ww w .j a va 2 s . c o m HttpGet httpget = new HttpGet(detailUrl); HttpResponse response = httpclient.execute(httpget); String page = EntityUtils.toString(response.getEntity()); Document mainDoc = Jsoup.parse(page); Element downloadEle = mainDoc.select("div#myexp_content ul li a").first(); if (downloadEle == null) { downloadEle = mainDoc.select("div#myexp_content ul li:nth-child(1) span a").first(); } String downloadUrl = downloadEle.attributes().get("href"); Thread.sleep(500); if (downloadUrl.contains("download")) { downloadFiles(downloadUrl, httpclient); } else { System.out.println(detailUrl + " do not contain valuable resource"); } } catch (IOException ex) { Logger.getLogger(getWorkFlowList.class.getName()).log(Level.SEVERE, null, ex); } catch (InterruptedException ex) { Logger.getLogger(getWorkFlowList.class.getName()).log(Level.SEVERE, null, ex); } }
From source file:io.sightly.tck.html.HTMLExtractor.java
/** * Retrieves the content of an element, without its own markup tags, identified by the {@code selector} from the given {@code markup}. * The {@code url} is used only for caching purposes, to avoid parsing multiple times the markup returned for the same resource. * * @param url the url that identifies the markup * @param markup the markup/*from ww w.ja v a 2 s.co m*/ * @param selector the selector used for retrieval * @return the contents of the selected element */ public static String innerHTML(String url, String markup, String selector) { ensureMarkup(url, markup); Document document = documents.get(url); Elements elements = document.select(selector); return elements.html(); }
From source file:io.sightly.tck.html.HTMLExtractor.java
/** * Checks if the element from the {@code markup} identified by the {@code selector} contains the text from {@code value}. The * {@code url} is used only for caching purposes, to avoid parsing multiple times the markup returned for the same resource. * * @param url the url that identifies the markup * @param markup the markup/* w w w. j a va 2 s.com*/ * @param selector the selector used for retrieval * @param value the text that should exist in the markup * @return {@code true} if the {@code value} was found in the markup, {@code false} otherwise */ public static boolean contains(String url, String markup, String selector, String value) { ensureMarkup(url, markup); Document document = documents.get(url); Elements elements = document.select(selector); return elements.outerHtml().contains(value); }
From source file:io.sightly.tck.html.HTMLExtractor.java
/** * Checks if the {@code selector} identifies an element from the {@code markup}. The {@code url} is used only for caching purposes, * to avoid parsing multiple times the markup returned for the same resource. * * @param url the url that identifies the markup * @param markup the markup/*ww w . ja v a 2 s . c om*/ * @param selector the selector used for retrieval * @return {@code true} if the element identified by the {@code selector} exists, {@code false} otherwise */ public static boolean exists(String url, String markup, String selector) { ensureMarkup(url, markup); Document document = documents.get(url); Elements elements = document.select(selector); return elements.size() > 0; }
From source file:io.sightly.tck.html.HTMLExtractor.java
/** * Checks if the element matched by the {@code selector} has children and if their number is equal to {@code howMany}. * * @param url the url that identifies the markup * @param markup the markup// www .ja va 2s. co m * @param selector the selector used for retrieval * @param howMany the number of expected children * @return {@code true} if the number of children is equal to {@code howMany}, {@code false} otherwise */ public static boolean hasChildren(String url, String markup, String selector, int howMany) { ensureMarkup(url, markup); Document document = documents.get(url); Element element = document.select(selector).first(); if (element == null) { return false; } return element.children().size() == howMany; }
From source file:controllers.BIProxy.java
public static F.Promise<Result> index(String query) { if (StringUtils.isEmpty(query)) { F.Promise.promise(new F.Function0<Object>() { @Override//from w w w.ja v a2s. co m public Object apply() throws Throwable { return ok(Json.toJson("Query parameter (q) not provided ")); } }); } F.Promise<WSResponse> wsResponsePromise = WS.url("http://www.businessinsider.com/s") .setQueryParameter("q", query).get(); return wsResponsePromise.map(new F.Function<WSResponse, Result>() { @Override public Result apply(WSResponse wsResponse) throws Throwable { String body = wsResponse.getBody(); List<Map<String, String>> results = new ArrayList<Map<String, String>>(); try { // Insert into map org.jsoup.nodes.Document doc = Jsoup.parse(body); Elements items = doc.select("div.search-result"); // Iterate through results for (Element item : items) { Map<String, String> keyValue = new LinkedHashMap<String, String>(); keyValue.put("image", item.select("img").attr("src")); keyValue.put("title", item.select("h3").text()); keyValue.put("content", item.select("div.excerpt").first().text()); keyValue.put("date", item.select("li.date").text()); keyValue.put("url", item.select("a").attr("href")); results.add(keyValue); } } catch (DOMException e) { e.printStackTrace(); } return ok(Json.toJson(results)); } }); }
From source file:io.sightly.tck.html.HTMLExtractor.java
/** * Checks if an element matched by the {@code selector} contains or not the attribute {@code attributeName}, * depending on the value of the {@code exists} flag. Additionally, the attribute's value can be checked against {@code attributeValue}. * * @param url the url that identifies the markup * @param markup the markup// ww w . j a v a 2s . co m * @param selector the selector used for retrieval * @param exists flag that defines if the attribute is expected to exist or not * @param attributeName the attribute's name * @param attributeValue the attribute's value * @return {@code true} if the attribute matches the defined conditions, {@code false} otherwise */ public static boolean hasAttribute(String url, String markup, String selector, boolean exists, String attributeName, String attributeValue) { ensureMarkup(url, markup); Document document = documents.get(url); Elements elements = document.select(selector); if (elements.size() > 0) { if (exists) { if (StringUtils.isNotEmpty(attributeValue)) { String value = elements.attr(attributeName); return attributeValue.equals(value); } return true; } else { return elements.hasAttr(attributeName); } } return false; }
From source file:controllers.FRBProxy.java
public static F.Promise<Result> index(String query) { if (StringUtils.isEmpty(query)) { F.Promise.promise(new F.Function0<Object>() { @Override//from w ww . j a v a 2s .c om public Object apply() throws Throwable { return ok(Json.toJson("Query parameter (q) not provided ")); } }); } F.Promise<WSResponse> wsResponsePromise = WS.url("http://www.forbes.com/search/") .setQueryParameter("q", query).get(); return wsResponsePromise.map(new F.Function<WSResponse, Result>() { @Override public Result apply(WSResponse wsResponse) throws Throwable { String body = wsResponse.getBody(); List<Map<String, String>> results = new ArrayList<Map<String, String>>(); try { // Insert into map org.jsoup.nodes.Document doc = Jsoup.parse(body); Elements items = doc.select("li.edittools-contentitem"); // All articles belong to this class for (Element item : items) { Map<String, String> keyValue = new LinkedHashMap<String, String>(); // Check if specific article belongs to gallery class (therefore it contains an image) if (item.hasClass("gallery")) { // Add image key and value to map keyValue.put("image", item.select("img").attr("src")); } // Add the rest of keys and values keyValue.put("title", item.select("h2").select("a").text()); keyValue.put("content", item.select("p").first().ownText()); keyValue.put("date", item.select("time").text()); keyValue.put("url", item.select("h2").select("a").attr("href")); results.add(keyValue); } } catch (DOMException e) { e.printStackTrace(); } return ok(Json.toJson(results)); } }); }
From source file:controllers.KWProxy.java
public static F.Promise<Result> index(String query) { if (StringUtils.isEmpty(query)) { F.Promise.promise(new F.Function0<Object>() { @Override/*from w ww . java2s . c o m*/ public Object apply() throws Throwable { return ok(Json.toJson("Query parameter (q) not provided ")); } }); } F.Promise<WSResponse> wsResponsePromise = WS.url("http://knowledge.wharton.upenn.edu/") .setQueryParameter("s", query).get(); return wsResponsePromise.map(new F.Function<WSResponse, Result>() { @Override public Result apply(WSResponse wsResponse) throws Throwable { String body = wsResponse.getBody(); List<Map<String, String>> results = new ArrayList<Map<String, String>>(); try { // Insert into map org.jsoup.nodes.Document doc = Jsoup.parse(body); Elements items = doc.select("div.article.type-article.status-publish"); // All articles belong to this classes for (Element item : items) { Map<String, String> keyValue = new LinkedHashMap<String, String>(); // Check if specific article belongs to "has-post-thumbnail" class (therefore it contains an image) if (item.hasClass("has-post-thumbnail")) { // Add image key and value to map keyValue.put("image", item.select("img").attr("src")); } // Add the rest of keys and values keyValue.put("title", item.select("h2").select("a").text()); keyValue.put("content", item.select("div.attribute.categorythumbs").first().text()); keyValue.put("date", item.select("ul.datestamp").select("li").first().text()); keyValue.put("url", item.select("h2").select("a").attr("href")); results.add(keyValue); } } catch (DOMException e) { e.printStackTrace(); } return ok(Json.toJson(results)); } }); }
From source file:controllers.NWProxy.java
public static F.Promise<Result> index(String query) { if (StringUtils.isEmpty(query)) { F.Promise.promise(new F.Function0<Object>() { @Override/*w w w .java 2s.c o m*/ public Object apply() throws Throwable { return ok(Json.toJson("Query parameter (q) not provided ")); } }); } final String officialUrl = "http://www.newsweek.com"; F.Promise<WSResponse> wsResponsePromise = WS.url(officialUrl + "/search/site/" + query).get(); return wsResponsePromise.map(new F.Function<WSResponse, Result>() { @Override public Result apply(WSResponse wsResponse) throws Throwable { String body = wsResponse.getBody(); List<Map<String, String>> results = new ArrayList<Map<String, String>>(); try { // Insert into map org.jsoup.nodes.Document doc = Jsoup.parse(body); Elements items = doc.select("li.search-result"); // All articles belong to this class for (Element item : items) { Map<String, String> keyValue = new LinkedHashMap<String, String>(); keyValue.put("image", item.select("img").attr("src")); keyValue.put("title", item.select("h2").select("a").text()); keyValue.put("content", item.select("div.article-summary").first().text()); // Get date from each article separately org.jsoup.nodes.Document articleDoc = RedirectionHandler( officialUrl + item.select("a").attr("href")); keyValue.put("date", articleDoc.select("span.timedate").text()); keyValue.put("url", officialUrl + item.select("a").attr("href")); results.add(keyValue); } } catch (DOMException e) { e.printStackTrace(); } return ok(Json.toJson(results)); } }); }