List of usage examples for org.jsoup.nodes Document select
public Elements select(String cssQuery)
From source file:controllers.TAXIProxy.java
public static F.Promise<Result> index(String query) { if (StringUtils.isEmpty(query)) { F.Promise.promise(new F.Function0<Object>() { @Override//from w w w . j a v a 2s.c o m public Object apply() throws Throwable { return ok(Json.toJson("Query parameter (q) not provided ")); } }); } final String baseUrl = "http://designtaxi.com/"; F.Promise<WSResponse> wsResponsePromise = WS.url(baseUrl + "news-search.php") .setQueryParameter("news_keyword", query).get(); return wsResponsePromise.map(new F.Function<WSResponse, Result>() { @Override public Result apply(WSResponse wsResponse) throws Throwable { String body = wsResponse.getBody(); List<Map<String, String>> results = new ArrayList<Map<String, String>>(); try { // Insert into map org.jsoup.nodes.Document doc = Jsoup.parse(body); Elements items = doc.select("div.news-cover"); // Iterate through results for (Element item : items) { Map<String, String> keyValue = new LinkedHashMap<String, String>(); keyValue.put("image", baseUrl + item.select("img").attr("src")); keyValue.put("title", item.select("a.addthis_button_expanded").attr("addthis:title")); // Connect to each and every article to get date and first sentence as content try { org.jsoup.nodes.Document articleDoc = Jsoup .connect(item.select("a.addthis_button_expanded").attr("addthis:url")) .userAgent("Mozilla").get(); // If connection successful(STATUS 200), the add content and date keys to map keyValue.put("content", articleDoc.select("div#news-content").text().substring(0, articleDoc.select("div#news-content").text().indexOf(".") + 1) + "."); keyValue.put("date", articleDoc.select("span.date").text()); } catch (IOException e) { System.out.println(e); } keyValue.put("url", item.select("a.addthis_button_expanded").attr("addthis:url")); results.add(keyValue); } } catch (DOMException e) { e.printStackTrace(); } return ok(Json.toJson(results)); } }); }
From source file:controllers.WDCDProxy.java
public static F.Promise<Result> index(String query) { if (StringUtils.isEmpty(query)) { F.Promise.promise(new F.Function0<Object>() { @Override// www . j a v a 2 s .c o m public Object apply() throws Throwable { return ok(Json.toJson("Query parameter (q) not provided ")); } }); } F.Promise<WSResponse> wsResponsePromise = WS.url("http://www.whatdesigncando.com/") .setQueryParameter("s", query).get(); return wsResponsePromise.map(new F.Function<WSResponse, Result>() { @Override public Result apply(WSResponse wsResponse) throws Throwable { String body = wsResponse.getBody(); List<Map<String, String>> results = new ArrayList<Map<String, String>>(); try { // Insert into map org.jsoup.nodes.Document doc = Jsoup.parse(body); Elements items = doc.select("div.item"); // Iterate through results for (Element item : items) { Map<String, String> keyValue = new LinkedHashMap<String, String>(); String imageUrl = item.select("a").attr("style"); keyValue.put("image", imageUrl.substring(imageUrl.indexOf("'") + 1, imageUrl.indexOf("'", imageUrl.indexOf("'") + 1))); keyValue.put("title", item.select("h3").text()); // Get date and the first sentence as "content" from each article separately (or the "sub-title" tag) org.jsoup.nodes.Document articleDoc = Jsoup.connect(item.select("a").attr("href")).get(); String datePublished = articleDoc.select("div#maincontent p.metainfo").text().substring(0, articleDoc.select("div#maincontent p.metainfo").text().indexOf("Published")); String firstSentence; if (articleDoc.select("div#maincontent p.sub-title").text().length() == 0) { firstSentence = articleDoc.select("div#maincontent p:not(.metainfo)").text().substring( 0, articleDoc.select("div#maincontent p:not(.metainfo)").text().indexOf(".") + 1); firstSentence = firstSentence + "."; } else { firstSentence = articleDoc.select("div#maincontent p.sub-title").text(); firstSentence = firstSentence + ".."; } keyValue.put("content", firstSentence); keyValue.put("date", datePublished); keyValue.put("url", item.select("a").attr("href")); results.add(keyValue); } } catch (DOMException e) { e.printStackTrace(); } return ok(Json.toJson(results)); } }); }
From source file:com.vaadin.sass.testcases.scss.W3ConformanceTests.java
protected static Collection<URI> scrapeIndexForTests(String url, String regexp, int maxTests, Collection<URI> excludeUrls) throws Exception { URI baseUrl = new URI(url); Document doc = Jsoup.connect(url).timeout(10000).get(); Elements elems = doc.select(String.format("a[href~=%s]", regexp)); LinkedHashSet<URI> tests = new LinkedHashSet<URI>(); for (Element e : elems) { URI testUrl = new URI(e.attr("href")); if (!testUrl.isAbsolute()) { testUrl = baseUrl.resolve(testUrl); }/*from w w w . j a v a 2 s . c o m*/ if (tests.size() < maxTests) { if (!excludeUrls.contains(testUrl)) { tests.add(testUrl); } } else { break; } } return tests; }
From source file:controllers.CNNProxy.java
public static F.Promise<Result> index(String query) { if (StringUtils.isEmpty(query)) { F.Promise.promise(new F.Function0<Object>() { @Override//from w ww . ja v a 2 s.c o m public Object apply() throws Throwable { return ok(Json.toJson("Query parameter (q) not provided ")); } }); } String feedUrl = "http://searchapp.cnn.com/search/query.jsp"; //query = query + "&type=all"; String page = "1"; String npp = "10"; String start = "1"; String type = "all"; String bucket = "true"; String sort = "relevance"; String csiID = "csi1"; F.Promise<WSResponse> wsResponsePromise = WS.url(feedUrl).setQueryParameter("page", page) .setQueryParameter("npp", npp).setQueryParameter("start", start).setQueryParameter("text", query) .setQueryParameter("type", type).setQueryParameter("bucket", bucket).setQueryParameter("sort", sort) .setQueryParameter("csiID", csiID).get(); return wsResponsePromise.map(new F.Function<WSResponse, Result>() { @Override public Result apply(WSResponse wsResponse) throws Throwable { String body = wsResponse.getBody(); List<Map<String, String>> ret = new ArrayList<Map<String, String>>(); try { // Reach json code into html response from ajax call org.jsoup.nodes.Document doc = Jsoup.parse(body); Element resultElement = doc.select("textarea#jsCode").first(); String resultString = resultElement.text(); // Parse the json code JSONObject obj = new JSONObject(resultString); JSONArray array = new JSONArray(obj.get("results").toString()); JSONArray internalArray = new JSONArray(array.get(0).toString()); // Insert each result's elements into map with corresponding key for (int i = 0; i < internalArray.length(); i++) { JSONObject elementObj = new JSONObject(internalArray.get(i).toString()); String image = elementObj.get("thumbnail").toString(); String title = elementObj.get("title").toString(); String content = elementObj.get("description").toString(); String date = elementObj.get("mediaDateUts").toString(); String url = elementObj.get("url").toString(); Map<String, String> keyValue = new LinkedHashMap<String, String>(); keyValue.put("image", image); keyValue.put("title", title); keyValue.put("content", content); keyValue.put("date", date); keyValue.put("url", url); ret.add(keyValue); } } catch (DOMException e) { e.printStackTrace(); } return ok(Json.toJson(ret)); } }); }
From source file:com.geecko.QuickLyric.lyrics.Genius.java
public static Lyrics fromURL(String url, String artist, String title) { Document lyricsPage; String text;//from w w w .j av a2 s .c o m try { lyricsPage = Jsoup.connect(url).get(); Elements lyricsDiv = lyricsPage.select("div.lyrics"); if (lyricsDiv.isEmpty()) throw new StringIndexOutOfBoundsException(); else text = Jsoup.clean(lyricsDiv.html(), Whitelist.none().addTags("br")).trim(); } catch (HttpStatusException e) { return new Lyrics(Lyrics.NO_RESULT); } catch (IOException | StringIndexOutOfBoundsException e) { e.printStackTrace(); return new Lyrics(Lyrics.ERROR); } if (artist == null) { title = lyricsPage.getElementsByClass("text_title").get(0).text(); artist = lyricsPage.getElementsByClass("text_artist").get(0).text(); } Lyrics result = new Lyrics(Lyrics.POSITIVE_RESULT); if ("[Instrumental]".equals(text)) result = new Lyrics(Lyrics.NEGATIVE_RESULT); result.setArtist(artist); result.setTitle(title); result.setText(text); result.setURL(url); result.setSource("Genius"); return result; }
From source file:com.megatome.j2d.support.JavadocSupport.java
private static Elements loadAndFindLinks(final File f) throws BuilderException { try {/*from w ww . j a v a2 s . c o m*/ final Document doc = Jsoup.parse(f, "UTF-8"); return doc.select("a"); } catch (IOException e) { throw new BuilderException("Failed to index javadoc files", e); } }
From source file:controllers.CNBCProxy.java
public static F.Promise<Result> index(String query) { if (StringUtils.isEmpty(query)) { F.Promise.promise(new F.Function0<Object>() { @Override//from ww w. ja v a 2s . c om public Object apply() throws Throwable { return ok(Json.toJson("Query parameter (q) not provided ")); } }); } String target = "all"; String categories = "exclude"; String partnerId = "2000"; // ?target=all&categories=exclude&partnerId=2000&keywords=apple F.Promise<WSResponse> wsResponsePromise = WS.url("http://search.cnbc.com/main.do") .setQueryParameter("target", target).setQueryParameter("categories", categories) .setQueryParameter("partnerId", partnerId).setQueryParameter("keywords", query).get(); return wsResponsePromise.map(new F.Function<WSResponse, Result>() { @Override public Result apply(WSResponse wsResponse) throws Throwable { String body = wsResponse.getBody(); List<Map<String, String>> results = new ArrayList<Map<String, String>>(); try { // Parse html document org.jsoup.nodes.Document doc = Jsoup.parse(body); Elements items = doc.select("div:not(.clr).padL.padR"); // Choose elements that contain classes "padL" and "padR", but not "clr" // Iterate through results for (Element item : items) { Map<String, String> keyValue = new LinkedHashMap<String, String>(); // Add the keys and values keyValue.put("title", item.select("a").text()); keyValue.put("content", item.select("span.cnbc_bio_content").text()); keyValue.put("date", CalculateDateFormat(Long .parseLong(item.getElementsByTag("script").html().replaceAll("[^0-9]", ""), 10))); // Edit the date format keyValue.put("url", item.select("a").attr("href")); results.add(keyValue); } } catch (DOMException e) { e.printStackTrace(); } return ok(Json.toJson(results)); } }); }
From source file:er.java
/** * Purpose: jsoup?Html// w w w . j a v a 2s .c om * * @param html * @return: String[] */ private static String[] htmlToPlainText(String html) { String[] content = new String[] { "", "" }; Document doc = Jsoup.parse(html); // ???blog Elements titles = doc.select("h3.title-article>strong"); for (Element oneSelect : titles) content[0] += oneSelect.text(); // ???blog Elements contents = doc.select("div#blogContent"); for (Element oneSelect : contents) content[1] += oneSelect.text(); return content; }
From source file:com.geecko.QuickLyric.lyrics.LyricWiki.java
public static Lyrics fromURL(String url, String artist, String song) { if (url.endsWith("action=edit")) { return new Lyrics(NO_RESULT); }/*w w w . j a v a 2 s .c o m*/ String text; try { //url = URLDecoder.decode(url, "utf-8"); Document lyricsPage = Jsoup.connect(url).get(); Element lyricbox = lyricsPage.select("div.lyricBox").get(0); lyricbox.after(lyricbox.childNode(0)); String lyricsHtml = lyricbox.html(); text = lyricsHtml.substring(0, lyricsHtml.indexOf("<!--")).replaceAll("<.*?>", "").replaceAll("\n", "<br />"); if (text.contains("&#")) text = Parser.unescapeEntities(text, true); } catch (IndexOutOfBoundsException | IOException e) { e.printStackTrace(); return new Lyrics(ERROR); } if (artist == null) artist = url.substring(24).replace("Gracenote:", "").split(":", 2)[0].replace('_', ' '); if (song == null) song = url.substring(24).replace("Gracenote:", "").split(":", 2)[1].replace('_', ' '); try { artist = URLDecoder.decode(artist, "UTF-8"); song = URLDecoder.decode(song, "UTF-8"); } catch (UnsupportedEncodingException e) { e.printStackTrace(); } if (text.contains( "Unfortunately, we are not licensed to display the full lyrics for this song at the moment.") || text.equals("Instrumental <br />")) { Lyrics result = new Lyrics(NEGATIVE_RESULT); result.setArtist(artist); result.setTitle(song); return result; } else if (text.equals("") || text.length() < 3) return new Lyrics(NO_RESULT); else { Lyrics lyrics = new Lyrics(POSITIVE_RESULT); lyrics.setArtist(artist); lyrics.setTitle(song); lyrics.setText(text); lyrics.setSource("LyricsWiki"); lyrics.setURL(url); return lyrics; } }
From source file:controllers.NWProxy.java
private static org.jsoup.nodes.Document RedirectionHandler(String url) throws IOException { org.jsoup.nodes.Document articleDoc = Jsoup.connect(url).get(); String officialUrl = "http://www.newsweek.com"; String redirectedUrl = null;//from w ww . j a va 2s . c o m Elements meta = articleDoc.select("html head meta"); if (meta.attr("http-equiv").contains("refresh")) { redirectedUrl = officialUrl + meta.attr("content").substring(meta.attr("content").indexOf("=") + 1).replaceAll("'", ""); return RedirectionHandler(redirectedUrl); } return articleDoc; }