List of usage examples for org.jsoup.nodes Document select
public Elements select(String cssQuery)
From source file:com.nuance.expertassistant.ContentCrawler.java
public static ArrayList<String> listURLs(String StartUrl, int depth) { System.out.println(" Current Depth is : [" + depth + "]"); // System.out.println(" PARENT URL is : [" + StartUrl + "]"); // System.out.println(" URL CRAWL Pattern is : [" + URLCrawlPattern + // "]");//from w ww . j a v a 2 s. com final ArrayList<String> tempURLs = new ArrayList<String>(); try { final Document doc = Jsoup.connect(StartUrl).timeout(0).get(); final Elements links = doc.select("a"); for (final Element link : links) { final String absLink = link.attr("abs:href"); if (!visitedURLs.contains(absLink) && absLink.contains(URLCrawlPattern)) { visitedURLs.add(absLink); if (visitedURLs.size() > PageLimit) { ContentExtractor.endDocument(); System.out.println(" Max URL Limit Reached - [Stopping ....] "); System.out.println(" [Stopped] "); exit(0); } tempURLs.add(absLink); System.out.println(" URLs Extracted So Far : [" + visitedURLs.size() + "]"); System.out.println(" Extracting Content From : [" + absLink + "]"); ContentExtractor.extract(absLink); } } } catch (final Exception e) { e.printStackTrace(); } return tempURLs; }
From source file:com.ettoremastrogiacomo.sktradingjava.starters.Temp.java
public static String getYahooQuotes(String symbol) throws Exception { //http://real-chart.finance.yahoo.com/table.csv?s=ENEL.MI&d=0&e=26&f=2017&g=d&a=6&b=9&c=2001&ignore=.csv URL url = new URL("https://finance.yahoo.com/quote/" + symbol + "/history?p=" + symbol); HttpFetch http = new HttpFetch(); String res = new String(http.HttpGetUrl(url.toString(), Optional.empty(), Optional.empty())); int k0 = res.indexOf("consent-form single-page-form single-page-agree-form"); if (k0 > 0) { java.util.HashMap<String, String> pmap = new java.util.HashMap<>(); Document dy = Jsoup.parse(res); Elements els = dy.select( "form[class='consent-form single-page-form single-page-agree-form'] input[type='hidden']"); els.forEach((x) -> {/*w w w.ja v a 2 s . co m*/ pmap.put(x.attr("name"), x.attr("value")); }); HttpURLConnection huc = http.sendPostRequest("https://guce.oath.com/consent", pmap); BufferedReader in = new BufferedReader(new InputStreamReader(huc.getInputStream())); String inputLine; StringBuilder response = new StringBuilder(); while ((inputLine = in.readLine()) != null) { response.append(inputLine); } in.close(); res = response.toString(); //cookieList = cookieManager.getCookieStore().getCookies(); } int k1 = res.indexOf("CrumbStore"); int k2 = res.indexOf("\"", k1 + 22); String crumb = res.substring(k1 + 21, k2).replace("\"", "").replace("\\u00", "%"); LOG.info("crumb=" + crumb); String u2 = "https://query1.finance.yahoo.com/v7/finance/download/" + symbol + "?period1=0&period2=" + System.currentTimeMillis() + "&interval=1d&events=history&crumb=" + crumb; res = new String(http.HttpGetUrl(u2, Optional.empty(), Optional.of(http.getCookies()))); LOG.debug("getting " + u2); LOG.debug(res); return res; }
From source file:org.brnvrn.Main.java
/** * Parse a HTML document, add tools to the list *//*ww w .j av a 2 s.co m*/ private static ObjectMapper parseDocument(List<Tool> tools, Document doc, boolean obsolete) { // http://jsoup.org/apidocs/org/jsoup/select/Selector.html Elements category_div = doc.select("div.container div.row:has(table)"); // we loop over each category table System.out.println("Parsing " + (obsolete ? "obsolete" : "") + " doc. ###"); System.out.println(" Found " + category_div.size() + " categories."); for (Element tool_div : category_div) { String category = tool_div.select("strong").text(); parseCategory(tools, tool_div, category, obsolete); } System.out.println(" Got " + tools.size() + " tools."); ObjectMapper objectMapper = new ObjectMapper(); objectMapper.enable(SerializationFeature.INDENT_OUTPUT); return objectMapper; }
From source file:org.shareok.data.plosdata.PlosUtil.java
/** * For some correspondences, there are no metadata about article title, <br> * instead, they is a title tag//from w w w . j a v a 2 s . c om * @param html : The string of the web page source * @return title */ public static String getTitleFromHtml(String html) { String title = ""; Document doc = Jsoup.parse(html.toString()); Elements titleElements = doc.select("title"); if (null != titleElements && titleElements.size() > 0) { title = titleElements.get(0).text(); } return title; }
From source file:com.cbmapi.CbmAPI.java
private static String parseHtmlForInfo(Document html) { //Instead of parsing the the whole html page everytime, only useful table section is used. Element table = html.select("table.desc").first(); //<span> containing the name is clearly labeled as cpuname. String cpuName = table.select("span.cpuname").text(); //Score is the last one to use <span> tag and will be parsed to int. int cpuScore = Integer.parseInt(table.select("span").last().text()); //There are 2 <em> tags containing information. First one has description and second one has "Other names" eg.alternative name. String description = table.select("em").first().text(); String altName = table.select("em").last().text(); //Name -> Score -> possible description -> AltName. String infoString = cpuName + ",Score:" + cpuScore + "," + description + ",AltName:" + altName; return infoString; }
From source file:controllers.NYTProxy.java
public static F.Promise<Result> index(String query) { if (StringUtils.isEmpty(query)) { F.Promise.promise(new F.Function0<Object>() { @Override/*from w ww .j a v a2s . c o m*/ public Object apply() throws Throwable { return ok(Json.toJson("Query parameter (q) not provided ")); } }); } String basicUrl = "http://query.nytimes.com/svc/add/v1/sitesearch.json"; // Additional query parameters String spotlight = "true"; String facet = "true"; F.Promise<WSResponse> wsResponsePromise = WS.url(basicUrl).setQueryParameter("q", query) .setQueryParameter("spotlight", spotlight).setQueryParameter("facet", facet).get(); return wsResponsePromise.map(new F.Function<WSResponse, Result>() { @Override public Result apply(WSResponse wsResponse) throws Throwable { String body = wsResponse.getBody(); List<Map<String, String>> results = new ArrayList<Map<String, String>>(); try { // Reach json code into html response from ajax call org.jsoup.nodes.Document doc = Jsoup.parse(body); String resultJSONbody = doc.select("body").first().text(); // Parse the json code JSONObject resultJSONobj = new JSONObject(resultJSONbody); resultJSONobj = (JSONObject) resultJSONobj.get("response"); // Reach array of results and set to JSONArray JSONArray resultJSONarray = new JSONArray(resultJSONobj.get("docs").toString()); // Insert each result's elements into map with corresponding key for (int i = 0; i < resultJSONarray.length(); i++) { // Set internal map Map<String, String> keyValue = new LinkedHashMap<String, String>(); // Set basic image url where image exists String basicImgUrl = "http://static01.nyt.com/"; resultJSONobj = (JSONObject) resultJSONarray.get(i); // Check if article contains "multimedia" key (image), else it shall be empty list if (resultJSONobj.getJSONArray("multimedia").length() != 0) { // Iterate through multimedia list of values and get the thumbnail image's url for (int j = 0; j < resultJSONobj.getJSONArray("multimedia").length(); j++) { if (resultJSONobj.getJSONArray("multimedia").getJSONObject(j).getString("subtype") .equals("thumbnail")) { // Prepend the basic image url keyValue.put("image", basicImgUrl + resultJSONobj.getJSONArray("multimedia") .getJSONObject(j).getString("url")); break; } } } keyValue.put("title", resultJSONobj.getJSONObject("headline").getString("main")); keyValue.put("content", resultJSONobj.getString("snippet").replace("</strong>", "")); // Format date String date = resultJSONobj.getString("pub_date").substring(0, resultJSONobj.getString("pub_date").length() - 1); Pattern pattern = Pattern.compile("[A-Z]"); Matcher matcher = pattern.matcher(date); if (matcher.find()) { date = date.substring(0, date.indexOf(matcher.group(0))) + " " + matcher.group(0) + " " + date.substring(date.indexOf(matcher.group(0)) + 1, date.length()); } keyValue.put("date", date); keyValue.put("url", resultJSONobj.getString("web_url")); results.add(keyValue); } } catch (DOMException e) { e.printStackTrace(); } return ok(Json.toJson(results)); } }); }
From source file:org.shareok.data.plosdata.PlosUtil.java
public static String getPlosAck(String html) { String ack = ""; Document doc = Jsoup.parse(html.toString()); Elements ackLinks = doc.select("a[id=ack]"); if (!ackLinks.isEmpty()) { Element ackDiv = ackLinks.first().parent(); if (null != ackDiv) { Elements ackParagraphs = ackDiv.select("p"); if (!ackParagraphs.isEmpty()) { for (Element element : ackParagraphs) { if (element.hasText()) ack += element.text(); }// w ww. j ava 2s . c o m } //System.out.println("the ack = "+ack+"\n\n"); } } return ack; }
From source file:org.shareok.data.plosdata.PlosUtil.java
public static String getPlosCitation(String html) { String citation = ""; Document doc = Jsoup.parse(html.toString()); Elements articleInfoDiv = doc.select("div[class=articleinfo]"); if (!articleInfoDiv.isEmpty()) { Element citationParagraph = articleInfoDiv.first().child(0); if (null != citationParagraph) { citation = citationParagraph.text().replace("Citation:", ""); //System.out.println("the citation = "+citation+"\n\n"); }/*ww w .j a v a2 s . c om*/ } return citation; }
From source file:org.shareok.data.plosdata.PlosUtil.java
/** * /*from w w w . ja v a 2 s.c om*/ * @param html : The string of the web page source * @return author contribution statement */ public static String getAuthorContributions(String html) { String contributions = ""; Document doc = Jsoup.parse(html.toString()); Elements articleInfoDiv = doc.select("div[class=contributions]"); if (!articleInfoDiv.isEmpty()) { Element contributionsParagraph = articleInfoDiv.first().child(2); if (null != contributionsParagraph) { contributions = contributionsParagraph.text(); //System.out.println("the contributions = "+contributions+"\n\n");System.exit(0); } } return contributions; }
From source file:org.shareok.data.plosdata.PlosUtil.java
/** * /*from www.j a va 2 s. c o m*/ * @param html : The string of the web page source * @return acknowledge statement */ public static String[] getSubjects(String html) { List<String> subjectsList = new ArrayList<>(); Document doc = Jsoup.parse(html.toString()); Elements subjectListDiv = doc.select("div[class=subject-areas-container]"); if (null != subjectListDiv && !subjectListDiv.isEmpty()) { Element subjectList = subjectListDiv.first().child(1); if (null != subjectList) { Elements lis = subjectList.select("li"); if (null != lis && lis.size() > 0) { for (Element li : lis) { Element link = li.child(0); subjectsList.add(link.text()); } } } } if (subjectsList.size() > 0) { return subjectsList.toArray(new String[subjectsList.size()]); } else { return null; } }