Example usage for org.jsoup.nodes Document select

List of usage examples for org.jsoup.nodes Document select

Introduction

In this page you can find the example usage for org.jsoup.nodes Document select.

Prototype

public Elements select(String cssQuery) 

Source Link

Document

Find elements that match the Selector CSS query, with this element as the starting context.

Usage

From source file:com.nuance.expertassistant.ContentCrawler.java

public static ArrayList<String> listURLs(String StartUrl, int depth) {

    System.out.println(" Current Depth is : [" + depth + "]");
    // System.out.println(" PARENT URL is : [" + StartUrl + "]");
    // System.out.println(" URL CRAWL Pattern  is : [" + URLCrawlPattern +
    // "]");//from  w  ww  . j a v  a  2 s.  com

    final ArrayList<String> tempURLs = new ArrayList<String>();

    try {
        final Document doc = Jsoup.connect(StartUrl).timeout(0).get();
        final Elements links = doc.select("a");

        for (final Element link : links) {
            final String absLink = link.attr("abs:href");
            if (!visitedURLs.contains(absLink) && absLink.contains(URLCrawlPattern)) {
                visitedURLs.add(absLink);
                if (visitedURLs.size() > PageLimit) {
                    ContentExtractor.endDocument();
                    System.out.println(" Max URL Limit Reached - [Stopping ....] ");
                    System.out.println(" [Stopped] ");

                    exit(0);
                }
                tempURLs.add(absLink);
                System.out.println(" URLs Extracted So Far : [" + visitedURLs.size() + "]");
                System.out.println(" Extracting Content From : [" + absLink + "]");
                ContentExtractor.extract(absLink);
            }

        }

    } catch (final Exception e) {
        e.printStackTrace();
    }

    return tempURLs;

}

From source file:com.ettoremastrogiacomo.sktradingjava.starters.Temp.java

public static String getYahooQuotes(String symbol) throws Exception {
    //http://real-chart.finance.yahoo.com/table.csv?s=ENEL.MI&d=0&e=26&f=2017&g=d&a=6&b=9&c=2001&ignore=.csv
    URL url = new URL("https://finance.yahoo.com/quote/" + symbol + "/history?p=" + symbol);

    HttpFetch http = new HttpFetch();
    String res = new String(http.HttpGetUrl(url.toString(), Optional.empty(), Optional.empty()));
    int k0 = res.indexOf("consent-form single-page-form single-page-agree-form");

    if (k0 > 0) {
        java.util.HashMap<String, String> pmap = new java.util.HashMap<>();
        Document dy = Jsoup.parse(res);
        Elements els = dy.select(
                "form[class='consent-form single-page-form single-page-agree-form'] input[type='hidden']");
        els.forEach((x) -> {/*w  w  w.ja  v a  2  s .  co m*/
            pmap.put(x.attr("name"), x.attr("value"));
        });
        HttpURLConnection huc = http.sendPostRequest("https://guce.oath.com/consent", pmap);
        BufferedReader in = new BufferedReader(new InputStreamReader(huc.getInputStream()));
        String inputLine;
        StringBuilder response = new StringBuilder();
        while ((inputLine = in.readLine()) != null) {
            response.append(inputLine);
        }
        in.close();
        res = response.toString();
        //cookieList = cookieManager.getCookieStore().getCookies();

    }
    int k1 = res.indexOf("CrumbStore");
    int k2 = res.indexOf("\"", k1 + 22);
    String crumb = res.substring(k1 + 21, k2).replace("\"", "").replace("\\u00", "%");
    LOG.info("crumb=" + crumb);
    String u2 = "https://query1.finance.yahoo.com/v7/finance/download/" + symbol + "?period1=0&period2="
            + System.currentTimeMillis() + "&interval=1d&events=history&crumb=" + crumb;
    res = new String(http.HttpGetUrl(u2, Optional.empty(), Optional.of(http.getCookies())));
    LOG.debug("getting " + u2);
    LOG.debug(res);
    return res;
}

From source file:org.brnvrn.Main.java

/**
 * Parse a HTML document, add tools to the list
 *//*ww  w .j av  a  2 s.co m*/
private static ObjectMapper parseDocument(List<Tool> tools, Document doc, boolean obsolete) {
    // http://jsoup.org/apidocs/org/jsoup/select/Selector.html
    Elements category_div = doc.select("div.container div.row:has(table)"); // we loop over each category table
    System.out.println("Parsing " + (obsolete ? "obsolete" : "") + " doc.   ###");
    System.out.println(" Found " + category_div.size() + " categories.");

    for (Element tool_div : category_div) {
        String category = tool_div.select("strong").text();
        parseCategory(tools, tool_div, category, obsolete);
    }
    System.out.println(" Got " + tools.size() + " tools.");

    ObjectMapper objectMapper = new ObjectMapper();
    objectMapper.enable(SerializationFeature.INDENT_OUTPUT);
    return objectMapper;
}

From source file:org.shareok.data.plosdata.PlosUtil.java

/**
 * For some correspondences, there are no metadata about article title, <br>
 * instead, they is a title tag//from  w w  w  . j  a v a  2 s  . c  om
 * @param html : The string of the web page source
 * @return title
 */
public static String getTitleFromHtml(String html) {
    String title = "";

    Document doc = Jsoup.parse(html.toString());
    Elements titleElements = doc.select("title");
    if (null != titleElements && titleElements.size() > 0) {
        title = titleElements.get(0).text();
    }
    return title;
}

From source file:com.cbmapi.CbmAPI.java

private static String parseHtmlForInfo(Document html) {
    //Instead of parsing the the whole html page everytime, only useful table section is used.
    Element table = html.select("table.desc").first();
    //<span> containing the name is clearly labeled as cpuname.
    String cpuName = table.select("span.cpuname").text();
    //Score is the last one to use <span> tag and will be parsed to int.
    int cpuScore = Integer.parseInt(table.select("span").last().text());
    //There are 2 <em> tags containing information. First one has description and second one has "Other names" eg.alternative name.
    String description = table.select("em").first().text();
    String altName = table.select("em").last().text();
    //Name -> Score -> possible description -> AltName.
    String infoString = cpuName + ",Score:" + cpuScore + "," + description + ",AltName:" + altName;
    return infoString;
}

From source file:controllers.NYTProxy.java

public static F.Promise<Result> index(String query) {

    if (StringUtils.isEmpty(query)) {

        F.Promise.promise(new F.Function0<Object>() {
            @Override/*from   w  ww  .j  a v a2s  .  c o m*/
            public Object apply() throws Throwable {
                return ok(Json.toJson("Query parameter (q) not provided "));
            }

        });
    }

    String basicUrl = "http://query.nytimes.com/svc/add/v1/sitesearch.json";

    // Additional query parameters
    String spotlight = "true";
    String facet = "true";

    F.Promise<WSResponse> wsResponsePromise = WS.url(basicUrl).setQueryParameter("q", query)
            .setQueryParameter("spotlight", spotlight).setQueryParameter("facet", facet).get();

    return wsResponsePromise.map(new F.Function<WSResponse, Result>() {
        @Override
        public Result apply(WSResponse wsResponse) throws Throwable {

            String body = wsResponse.getBody();
            List<Map<String, String>> results = new ArrayList<Map<String, String>>();

            try {
                // Reach json code into html response from ajax call
                org.jsoup.nodes.Document doc = Jsoup.parse(body);
                String resultJSONbody = doc.select("body").first().text();

                // Parse the json code
                JSONObject resultJSONobj = new JSONObject(resultJSONbody);
                resultJSONobj = (JSONObject) resultJSONobj.get("response");

                // Reach array of results and set to JSONArray
                JSONArray resultJSONarray = new JSONArray(resultJSONobj.get("docs").toString());

                // Insert each result's elements into map with corresponding key
                for (int i = 0; i < resultJSONarray.length(); i++) {
                    // Set internal map
                    Map<String, String> keyValue = new LinkedHashMap<String, String>();

                    // Set basic image url where image exists
                    String basicImgUrl = "http://static01.nyt.com/";
                    resultJSONobj = (JSONObject) resultJSONarray.get(i);

                    // Check if article contains "multimedia" key (image), else it shall be empty list
                    if (resultJSONobj.getJSONArray("multimedia").length() != 0) {
                        // Iterate through multimedia list of values and get the thumbnail image's url
                        for (int j = 0; j < resultJSONobj.getJSONArray("multimedia").length(); j++) {
                            if (resultJSONobj.getJSONArray("multimedia").getJSONObject(j).getString("subtype")
                                    .equals("thumbnail")) {
                                // Prepend the basic image url
                                keyValue.put("image", basicImgUrl + resultJSONobj.getJSONArray("multimedia")
                                        .getJSONObject(j).getString("url"));
                                break;
                            }
                        }

                    }

                    keyValue.put("title", resultJSONobj.getJSONObject("headline").getString("main"));
                    keyValue.put("content", resultJSONobj.getString("snippet").replace("</strong>", ""));

                    // Format date
                    String date = resultJSONobj.getString("pub_date").substring(0,
                            resultJSONobj.getString("pub_date").length() - 1);
                    Pattern pattern = Pattern.compile("[A-Z]");
                    Matcher matcher = pattern.matcher(date);

                    if (matcher.find()) {
                        date = date.substring(0, date.indexOf(matcher.group(0))) + " " + matcher.group(0) + " "
                                + date.substring(date.indexOf(matcher.group(0)) + 1, date.length());
                    }

                    keyValue.put("date", date);
                    keyValue.put("url", resultJSONobj.getString("web_url"));

                    results.add(keyValue);

                }
            } catch (DOMException e) {
                e.printStackTrace();
            }

            return ok(Json.toJson(results));
        }
    });
}

From source file:org.shareok.data.plosdata.PlosUtil.java

public static String getPlosAck(String html) {

    String ack = "";
    Document doc = Jsoup.parse(html.toString());
    Elements ackLinks = doc.select("a[id=ack]");
    if (!ackLinks.isEmpty()) {
        Element ackDiv = ackLinks.first().parent();
        if (null != ackDiv) {
            Elements ackParagraphs = ackDiv.select("p");
            if (!ackParagraphs.isEmpty()) {
                for (Element element : ackParagraphs) {
                    if (element.hasText())
                        ack += element.text();
                }// w  ww.  j ava 2s .  c o m
            }
            //System.out.println("the ack = "+ack+"\n\n");
        }
    }

    return ack;
}

From source file:org.shareok.data.plosdata.PlosUtil.java

public static String getPlosCitation(String html) {

    String citation = "";

    Document doc = Jsoup.parse(html.toString());
    Elements articleInfoDiv = doc.select("div[class=articleinfo]");
    if (!articleInfoDiv.isEmpty()) {
        Element citationParagraph = articleInfoDiv.first().child(0);
        if (null != citationParagraph) {
            citation = citationParagraph.text().replace("Citation:", "");
            //System.out.println("the citation = "+citation+"\n\n");
        }/*ww  w  .j a  v  a2  s  .  c  om*/
    }

    return citation;
}

From source file:org.shareok.data.plosdata.PlosUtil.java

/**
 * /*from   w w w  .  ja  v  a  2 s.c om*/
 * @param html : The string of the web page source
 * @return author contribution statement
 */
public static String getAuthorContributions(String html) {
    String contributions = "";

    Document doc = Jsoup.parse(html.toString());
    Elements articleInfoDiv = doc.select("div[class=contributions]");
    if (!articleInfoDiv.isEmpty()) {
        Element contributionsParagraph = articleInfoDiv.first().child(2);
        if (null != contributionsParagraph) {
            contributions = contributionsParagraph.text();
            //System.out.println("the contributions = "+contributions+"\n\n");System.exit(0);
        }
    }

    return contributions;
}

From source file:org.shareok.data.plosdata.PlosUtil.java

/**
 * /*from www.j  a va  2  s.  c  o m*/
 * @param html : The string of the web page source
 * @return acknowledge statement
 */
public static String[] getSubjects(String html) {
    List<String> subjectsList = new ArrayList<>();

    Document doc = Jsoup.parse(html.toString());
    Elements subjectListDiv = doc.select("div[class=subject-areas-container]");
    if (null != subjectListDiv && !subjectListDiv.isEmpty()) {
        Element subjectList = subjectListDiv.first().child(1);
        if (null != subjectList) {
            Elements lis = subjectList.select("li");
            if (null != lis && lis.size() > 0) {
                for (Element li : lis) {
                    Element link = li.child(0);
                    subjectsList.add(link.text());
                }
            }
        }
    }
    if (subjectsList.size() > 0) {
        return subjectsList.toArray(new String[subjectsList.size()]);
    } else {
        return null;
    }
}