Example usage for org.jsoup.nodes Element select

List of usage examples for org.jsoup.nodes Element select

Introduction

In this page you can find the example usage for org.jsoup.nodes Element select.

Prototype

public Elements select(String cssQuery) 

Source Link

Document

Find elements that match the Selector CSS query, with this element as the starting context.

Usage

From source file:controllers.BIProxy.java

public static F.Promise<Result> index(String query) {

    if (StringUtils.isEmpty(query)) {

        F.Promise.promise(new F.Function0<Object>() {
            @Override//  ww w  .j  a v a 2 s  . c om
            public Object apply() throws Throwable {
                return ok(Json.toJson("Query parameter (q) not provided "));
            }

        });
    }

    F.Promise<WSResponse> wsResponsePromise = WS.url("http://www.businessinsider.com/s")
            .setQueryParameter("q", query).get();

    return wsResponsePromise.map(new F.Function<WSResponse, Result>() {
        @Override
        public Result apply(WSResponse wsResponse) throws Throwable {

            String body = wsResponse.getBody();
            List<Map<String, String>> results = new ArrayList<Map<String, String>>();

            try {
                // Insert into map
                org.jsoup.nodes.Document doc = Jsoup.parse(body);
                Elements items = doc.select("div.search-result");

                // Iterate through results
                for (Element item : items) {
                    Map<String, String> keyValue = new LinkedHashMap<String, String>();

                    keyValue.put("image", item.select("img").attr("src"));
                    keyValue.put("title", item.select("h3").text());
                    keyValue.put("content", item.select("div.excerpt").first().text());
                    keyValue.put("date", item.select("li.date").text());
                    keyValue.put("url", item.select("a").attr("href"));

                    results.add(keyValue);
                }

            } catch (DOMException e) {
                e.printStackTrace();
            }

            return ok(Json.toJson(results));
        }
    });
}

From source file:downloadwolkflow.getWorkFlowList.java

public static String[] getPageList() {
    String[] pageList = null;/*w  w  w. j a  v a  2 s. co  m*/
    CloseableHttpClient httpclient = HttpClients.createDefault();
    try {
        HttpGet httpget = new HttpGet("http://www.myexperiment.org/workflows");
        HttpResponse response = httpclient.execute(httpget);
        String mainpage = EntityUtils.toString(response.getEntity());
        Document mainDoc = Jsoup.parse(mainpage);
        Element pageinfo = mainDoc.select("div.pagination ").first();
        //            System.out.println(pageinfo.toString());
        Elements pagesElemenets = pageinfo.select("[href]");
        int pageSize = Integer.parseInt(pagesElemenets.get(pagesElemenets.size() - 2).text());
        pageList = new String[pageSize + 1];
        for (int i = 1; i <= pageSize; i++) {
            pageList[i] = "http://www.myexperiment.org/workflows?page=" + i;
        }

    } catch (IOException ex) {
        Logger.getLogger(getWorkFlowList.class.getName()).log(Level.SEVERE, null, ex);
    }
    return pageList;
}

From source file:org.brnvrn.Main.java

/**
 * Parse the HTML containing a category table and the interleaved comments ...
*///  w ww .j av  a 2  s  .  co  m
private static void parseCategory(List<Tool> tools, Element tool_div, String category, boolean obsolete) {
    Tool tool = new Tool(obsolete);
    for (Node child : tool_div.select("tbody").first().childNodes()) {
        switch (child.nodeName()) {
        case "#comment":
            parseComment(tool, (Comment) child);
            break;
        case "tr":
            Element tr = (Element) child;
            if (tr.select("th").size() > 0) // Skip headings
                break;
            tool.setCategory(category);
            if (!parseTrTool(tool, tr))
                System.out.println("  Could not parse: " + tr.outerHtml());
            tools.add(tool);
            tool = new Tool(obsolete);
            break;
        }
    }
}

From source file:com.cbmapi.CbmAPI.java

private static String parseHtmlForInfo(Document html) {
    //Instead of parsing the the whole html page everytime, only useful table section is used.
    Element table = html.select("table.desc").first();
    //<span> containing the name is clearly labeled as cpuname.
    String cpuName = table.select("span.cpuname").text();
    //Score is the last one to use <span> tag and will be parsed to int.
    int cpuScore = Integer.parseInt(table.select("span").last().text());
    //There are 2 <em> tags containing information. First one has description and second one has "Other names" eg.alternative name.
    String description = table.select("em").first().text();
    String altName = table.select("em").last().text();
    //Name -> Score -> possible description -> AltName.
    String infoString = cpuName + ",Score:" + cpuScore + "," + description + ",AltName:" + altName;
    return infoString;
}

From source file:controllers.NWProxy.java

public static F.Promise<Result> index(String query) {

    if (StringUtils.isEmpty(query)) {

        F.Promise.promise(new F.Function0<Object>() {
            @Override/* w  w  w .  j a va2  s  . com*/
            public Object apply() throws Throwable {
                return ok(Json.toJson("Query parameter (q) not provided "));
            }

        });
    }

    final String officialUrl = "http://www.newsweek.com";

    F.Promise<WSResponse> wsResponsePromise = WS.url(officialUrl + "/search/site/" + query).get();

    return wsResponsePromise.map(new F.Function<WSResponse, Result>() {
        @Override
        public Result apply(WSResponse wsResponse) throws Throwable {

            String body = wsResponse.getBody();

            List<Map<String, String>> results = new ArrayList<Map<String, String>>();

            try {

                // Insert into map
                org.jsoup.nodes.Document doc = Jsoup.parse(body);
                Elements items = doc.select("li.search-result"); // All articles belong to this class

                for (Element item : items) {
                    Map<String, String> keyValue = new LinkedHashMap<String, String>();

                    keyValue.put("image", item.select("img").attr("src"));
                    keyValue.put("title", item.select("h2").select("a").text());
                    keyValue.put("content", item.select("div.article-summary").first().text());

                    // Get date from each article separately
                    org.jsoup.nodes.Document articleDoc = RedirectionHandler(
                            officialUrl + item.select("a").attr("href"));

                    keyValue.put("date", articleDoc.select("span.timedate").text());
                    keyValue.put("url", officialUrl + item.select("a").attr("href"));

                    results.add(keyValue);
                }
            } catch (DOMException e) {
                e.printStackTrace();
            }

            return ok(Json.toJson(results));
        }
    });
}

From source file:damo.three.ie.util.HtmlUtilities.java

/**
 * Parses the My3 account usage page to nicer JSON format.
 *
 * @param pageContent Page content as HTML.
 * @return Usage information stripped out and formatted as JSON.
 * @throws JSONException//from  w w  w  . ja  v  a2s  . co  m
 */
public static JSONArray parseUsageAsJSONArray(String pageContent) throws JSONException {
    // The HTML on prepay is pig-ugly, so we will use JSoup to
    // clean and parse it.
    Document doc = Jsoup.parse(pageContent);
    HtmlUtilities.removeComments(doc);

    Elements elements = doc.getElementsByTag("table");

    JSONArray jsonArray = new JSONArray();

    // three don't have a sub label for the 3-to-3 calls, which is not consistent with other items.
    // .. feck them!
    boolean three2threeCallsBug = false;

    for (Element element : elements) {

        for (Element subelement : element.select("tbody > tr")) {

            if ((subelement.text().contains("3 to 3 Calls")) && (subelement.text().contains("Valid until"))) {
                three2threeCallsBug = true;
            }

            Elements subsubelements = subelement.select("td");

            if (subsubelements.size() == 3) {

                // skip the "total" entries
                if (subsubelements.select("td").get(0).text().contains("Total")) {
                    continue;
                }

                JSONObject currentItem = new JSONObject();

                if (three2threeCallsBug) {
                    currentItem.put("item", "3 to 3 Calls");
                } else {
                    // Get rid of that "non-breaking space" character if it exists
                    String titleToClean = subsubelements.select("td").get(0).text().replace("\u00a0", "")
                            .trim();
                    currentItem.put("item", titleToClean);
                }

                /**
                 * Check if date contains "Today", if so, change it to a date.
                 * Otherwise we will never know when usage ends, unless user refreshes, As 'today'
                 * is 'today', tomorrow.. see!
                 */
                String value1 = subsubelements.select("td").get(1).text();
                if (value1.equals("Today")) {
                    DateTimeFormatter formatter = DateTimeFormat.forPattern("dd/MM/yy").withLocale(Locale.UK);
                    DateTime dt = new DateTime(); // current datetime
                    value1 = "Expires " + formatter.print(dt);
                }
                currentItem.put("value1", value1);
                currentItem.put("value2", subsubelements.select("td").get(2).text());

                // Out of Bundle charges have an extra property
                if (currentItem.getString("item").startsWith("Internet")) {

                    Pattern p1 = Pattern.compile(Constants.OUT_OF_BUNDLE_REGEX, Pattern.DOTALL);
                    Matcher m1 = p1.matcher(pageContent);

                    StringBuilder cleanedDate = new StringBuilder();
                    if (m1.matches()) {
                        cleanedDate.append(m1.group(1));
                        cleanedDate.append(' ');
                        cleanedDate.append(m1.group(2));
                        cleanedDate.append(' ');
                        cleanedDate.append(m1.group(3));
                        currentItem.put("value3", cleanedDate.toString());
                    }

                }
                jsonArray.put(currentItem);
            }

        }

        // reset the 3-to-3 call bug flag for next Element
        if (three2threeCallsBug) {
            three2threeCallsBug = false;
        }
    }

    return jsonArray;
}

From source file:controllers.WDCDProxy.java

public static F.Promise<Result> index(String query) {

    if (StringUtils.isEmpty(query)) {

        F.Promise.promise(new F.Function0<Object>() {
            @Override//  w  ww  .  jav a 2  s .c om
            public Object apply() throws Throwable {
                return ok(Json.toJson("Query parameter (q) not provided "));
            }

        });
    }

    F.Promise<WSResponse> wsResponsePromise = WS.url("http://www.whatdesigncando.com/")
            .setQueryParameter("s", query).get();

    return wsResponsePromise.map(new F.Function<WSResponse, Result>() {
        @Override
        public Result apply(WSResponse wsResponse) throws Throwable {

            String body = wsResponse.getBody();
            List<Map<String, String>> results = new ArrayList<Map<String, String>>();

            try {
                // Insert into map
                org.jsoup.nodes.Document doc = Jsoup.parse(body);
                Elements items = doc.select("div.item");

                // Iterate through results
                for (Element item : items) {
                    Map<String, String> keyValue = new LinkedHashMap<String, String>();

                    String imageUrl = item.select("a").attr("style");

                    keyValue.put("image", imageUrl.substring(imageUrl.indexOf("'") + 1,
                            imageUrl.indexOf("'", imageUrl.indexOf("'") + 1)));
                    keyValue.put("title", item.select("h3").text());

                    // Get date and the first sentence as "content" from each article separately (or the "sub-title" tag)
                    org.jsoup.nodes.Document articleDoc = Jsoup.connect(item.select("a").attr("href")).get();

                    String datePublished = articleDoc.select("div#maincontent p.metainfo").text().substring(0,
                            articleDoc.select("div#maincontent p.metainfo").text().indexOf("Published"));
                    String firstSentence;

                    if (articleDoc.select("div#maincontent p.sub-title").text().length() == 0) {
                        firstSentence = articleDoc.select("div#maincontent p:not(.metainfo)").text().substring(
                                0,
                                articleDoc.select("div#maincontent p:not(.metainfo)").text().indexOf(".") + 1);
                        firstSentence = firstSentence + ".";
                    } else {
                        firstSentence = articleDoc.select("div#maincontent p.sub-title").text();
                        firstSentence = firstSentence + "..";
                    }

                    keyValue.put("content", firstSentence);
                    keyValue.put("date", datePublished);
                    keyValue.put("url", item.select("a").attr("href"));

                    results.add(keyValue);
                }

            } catch (DOMException e) {
                e.printStackTrace();
            }

            return ok(Json.toJson(results));
        }
    });
}

From source file:controllers.TAXIProxy.java

public static F.Promise<Result> index(String query) {

    if (StringUtils.isEmpty(query)) {

        F.Promise.promise(new F.Function0<Object>() {
            @Override/*from  www. j a  v a 2  s . co m*/
            public Object apply() throws Throwable {
                return ok(Json.toJson("Query parameter (q) not provided "));
            }

        });
    }

    final String baseUrl = "http://designtaxi.com/";

    F.Promise<WSResponse> wsResponsePromise = WS.url(baseUrl + "news-search.php")
            .setQueryParameter("news_keyword", query).get();

    return wsResponsePromise.map(new F.Function<WSResponse, Result>() {
        @Override
        public Result apply(WSResponse wsResponse) throws Throwable {

            String body = wsResponse.getBody();
            List<Map<String, String>> results = new ArrayList<Map<String, String>>();

            try {
                // Insert into map
                org.jsoup.nodes.Document doc = Jsoup.parse(body);
                Elements items = doc.select("div.news-cover");

                // Iterate through results
                for (Element item : items) {
                    Map<String, String> keyValue = new LinkedHashMap<String, String>();

                    keyValue.put("image", baseUrl + item.select("img").attr("src"));
                    keyValue.put("title", item.select("a.addthis_button_expanded").attr("addthis:title"));

                    // Connect to each and every article to get date and first sentence as content
                    try {
                        org.jsoup.nodes.Document articleDoc = Jsoup
                                .connect(item.select("a.addthis_button_expanded").attr("addthis:url"))
                                .userAgent("Mozilla").get();

                        // If connection successful(STATUS 200), the add content and date keys to map
                        keyValue.put("content", articleDoc.select("div#news-content").text().substring(0,
                                articleDoc.select("div#news-content").text().indexOf(".") + 1) + ".");
                        keyValue.put("date", articleDoc.select("span.date").text());

                    } catch (IOException e) {
                        System.out.println(e);
                    }

                    keyValue.put("url", item.select("a.addthis_button_expanded").attr("addthis:url"));

                    results.add(keyValue);
                }

            } catch (DOMException e) {
                e.printStackTrace();
            }

            return ok(Json.toJson(results));
        }
    });
}

From source file:org.shareok.data.plosdata.PlosUtil.java

public static String getPlosAck(String html) {

    String ack = "";
    Document doc = Jsoup.parse(html.toString());
    Elements ackLinks = doc.select("a[id=ack]");
    if (!ackLinks.isEmpty()) {
        Element ackDiv = ackLinks.first().parent();
        if (null != ackDiv) {
            Elements ackParagraphs = ackDiv.select("p");
            if (!ackParagraphs.isEmpty()) {
                for (Element element : ackParagraphs) {
                    if (element.hasText())
                        ack += element.text();
                }/*from  ww  w .j a va2 s . c  o  m*/
            }
            //System.out.println("the ack = "+ack+"\n\n");
        }
    }

    return ack;
}

From source file:org.shareok.data.plosdata.PlosUtil.java

/**
 * //from  w w  w  .  ja v  a  2 s.c  o  m
 * @param html : The string of the web page source
 * @return acknowledge statement
 */
public static String[] getSubjects(String html) {
    List<String> subjectsList = new ArrayList<>();

    Document doc = Jsoup.parse(html.toString());
    Elements subjectListDiv = doc.select("div[class=subject-areas-container]");
    if (null != subjectListDiv && !subjectListDiv.isEmpty()) {
        Element subjectList = subjectListDiv.first().child(1);
        if (null != subjectList) {
            Elements lis = subjectList.select("li");
            if (null != lis && lis.size() > 0) {
                for (Element li : lis) {
                    Element link = li.child(0);
                    subjectsList.add(link.text());
                }
            }
        }
    }
    if (subjectsList.size() > 0) {
        return subjectsList.toArray(new String[subjectsList.size()]);
    } else {
        return null;
    }
}