Example usage for org.jsoup.nodes Document select

List of usage examples for org.jsoup.nodes Document select

Introduction

In this page you can find the example usage for org.jsoup.nodes Document select.

Prototype

public Elements select(String cssQuery) 

Source Link

Document

Find elements that match the Selector CSS query, with this element as the starting context.

Usage

From source file:controllers.TAXIProxy.java

public static F.Promise<Result> index(String query) {

    if (StringUtils.isEmpty(query)) {

        F.Promise.promise(new F.Function0<Object>() {
            @Override//from   w  w  w .  j a v a 2s.c  o m
            public Object apply() throws Throwable {
                return ok(Json.toJson("Query parameter (q) not provided "));
            }

        });
    }

    final String baseUrl = "http://designtaxi.com/";

    F.Promise<WSResponse> wsResponsePromise = WS.url(baseUrl + "news-search.php")
            .setQueryParameter("news_keyword", query).get();

    return wsResponsePromise.map(new F.Function<WSResponse, Result>() {
        @Override
        public Result apply(WSResponse wsResponse) throws Throwable {

            String body = wsResponse.getBody();
            List<Map<String, String>> results = new ArrayList<Map<String, String>>();

            try {
                // Insert into map
                org.jsoup.nodes.Document doc = Jsoup.parse(body);
                Elements items = doc.select("div.news-cover");

                // Iterate through results
                for (Element item : items) {
                    Map<String, String> keyValue = new LinkedHashMap<String, String>();

                    keyValue.put("image", baseUrl + item.select("img").attr("src"));
                    keyValue.put("title", item.select("a.addthis_button_expanded").attr("addthis:title"));

                    // Connect to each and every article to get date and first sentence as content
                    try {
                        org.jsoup.nodes.Document articleDoc = Jsoup
                                .connect(item.select("a.addthis_button_expanded").attr("addthis:url"))
                                .userAgent("Mozilla").get();

                        // If connection successful(STATUS 200), the add content and date keys to map
                        keyValue.put("content", articleDoc.select("div#news-content").text().substring(0,
                                articleDoc.select("div#news-content").text().indexOf(".") + 1) + ".");
                        keyValue.put("date", articleDoc.select("span.date").text());

                    } catch (IOException e) {
                        System.out.println(e);
                    }

                    keyValue.put("url", item.select("a.addthis_button_expanded").attr("addthis:url"));

                    results.add(keyValue);
                }

            } catch (DOMException e) {
                e.printStackTrace();
            }

            return ok(Json.toJson(results));
        }
    });
}

From source file:controllers.WDCDProxy.java

public static F.Promise<Result> index(String query) {

    if (StringUtils.isEmpty(query)) {

        F.Promise.promise(new F.Function0<Object>() {
            @Override//  www  . j a  v a 2  s  .c o  m
            public Object apply() throws Throwable {
                return ok(Json.toJson("Query parameter (q) not provided "));
            }

        });
    }

    F.Promise<WSResponse> wsResponsePromise = WS.url("http://www.whatdesigncando.com/")
            .setQueryParameter("s", query).get();

    return wsResponsePromise.map(new F.Function<WSResponse, Result>() {
        @Override
        public Result apply(WSResponse wsResponse) throws Throwable {

            String body = wsResponse.getBody();
            List<Map<String, String>> results = new ArrayList<Map<String, String>>();

            try {
                // Insert into map
                org.jsoup.nodes.Document doc = Jsoup.parse(body);
                Elements items = doc.select("div.item");

                // Iterate through results
                for (Element item : items) {
                    Map<String, String> keyValue = new LinkedHashMap<String, String>();

                    String imageUrl = item.select("a").attr("style");

                    keyValue.put("image", imageUrl.substring(imageUrl.indexOf("'") + 1,
                            imageUrl.indexOf("'", imageUrl.indexOf("'") + 1)));
                    keyValue.put("title", item.select("h3").text());

                    // Get date and the first sentence as "content" from each article separately (or the "sub-title" tag)
                    org.jsoup.nodes.Document articleDoc = Jsoup.connect(item.select("a").attr("href")).get();

                    String datePublished = articleDoc.select("div#maincontent p.metainfo").text().substring(0,
                            articleDoc.select("div#maincontent p.metainfo").text().indexOf("Published"));
                    String firstSentence;

                    if (articleDoc.select("div#maincontent p.sub-title").text().length() == 0) {
                        firstSentence = articleDoc.select("div#maincontent p:not(.metainfo)").text().substring(
                                0,
                                articleDoc.select("div#maincontent p:not(.metainfo)").text().indexOf(".") + 1);
                        firstSentence = firstSentence + ".";
                    } else {
                        firstSentence = articleDoc.select("div#maincontent p.sub-title").text();
                        firstSentence = firstSentence + "..";
                    }

                    keyValue.put("content", firstSentence);
                    keyValue.put("date", datePublished);
                    keyValue.put("url", item.select("a").attr("href"));

                    results.add(keyValue);
                }

            } catch (DOMException e) {
                e.printStackTrace();
            }

            return ok(Json.toJson(results));
        }
    });
}

From source file:com.vaadin.sass.testcases.scss.W3ConformanceTests.java

protected static Collection<URI> scrapeIndexForTests(String url, String regexp, int maxTests,
        Collection<URI> excludeUrls) throws Exception {

    URI baseUrl = new URI(url);
    Document doc = Jsoup.connect(url).timeout(10000).get();
    Elements elems = doc.select(String.format("a[href~=%s]", regexp));
    LinkedHashSet<URI> tests = new LinkedHashSet<URI>();
    for (Element e : elems) {
        URI testUrl = new URI(e.attr("href"));
        if (!testUrl.isAbsolute()) {
            testUrl = baseUrl.resolve(testUrl);
        }/*from   w  w w . j a v  a 2 s . c  o m*/
        if (tests.size() < maxTests) {
            if (!excludeUrls.contains(testUrl)) {
                tests.add(testUrl);
            }
        } else {
            break;
        }
    }

    return tests;
}

From source file:controllers.CNNProxy.java

public static F.Promise<Result> index(String query) {

    if (StringUtils.isEmpty(query)) {

        F.Promise.promise(new F.Function0<Object>() {
            @Override//from  w ww .  ja v  a 2 s.c o  m
            public Object apply() throws Throwable {
                return ok(Json.toJson("Query parameter  (q) not provided "));
            }

        });
    }

    String feedUrl = "http://searchapp.cnn.com/search/query.jsp";

    //query = query  + "&type=all";
    String page = "1";
    String npp = "10";
    String start = "1";
    String type = "all";
    String bucket = "true";
    String sort = "relevance";
    String csiID = "csi1";

    F.Promise<WSResponse> wsResponsePromise = WS.url(feedUrl).setQueryParameter("page", page)
            .setQueryParameter("npp", npp).setQueryParameter("start", start).setQueryParameter("text", query)
            .setQueryParameter("type", type).setQueryParameter("bucket", bucket).setQueryParameter("sort", sort)
            .setQueryParameter("csiID", csiID).get();

    return wsResponsePromise.map(new F.Function<WSResponse, Result>() {
        @Override
        public Result apply(WSResponse wsResponse) throws Throwable {

            String body = wsResponse.getBody();

            List<Map<String, String>> ret = new ArrayList<Map<String, String>>();

            try {
                // Reach json code into html response from ajax call
                org.jsoup.nodes.Document doc = Jsoup.parse(body);
                Element resultElement = doc.select("textarea#jsCode").first();
                String resultString = resultElement.text();

                // Parse the json code
                JSONObject obj = new JSONObject(resultString);
                JSONArray array = new JSONArray(obj.get("results").toString());
                JSONArray internalArray = new JSONArray(array.get(0).toString());

                // Insert each result's elements into map with corresponding key
                for (int i = 0; i < internalArray.length(); i++) {
                    JSONObject elementObj = new JSONObject(internalArray.get(i).toString());

                    String image = elementObj.get("thumbnail").toString();
                    String title = elementObj.get("title").toString();
                    String content = elementObj.get("description").toString();
                    String date = elementObj.get("mediaDateUts").toString();
                    String url = elementObj.get("url").toString();

                    Map<String, String> keyValue = new LinkedHashMap<String, String>();

                    keyValue.put("image", image);
                    keyValue.put("title", title);
                    keyValue.put("content", content);
                    keyValue.put("date", date);
                    keyValue.put("url", url);

                    ret.add(keyValue);
                }
            } catch (DOMException e) {
                e.printStackTrace();
            }

            return ok(Json.toJson(ret));

        }
    });
}

From source file:com.geecko.QuickLyric.lyrics.Genius.java

public static Lyrics fromURL(String url, String artist, String title) {
    Document lyricsPage;
    String text;//from   w  w  w .j av  a2 s  .c o m
    try {
        lyricsPage = Jsoup.connect(url).get();
        Elements lyricsDiv = lyricsPage.select("div.lyrics");
        if (lyricsDiv.isEmpty())
            throw new StringIndexOutOfBoundsException();
        else
            text = Jsoup.clean(lyricsDiv.html(), Whitelist.none().addTags("br")).trim();
    } catch (HttpStatusException e) {
        return new Lyrics(Lyrics.NO_RESULT);
    } catch (IOException | StringIndexOutOfBoundsException e) {
        e.printStackTrace();
        return new Lyrics(Lyrics.ERROR);
    }
    if (artist == null) {
        title = lyricsPage.getElementsByClass("text_title").get(0).text();
        artist = lyricsPage.getElementsByClass("text_artist").get(0).text();
    }
    Lyrics result = new Lyrics(Lyrics.POSITIVE_RESULT);
    if ("[Instrumental]".equals(text))
        result = new Lyrics(Lyrics.NEGATIVE_RESULT);
    result.setArtist(artist);
    result.setTitle(title);
    result.setText(text);
    result.setURL(url);
    result.setSource("Genius");
    return result;
}

From source file:com.megatome.j2d.support.JavadocSupport.java

private static Elements loadAndFindLinks(final File f) throws BuilderException {
    try {/*from   w ww . j a  v  a2  s . c o m*/
        final Document doc = Jsoup.parse(f, "UTF-8");
        return doc.select("a");
    } catch (IOException e) {
        throw new BuilderException("Failed to index javadoc files", e);
    }
}

From source file:controllers.CNBCProxy.java

public static F.Promise<Result> index(String query) {

    if (StringUtils.isEmpty(query)) {

        F.Promise.promise(new F.Function0<Object>() {
            @Override//from   ww w. ja  v  a  2s .  c  om
            public Object apply() throws Throwable {
                return ok(Json.toJson("Query parameter (q) not provided "));
            }

        });
    }

    String target = "all";
    String categories = "exclude";
    String partnerId = "2000";
    // ?target=all&categories=exclude&partnerId=2000&keywords=apple
    F.Promise<WSResponse> wsResponsePromise = WS.url("http://search.cnbc.com/main.do")
            .setQueryParameter("target", target).setQueryParameter("categories", categories)
            .setQueryParameter("partnerId", partnerId).setQueryParameter("keywords", query).get();

    return wsResponsePromise.map(new F.Function<WSResponse, Result>() {
        @Override
        public Result apply(WSResponse wsResponse) throws Throwable {

            String body = wsResponse.getBody();

            List<Map<String, String>> results = new ArrayList<Map<String, String>>();

            try {
                // Parse html document
                org.jsoup.nodes.Document doc = Jsoup.parse(body);
                Elements items = doc.select("div:not(.clr).padL.padR"); // Choose elements that contain classes "padL" and "padR", but not "clr"

                // Iterate through results
                for (Element item : items) {

                    Map<String, String> keyValue = new LinkedHashMap<String, String>();

                    // Add the keys and values
                    keyValue.put("title", item.select("a").text());
                    keyValue.put("content", item.select("span.cnbc_bio_content").text());
                    keyValue.put("date", CalculateDateFormat(Long
                            .parseLong(item.getElementsByTag("script").html().replaceAll("[^0-9]", ""), 10))); // Edit the date format
                    keyValue.put("url", item.select("a").attr("href"));

                    results.add(keyValue);
                }
            } catch (DOMException e) {
                e.printStackTrace();
            }

            return ok(Json.toJson(results));
        }
    });
}

From source file:er.java

/**
 * Purpose: jsoup?Html// w w  w  .  j a v  a 2s  .c  om
 * 
 * @param html
 * @return: String[]
 */
private static String[] htmlToPlainText(String html) {
    String[] content = new String[] { "", "" };
    Document doc = Jsoup.parse(html);
    // ???blog
    Elements titles = doc.select("h3.title-article>strong");
    for (Element oneSelect : titles)
        content[0] += oneSelect.text();
    // ???blog
    Elements contents = doc.select("div#blogContent");
    for (Element oneSelect : contents)
        content[1] += oneSelect.text();
    return content;
}

From source file:com.geecko.QuickLyric.lyrics.LyricWiki.java

public static Lyrics fromURL(String url, String artist, String song) {
    if (url.endsWith("action=edit")) {
        return new Lyrics(NO_RESULT);
    }/*w w  w  .  j a v  a  2 s .c  o m*/
    String text;
    try {
        //url = URLDecoder.decode(url, "utf-8");
        Document lyricsPage = Jsoup.connect(url).get();
        Element lyricbox = lyricsPage.select("div.lyricBox").get(0);
        lyricbox.after(lyricbox.childNode(0));
        String lyricsHtml = lyricbox.html();
        text = lyricsHtml.substring(0, lyricsHtml.indexOf("<!--")).replaceAll("<.*?>", "").replaceAll("\n",
                "<br />");
        if (text.contains("&#"))
            text = Parser.unescapeEntities(text, true);
    } catch (IndexOutOfBoundsException | IOException e) {
        e.printStackTrace();
        return new Lyrics(ERROR);
    }

    if (artist == null)
        artist = url.substring(24).replace("Gracenote:", "").split(":", 2)[0].replace('_', ' ');
    if (song == null)
        song = url.substring(24).replace("Gracenote:", "").split(":", 2)[1].replace('_', ' ');

    try {
        artist = URLDecoder.decode(artist, "UTF-8");
        song = URLDecoder.decode(song, "UTF-8");
    } catch (UnsupportedEncodingException e) {
        e.printStackTrace();
    }
    if (text.contains(
            "Unfortunately, we are not licensed to display the full lyrics for this song at the moment.")
            || text.equals("Instrumental <br />")) {
        Lyrics result = new Lyrics(NEGATIVE_RESULT);
        result.setArtist(artist);
        result.setTitle(song);
        return result;
    } else if (text.equals("") || text.length() < 3)
        return new Lyrics(NO_RESULT);
    else {
        Lyrics lyrics = new Lyrics(POSITIVE_RESULT);
        lyrics.setArtist(artist);
        lyrics.setTitle(song);
        lyrics.setText(text);
        lyrics.setSource("LyricsWiki");
        lyrics.setURL(url);
        return lyrics;
    }
}

From source file:controllers.NWProxy.java

private static org.jsoup.nodes.Document RedirectionHandler(String url) throws IOException {

    org.jsoup.nodes.Document articleDoc = Jsoup.connect(url).get();
    String officialUrl = "http://www.newsweek.com";
    String redirectedUrl = null;//from  w  ww .  j a  va  2s  .  c o m

    Elements meta = articleDoc.select("html head meta");

    if (meta.attr("http-equiv").contains("refresh")) {
        redirectedUrl = officialUrl
                + meta.attr("content").substring(meta.attr("content").indexOf("=") + 1).replaceAll("'", "");
        return RedirectionHandler(redirectedUrl);
    }

    return articleDoc;
}