Example usage for org.jsoup.nodes Document select

Introduction

In this page you can find the example usage for org.jsoup.nodes Document select.

Prototype

public Elements select(String cssQuery)

Source Link

Document

Find elements that match the Selector CSS query, with this element as the starting context.

Usage

From source file:org.commonjava.indy.folo.ftest.urls.StoreOneAndSourceStoreUrlInHtmlListingTest.java

@Test
public void storeOneFileAndVerifyItInParentDirectoryListing() throws Exception {
    final byte[] data = "this is a test".getBytes();
    final ByteArrayInputStream stream = new ByteArrayInputStream(data);
    final String root = "/path/to/";
    final String path = root + "foo.txt";
    final String track = "track";

    content.store(track, hosted, STORE, path, stream);

    final IndyClientHttp http = getHttp();

    final HttpGet request = http.newRawGet(content.contentUrl(track, hosted, STORE, root));

    request.addHeader("Accept", "text/html");

    final CloseableHttpClient hc = http.newClient();
    final CloseableHttpResponse response = hc.execute(request);

    final InputStream listing = response.getEntity().getContent();
    final String html = IOUtils.toString(listing);

    // TODO: Charset!!
    final Document doc = Jsoup.parse(html);
    for (final Element item : doc.select("a.source-link")) {
        final String fname = item.text();
        System.out.printf("Listing contains: '%s'\n", fname);
        final String href = item.attr("href");
        final String expected = client.content().contentUrl(hosted, STORE);

        assertThat(fname + " does not have a href", href, notNullValue());
        assertThat(fname + " has incorrect link: '" + href + "' (" + href.getClass().getName()
                + ")\nshould be: '" + expected + "' (String)", href, equalTo(expected));
    }/*w w  w.j  av  a2  s  . c om*/
}

From source file:org.commonjava.indy.folo.ftest.urls.StoreOneAndVerifyInHtmlListingTest.java

@Test
public void storeOneFileAndVerifyItInParentDirectoryListing() throws Exception {
    final byte[] data = "this is a test".getBytes();
    final ByteArrayInputStream stream = new ByteArrayInputStream(data);
    final String root = "/path/to/";
    final String path = root + "foo.txt";
    final String track = "track";

    content.store(track, hosted, STORE, path, stream);

    final IndyClientHttp http = getHttp();

    final HttpGet request = http.newRawGet(content.contentUrl(track, hosted, STORE, root));

    request.addHeader("Accept", "text/html");

    final CloseableHttpClient hc = http.newClient();
    final CloseableHttpResponse response = hc.execute(request);

    final InputStream listing = response.getEntity().getContent();
    final String html = IOUtils.toString(listing);

    // TODO: Charset!!
    final Document doc = Jsoup.parse(html);
    for (final Element item : doc.select("a.item-link")) {
        final String fname = item.text();
        System.out.printf("Listing contains: '%s'\n", fname);
        final String href = item.attr("href");
        final String expected = client.content().contentUrl(hosted, STORE, root, fname);

        assertThat(fname + " does not have a href", href, notNullValue());
        assertThat(fname + " has incorrect link: '" + href + "' (" + href.getClass().getName()
                + ")\nshould be: '" + expected + "' (String)", href, equalTo(expected));
    }//from w  w  w. j a v a2 s  . c  om
}

From source file:com.kasabi.data.movies.dbpedia.DBPediaBaseLinker.java

protected String getURI(HttpClient httpclient, String type, String string) {
    String uri = null;/*from w ww. j a v  a2  s .c  o m*/
    try {
        String queryClass = type != null ? "&QueryClass=" + URLEncoder.encode(type, "UTF-8") : "";
        String queryString = "?QueryString=" + URLEncoder.encode(string, "UTF-8");
        HttpGet httpget = new HttpGet(
                "http://lookup.dbpedia.org/api/search.asmx/KeywordSearch" + queryString + queryClass);
        ResponseHandler<String> responseHandler = new BasicResponseHandler();
        String responseBody = httpclient.execute(httpget, responseHandler);
        Document document = Jsoup.parse(responseBody);
        Elements elements = document.select("result > uri");
        if (!elements.isEmpty()) {
            uri = elements.first().text();
        }
    } catch (ClientProtocolException e) {
        e.printStackTrace();
    } catch (IOException e) {
        e.printStackTrace();
    }
    return uri;
}

From source file:it.polito.tellmefirst.apimanager.ImageManager.java

public String scrapeImageFromPage(String pageURL) {

    LOG.debug("[scrapeImageFromPage] - BEGIN url=" + pageURL);
    long startTime = System.currentTimeMillis();
    String result = DEFAULT_IMAGE;
    try {//from  w  w  w.  ja  v  a  2 s. c  o  m
        Document doc = Jsoup.connect(pageURL).get();
        Element image = doc.select("img").first();
        result = image.attr("src");
    } catch (Exception e) {
        LOG.error("[scrapeImageFromPage] - EXCEPTION: ", e);
    }
    long endTime = System.currentTimeMillis();
    long duration = (endTime - startTime) / 1000;
    //no prod
    LOG.debug("########### [scrapeImageFromPage] took " + duration + " seconds. ###########");
    LOG.debug("[scrapeImageFromPage] - END");
    return result;
}

From source file:org.sonatype.nexus.testsuite.misc.nxcm4389.NXCM4389FavIconIT.java

private Document extractIELink(Document doc) {
    // assert that IE elements are in there as well
    final String head = doc.select("head").outerHtml();
    final int start = head.indexOf("IE]>");
    final int end = head.lastIndexOf("endif");
    doc = Jsoup.parse(head.substring(start + 4, end));
    return doc;//from   w  ww.  j  a  v  a  2s. c om
}

From source file:com.techcavern.wavetact.ircCommands.dnsinfo.ISup.java

@Override
public void onCommand(String command, User user, PircBotX network, String prefix, Channel channel,
        boolean isPrivate, int userPermLevel, String... args) throws Exception {
    if (!args[0].startsWith("http://") && !args[0].startsWith("https://")) {
        args[0] = "http://" + args[0];
    }/*from   ww  w.  j a va2 s  .  c  o m*/
    Document doc = Jsoup.connect("http://www.isup.me/" + args[0]).userAgent(Registry.USER_AGENT).get();
    String c = doc.select("#container").text();
    if (c.contains("not just you")) {
        c = "It's not just you! " + args[0]
                + " looks down from here too. (Please note that isup.me - the service we use - lacks IPv6 support, so this might not be entirely accurate)";
    } else if (c.contains("just you")) {
        c = "It's just you. " + args[0] + " looks fine from here.";
    } else {
        c = "isup.me can't find " + args[0]
                + " on the interwho. This might be because isup.me lacks IPv6 support or simply because you put in an invalid url.";
    }
    IRCUtils.sendMessage(user, network, channel, c, prefix);

}

From source file:com.liato.bankdroid.banking.banks.AppeakPoker.java

@Override
public Urllib login() throws LoginException, BankException {
    try {/*from   w w w. j a v  a  2s . c  o  m*/
        LoginPackage lp = preLogin();
        String response = urlopen.open(lp.getLoginTarget());
        Document d = Jsoup.parse(response);
        Element e = d.select("#content > table tr:eq(2) td:eq(1)").first();
        if (e == null) {
            throw new LoginException(res.getText(R.string.invalid_username).toString());
        } else {
            mChips = e.html();
        }
    } catch (ClientProtocolException e) {
        throw new BankException(e.getMessage());
    } catch (IOException e) {
        throw new BankException(e.getMessage());
    }
    return urlopen;
}

From source file:org.brunocvcunha.taskerbox.impl.custom.hardmob.HardmobEmailAction.java

@Override
public void spreadAction(final String url, String postTitle) {
    EmailAction email = getEmailAction();

    StringBuffer sb = new StringBuffer();
    sb.append(url);/*from  w w  w  . j a v a  2s .  com*/
    EmailValueVO emailVO = new EmailValueVO();
    emailVO.setTitle("Hardmob - " + postTitle);

    try {
        Document doc = TaskerboxHttpBox.getInstance().getDocumentForURL(url);

        for (Element post : doc.select(".postcontent")) {
            sb.append("<br>");
            sb.append(post.html());
        }
    } catch (ClientProtocolException e) {
        e.printStackTrace();
    } catch (IllegalStateException e) {
        e.printStackTrace();
    } catch (IOException e) {
        e.printStackTrace();
    } catch (URISyntaxException e) {
        e.printStackTrace();
    }

    emailVO.setBody(sb.toString());
    email.action(emailVO);

}

From source file:com.switchfly.inputvalidation.sanitizer.StripHtmlSanitizer.java

@Override
public String execute(String content) {
    if (StringUtils.isBlank(content)) {
        return content;
    }/*w w w . ja va  2 s  .  c o  m*/
    Document document = Jsoup.parse(content);
    document.outputSettings().escapeMode(Entities.EscapeMode.xhtml);
    for (Element element : document.select("script,link,iframe,style")) {
        element.remove();
    }
    return document.text();
}

From source file:com.bluedragon.search.index.crawl.handler.FileHandlerHTMLImpl.java

/**
 * Runs around all the internal links and pulls out all the URLs
 * @param doc//ww  w . j  av  a2  s  . co  m
 * @param baseUri
 */
private void setAnchors(Document doc, String baseUri) {
    Elements links = doc.select("a[href]");
    for (Element link : links) {
        if (baseUri != null)
            link.setBaseUri(baseUri);

        String newLink = link.attr("abs:href");
        if (newLink.indexOf("#") != -1)
            newLink = newLink.substring(0, newLink.indexOf("#"));

        anchors.add(newLink);
    }
}