List of usage examples for org.jsoup.nodes Document select
public Elements select(String cssQuery)
From source file:org.commonjava.indy.folo.ftest.urls.StoreOneAndSourceStoreUrlInHtmlListingTest.java
@Test public void storeOneFileAndVerifyItInParentDirectoryListing() throws Exception { final byte[] data = "this is a test".getBytes(); final ByteArrayInputStream stream = new ByteArrayInputStream(data); final String root = "/path/to/"; final String path = root + "foo.txt"; final String track = "track"; content.store(track, hosted, STORE, path, stream); final IndyClientHttp http = getHttp(); final HttpGet request = http.newRawGet(content.contentUrl(track, hosted, STORE, root)); request.addHeader("Accept", "text/html"); final CloseableHttpClient hc = http.newClient(); final CloseableHttpResponse response = hc.execute(request); final InputStream listing = response.getEntity().getContent(); final String html = IOUtils.toString(listing); // TODO: Charset!! final Document doc = Jsoup.parse(html); for (final Element item : doc.select("a.source-link")) { final String fname = item.text(); System.out.printf("Listing contains: '%s'\n", fname); final String href = item.attr("href"); final String expected = client.content().contentUrl(hosted, STORE); assertThat(fname + " does not have a href", href, notNullValue()); assertThat(fname + " has incorrect link: '" + href + "' (" + href.getClass().getName() + ")\nshould be: '" + expected + "' (String)", href, equalTo(expected)); }/*w w w.j av a2 s . c om*/ }
From source file:org.commonjava.indy.folo.ftest.urls.StoreOneAndVerifyInHtmlListingTest.java
@Test public void storeOneFileAndVerifyItInParentDirectoryListing() throws Exception { final byte[] data = "this is a test".getBytes(); final ByteArrayInputStream stream = new ByteArrayInputStream(data); final String root = "/path/to/"; final String path = root + "foo.txt"; final String track = "track"; content.store(track, hosted, STORE, path, stream); final IndyClientHttp http = getHttp(); final HttpGet request = http.newRawGet(content.contentUrl(track, hosted, STORE, root)); request.addHeader("Accept", "text/html"); final CloseableHttpClient hc = http.newClient(); final CloseableHttpResponse response = hc.execute(request); final InputStream listing = response.getEntity().getContent(); final String html = IOUtils.toString(listing); // TODO: Charset!! final Document doc = Jsoup.parse(html); for (final Element item : doc.select("a.item-link")) { final String fname = item.text(); System.out.printf("Listing contains: '%s'\n", fname); final String href = item.attr("href"); final String expected = client.content().contentUrl(hosted, STORE, root, fname); assertThat(fname + " does not have a href", href, notNullValue()); assertThat(fname + " has incorrect link: '" + href + "' (" + href.getClass().getName() + ")\nshould be: '" + expected + "' (String)", href, equalTo(expected)); }//from w w w. j a v a2 s . c om }
From source file:com.kasabi.data.movies.dbpedia.DBPediaBaseLinker.java
protected String getURI(HttpClient httpclient, String type, String string) { String uri = null;/*from w ww. j a v a2 s .c o m*/ try { String queryClass = type != null ? "&QueryClass=" + URLEncoder.encode(type, "UTF-8") : ""; String queryString = "?QueryString=" + URLEncoder.encode(string, "UTF-8"); HttpGet httpget = new HttpGet( "http://lookup.dbpedia.org/api/search.asmx/KeywordSearch" + queryString + queryClass); ResponseHandler<String> responseHandler = new BasicResponseHandler(); String responseBody = httpclient.execute(httpget, responseHandler); Document document = Jsoup.parse(responseBody); Elements elements = document.select("result > uri"); if (!elements.isEmpty()) { uri = elements.first().text(); } } catch (ClientProtocolException e) { e.printStackTrace(); } catch (IOException e) { e.printStackTrace(); } return uri; }
From source file:it.polito.tellmefirst.apimanager.ImageManager.java
public String scrapeImageFromPage(String pageURL) { LOG.debug("[scrapeImageFromPage] - BEGIN url=" + pageURL); long startTime = System.currentTimeMillis(); String result = DEFAULT_IMAGE; try {//from w w w. ja v a 2 s. c o m Document doc = Jsoup.connect(pageURL).get(); Element image = doc.select("img").first(); result = image.attr("src"); } catch (Exception e) { LOG.error("[scrapeImageFromPage] - EXCEPTION: ", e); } long endTime = System.currentTimeMillis(); long duration = (endTime - startTime) / 1000; //no prod LOG.debug("########### [scrapeImageFromPage] took " + duration + " seconds. ###########"); LOG.debug("[scrapeImageFromPage] - END"); return result; }
From source file:org.sonatype.nexus.testsuite.misc.nxcm4389.NXCM4389FavIconIT.java
private Document extractIELink(Document doc) { // assert that IE elements are in there as well final String head = doc.select("head").outerHtml(); final int start = head.indexOf("IE]>"); final int end = head.lastIndexOf("endif"); doc = Jsoup.parse(head.substring(start + 4, end)); return doc;//from w ww. j a v a 2s. c om }
From source file:com.techcavern.wavetact.ircCommands.dnsinfo.ISup.java
@Override public void onCommand(String command, User user, PircBotX network, String prefix, Channel channel, boolean isPrivate, int userPermLevel, String... args) throws Exception { if (!args[0].startsWith("http://") && !args[0].startsWith("https://")) { args[0] = "http://" + args[0]; }/*from ww w. j a va2 s . c o m*/ Document doc = Jsoup.connect("http://www.isup.me/" + args[0]).userAgent(Registry.USER_AGENT).get(); String c = doc.select("#container").text(); if (c.contains("not just you")) { c = "It's not just you! " + args[0] + " looks down from here too. (Please note that isup.me - the service we use - lacks IPv6 support, so this might not be entirely accurate)"; } else if (c.contains("just you")) { c = "It's just you. " + args[0] + " looks fine from here."; } else { c = "isup.me can't find " + args[0] + " on the interwho. This might be because isup.me lacks IPv6 support or simply because you put in an invalid url."; } IRCUtils.sendMessage(user, network, channel, c, prefix); }
From source file:com.liato.bankdroid.banking.banks.AppeakPoker.java
@Override public Urllib login() throws LoginException, BankException { try {/*from w w w. j a v a 2s . c o m*/ LoginPackage lp = preLogin(); String response = urlopen.open(lp.getLoginTarget()); Document d = Jsoup.parse(response); Element e = d.select("#content > table tr:eq(2) td:eq(1)").first(); if (e == null) { throw new LoginException(res.getText(R.string.invalid_username).toString()); } else { mChips = e.html(); } } catch (ClientProtocolException e) { throw new BankException(e.getMessage()); } catch (IOException e) { throw new BankException(e.getMessage()); } return urlopen; }
From source file:org.brunocvcunha.taskerbox.impl.custom.hardmob.HardmobEmailAction.java
@Override public void spreadAction(final String url, String postTitle) { EmailAction email = getEmailAction(); StringBuffer sb = new StringBuffer(); sb.append(url);/*from w w w . j a v a 2s . com*/ EmailValueVO emailVO = new EmailValueVO(); emailVO.setTitle("Hardmob - " + postTitle); try { Document doc = TaskerboxHttpBox.getInstance().getDocumentForURL(url); for (Element post : doc.select(".postcontent")) { sb.append("<br>"); sb.append(post.html()); } } catch (ClientProtocolException e) { e.printStackTrace(); } catch (IllegalStateException e) { e.printStackTrace(); } catch (IOException e) { e.printStackTrace(); } catch (URISyntaxException e) { e.printStackTrace(); } emailVO.setBody(sb.toString()); email.action(emailVO); }
From source file:com.switchfly.inputvalidation.sanitizer.StripHtmlSanitizer.java
@Override public String execute(String content) { if (StringUtils.isBlank(content)) { return content; }/*w w w . ja va 2 s . c o m*/ Document document = Jsoup.parse(content); document.outputSettings().escapeMode(Entities.EscapeMode.xhtml); for (Element element : document.select("script,link,iframe,style")) { element.remove(); } return document.text(); }
From source file:com.bluedragon.search.index.crawl.handler.FileHandlerHTMLImpl.java
/** * Runs around all the internal links and pulls out all the URLs * @param doc//ww w . j av a2 s . co m * @param baseUri */ private void setAnchors(Document doc, String baseUri) { Elements links = doc.select("a[href]"); for (Element link : links) { if (baseUri != null) link.setBaseUri(baseUri); String newLink = link.attr("abs:href"); if (newLink.indexOf("#") != -1) newLink = newLink.substring(0, newLink.indexOf("#")); anchors.add(newLink); } }