List of usage examples for org.jsoup.nodes Element attr
public String attr(String attributeKey)
From source file:org.keycloak.testsuite.util.saml.UpdateProfileBuilder.java
public HttpUriRequest handleUpdateProfile(String loginPage, URI currentURI) { org.jsoup.nodes.Document theUpdateProfilePage = Jsoup.parse(loginPage); Set<String> unusedParams = new HashSet<>(this.parameters.keySet()); List<NameValuePair> parameters = new LinkedList<>(); for (Element form : theUpdateProfilePage.getElementsByTag("form")) { String method = form.attr("method"); String action = form.attr("action"); boolean isPost = method != null && "post".equalsIgnoreCase(method); for (Element input : form.getElementsByTag("input")) { if (this.parameters.containsKey(input.attr("name"))) { parameters.add(/*from www . j a v a 2 s .c o m*/ new BasicNameValuePair(input.attr("name"), this.parameters.get(input.attr("name")))); unusedParams.remove(input.attr("name")); } } if (!unusedParams.isEmpty()) { LOG.warnf("Unused parameter names at Update Profile page: %s", unusedParams); } if (isPost) { HttpPost res = new HttpPost(action); UrlEncodedFormEntity formEntity; try { formEntity = new UrlEncodedFormEntity(parameters, "UTF-8"); } catch (UnsupportedEncodingException e) { throw new RuntimeException(e); } res.setEntity(formEntity); return res; } else { UriBuilder b = UriBuilder.fromPath(action); for (NameValuePair parameter : parameters) { b.queryParam(parameter.getName(), parameter.getValue()); } return new HttpGet(b.build()); } } throw new IllegalArgumentException("Invalid update profile form: " + loginPage); }
From source file:com.github.brandtg.pantopod.crawler.CrawlingEventHandler.java
@Override public Set<CrawlEvent> handle(CrawlEvent event) throws Exception { Set<CrawlEvent> nextEvents = new HashSet<>(); // Get url//from w w w. ja v a 2s. c om URI url = URI.create(event.getUrl()); Document dom = null; boolean created = false; if (!checkErrors || !hasError(url)) { HttpGet req = new HttpGet(url); HttpResponse res = httpClient.execute(req); try { if (res.getStatusLine().getStatusCode() == 200) { byte[] domBytes = IOUtils.toByteArray(res.getEntity().getContent()); created = handleData(url, domBytes); dom = Jsoup.parse(new String(domBytes)); } else { LOG.error("Error for {} #=> {}", url, res.getStatusLine().getStatusCode()); markError(url, res.getStatusLine().getStatusCode()); } } finally { if (res.getEntity() != null) { EntityUtils.consumeQuietly(res.getEntity()); } } } // Extract links if ((created || traverseDuplicates) && dom != null) { for (Element element : dom.select("a")) { String href = element.attr("href"); if (href != null) { URI nextUri = getNextUri(url, href, event.getChroot()); if (shouldExplore(nextUri) && isSameDomain(url, nextUri) && isDifferentPage(url, nextUri)) { CrawlEvent nextEvent = new CrawlEvent(event); nextEvent.setUrl(nextUri.toString()); nextEvent.setParentUrl(event.getUrl()); nextEvent.setDepth(event.getDepth() + 1); nextEvents.add(nextEvent); LOG.debug("Exploring {}", nextUri); } else { LOG.debug("Skipping {}", nextUri); } } } } return nextEvents; }
From source file:com.johan.vertretungsplan.parser.UntisSubstitutionParser.java
@Override public Vertretungsplan getVertretungsplan() throws IOException, JSONException { new LoginHandler(schule).handleLogin(executor, cookieStore, username, password); String encoding = schule.getData().getString("encoding"); Document doc = Jsoup.parse(this.httpGet(baseUrl, encoding)); Elements classes = doc.select("td a"); Vertretungsplan v = new Vertretungsplan(); List<VertretungsplanTag> tage = new ArrayList<VertretungsplanTag>(); VertretungsplanTag tag = new VertretungsplanTag(); tage.add(tag);//w w w . j a v a 2s . co m v.setTage(tage); String stand = doc.select("td[align=right]:not(:has(b))").text(); tag.setStand(stand); Pattern dayPattern = Pattern.compile("\\d\\d?.\\d\\d?. / \\w+"); for (Element klasse : classes) { Document classDoc = Jsoup.parse( httpGet(baseUrl.substring(0, baseUrl.lastIndexOf("/")) + "/" + klasse.attr("href"), encoding)); if (tag.getDatum() == null) { String title = classDoc.select("font[size=5]").text(); Matcher matcher = dayPattern.matcher(title); if (matcher.find()) tag.setDatum(matcher.group()); } Element table = classDoc.select("table[rules=all]").first(); parseVertretungsplanTable(table, data, tag); } return v; }
From source file:com.clonephpscrapper.crawler.ClonePhpScrapper.java
public void crawledCategories() throws URISyntaxException, IOException, InterruptedException, Exception { String url = "http://clonephp.com/"; // Document doc = Jsoup.parse(fetchPage(new URI(url))); String response = ""; response = new GetRequestHandler().doGetRequest(new URL(url)); Document doc = Jsoup.parse(response); Elements ele = doc.select("table[class=dir] tbody tr td table[class=dir_cat] tbody tr th a");//.first(); for (Element ele1 : ele) { objCategories = new Categories(); String categoryName = ele1.text(); String categoryUrl = "http://clonephp.com/" + ele1.attr("href"); System.out.println("CATEGORY_NAME : " + categoryName); System.out.println("CATEGORY_URL : " + categoryUrl); objCategories.setCategoryName(categoryName); objCategories.setCategoryUrl(categoryUrl); objClonePhpDaoImpl.insertCategoriesData(objCategories); // objCrawlingEachUrlData.crawlingUrlData(categoryUrl); }/*from w w w. j a v a 2s. co m*/ List<Future<String>> list = new ArrayList<Future<String>>(); ExecutorService executor = Executors.newFixedThreadPool(5); List<Categories> listCatogories = objClonePhpDaoImpl.getCategoriesDataList(); for (Categories listCatogory : listCatogories) { try { Callable worker = new CrawlingEachUrlData(listCatogory, objClonePhpDaoImpl); Future<String> future = executor.submit(worker); list.add(future); } catch (Exception exx) { System.out.println(exx); } } for (Future<String> fut : list) { try { //print the return value of Future, notice the output delay in console // because Future.get() waits for task to get completed System.out.println(new Date() + "::" + fut.get()); } catch (InterruptedException | ExecutionException ep) { ep.printStackTrace(); } } //shut down the executor service now executor.shutdown(); }
From source file:com.vaushell.superpipes.tools.http.ImageExtractor.java
/** * Return the biggest image URI of this webpage. * * @param rootURI Webpage URI//from w w w . j a v a 2 s .co m * @return Biggest image * @throws IOException */ public BufferedImage extractBiggest(final URI rootURI) throws IOException { final List<URI> imagesURIs = new ArrayList<>(); HttpEntity responseEntity = null; try { // Exec request final HttpGet get = new HttpGet(rootURI); try (final CloseableHttpResponse response = client.execute(get)) { final StatusLine sl = response.getStatusLine(); if (sl.getStatusCode() != 200) { throw new IOException(sl.getReasonPhrase()); } responseEntity = response.getEntity(); try (final InputStream is = responseEntity.getContent()) { final Document doc = Jsoup.parse(is, "UTF-8", rootURI.toString()); final Elements elts = doc.select("img"); if (elts != null) { for (final Element elt : elts) { final String src = elt.attr("src"); if (src != null && !src.isEmpty()) { try { imagesURIs.add(rootURI.resolve(src)); } catch (final IllegalArgumentException ex) { // Ignore wrong encoded URI } } } } } } } finally { if (responseEntity != null) { EntityUtils.consume(responseEntity); } } final BufferedImage[] images = new BufferedImage[imagesURIs.size()]; final ExecutorService service = Executors.newCachedThreadPool(); for (int i = 0; i < imagesURIs.size(); ++i) { final int num = i; service.execute(new Runnable() { @Override public void run() { try { images[num] = HTTPhelper.loadPicture(client, imagesURIs.get(num)); } catch (final IOException ex) { images[num] = null; } } }); } service.shutdown(); try { service.awaitTermination(1L, TimeUnit.DAYS); } catch (final InterruptedException ex) { // Ignore } BufferedImage biggest = null; int biggestSize = Integer.MIN_VALUE; for (int i = 0; i < imagesURIs.size(); ++i) { if (images[i] != null) { final int actualSize = images[i].getWidth() * images[i].getHeight(); if (actualSize > biggestSize) { biggest = images[i]; biggestSize = actualSize; } } } return biggest; }
From source file:com.romeikat.datamessie.core.processing.task.documentProcessing.redirecting.DocumentRedirector.java
private String applyHardCodedRedirectingRule(final RawContent rawContent) { // Parse raw content final org.jsoup.nodes.Document jsoupDocument = Jsoup.parse(rawContent.getContent()); final String title = jsoupDocument.title(); final boolean documentTitleMatches = title.equalsIgnoreCase("advertisment"); if (documentTitleMatches) { // Map link target -> number of occurrence final Map<String, Integer> linkCounts = new HashMap<String, Integer>(); // Count link occurrences final Elements links = jsoupDocument.select("a[href]"); for (final Element link : links) { final String linkTarget = link.attr("href"); Integer linkCount = linkCounts.get(linkTarget); if (linkCount == null) { linkCount = 0;/*from w ww.j a va 2s . co m*/ } linkCount++; linkCounts.put(linkTarget, linkCount); } // Get most frequent link (for multiple highest link counts, use the "lower" link URL) String mostFrequentLinkTarget = null; int mostFrequentLinkCount = 0; for (final String linkTarget : linkCounts.keySet()) { final int linkCount = linkCounts.get(linkTarget); if (linkCount > mostFrequentLinkCount || linkCount == mostFrequentLinkCount && linkTarget.toLowerCase().compareTo(mostFrequentLinkTarget.toLowerCase()) < 0) { mostFrequentLinkTarget = linkTarget; mostFrequentLinkCount = linkCount; } } // Use most frequent link, if one was found if (mostFrequentLinkTarget != null) { return mostFrequentLinkTarget; } } // No redirecting return null; }
From source file:org.commonjava.indy.ftest.core.urls.StoreOneAndVerifyInHtmlListingTest.java
@Test public void storeOneFileAndVerifyItInParentDirectoryListing() throws Exception { final byte[] data = "this is a test".getBytes(); final ByteArrayInputStream stream = new ByteArrayInputStream(data); final String root = "/path/to/"; final String path = root + "foo.txt"; client.content().store(hosted, STORE, path, stream); final IndyClientHttp http = getHttp(); final HttpGet request = http.newRawGet(client.content().contentUrl(hosted, STORE, root)); request.addHeader("Accept", "text/html"); final CloseableHttpClient hc = http.newClient(); final CloseableHttpResponse response = hc.execute(request); final InputStream listing = response.getEntity().getContent(); final String html = IOUtils.toString(listing); // TODO: Charset!! final Document doc = Jsoup.parse(html); for (final Element item : doc.select("a.item-link")) { final String fname = item.text(); System.out.printf("Listing contains: '%s'\n", fname); final String href = item.attr("href"); final String expected = client.content().contentUrl(hosted, STORE, root, fname); assertThat(fname + " does not have a href", href, notNullValue()); assertThat(fname + " has incorrect link: '" + href + "' (" + href.getClass().getName() + ")\nshould be: '" + expected + "' (String)", href, equalTo(expected)); }//from w w w . j av a 2 s .c o m }
From source file:org.commonjava.aprox.folo.ftest.urls.StoreOneAndVerifyInHtmlListingTest.java
@Test public void storeOneFileAndVerifyItInParentDirectoryListing() throws Exception { final byte[] data = "this is a test".getBytes(); final ByteArrayInputStream stream = new ByteArrayInputStream(data); final String root = "/path/to/"; final String path = root + "foo.txt"; final String track = "track"; content.store(track, hosted, STORE, path, stream); final AproxClientHttp http = getHttp(); final HttpGet request = http.newRawGet(content.contentUrl(track, hosted, STORE, root)); request.addHeader("Accept", "text/html"); final CloseableHttpClient hc = http.newClient(); final CloseableHttpResponse response = hc.execute(request); final InputStream listing = response.getEntity().getContent(); final String html = IOUtils.toString(listing); // TODO: Charset!! final Document doc = Jsoup.parse(html); for (final Element item : doc.select("a.item-link")) { final String fname = item.text(); System.out.printf("Listing contains: '%s'\n", fname); final String href = item.attr("href"); final String expected = client.content().contentUrl(hosted, STORE, root, fname); assertThat(fname + " does not have a href", href, notNullValue()); assertThat(fname + " has incorrect link: '" + href + "' (" + href.getClass().getName() + ")\nshould be: '" + expected + "' (String)", href, equalTo(expected)); }/* w ww . ja va2 s.c o m*/ }
From source file:org.commonjava.indy.folo.ftest.urls.StoreOneAndVerifyInHtmlListingTest.java
@Test public void storeOneFileAndVerifyItInParentDirectoryListing() throws Exception { final byte[] data = "this is a test".getBytes(); final ByteArrayInputStream stream = new ByteArrayInputStream(data); final String root = "/path/to/"; final String path = root + "foo.txt"; final String track = "track"; content.store(track, hosted, STORE, path, stream); final IndyClientHttp http = getHttp(); final HttpGet request = http.newRawGet(content.contentUrl(track, hosted, STORE, root)); request.addHeader("Accept", "text/html"); final CloseableHttpClient hc = http.newClient(); final CloseableHttpResponse response = hc.execute(request); final InputStream listing = response.getEntity().getContent(); final String html = IOUtils.toString(listing); // TODO: Charset!! final Document doc = Jsoup.parse(html); for (final Element item : doc.select("a.item-link")) { final String fname = item.text(); System.out.printf("Listing contains: '%s'\n", fname); final String href = item.attr("href"); final String expected = client.content().contentUrl(hosted, STORE, root, fname); assertThat(fname + " does not have a href", href, notNullValue()); assertThat(fname + " has incorrect link: '" + href + "' (" + href.getClass().getName() + ")\nshould be: '" + expected + "' (String)", href, equalTo(expected)); }//w w w .j av a 2 s . c o m }
From source file:org.commonjava.indy.ftest.core.urls.StoreOneAndSourceStoreUrlInHtmlListingTest.java
@Test public void storeOneFileAndVerifyItInParentDirectoryListing() throws Exception { final byte[] data = "this is a test".getBytes(); final ByteArrayInputStream stream = new ByteArrayInputStream(data); final String root = "/path/to/"; final String path = root + "foo.txt"; client.content().store(hosted, STORE, path, stream); final IndyClientHttp http = getHttp(); final HttpGet request = http.newRawGet(client.content().contentUrl(hosted, STORE, root)); request.addHeader("Accept", "text/html"); final CloseableHttpClient hc = http.newClient(); final CloseableHttpResponse response = hc.execute(request); final InputStream listing = response.getEntity().getContent(); final String html = IOUtils.toString(listing); // TODO: Charset!! final Document doc = Jsoup.parse(html); for (final Element item : doc.select("a.source-link")) { final String fname = item.text(); System.out.printf("Listing contains: '%s'\n", fname); final String href = item.attr("href"); final String expected = client.content().contentUrl(hosted, STORE); assertThat(fname + " does not have a href", href, notNullValue()); assertThat(fname + " has incorrect link: '" + href + "' (" + href.getClass().getName() + ")\nshould be: '" + expected + "' (String)", href, equalTo(expected)); }/* www . j a v a 2 s . c o m*/ }