List of usage examples for org.jsoup.nodes Element text
public String text()
From source file:namedatabasescraper.PageScraper.java
@SuppressWarnings("OverridableMethodCallInConstructor") public PageScraper(File file, String dirname, String selector, String charset) throws IOException { filename = file.getAbsolutePath();/* w ww . j ava 2 s .c om*/ this.dirname = dirname; this.id = this.createScraperId(); String html = FileUtils.readFileToString(file, charset); this.names = new ArrayList<>(); Document soup = Jsoup.parse(html); //Elements nameElements = soup.select("a.nom"); //Elements nameElements = soup.select("div > a:not(.n1)"); Elements nameElements = soup.select(selector); for (Element nameElement : nameElements) { String name = nameElement.text(); names.add(name); } logger.log(Level.INFO, "Scraped " + this.names.size() + " names from page {0}", file.getName()); }
From source file:com.example.amazon.mw.exempli.ExempliClient.java
public ArrayList<String> getImdbData(String videoAsin) { ArrayList<String> guides = new ArrayList<String>(); try {/*from w ww.ja v a 2 s .com*/ String imdbId = asinToImdbId.get(videoAsin); Log.e("Asin: ", videoAsin); Log.e("Parent guide page: ", "http://www.imdb.com/title/" + imdbId + "/parentalguide"); Document doc = Jsoup.connect("http://www.imdb.com/title/" + imdbId + "/parentalguide").get(); Elements parentGuideElements = doc.select(".display p"); for (Element element : parentGuideElements) { guides.add(element.text()); Log.e("retrieved guide: ", element.text()); } } catch (IOException e) { Log.e("exception", "Client failure: ", e); } return guides; }
From source file:org.commonjava.aprox.folo.ftest.urls.StoreOneAndSourceStoreUrlInHtmlListingTest.java
@Test public void storeOneFileAndVerifyItInParentDirectoryListing() throws Exception { final byte[] data = "this is a test".getBytes(); final ByteArrayInputStream stream = new ByteArrayInputStream(data); final String root = "/path/to/"; final String path = root + "foo.txt"; final String track = "track"; content.store(track, hosted, STORE, path, stream); final AproxClientHttp http = getHttp(); final HttpGet request = http.newRawGet(content.contentUrl(track, hosted, STORE, root)); request.addHeader("Accept", "text/html"); final CloseableHttpClient hc = http.newClient(); final CloseableHttpResponse response = hc.execute(request); final InputStream listing = response.getEntity().getContent(); final String html = IOUtils.toString(listing); // TODO: Charset!! final Document doc = Jsoup.parse(html); for (final Element item : doc.select("a.source-link")) { final String fname = item.text(); System.out.printf("Listing contains: '%s'\n", fname); final String href = item.attr("href"); final String expected = client.content().contentUrl(hosted, STORE); assertThat(fname + " does not have a href", href, notNullValue()); assertThat(fname + " has incorrect link: '" + href + "' (" + href.getClass().getName() + ")\nshould be: '" + expected + "' (String)", href, equalTo(expected)); }// w w w. j ava2 s . c o m }
From source file:org.commonjava.indy.folo.ftest.urls.StoreOneAndSourceStoreUrlInHtmlListingTest.java
@Test public void storeOneFileAndVerifyItInParentDirectoryListing() throws Exception { final byte[] data = "this is a test".getBytes(); final ByteArrayInputStream stream = new ByteArrayInputStream(data); final String root = "/path/to/"; final String path = root + "foo.txt"; final String track = "track"; content.store(track, hosted, STORE, path, stream); final IndyClientHttp http = getHttp(); final HttpGet request = http.newRawGet(content.contentUrl(track, hosted, STORE, root)); request.addHeader("Accept", "text/html"); final CloseableHttpClient hc = http.newClient(); final CloseableHttpResponse response = hc.execute(request); final InputStream listing = response.getEntity().getContent(); final String html = IOUtils.toString(listing); // TODO: Charset!! final Document doc = Jsoup.parse(html); for (final Element item : doc.select("a.source-link")) { final String fname = item.text(); System.out.printf("Listing contains: '%s'\n", fname); final String href = item.attr("href"); final String expected = client.content().contentUrl(hosted, STORE); assertThat(fname + " does not have a href", href, notNullValue()); assertThat(fname + " has incorrect link: '" + href + "' (" + href.getClass().getName() + ")\nshould be: '" + expected + "' (String)", href, equalTo(expected)); }//from ww w .ja v a2 s. co m }
From source file:de.dlopes.stocks.facilitator.services.impl.FinanznachrichtenOrderbuchExtractorImpl.java
@Override public List<String> getFinanceData(String url, FinanceDataType dataType) { List<String> list = new ArrayList<String>(); try {//from w ww . j a va2 s .c o m Document doc = null; if (url.startsWith("file://")) { File input = new File(url.replaceFirst("file://", "")); doc = Jsoup.parse(input, "UTF-8"); } else { URL input = new URL(url); doc = Jsoup.parse(input, 30000); } Elements elements = doc.body().select("span[id^=productid] > span"); for (Element e : elements) { String text = e.text(); // Guard: move on when the text is empty if (StringUtils.isEmpty(text)) { continue; } text = StringUtils.trimAllWhitespace(text); // Guard: move on when the text does not contain the ISIN or WKN if (!text.startsWith(dataType.name() + ":")) { continue; } text = text.replace(dataType.name() + ":", ""); list.add(text); } } catch (IOException e) { e.printStackTrace(); } return list; }
From source file:com.clonescriptscrapper.crawler.CloneScriptScrapper.java
public void crawledCategories() throws URISyntaxException, IOException, InterruptedException, Exception { String url = "http://www.clonescriptdirectory.com/"; // Document doc = Jsoup.parse(fetchPage(new URI(url))); String response = ""; response = new GetRequestHandler().doGetRequest(new URL(url)); Document doc = Jsoup.parse(response); // System.out.println("---" + doc); Elements ele = doc.select("table[class=categories] tbody tr td a"); for (Element ele1 : ele) { objCategories = new Categories(); String title = ele1.text(); String href = ele1.attr("href"); System.out.println("Title : " + ele1.text()); System.out.println("Href : " + ele1.attr("href")); objCategories.setCategoryName(title); objCategories.setCategoryUrl(href); objCloneScriptDirectoryDaoImpl.insertCategoriesData(objCategories); }//w ww. j ava 2 s .c o m List<Future<String>> list = new ArrayList<Future<String>>(); ExecutorService executor = Executors.newFixedThreadPool(5); List<Categories> listCatogories = objCloneScriptDirectoryDaoImpl.getCategoriesDataList(); for (Categories listCatogory : listCatogories) { try { objCloneScriptDirectoryDaoImpl.updateCategoriesData(objCategories); Callable worker = new CrawlingEachUrlData(listCatogory, objCloneScriptDirectoryDaoImpl); Future<String> future = executor.submit(worker); list.add(future); } catch (Exception exx) { System.out.println(exx); } } for (Future<String> fut : list) { try { //print the return value of Future, notice the output delay in console // because Future.get() waits for task to get completed System.out.println(new Date() + "::" + fut.get()); } catch (InterruptedException | ExecutionException ep) { ep.printStackTrace(); } } //shut down the executor service now executor.shutdown(); // objcrawlingUrlData.crawlingUrlData(href); }
From source file:me.vertretungsplan.parser.UntisCommonParser.java
private static String findLastChangeFromMonHeadTable(Element monHead) { if (monHead.select("td[align=right]").size() == 0) return null; String lastChange = null;/*w w w .ja v a2 s.c o m*/ Pattern pattern = Pattern.compile("\\d\\d\\.\\d\\d\\.\\d\\d\\d\\d \\d\\d:\\d\\d"); Matcher matcher = pattern.matcher(monHead.select("td[align=right]").first().text()); if (matcher.find()) { lastChange = matcher.group(); } else if (monHead.text().contains("Stand: ")) { lastChange = monHead.text().substring(monHead.text().indexOf("Stand:") + "Stand:".length()).trim(); } return lastChange; }
From source file:fr.arlefebvre.pronostics.controller.EuroMatchListController.java
@RequestMapping("/euro2016/matches") public List<Match> matches() { if (pseudoCache != null && !pseudoCache.isEmpty()) return pseudoCache; ArrayList<Match> result = new ArrayList<Match>(); String uri = "http://www.lequipe.fr/Football/Euro/Saison-2016/calendrier-resultats.html"; //On se connecte au site et on charge le document html Document doc;/*from w w w . j a v a2s. co m*/ try { doc = Jsoup.connect(uri).get(); Elements elements = doc.getElementsByClass("mainDate"); for (Element element : elements) { Element title = element.getElementsByClass("title").first(); String date = title.text(); Element tbody = element.getElementsByTag("tbody").first(); for (Element matchElement : tbody.children()) { String groupe = matchElement.getElementsByClass("date").first().text(); String home = matchElement.getElementsByClass("domicile").first().text(); String away = matchElement.getElementsByClass("exterieur").first().text(); Match m = new Match(); m.setDate(date); m.setHomeTeamId(home); m.setAwayTeamId(away); m.setGroup(groupe); result.add(m); } } } catch (IOException e) { e.printStackTrace(); } if (pseudoCache == null) pseudoCache = result; return result; }
From source file:com.example.muzei.muzeiapod.ApodNasaArtSource.java
@Override protected void onTryUpdate(int reason) throws RetryException { URI topUri;//from ww w . j a v a 2 s.c o m try { topUri = new URI("http://apod.nasa.gov/"); } catch (URISyntaxException e) { return; } URI mainUri = topUri.resolve("/apod/astropix.html"); String bodyStr = getURLContent(mainUri.toString()); /* TODO code below should go to a separate method/class */ /* start parsing page */ Document doc = Jsoup.parse(bodyStr); Element body = doc.body(); /* get image URI */ Element firstCenterTag = body.child(0); Element imgAnchor = firstCenterTag.getElementsByTag("a").last(); Element img = imgAnchor.getElementsByTag("img").first(); URI bigImageUri = topUri.resolve("/apod/" + img.attr("src")); String uri = bigImageUri.toString(); /* get title */ Element secondCenterTag = body.child(1); Element titleElem = secondCenterTag.child(0); String title = titleElem.text(); /* get byline */ String secondCenterText = secondCenterTag.text(); /* byline: everything after 'title' above */ int idx = secondCenterText.lastIndexOf(title) + title.length(); String byline = secondCenterText.substring(idx).trim(); /* TODO figure out the permanent link */ String link = "http://apod.nasa.gov/apod/astropix.html"; publishArtwork(new Artwork.Builder().title(title).byline(byline).imageUri(Uri.parse(uri)).token(title) .viewIntent(new Intent(Intent.ACTION_VIEW, Uri.parse(link))).build()); scheduleUpdate(System.currentTimeMillis() + ROTATE_TIME_MILLIS); }
From source file:com.clonephpscrapper.crawler.ClonePhpScrapper.java
public void crawledCategories() throws URISyntaxException, IOException, InterruptedException, Exception { String url = "http://clonephp.com/"; // Document doc = Jsoup.parse(fetchPage(new URI(url))); String response = ""; response = new GetRequestHandler().doGetRequest(new URL(url)); Document doc = Jsoup.parse(response); Elements ele = doc.select("table[class=dir] tbody tr td table[class=dir_cat] tbody tr th a");//.first(); for (Element ele1 : ele) { objCategories = new Categories(); String categoryName = ele1.text(); String categoryUrl = "http://clonephp.com/" + ele1.attr("href"); System.out.println("CATEGORY_NAME : " + categoryName); System.out.println("CATEGORY_URL : " + categoryUrl); objCategories.setCategoryName(categoryName); objCategories.setCategoryUrl(categoryUrl); objClonePhpDaoImpl.insertCategoriesData(objCategories); // objCrawlingEachUrlData.crawlingUrlData(categoryUrl); }//from ww w . jav a 2 s.c o m List<Future<String>> list = new ArrayList<Future<String>>(); ExecutorService executor = Executors.newFixedThreadPool(5); List<Categories> listCatogories = objClonePhpDaoImpl.getCategoriesDataList(); for (Categories listCatogory : listCatogories) { try { Callable worker = new CrawlingEachUrlData(listCatogory, objClonePhpDaoImpl); Future<String> future = executor.submit(worker); list.add(future); } catch (Exception exx) { System.out.println(exx); } } for (Future<String> fut : list) { try { //print the return value of Future, notice the output delay in console // because Future.get() waits for task to get completed System.out.println(new Date() + "::" + fut.get()); } catch (InterruptedException | ExecutionException ep) { ep.printStackTrace(); } } //shut down the executor service now executor.shutdown(); }