Example usage for org.jsoup.nodes Element text

Introduction

In this page you can find the example usage for org.jsoup.nodes Element text.

Prototype

public String text()

Source Link

Document

Gets the combined text of this element and all its children.

Usage

From source file:namedatabasescraper.PageScraper.java

@SuppressWarnings("OverridableMethodCallInConstructor")
public PageScraper(File file, String dirname, String selector, String charset) throws IOException {
    filename = file.getAbsolutePath();/*  w  ww  . j  ava 2 s .c om*/
    this.dirname = dirname;
    this.id = this.createScraperId();
    String html = FileUtils.readFileToString(file, charset);
    this.names = new ArrayList<>();
    Document soup = Jsoup.parse(html);
    //Elements nameElements = soup.select("a.nom");
    //Elements nameElements = soup.select("div > a:not(.n1)");
    Elements nameElements = soup.select(selector);
    for (Element nameElement : nameElements) {
        String name = nameElement.text();
        names.add(name);
    }
    logger.log(Level.INFO, "Scraped " + this.names.size() + " names from page {0}", file.getName());
}

From source file:com.example.amazon.mw.exempli.ExempliClient.java

public ArrayList<String> getImdbData(String videoAsin) {
    ArrayList<String> guides = new ArrayList<String>();
    try {/*from   w ww.ja  v  a  2 s  .com*/
        String imdbId = asinToImdbId.get(videoAsin);
        Log.e("Asin: ", videoAsin);
        Log.e("Parent guide page: ", "http://www.imdb.com/title/" + imdbId + "/parentalguide");
        Document doc = Jsoup.connect("http://www.imdb.com/title/" + imdbId + "/parentalguide").get();
        Elements parentGuideElements = doc.select(".display p");

        for (Element element : parentGuideElements) {
            guides.add(element.text());
            Log.e("retrieved guide: ", element.text());
        }
    } catch (IOException e) {
        Log.e("exception", "Client failure: ", e);
    }
    return guides;
}

From source file:org.commonjava.aprox.folo.ftest.urls.StoreOneAndSourceStoreUrlInHtmlListingTest.java

@Test
public void storeOneFileAndVerifyItInParentDirectoryListing() throws Exception {
    final byte[] data = "this is a test".getBytes();
    final ByteArrayInputStream stream = new ByteArrayInputStream(data);
    final String root = "/path/to/";
    final String path = root + "foo.txt";
    final String track = "track";

    content.store(track, hosted, STORE, path, stream);

    final AproxClientHttp http = getHttp();

    final HttpGet request = http.newRawGet(content.contentUrl(track, hosted, STORE, root));

    request.addHeader("Accept", "text/html");

    final CloseableHttpClient hc = http.newClient();
    final CloseableHttpResponse response = hc.execute(request);

    final InputStream listing = response.getEntity().getContent();
    final String html = IOUtils.toString(listing);

    // TODO: Charset!!
    final Document doc = Jsoup.parse(html);
    for (final Element item : doc.select("a.source-link")) {
        final String fname = item.text();
        System.out.printf("Listing contains: '%s'\n", fname);
        final String href = item.attr("href");
        final String expected = client.content().contentUrl(hosted, STORE);

        assertThat(fname + " does not have a href", href, notNullValue());
        assertThat(fname + " has incorrect link: '" + href + "' (" + href.getClass().getName()
                + ")\nshould be: '" + expected + "' (String)", href, equalTo(expected));
    }// w  w  w.  j  ava2 s  .  c  o m
}

From source file:org.commonjava.indy.folo.ftest.urls.StoreOneAndSourceStoreUrlInHtmlListingTest.java

@Test
public void storeOneFileAndVerifyItInParentDirectoryListing() throws Exception {
    final byte[] data = "this is a test".getBytes();
    final ByteArrayInputStream stream = new ByteArrayInputStream(data);
    final String root = "/path/to/";
    final String path = root + "foo.txt";
    final String track = "track";

    content.store(track, hosted, STORE, path, stream);

    final IndyClientHttp http = getHttp();

    final HttpGet request = http.newRawGet(content.contentUrl(track, hosted, STORE, root));

    request.addHeader("Accept", "text/html");

    final CloseableHttpClient hc = http.newClient();
    final CloseableHttpResponse response = hc.execute(request);

    final InputStream listing = response.getEntity().getContent();
    final String html = IOUtils.toString(listing);

    // TODO: Charset!!
    final Document doc = Jsoup.parse(html);
    for (final Element item : doc.select("a.source-link")) {
        final String fname = item.text();
        System.out.printf("Listing contains: '%s'\n", fname);
        final String href = item.attr("href");
        final String expected = client.content().contentUrl(hosted, STORE);

        assertThat(fname + " does not have a href", href, notNullValue());
        assertThat(fname + " has incorrect link: '" + href + "' (" + href.getClass().getName()
                + ")\nshould be: '" + expected + "' (String)", href, equalTo(expected));
    }//from ww  w  .ja v  a2  s. co m
}

From source file:de.dlopes.stocks.facilitator.services.impl.FinanznachrichtenOrderbuchExtractorImpl.java

@Override
public List<String> getFinanceData(String url, FinanceDataType dataType) {

    List<String> list = new ArrayList<String>();

    try {//from w  ww . j  a va2 s  .c o  m

        Document doc = null;
        if (url.startsWith("file://")) {
            File input = new File(url.replaceFirst("file://", ""));
            doc = Jsoup.parse(input, "UTF-8");
        } else {
            URL input = new URL(url);
            doc = Jsoup.parse(input, 30000);
        }

        Elements elements = doc.body().select("span[id^=productid] > span");

        for (Element e : elements) {
            String text = e.text();

            // Guard: move on when the text is empty
            if (StringUtils.isEmpty(text)) {
                continue;
            }

            text = StringUtils.trimAllWhitespace(text);

            // Guard: move on when the text does not contain the ISIN or WKN
            if (!text.startsWith(dataType.name() + ":")) {
                continue;
            }

            text = text.replace(dataType.name() + ":", "");
            list.add(text);

        }

    } catch (IOException e) {
        e.printStackTrace();
    }

    return list;

}

From source file:com.clonescriptscrapper.crawler.CloneScriptScrapper.java

public void crawledCategories() throws URISyntaxException, IOException, InterruptedException, Exception {

    String url = "http://www.clonescriptdirectory.com/";

    //       Document doc = Jsoup.parse(fetchPage(new URI(url)));
    String response = "";
    response = new GetRequestHandler().doGetRequest(new URL(url));

    Document doc = Jsoup.parse(response);

    //        System.out.println("---" + doc);
    Elements ele = doc.select("table[class=categories] tbody tr td a");
    for (Element ele1 : ele) {

        objCategories = new Categories();

        String title = ele1.text();
        String href = ele1.attr("href");
        System.out.println("Title : " + ele1.text());
        System.out.println("Href : " + ele1.attr("href"));

        objCategories.setCategoryName(title);
        objCategories.setCategoryUrl(href);

        objCloneScriptDirectoryDaoImpl.insertCategoriesData(objCategories);
    }//w ww. j ava  2 s  .c  o  m

    List<Future<String>> list = new ArrayList<Future<String>>();
    ExecutorService executor = Executors.newFixedThreadPool(5);

    List<Categories> listCatogories = objCloneScriptDirectoryDaoImpl.getCategoriesDataList();

    for (Categories listCatogory : listCatogories) {

        try {
            objCloneScriptDirectoryDaoImpl.updateCategoriesData(objCategories);
            Callable worker = new CrawlingEachUrlData(listCatogory, objCloneScriptDirectoryDaoImpl);
            Future<String> future = executor.submit(worker);
            list.add(future);
        } catch (Exception exx) {
            System.out.println(exx);
        }

    }

    for (Future<String> fut : list) {
        try {
            //print the return value of Future, notice the output delay in console
            // because Future.get() waits for task to get completed
            System.out.println(new Date() + "::" + fut.get());
        } catch (InterruptedException | ExecutionException ep) {
            ep.printStackTrace();
        }
    }
    //shut down the executor service now
    executor.shutdown();

    //            objcrawlingUrlData.crawlingUrlData(href);
}

From source file:me.vertretungsplan.parser.UntisCommonParser.java

private static String findLastChangeFromMonHeadTable(Element monHead) {
    if (monHead.select("td[align=right]").size() == 0)
        return null;

    String lastChange = null;/*w  w  w  .ja  v a2  s.c o  m*/
    Pattern pattern = Pattern.compile("\\d\\d\\.\\d\\d\\.\\d\\d\\d\\d \\d\\d:\\d\\d");
    Matcher matcher = pattern.matcher(monHead.select("td[align=right]").first().text());
    if (matcher.find()) {
        lastChange = matcher.group();
    } else if (monHead.text().contains("Stand: ")) {
        lastChange = monHead.text().substring(monHead.text().indexOf("Stand:") + "Stand:".length()).trim();
    }
    return lastChange;
}

From source file:fr.arlefebvre.pronostics.controller.EuroMatchListController.java

@RequestMapping("/euro2016/matches")
public List<Match> matches() {
    if (pseudoCache != null && !pseudoCache.isEmpty())
        return pseudoCache;
    ArrayList<Match> result = new ArrayList<Match>();
    String uri = "http://www.lequipe.fr/Football/Euro/Saison-2016/calendrier-resultats.html";

    //On se connecte au site et on charge le document html

    Document doc;/*from   w w  w  . j a v a2s. co  m*/
    try {
        doc = Jsoup.connect(uri).get();

        Elements elements = doc.getElementsByClass("mainDate");
        for (Element element : elements) {
            Element title = element.getElementsByClass("title").first();
            String date = title.text();

            Element tbody = element.getElementsByTag("tbody").first();
            for (Element matchElement : tbody.children()) {
                String groupe = matchElement.getElementsByClass("date").first().text();
                String home = matchElement.getElementsByClass("domicile").first().text();
                String away = matchElement.getElementsByClass("exterieur").first().text();

                Match m = new Match();
                m.setDate(date);
                m.setHomeTeamId(home);
                m.setAwayTeamId(away);
                m.setGroup(groupe);
                result.add(m);
            }
        }

    } catch (IOException e) {
        e.printStackTrace();
    }

    if (pseudoCache == null)
        pseudoCache = result;
    return result;
}

From source file:com.example.muzei.muzeiapod.ApodNasaArtSource.java

@Override
protected void onTryUpdate(int reason) throws RetryException {
    URI topUri;//from   ww w  .  j  a  v a  2  s.c o m
    try {
        topUri = new URI("http://apod.nasa.gov/");
    } catch (URISyntaxException e) {
        return;
    }

    URI mainUri = topUri.resolve("/apod/astropix.html");
    String bodyStr = getURLContent(mainUri.toString());

    /* TODO code below should go to a separate method/class */

    /* start parsing page */
    Document doc = Jsoup.parse(bodyStr);
    Element body = doc.body();

    /* get image URI */
    Element firstCenterTag = body.child(0);
    Element imgAnchor = firstCenterTag.getElementsByTag("a").last();
    Element img = imgAnchor.getElementsByTag("img").first();
    URI bigImageUri = topUri.resolve("/apod/" + img.attr("src"));
    String uri = bigImageUri.toString();

    /* get title */
    Element secondCenterTag = body.child(1);
    Element titleElem = secondCenterTag.child(0);
    String title = titleElem.text();

    /* get byline */
    String secondCenterText = secondCenterTag.text();
    /* byline: everything after 'title' above */
    int idx = secondCenterText.lastIndexOf(title) + title.length();
    String byline = secondCenterText.substring(idx).trim();

    /* TODO figure out the permanent link */
    String link = "http://apod.nasa.gov/apod/astropix.html";

    publishArtwork(new Artwork.Builder().title(title).byline(byline).imageUri(Uri.parse(uri)).token(title)
            .viewIntent(new Intent(Intent.ACTION_VIEW, Uri.parse(link))).build());
    scheduleUpdate(System.currentTimeMillis() + ROTATE_TIME_MILLIS);
}

From source file:com.clonephpscrapper.crawler.ClonePhpScrapper.java

public void crawledCategories() throws URISyntaxException, IOException, InterruptedException, Exception {

    String url = "http://clonephp.com/";

    //       Document doc = Jsoup.parse(fetchPage(new URI(url)));
    String response = "";
    response = new GetRequestHandler().doGetRequest(new URL(url));

    Document doc = Jsoup.parse(response);

    Elements ele = doc.select("table[class=dir] tbody tr td table[class=dir_cat] tbody tr th a");//.first();

    for (Element ele1 : ele) {
        objCategories = new Categories();

        String categoryName = ele1.text();
        String categoryUrl = "http://clonephp.com/" + ele1.attr("href");

        System.out.println("CATEGORY_NAME : " + categoryName);
        System.out.println("CATEGORY_URL  : " + categoryUrl);

        objCategories.setCategoryName(categoryName);
        objCategories.setCategoryUrl(categoryUrl);

        objClonePhpDaoImpl.insertCategoriesData(objCategories);

        //            objCrawlingEachUrlData.crawlingUrlData(categoryUrl);
    }//from ww w  . jav  a 2 s.c o  m

    List<Future<String>> list = new ArrayList<Future<String>>();
    ExecutorService executor = Executors.newFixedThreadPool(5);

    List<Categories> listCatogories = objClonePhpDaoImpl.getCategoriesDataList();

    for (Categories listCatogory : listCatogories) {

        try {
            Callable worker = new CrawlingEachUrlData(listCatogory, objClonePhpDaoImpl);
            Future<String> future = executor.submit(worker);
            list.add(future);
        } catch (Exception exx) {
            System.out.println(exx);
        }

    }

    for (Future<String> fut : list) {
        try {
            //print the return value of Future, notice the output delay in console
            // because Future.get() waits for task to get completed
            System.out.println(new Date() + "::" + fut.get());
        } catch (InterruptedException | ExecutionException ep) {
            ep.printStackTrace();
        }
    }
    //shut down the executor service now
    executor.shutdown();

}