Example usage for org.jsoup.nodes Element text

List of usage examples for org.jsoup.nodes Element text

Introduction

In this page you can find the example usage for org.jsoup.nodes Element text.

Prototype

public String text() 

Source Link

Document

Gets the combined text of this element and all its children.

Usage

From source file:org.tinymediamanager.scraper.imdb.ImdbTvShowParser.java

private boolean parseEpisodeList(int season, List<MediaEpisode> episodes, Document doc) {
    Pattern unknownPattern = Pattern.compile("Unknown");
    Pattern seasonEpisodePattern = Pattern.compile("S([0-9]*), Ep([0-9]*)");
    int episodeCounter = 0;

    // parse episodes
    Elements tables = doc.getElementsByClass("eplist");
    for (Element table : tables) {
        Elements rows = table.getElementsByClass("list_item");
        for (Element row : rows) {
            Matcher matcher = season == 0 ? unknownPattern.matcher(row.text())
                    : seasonEpisodePattern.matcher(row.text());
            if (matcher.find() && (season == 0 || matcher.groupCount() >= 2)) {
                try {
                    // we found a row containing episode data
                    MediaEpisode ep = new MediaEpisode(providerInfo.getId());

                    // parse season and ep number
                    if (season == 0) {
                        ep.season = season;
                        ep.episode = ++episodeCounter;
                    } else {
                        ep.season = Integer.parseInt(matcher.group(1));
                        ep.episode = Integer.parseInt(matcher.group(2));
                    }/* w w  w . ja v  a2s  .  com*/

                    // check if we have still valid data
                    if (season > 0 && season != ep.season) {
                        return false;
                    }

                    // get ep title and id
                    Elements anchors = row.getElementsByAttributeValueStarting("href", "/title/tt");
                    for (Element anchor : anchors) {
                        if ("name".equals(anchor.attr("itemprop"))) {
                            ep.title = anchor.text();
                            break;
                        }
                    }

                    String id = "";
                    Matcher idMatcher = IMDB_ID_PATTERN.matcher(anchors.get(0).attr("href"));
                    while (idMatcher.find()) {
                        if (idMatcher.group(1) != null) {
                            id = idMatcher.group(1);
                        }
                    }

                    if (StringUtils.isNotBlank(id)) {
                        ep.ids.put(providerInfo.getId(), id);
                    }

                    // plot
                    Element plot = row.getElementsByClass("item_description").first();
                    if (plot != null) {
                        ep.plot = plot.ownText();
                    }

                    // rating and rating count
                    Element ratingElement = row.getElementsByClass("ipl-rating-star__rating").first();
                    if (ratingElement != null) {
                        String ratingAsString = ratingElement.ownText().replace(",", ".");
                        try {
                            ep.rating = Float.valueOf(ratingAsString);
                        } catch (Exception ignored) {
                        }

                        Element votesElement = row.getElementsByClass("ipl-rating-star__total-votes").first();
                        if (votesElement != null) {
                            String countAsString = votesElement.ownText().replaceAll("[.,()]", "").trim();
                            try {
                                ep.voteCount = Integer.parseInt(countAsString);
                            } catch (Exception ignored) {
                            }
                        }
                    }

                    // release date
                    Element releaseDate = row.getElementsByClass("airdate").first();
                    if (releaseDate != null) {
                        ep.firstAired = releaseDate.ownText();
                    }

                    // poster
                    Element image = row.getElementsByTag("img").first();
                    if (image != null) {
                        String posterUrl = image.attr("src");
                        posterUrl = posterUrl.replaceAll("UX[0-9]{2,4}_", "");
                        posterUrl = posterUrl.replaceAll("UY[0-9]{2,4}_", "");
                        posterUrl = posterUrl.replaceAll("CR[0-9]{1,3},[0-9]{1,3},[0-9]{1,3},[0-9]{1,3}_", "");

                        if (StringUtils.isNotBlank(posterUrl)) {
                            MediaArtwork ma = new MediaArtwork(ImdbMetadataProvider.providerInfo.getId(),
                                    MediaArtwork.MediaArtworkType.THUMB);
                            ma.setPreviewUrl(posterUrl);
                            ma.setDefaultUrl(posterUrl);
                            ep.artwork.add(ma);
                        }
                    }

                    episodes.add(ep);
                } catch (Exception e) {
                    LOGGER.warn("failed parsing: " + row.text() + " for ep data; " + e.getMessage());
                }
            }
        }
    }
    return true;
}

From source file:org.tinymediamanager.scraper.ofdb.OfdbMetadataProvider.java

@Override
public MediaMetadata getMetadata(MediaScrapeOptions options) throws Exception {
    LOGGER.debug("getMetadata() " + options.toString());

    if (options.getType() != MediaType.MOVIE) {
        throw new UnsupportedMediaTypeException(options.getType());
    }/*from  w w  w . ja  va  2s . com*/

    // we have 3 entry points here
    // a) getMetadata has been called with an ofdbId
    // b) getMetadata has been called with an imdbId
    // c) getMetadata has been called from a previous search

    String detailUrl = "";

    // case a) and c)
    if (StringUtils.isNotBlank(options.getId(getProviderInfo().getId())) || options.getResult() != null) {

        if (StringUtils.isNotBlank(options.getId(getProviderInfo().getId()))) {
            detailUrl = "http://www.ofdb.de/view.php?page=film&fid=" + options.getId(getProviderInfo().getId());
        } else {
            detailUrl = options.getResult().getUrl();
        }
    }

    // case b)
    if (options.getResult() == null && StringUtils.isNotBlank(options.getId(MediaMetadata.IMDB))) {
        MediaSearchOptions searchOptions = new MediaSearchOptions(MediaType.MOVIE);
        searchOptions.setImdbId(options.getId(MediaMetadata.IMDB));
        try {
            List<MediaSearchResult> results = search(searchOptions);
            if (results != null && !results.isEmpty()) {
                options.setResult(results.get(0));
                detailUrl = options.getResult().getUrl();
            }
        } catch (Exception e) {
            LOGGER.warn("failed IMDB search: " + e.getMessage());
        }
    }

    // we can only work further if we got a search result on ofdb.de
    if (StringUtils.isBlank(detailUrl)) {
        throw new Exception("We did not get any useful movie url");
    }

    MediaMetadata md = new MediaMetadata(providerInfo.getId());
    // generic Elements used all over
    Elements el = null;
    String ofdbId = StrgUtils.substr(detailUrl, "film\\/(\\d+),");
    if (StringUtils.isBlank(ofdbId)) {
        ofdbId = StrgUtils.substr(detailUrl, "fid=(\\d+)");
    }

    Url url;
    try {
        LOGGER.trace("get details page");
        url = new Url(detailUrl);
        InputStream in = url.getInputStream();
        Document doc = Jsoup.parse(in, "UTF-8", "");
        in.close();

        if (doc.getAllElements().size() < 10) {
            throw new Exception("meh - we did not receive a valid web page");
        }

        // parse details

        // IMDB ID "http://www.imdb.com/Title?1194173"
        el = doc.getElementsByAttributeValueContaining("href", "imdb.com");
        if (!el.isEmpty()) {
            md.setId(MediaMetadata.IMDB, "tt" + StrgUtils.substr(el.first().attr("href"), "\\?(\\d+)"));
        }

        // title / year
        // <meta property="og:title" content="Bourne Vermchtnis, Das (2012)" />
        el = doc.getElementsByAttributeValue("property", "og:title");
        if (!el.isEmpty()) {
            String[] ty = parseTitle(el.first().attr("content"));
            md.setTitle(StrgUtils.removeCommonSortableName(ty[0]));
            try {
                md.setYear(Integer.parseInt(ty[1]));
            } catch (Exception ignored) {
            }
        }
        // another year position
        if (md.getYear() == 0) {
            // <a href="view.php?page=blaettern&Kat=Jahr&Text=2012">2012</a>
            el = doc.getElementsByAttributeValueContaining("href", "Kat=Jahr");
            try {
                md.setYear(Integer.parseInt(el.first().text()));
            } catch (Exception ignored) {
            }
        }

        // original title (has to be searched with a regexp)
        // <tr valign="top">
        // <td nowrap=""><font class="Normal" face="Arial,Helvetica,sans-serif"
        // size="2">Originaltitel:</font></td>
        // <td>&nbsp;&nbsp;</td>
        // <td width="99%"><font class="Daten" face="Arial,Helvetica,sans-serif"
        // size="2"><b>Brave</b></font></td>
        // </tr>
        String originalTitle = StrgUtils.substr(doc.body().html(), "(?s)Originaltitel.*?<b>(.*?)</b>");
        if (!originalTitle.isEmpty()) {
            md.setOriginalTitle(StrgUtils.removeCommonSortableName(originalTitle));
        }

        // Genre: <a href="view.php?page=genre&Genre=Action">Action</a>
        el = doc.getElementsByAttributeValueContaining("href", "page=genre");
        for (Element g : el) {
            md.addGenre(getTmmGenre(g.text()));
        }

        // rating
        // <div itemtype="http://schema.org/AggregateRating" itemscope
        // itemprop="aggregateRating">Note: <span
        // itemprop="ratingValue">6.73</span><meta
        // itemprop="worstRating" content="1" />
        el = doc.getElementsByAttributeValue("itemprop", "ratingValue");
        if (!el.isEmpty()) {
            String r = el.text();
            if (!r.isEmpty()) {
                try {
                    md.setRating(Float.parseFloat(r));
                } catch (Exception e) {
                    LOGGER.debug("could not parse rating");
                }
            }
        }

        // get PlotLink; open url and parse
        // <a href="plot/22523,31360,Die-Bourne-Identitt"><b>[mehr]</b></a>
        LOGGER.trace("parse plot");
        el = doc.getElementsByAttributeValueMatching("href", "plot\\/\\d+,");
        if (!el.isEmpty()) {
            String plotUrl = BASE_URL + "/" + el.first().attr("href");
            try {
                url = new Url(plotUrl);
                in = url.getInputStream();
                Document plot = Jsoup.parse(in, "UTF-8", "");
                in.close();
                Elements block = plot.getElementsByClass("Blocksatz"); // first
                                                                       // Blocksatz
                                                                       // is plot
                String p = block.first().text(); // remove all html stuff
                p = p.substring(p.indexOf("Mal gelesen") + 12); // remove "header"
                md.setPlot(p);
            } catch (Exception e) {
                LOGGER.error("failed to get plot page: " + e.getMessage());
            }
        }

        // http://www.ofdb.de/view.php?page=film_detail&fid=226745
        LOGGER.debug("parse actor detail");
        String movieDetail = BASE_URL + "/view.php?page=film_detail&fid=" + ofdbId;
        doc = null;
        try {
            url = new Url(movieDetail);
            in = url.getInputStream();
            doc = Jsoup.parse(in, "UTF-8", "");
            in.close();
        } catch (Exception e) {
            LOGGER.error("failed to get detail page: " + e.getMessage());
        }

        if (doc != null) {
            parseCast(doc.getElementsContainingOwnText("Regie"), MediaCastMember.CastType.DIRECTOR, md);
            parseCast(doc.getElementsContainingOwnText("Darsteller"), MediaCastMember.CastType.ACTOR, md);
            parseCast(doc.getElementsContainingOwnText("Stimme/Sprecher"), MediaCastMember.CastType.ACTOR, md);
            parseCast(doc.getElementsContainingOwnText("Synchronstimme (deutsch)"),
                    MediaCastMember.CastType.ACTOR, md);
            parseCast(doc.getElementsContainingOwnText("Drehbuchautor(in)"), MediaCastMember.CastType.WRITER,
                    md);
            parseCast(doc.getElementsContainingOwnText("Produzent(in)"), MediaCastMember.CastType.PRODUCER, md);
        }
    } catch (Exception e) {
        LOGGER.error("Error parsing " + detailUrl);
        throw e;
    }

    return md;
}

From source file:org.tinymediamanager.scraper.zelluloid.ZelluloidMetadataProvider.java

@Override
public List<MediaSearchResult> search(MediaSearchOptions options) throws Exception {
    LOGGER.debug("search() " + options.toString());
    List<MediaSearchResult> resultList = new ArrayList<MediaSearchResult>();
    String searchUrl = "";
    String searchTerm = "";
    String imdb = "";

    // only title search
    if (StringUtils.isNotEmpty(options.get(MediaSearchOptions.SearchParam.QUERY))) {
        searchTerm = cleanSearch(options.get(MediaSearchOptions.SearchParam.QUERY));
        searchUrl = BASE_URL + "/suche/index.php3?qstring=" + URLEncoder.encode(searchTerm, "UTF-8");
        LOGGER.debug("search for everything: " + searchTerm);
    } else if (StringUtils.isNotEmpty(options.get(MediaSearchOptions.SearchParam.TITLE))) {
        searchTerm = cleanSearch(options.get(MediaSearchOptions.SearchParam.TITLE));
        searchUrl = BASE_URL + "/suche/index.php3?qstring=" + URLEncoder.encode(searchTerm, "UTF-8");
        LOGGER.debug("search with title: " + searchTerm);
    } else {/*w  w w . ja v  a 2  s  .  com*/
        LOGGER.debug("empty searchString");
        return resultList;
    }

    searchTerm = MetadataUtil.removeNonSearchCharacters(searchTerm);

    Document doc = null;
    try {
        Url url = new CachedUrl(searchUrl);
        InputStream in = url.getInputStream();
        doc = Jsoup.parse(in, PAGE_ENCODING, "");
        in.close();
    } catch (Exception e) {
        LOGGER.error("failed to search for " + searchTerm + ": " + e.getMessage());

        // clear cache
        CachedUrl.removeCachedFileForUrl(searchUrl);
    }

    if (doc == null) {
        return resultList;
    }

    // only look for movie links
    Elements filme = doc.getElementsByAttributeValueStarting("href", "hit.php");
    LOGGER.debug("found " + filme.size() + " search results");
    if (filme.isEmpty()) {
        if (!doc.getElementsByTag("title").text().contains("Suche nach")) {
            // redirected to detail page
            MediaSearchResult msr = new MediaSearchResult(providerInfo.getId());
            Elements el = doc.getElementsByAttributeValueStarting("href", "index.php3?id=");
            if (el.size() > 0) {
                msr.setId(StrgUtils.substr(el.get(0).attr("href"), "id=(\\d+)"));
            }
            msr.setTitle(StrgUtils.substr(doc.getElementsByTag("title").text(), "(.*?)\\|").trim());
            el = doc.getElementsByAttributeValueContaining("href", "az.php3?j=");
            if (el.size() == 1) {
                msr.setYear(el.get(0).text());
            }
            resultList.add(msr);
        }
        return resultList;
    }

    // <a
    // href="hit.php3?hit=d6900d7d9baf66ba77d8e59cc425da9e-movie-7614-17114331-1"
    // class="normLight">Avatar - Aufbruch nach Pandora</B>
    // <nobr>(2009)</nobr><br /><span class="smallLight"
    // style="color:#ccc;">Avatar</span></a>

    // map to merge 2 results :/
    Map<String, MediaSearchResult> res = new HashMap<String, MediaSearchResult>();

    for (Element a : filme) {
        try {
            String id = StrgUtils.substr(a.attr("href"), "-movie-(.*?)-");
            MediaSearchResult sr = new MediaSearchResult(providerInfo.getId());
            if (res.containsKey(id)) {
                LOGGER.debug("dupe found; merging with previous searchresult");
                sr = res.get(id);
            }

            if (StringUtils.isNotEmpty(imdb)) {
                sr.setIMDBId(imdb);
            }
            if (StringUtils.isEmpty(sr.getId())) {
                sr.setId(id);
            }
            if (StringUtils.isEmpty(sr.getTitle())) {
                if (a.html().contains("nobr")) {
                    sr.setTitle(a.ownText());
                } else {
                    sr.setTitle(a.text());
                }
            }
            LOGGER.debug("found movie " + sr.getTitle());
            if (StringUtils.isEmpty(sr.getOriginalTitle())) {
                sr.setOriginalTitle(a.getElementsByTag("span").text());
            }
            if (StringUtils.isEmpty(sr.getYear())) {
                sr.setYear(StrgUtils.substr(a.getElementsByTag("nobr").text(), ".*(\\d{4}).*")); // any
                                                                                                 // 4
                                                                                                 // digit
            }
            sr.setMediaType(MediaType.MOVIE);
            sr.setUrl(BASE_URL + "/filme/index.php3?id=" + id);
            // sr.setPosterUrl(BASE_URL + "/images" + StrgUtils.substr(a.toString(),
            // "images(.*?)\\&quot"));

            if (imdb.equals(sr.getIMDBId())) {
                // perfect match
                sr.setScore(1);
            } else {
                // compare score based on names
                sr.setScore(MetadataUtil.calculateScore(searchTerm, sr.getTitle()));
            }

            // populate extra args
            MetadataUtil.copySearchQueryToSearchResult(options, sr);
            res.put(id, sr);
        } catch (Exception e) {
            LOGGER.warn("error parsing movie result: " + e.getMessage());
        }
    }
    for (String r : res.keySet()) {
        resultList.add(res.get(r));
    }
    Collections.sort(resultList);
    Collections.reverse(resultList);
    return resultList;
}

From source file:org.tomstools.web.service.SolrService.java

private int doGenerateHot(String flag, SolrTools solrTool, Date beginTime, Date endTime) {
    if (StringUtils.isEmpty(flag)) {
        return 0;
    }/*w w w  .j  av a2  s.  c o  m*/
    String configSelector = userService.getConfig(-1, "SELECTOR_4_HOT_FLAG_" + flag);
    String url = userService.getConfig(-1, "URL_4_HOT_FLAG_" + flag);
    String defaultCharsetName = userService.getConfig(-1, "CHARSET_4_HOT_FLAG_" + flag);
    PageFetcher fetcher = new PageFetcher(defaultCharsetName);
    String content = fetcher.fetchPageContent(url);
    Document document = Jsoup.parse(content);
    Elements nodes = document.select(configSelector);
    if (null == nodes) {
        return 0;
    }
    List<Word> words = new ArrayList<Word>();
    for (int i = 0; i < nodes.size(); i++) {
        Element e = nodes.get(i);
        words.add(new Word(e.text(), 100 / (i + 1)));
    }
    // ????
    for (Word word : words) {
        List<String> terms = analyzer.getWords(word.word);
        if (terms.isEmpty()) {
            continue;
        }
        StringBuilder msg = new StringBuilder();
        for (int i = 0; i < terms.size(); i++) {
            if (i != 0) {
                msg.append(" AND ");
            }
            msg.append(terms.get(i));
        }
        try {
            long cnt = solrTool.count("text:" + msg.toString(), beginTime, endTime);
            word.heat += cnt;
        } catch (Exception e) {
            LOG.error(e.getMessage(), e);
        }
    }

    // ?
    if (words.size() > 0) {
        //
        //siteMapper.saveHotHis(flag);
        siteMapper.deleteHot(flag);
        for (Word word : words) {
            siteMapper.saveHot(flag, word.word, word.heat);
        }

        return words.size();
    } else {
        return 0;
    }
}

From source file:org.xlrnet.metadict.engines.leo.LeoEngine.java

private void processSimilarities(@Nullable Element similarityNode,
        @NotNull EngineQueryResultBuilder engineQueryResultBuilder) {
    if (similarityNode == null) {
        LOGGER.warn("Couldn't find similarity node");
        return;// www.  j av a  2 s. com
    }

    Elements sides = similarityNode.getElementsByTag("side");

    for (Element side : sides) {
        Language sideLanguage = Language.getExistingLanguageById(side.attr("lang"));

        for (Element word : side.getElementsByTag("word")) {
            String wordText = word.text();
            engineQueryResultBuilder.addSimilarRecommendation(
                    new DictionaryObjectBuilder().setLanguage(sideLanguage).setGeneralForm(wordText).build());
        }

    }
}

From source file:org.xlrnet.metadict.engines.nobordbok.OrdbokEngine.java

/**
 * Extract both the general form and all syllables from the oppslagsord-node.
 *///from  w  w  w .jav a2 s. c o m
private void extractGeneralForm(DictionaryObjectBuilder objectBuilder, Element oppslagsord) {
    String rawForm = oppslagsord.text();
    String[] syllabification = StringUtils.split(rawForm, SYLLABLE_SEPARATOR_CHAR);

    String generalForm = StringUtils.remove(rawForm, SYLLABLE_SEPARATOR_CHAR);
    objectBuilder.setGeneralForm(generalForm);
    objectBuilder.setSyllabification(syllabification);
}

From source file:org.xlrnet.metadict.engines.woxikon.WoxikonEngine.java

private void findRecommendations(@NotNull Document doc, @NotNull BilingualQueryResultBuilder resultBuilder) {
    // Determine all candidate nodes:
    Elements alternativeNodes = doc.select("div.cc > p > *");

    Language currentLanguage = null;/*from w w w.j  a v  a 2 s. c o m*/

    for (Element node : alternativeNodes) {
        // If the next node is a flagicon, try to determine the language for the next entries from the class name
        if (node.tagName().equals("span") && node.hasClass("flagicon")) {
            Set<String> classNames = node.classNames();
            classNames.remove("flagicon");
            for (String className : classNames) {
                Language candidate = Language.getExistingLanguageById(className);
                if (candidate != null) {
                    currentLanguage = candidate;
                    break;
                }
            }
        } else if (node.tagName().equals("a")) {
            String recommendationText = node.text();

            DictionaryObjectBuilder objectBuilder = ImmutableDictionaryObject.builder();
            objectBuilder.setLanguage(currentLanguage).setGeneralForm(recommendationText);

            resultBuilder.addSimilarRecommendation(objectBuilder.build());
        }
    }
}

From source file:org.xlrnet.metadict.engines.woxikon.WoxikonEngine.java

@NotNull
private DictionaryObject processSingleNode(@NotNull Element element, @NotNull Language language,
        String queryString) {//from  w ww .  j a va  2 s .c o  m
    DictionaryObjectBuilder objectBuilder = ImmutableDictionaryObject.builder();
    objectBuilder.setLanguage(language);

    // Extract entry text:
    String context = StringUtils.substringBefore(element.text(), element.getElementsByTag("a").first().text());
    String generalForm = context + element.getElementsByTag("a").first().text();
    objectBuilder.setGeneralForm(StringUtils.strip(generalForm));

    // Extract description:
    extractDescription(element, queryString, objectBuilder);

    // Extract gender:
    extractGender(element, objectBuilder);

    return objectBuilder.build();
}

From source file:org.xlrnet.metadict.engines.woxikon.WoxikonEngine.java

private void extractDescription(@NotNull Element element, String queryString,
        DictionaryObjectBuilder objectBuilder) {
    Element descriptionNode = element.getElementsByClass(CLASS_DESCRIPTION).first();
    if (descriptionNode == null) {
        // Try to detect the description node with an alternative class (necessary for synonyms)
        descriptionNode = element.getElementsByClass(CLASS_EXTRA_INFO).first();
    }/*from  w w w .j a  v a2 s.c  o  m*/
    if (descriptionNode != null) {
        String description = descriptionNode.text();

        description = StringUtils.removeStart(description, DESCRIPTION_BEGIN);
        description = StringUtils.removeEnd(description, DESCRIPTION_END);

        if (!StringUtils.equalsIgnoreCase(description, queryString)) // Set description only if it is different from request string
            objectBuilder.setDescription(StringUtils.strip(description));
    }
}

From source file:org.xlrnet.metadict.engines.woxikon.WoxikonEngine.java

private void extractGender(@NotNull Element element, DictionaryObjectBuilder objectBuilder) {
    Element genderNode = element.getElementsByClass(CLASS_GENDER).first();
    if (genderNode != null) {
        String gender = genderNode.text();
        if (GENDER_MAP.containsKey(gender))
            objectBuilder.setGrammaticalGender(GENDER_MAP.get(gender));
    }/*  w  w w. j  a v a 2 s .co m*/
}