Example usage for org.jsoup.nodes Element getElementsByClass

List of usage examples for org.jsoup.nodes Element getElementsByClass

Introduction

In this page you can find the example usage for org.jsoup.nodes Element getElementsByClass.

Prototype

public Elements getElementsByClass(String className) 

Source Link

Document

Find elements that have this class, including or under this element.

Usage

From source file:org.tinymediamanager.scraper.imdb.ImdbMovieParser.java

MediaMetadata getMovieMetadata(MediaScrapeOptions options) throws Exception {
    MediaMetadata md = new MediaMetadata(providerInfo.getId());

    // check if there is a md in the result
    if (options.getResult() != null && options.getResult().getMediaMetadata() != null) {
        LOGGER.debug("IMDB: getMetadata from cache: " + options.getResult());
        return options.getResult().getMediaMetadata();
    }//ww  w. j a v a  2 s .  c  o  m

    String imdbId = "";

    // imdbId from searchResult
    if (options.getResult() != null) {
        imdbId = options.getResult().getIMDBId();
    }

    // imdbid from scraper option
    if (!MetadataUtil.isValidImdbId(imdbId)) {
        imdbId = options.getImdbId();
    }

    if (!MetadataUtil.isValidImdbId(imdbId)) {
        return md;
    }

    LOGGER.debug("IMDB: getMetadata(imdbId): " + imdbId);
    md.setId(providerInfo.getId(), imdbId);

    ExecutorCompletionService<Document> compSvcImdb = new ExecutorCompletionService<>(executor);
    ExecutorCompletionService<MediaMetadata> compSvcTmdb = new ExecutorCompletionService<>(executor);

    // worker for imdb request (/reference) (everytime from www.imdb.com)
    // StringBuilder sb = new StringBuilder(imdbSite.getSite());
    StringBuilder sb = new StringBuilder(ImdbSiteDefinition.IMDB_COM.getSite());
    sb.append("title/");
    sb.append(imdbId);
    sb.append("/reference");
    Callable<Document> worker = new ImdbWorker(sb.toString(), options.getLanguage().getLanguage(),
            options.getCountry().getAlpha2(), imdbSite);
    Future<Document> futureReference = compSvcImdb.submit(worker);

    // worker for imdb request (/plotsummary) (from chosen site)
    Future<Document> futurePlotsummary;
    sb = new StringBuilder(imdbSite.getSite());
    sb.append("title/");
    sb.append(imdbId);
    sb.append("/plotsummary");

    worker = new ImdbWorker(sb.toString(), options.getLanguage().getLanguage(),
            options.getCountry().getAlpha2(), imdbSite);
    futurePlotsummary = compSvcImdb.submit(worker);

    // worker for tmdb request
    Future<MediaMetadata> futureTmdb = null;
    if (ImdbMetadataProvider.providerInfo.getConfig().getValueAsBool("useTmdb")
            || ImdbMetadataProvider.providerInfo.getConfig().getValueAsBool("scrapeCollectionInfo")) {
        Callable<MediaMetadata> worker2 = new TmdbWorker(imdbId, options.getLanguage(), options.getCountry());
        futureTmdb = compSvcTmdb.submit(worker2);
    }

    Document doc;
    doc = futureReference.get();
    parseReferencePage(doc, options, md);

    /*
     * plot from /plotsummary
     */
    // build the url
    doc = futurePlotsummary.get();
    parsePlotsummaryPage(doc, options, md);

    // title also from chosen site if we are not scraping akas.imdb.com
    if (imdbSite != ImdbSiteDefinition.IMDB_COM) {
        Element title = doc.getElementById("tn15title");
        if (title != null) {
            Element element;
            // title
            Elements elements = title.getElementsByClass("main");
            if (elements.size() > 0) {
                element = elements.first();
                String movieTitle = cleanString(element.ownText());
                md.setTitle(movieTitle);
            }
        }
    }

    // get the release info page
    Future<Document> futureReleaseinfo;
    sb = new StringBuilder(imdbSite.getSite());
    sb.append("title/");
    sb.append(imdbId);
    sb.append("/releaseinfo");
    worker = new ImdbWorker(sb.toString(), options.getLanguage().getLanguage(),
            options.getCountry().getAlpha2(), imdbSite);
    futureReleaseinfo = compSvcImdb.submit(worker);
    doc = futureReleaseinfo.get();
    // parse original title here!!
    parseReleaseinfoPageAKAs(doc, options, md);

    // did we get a release date?
    if (md.getReleaseDate() == null
            || ImdbMetadataProvider.providerInfo.getConfig().getValueAsBool("localReleaseDate")) {
        parseReleaseinfoPage(doc, options, md);
    }

    // get data from tmdb?
    if (futureTmdb != null && (ImdbMetadataProvider.providerInfo.getConfig().getValueAsBool("useTmdb")
            || ImdbMetadataProvider.providerInfo.getConfig().getValueAsBool("scrapeCollectionInfo"))) {
        try {
            MediaMetadata tmdbMd = futureTmdb.get();
            if (ImdbMetadataProvider.providerInfo.getConfig().getValueAsBool("useTmdb") && tmdbMd != null) {
                // tmdbid
                md.setId(MediaMetadata.TMDB, tmdbMd.getId(MediaMetadata.TMDB));
                // title
                if (StringUtils.isNotBlank(tmdbMd.getTitle())) {
                    md.setTitle(tmdbMd.getTitle());
                }
                // original title
                if (StringUtils.isNotBlank(tmdbMd.getOriginalTitle())) {
                    md.setOriginalTitle(tmdbMd.getOriginalTitle());
                }
                // tagline
                if (StringUtils.isNotBlank(tmdbMd.getTagline())) {
                    md.setTagline(tmdbMd.getTagline());
                }
                // plot
                if (StringUtils.isNotBlank(tmdbMd.getPlot())) {
                    md.setPlot(tmdbMd.getPlot());
                }
                // collection info
                if (StringUtils.isNotBlank(tmdbMd.getCollectionName())) {
                    md.setCollectionName(tmdbMd.getCollectionName());
                    md.setId(MediaMetadata.TMDB_SET, tmdbMd.getId(MediaMetadata.TMDB_SET));
                }
            }
            if (ImdbMetadataProvider.providerInfo.getConfig().getValueAsBool("scrapeCollectionInfo")
                    && tmdbMd != null) {
                md.setId(MediaMetadata.TMDB_SET, tmdbMd.getId(MediaMetadata.TMDB_SET));
                md.setCollectionName(tmdbMd.getCollectionName());
            }
            md.setId(tmdbMd.getProviderId(), tmdbMd.getId(tmdbMd.getProviderId()));
        } catch (Exception ignored) {
        }
    }

    // if we have still no original title, take the title
    if (StringUtils.isBlank(md.getOriginalTitle())) {
        md.setOriginalTitle(md.getTitle());
    }

    // populate id
    md.setId(ImdbMetadataProvider.providerInfo.getId(), imdbId);

    return md;
}

From source file:org.tinymediamanager.scraper.imdb.ImdbMovieParser.java

private MediaMetadata parseReleaseinfoPage(Document doc, MediaScrapeOptions options, MediaMetadata md) {
    Date releaseDate = null;/*  w  w w.  j  a  v a 2  s  . c om*/
    Pattern pattern = Pattern.compile("/calendar/\\?region=(.{2})");

    // old way
    Element tableReleaseDates = doc.getElementById("release_dates");
    if (tableReleaseDates != null) {
        Elements rows = tableReleaseDates.getElementsByTag("tr");
        // first round: check the release date for the first one with the requested country
        for (Element row : rows) {
            // get the anchor
            Element anchor = row.getElementsByAttributeValueStarting("href", "/calendar/").first();
            if (anchor != null) {
                Matcher matcher = pattern.matcher(anchor.attr("href"));
                if (matcher.find() && options.getCountry().getAlpha2().equalsIgnoreCase(matcher.group(1))) {
                    Element column = row.getElementsByClass("release_date").first();
                    if (column != null) {
                        try {
                            SimpleDateFormat sdf = new SimpleDateFormat("d MMMM yyyy", Locale.US);
                            releaseDate = sdf.parse(column.text());
                            break;
                        } catch (ParseException otherformat) {
                            try {
                                SimpleDateFormat sdf = new SimpleDateFormat("MMMM yyyy", Locale.US);
                                releaseDate = sdf.parse(column.text());
                                break;
                            } catch (ParseException ignored) {
                            }
                        }
                    }
                }
            }
        }
    }

    // new way; iterating over class name items
    if (releaseDate == null) {
        Elements rows = doc.getElementsByClass("release-date-item");
        for (Element row : rows) {
            Element anchor = row.getElementsByAttributeValueStarting("href", "/calendar/").first();
            if (anchor != null) {
                Matcher matcher = pattern.matcher(anchor.attr("href"));
                if (matcher.find() && options.getCountry().getAlpha2().equalsIgnoreCase(matcher.group(1))) {
                    Element column = row.getElementsByClass("release-date-item__date").first();
                    if (column != null) {
                        try {
                            SimpleDateFormat sdf = new SimpleDateFormat("d MMMM yyyy", Locale.US);
                            releaseDate = sdf.parse(column.text());
                            break;
                        } catch (ParseException otherformat) {
                            try {
                                SimpleDateFormat sdf = new SimpleDateFormat("MMMM yyyy", Locale.US);
                                releaseDate = sdf.parse(column.text());
                                break;
                            } catch (ParseException ignored) {
                            }
                        }
                    }
                } else {
                    LOGGER.trace("country {} does not match ours {}", matcher.group(1),
                            options.getCountry().getAlpha2());
                }
            }
        }
    }

    // no matching local release date found; take the first one
    if (releaseDate == null) {
        Element column = doc.getElementsByClass("release_date").first();
        if (column == null) {
            column = doc.getElementsByClass("release-date-item__date").first();
        }
        if (column != null) {
            try {
                SimpleDateFormat sdf = new SimpleDateFormat("d MMMM yyyy", Locale.US);
                releaseDate = sdf.parse(column.text());
            } catch (ParseException otherformat) {
                try {
                    SimpleDateFormat sdf = new SimpleDateFormat("MMMM yyyy", Locale.US);
                    releaseDate = sdf.parse(column.text());
                } catch (ParseException ignored) {
                }
            }
        }
    }

    if (releaseDate != null) {
        md.setReleaseDate(releaseDate);
    }
    return md;
}

From source file:org.tinymediamanager.scraper.imdb.ImdbMovieParser.java

private MediaMetadata parseReleaseinfoPageAKAs(Document doc, MediaScrapeOptions options, MediaMetadata md) {
    // <table id="akas" class="subpage_data spEven2Col">
    // <tr class="even">
    // <td>(original title)</td>
    // <td>Intouchables</td>
    // </tr>
    // need to search all tables for correct ID, since the UNIQUE id is used multiple times - thanks for nothing :p
    for (Element table : doc.getElementsByTag("table")) {
        if (table.id().equalsIgnoreCase("akas")) {
            Elements rows = table.getElementsByTag("tr");
            for (Element row : rows) {
                Element c1 = row.getElementsByTag("td").get(0);
                Element c2 = row.getElementsByTag("td").get(1);
                if (c1 != null && c1.text().toLowerCase(Locale.ROOT).contains("original title")) {
                    md.setOriginalTitle(c2.text());
                    break;
                }//from  w w w . ja  va 2  s  .c  o  m
            }
        }
    }

    // alternative; new way with table classes
    // <tr class="ipl-zebra-list__item aka-item">
    // <td class="aka-item__name">Germany</td>
    // <td class="aka-item__title">Avatar - Aufbruch nach Pandora</td>
    // </tr>
    if (md.getOriginalTitle().isEmpty()) {
        Elements rows = doc.getElementsByClass("aka-item");
        for (Element row : rows) {
            Element country = row.getElementsByClass("aka-item__name").first();
            Element title = row.getElementsByClass("aka-item__title").first();
            if (country != null && country.text().toLowerCase(Locale.ROOT).contains("original title")) {
                md.setOriginalTitle(title.text());
                break;
            }
        }
    }

    return md;
}

From source file:org.tinymediamanager.scraper.imdb.ImdbParser.java

/**
 * do the search according to the type//from w  ww.  j  av a  2  s . c o m
 * 
 * @param query
 *          the search params
 * @return the found results
 */
protected List<MediaSearchResult> search(MediaSearchOptions query) throws Exception {
    List<MediaSearchResult> result = new ArrayList<>();

    /*
     * IMDb matches seem to come in several "flavours".
     * 
     * Firstly, if there is one exact match it returns the matching IMDb page.
     * 
     * If that fails to produce a unique hit then a list of possible matches are returned categorised as: Popular Titles (Displaying ? Results) Titles
     * (Exact Matches) (Displaying ? Results) Titles (Partial Matches) (Displaying ? Results)
     * 
     * We should check the Exact match section first, then the poplar titles and finally the partial matches.
     * 
     * Note: That even with exact matches there can be more than 1 hit, for example "Star Trek"
     */
    String searchTerm = "";

    if (StringUtils.isNotEmpty(query.getImdbId())) {
        searchTerm = query.getImdbId();
    }

    if (StringUtils.isEmpty(searchTerm)) {
        searchTerm = query.getQuery();
    }

    if (StringUtils.isEmpty(searchTerm)) {
        return result;
    }

    // parse out language and coutry from the scraper query
    String language = query.getLanguage().getLanguage();
    int myear = query.getYear();
    String country = query.getCountry().getAlpha2(); // for passing the country to the scrape

    searchTerm = MetadataUtil.removeNonSearchCharacters(searchTerm);

    StringBuilder sb = new StringBuilder(getImdbSite().getSite());
    sb.append("find?q=");
    try {
        // search site was everytime in UTF-8
        sb.append(URLEncoder.encode(searchTerm, "UTF-8"));
    } catch (UnsupportedEncodingException ex) {
        // Failed to encode the movie name for some reason!
        getLogger().debug("Failed to encode search term: " + searchTerm);
        sb.append(searchTerm);
    }

    // we need to search for all - otherwise we do not find TV movies
    sb.append(getSearchCategory());

    getLogger().debug("========= BEGIN IMDB Scraper Search for: " + sb.toString());
    Document doc;
    try {
        Url url = new Url(sb.toString());
        url.addHeader("Accept-Language", getAcceptLanguage(language, country));
        doc = Jsoup.parse(url.getInputStream(), "UTF-8", "");
    } catch (Exception e) {
        getLogger().debug("tried to fetch search response", e);
        return result;
    }

    // check if it was directly redirected to the site
    Elements elements = doc.getElementsByAttributeValue("rel", "canonical");
    for (Element element : elements) {
        MediaMetadata md = null;
        // we have been redirected to the movie site
        String movieName = null;
        String movieId = null;

        String href = element.attr("href");
        Matcher matcher = IMDB_ID_PATTERN.matcher(href);
        while (matcher.find()) {
            if (matcher.group(1) != null) {
                movieId = matcher.group(1);
            }
        }

        // get full information
        if (!StringUtils.isEmpty(movieId)) {
            MediaScrapeOptions options = new MediaScrapeOptions(type);
            options.setImdbId(movieId);
            options.setLanguage(query.getLanguage());
            options.setCountry(CountryCode.valueOf(country));
            md = getMetadata(options);
            if (!StringUtils.isEmpty(md.getTitle())) {
                movieName = md.getTitle();
            }
        }

        // if a movie name/id was found - return it
        if (StringUtils.isNotEmpty(movieName) && StringUtils.isNotEmpty(movieId)) {
            MediaSearchResult sr = new MediaSearchResult(ImdbMetadataProvider.providerInfo.getId(),
                    query.getMediaType());
            sr.setTitle(movieName);
            sr.setIMDBId(movieId);
            sr.setYear(md.getYear());
            sr.setMetadata(md);
            sr.setScore(1);

            // and parse out the poster
            String posterUrl = "";
            Elements posters = doc.getElementsByClass("poster");
            if (posters != null && !posters.isEmpty()) {
                Elements imgs = posters.get(0).getElementsByTag("img");
                for (Element img : imgs) {
                    posterUrl = img.attr("src");
                    posterUrl = posterUrl.replaceAll("UX[0-9]{2,4}_", "UX200_");
                    posterUrl = posterUrl.replaceAll("UY[0-9]{2,4}_", "UY200_");
                    posterUrl = posterUrl.replaceAll("CR[0-9]{1,3},[0-9]{1,3},[0-9]{1,3},[0-9]{1,3}_", "");
                }
            }
            if (StringUtils.isNotBlank(posterUrl)) {
                sr.setPosterUrl(posterUrl);
            }

            result.add(sr);
            return result;
        }
    }

    // parse results
    // elements = doc.getElementsByClass("result_text");
    elements = doc.getElementsByClass("findResult");
    for (Element tr : elements) {
        // we only want the tr's
        if (!"tr".equalsIgnoreCase(tr.tagName())) {
            continue;
        }

        // find the id / name
        String movieName = "";
        String movieId = "";
        int year = 0;
        Elements tds = tr.getElementsByClass("result_text");
        for (Element element : tds) {
            // we only want the td's
            if (!"td".equalsIgnoreCase(element.tagName())) {
                continue;
            }

            // filter out unwanted results
            Pattern unwantedSearchResultPattern = getUnwantedSearchResultPattern();
            if (unwantedSearchResultPattern != null) {
                Matcher matcher = unwantedSearchResultPattern.matcher(element.text());
                if (matcher.find()) {
                    continue;
                }
            }

            // is there a localized name? (aka)
            String localizedName = "";
            Elements italics = element.getElementsByTag("i");
            if (italics.size() > 0) {
                localizedName = italics.text().replace("\"", "");
            }

            // get the name inside the link
            Elements anchors = element.getElementsByTag("a");
            for (Element a : anchors) {
                if (StringUtils.isNotEmpty(a.text())) {
                    // movie name
                    if (StringUtils.isNotBlank(localizedName) && !language.equals("en")) {
                        // take AKA as title, but only if not EN
                        movieName = localizedName;
                    } else {
                        movieName = a.text();
                    }

                    // parse id
                    String href = a.attr("href");
                    Matcher matcher = IMDB_ID_PATTERN.matcher(href);
                    while (matcher.find()) {
                        if (matcher.group(1) != null) {
                            movieId = matcher.group(1);
                        }
                    }

                    // try to parse out the year
                    Pattern yearPattern = Pattern.compile("\\(([0-9]{4})|/\\)");
                    matcher = yearPattern.matcher(element.text());
                    while (matcher.find()) {
                        if (matcher.group(1) != null) {
                            try {
                                year = Integer.parseInt(matcher.group(1));
                                break;
                            } catch (Exception ignored) {
                            }
                        }
                    }
                    break;
                }
            }
        }

        // if an id/name was found - parse the poster image
        String posterUrl = "";
        tds = tr.getElementsByClass("primary_photo");
        for (Element element : tds) {
            Elements imgs = element.getElementsByTag("img");
            for (Element img : imgs) {
                posterUrl = img.attr("src");
                posterUrl = posterUrl.replaceAll("UX[0-9]{2,4}_", "UX200_");
                posterUrl = posterUrl.replaceAll("UY[0-9]{2,4}_", "UY200_");
                posterUrl = posterUrl.replaceAll("CR[0-9]{1,3},[0-9]{1,3},[0-9]{1,3},[0-9]{1,3}_", "");
            }
        }

        // if no movie name/id was found - continue
        if (StringUtils.isEmpty(movieName) || StringUtils.isEmpty(movieId)) {
            continue;
        }

        MediaSearchResult sr = new MediaSearchResult(ImdbMetadataProvider.providerInfo.getId(),
                query.getMediaType());
        sr.setTitle(movieName);
        sr.setIMDBId(movieId);
        sr.setYear(year);
        sr.setPosterUrl(posterUrl);

        if (movieId.equals(query.getImdbId())) {
            // perfect match
            sr.setScore(1);
        } else {
            // compare score based on names
            float score = MetadataUtil.calculateScore(searchTerm, movieName);
            if (posterUrl.isEmpty() || posterUrl.contains("nopicture")) {
                getLogger().debug("no poster - downgrading score by 0.01");
                score = score - 0.01f;
            }
            if (yearDiffers(myear, year)) {
                float diff = (float) Math.abs(year - myear) / 100;
                getLogger()
                        .debug("parsed year does not match search result year - downgrading score by " + diff);
                score -= diff;
            }
            sr.setScore(score);
        }

        result.add(sr);

        // only get 40 results
        if (result.size() >= 40) {
            break;
        }
    }
    Collections.sort(result);
    Collections.reverse(result);

    return result;
}

From source file:org.tinymediamanager.scraper.imdb.ImdbParser.java

protected MediaMetadata parseReferencePage(Document doc, MediaScrapeOptions options, MediaMetadata md) {
    /*//from  w  ww .j a v  a  2 s  .  c  o  m
     * title and year have the following structure
     * 
     * <div id="tn15title"><h1>Merida - Legende der Highlands <span>(<a href="/year/2012/">2012</a>) <span class="pro-link">...</span> <span
     * class="title-extra">Brave <i>(original title)</i></span> </span></h1> </div>
     */

    // title
    Element title = doc.getElementsByAttributeValue("name", "title").first();
    if (title != null) {
        String movieTitle = cleanString(title.attr("content"));
        int yearStart = movieTitle.lastIndexOf("(");
        if (yearStart > 0) {
            movieTitle = movieTitle.substring(0, yearStart - 1).trim();
            md.setTitle(movieTitle);
        }
    }

    // original title and year
    Element originalTitleYear = doc.getElementsByAttributeValue("property", "og:title").first();
    if (originalTitleYear != null) {
        String content = originalTitleYear.attr("content");
        int startOfYear = content.lastIndexOf("(");
        if (startOfYear > 0) {
            // noo - this is NOT the original title!!! (seems always english?) parse from AKAs page...
            // String originalTitle = content.substring(0, startOfYear - 1).trim();
            // md.setOriginalTitle(originalTitle);

            String yearText = content.substring(startOfYear);

            // search year
            Pattern yearPattern = Pattern.compile("[1-2][0-9]{3}");
            Matcher matcher = yearPattern.matcher(yearText);
            while (matcher.find()) {
                if (matcher.group(0) != null) {
                    String movieYear = matcher.group(0);
                    try {
                        md.setYear(Integer.parseInt(movieYear));
                        break;
                    } catch (Exception ignored) {
                    }
                }
            }
        }
    }

    // poster
    Element poster = doc.getElementsByAttributeValue("property", "og:image").first();
    if (poster != null) {
        String posterUrl = poster.attr("content");

        int fileStart = posterUrl.lastIndexOf("/");
        if (fileStart > 0) {
            int parameterStart = posterUrl.indexOf("_", fileStart);
            if (parameterStart > 0) {
                int startOfExtension = posterUrl.lastIndexOf(".");
                if (startOfExtension > parameterStart) {
                    posterUrl = posterUrl.substring(0, parameterStart) + posterUrl.substring(startOfExtension);

                }
            }
        }
        processMediaArt(md, MediaArtwork.MediaArtworkType.POSTER, posterUrl);
    }

    /*
     * <div class="starbar-meta"> <b>7.4/10</b> &nbsp;&nbsp;<a href="ratings" class="tn15more">52,871 votes</a>&nbsp;&raquo; </div>
     */

    // rating and rating count
    Element ratingElement = doc.getElementsByClass("ipl-rating-star__rating").first();
    if (ratingElement != null) {
        String ratingAsString = ratingElement.ownText().replace(",", ".");
        try {
            md.setRating(Float.valueOf(ratingAsString));
        } catch (Exception ignored) {
        }

        Element votesElement = doc.getElementsByClass("ipl-rating-star__total-votes").first();
        if (votesElement != null) {
            String countAsString = votesElement.ownText().replaceAll("[.,()\\u00a0]", "").trim();
            try {
                md.setVoteCount(Integer.parseInt(countAsString));
            } catch (Exception ignored) {
            }
        }
    }
    // top250
    Element topRatedElement = doc.getElementsByAttributeValue("href", "/chart/top").first();
    if (topRatedElement != null) {
        Pattern topPattern = Pattern.compile("Top Rated Movies: #([0-9]{1,3})");
        Matcher matcher = topPattern.matcher(topRatedElement.ownText());
        while (matcher.find()) {
            if (matcher.group(1) != null) {
                try {
                    String top250Text = matcher.group(1);
                    md.setTop250(Integer.parseInt(top250Text));
                } catch (Exception ignored) {
                }
            }
        }
    }

    // releasedate
    Element releaseDateElement = doc
            .getElementsByAttributeValue("href", "/title/" + options.getImdbId().toLowerCase() + "/releaseinfo")
            .first();
    if (releaseDateElement != null) {
        String releaseDateText = releaseDateElement.ownText();
        int startOfCountry = releaseDateText.indexOf("(");
        if (startOfCountry > 0) {
            releaseDateText = releaseDateText.substring(0, startOfCountry - 1).trim();
        }
        try {
            SimpleDateFormat sdf = new SimpleDateFormat("d MMMM yyyy", Locale.US);
            Date parsedDate = sdf.parse(releaseDateText);
            md.setReleaseDate(parsedDate);
        } catch (ParseException otherformat) {
            try {
                SimpleDateFormat sdf = new SimpleDateFormat("MMMM yyyy", Locale.US);
                Date parsedDate = sdf.parse(releaseDateText);
                md.setReleaseDate(parsedDate);
            } catch (ParseException ignored) {
            }
        }
    }

    Elements elements = doc.getElementsByClass("ipl-zebra-list__label");
    for (Element element : elements) {
        // only parse tds
        if (!"td".equals(element.tag().getName())) {
            continue;
        }

        String elementText = element.ownText();

        if (elementText.equals("Taglines")) {
            if (!ImdbMetadataProvider.providerInfo.getConfig().getValueAsBool("useTmdb")) {
                Element taglineElement = element.nextElementSibling();
                if (taglineElement != null) {
                    String tagline = cleanString(taglineElement.ownText().replaceAll("", ""));
                    md.setTagline(tagline);
                }
            }
        }

        if (elementText.equals("Genres")) {
            Element nextElement = element.nextElementSibling();
            if (nextElement != null) {
                Elements genreElements = nextElement.getElementsByAttributeValueStarting("href", "/genre/");

                for (Element genreElement : genreElements) {
                    String genreText = genreElement.ownText();
                    md.addGenre(getTmmGenre(genreText));
                }
            }
        }

        /*
         * Old HTML, but maybe the same content formart <div class="info"><h5>Runtime:</h5><div class="info-content">162 min | 171 min (special edition)
         * | 178 min (extended cut)</div></div>
         */
        if (elementText.equals("Runtime")) {
            Element nextElement = element.nextElementSibling();
            if (nextElement != null) {
                Element runtimeElement = nextElement.getElementsByClass("ipl-inline-list__item").first();
                if (runtimeElement != null) {
                    String first = runtimeElement.ownText().split("\\|")[0];
                    String runtimeAsString = cleanString(first.replaceAll("min", ""));
                    int runtime = 0;
                    try {
                        runtime = Integer.parseInt(runtimeAsString);
                    } catch (Exception e) {
                        // try to filter out the first number we find
                        Pattern runtimePattern = Pattern.compile("([0-9]{2,3})");
                        Matcher matcher = runtimePattern.matcher(runtimeAsString);
                        if (matcher.find()) {
                            runtime = Integer.parseInt(matcher.group(0));
                        }
                    }
                    md.setRuntime(runtime);
                }
            }
        }

        if (elementText.equals("Country")) {
            Element nextElement = element.nextElementSibling();
            if (nextElement != null) {
                Elements countryElements = nextElement.getElementsByAttributeValueStarting("href", "/country/");
                Pattern pattern = Pattern.compile("/country/(.*)");

                for (Element countryElement : countryElements) {
                    Matcher matcher = pattern.matcher(countryElement.attr("href"));
                    if (matcher.matches()) {
                        if (ImdbMetadataProvider.providerInfo.getConfig()
                                .getValueAsBool("scrapeLanguageNames")) {
                            md.addCountry(LanguageUtils.getLocalizedCountryForLanguage(
                                    options.getLanguage().getLanguage(), countryElement.text(),
                                    matcher.group(1)));
                        } else {
                            md.addCountry(matcher.group(1));
                        }
                    }
                }
            }
        }

        if (elementText.equals("Language")) {
            Element nextElement = element.nextElementSibling();
            if (nextElement != null) {
                Elements languageElements = nextElement.getElementsByAttributeValueStarting("href",
                        "/language/");
                Pattern pattern = Pattern.compile("/language/(.*)");

                for (Element languageElement : languageElements) {
                    Matcher matcher = pattern.matcher(languageElement.attr("href"));
                    if (matcher.matches()) {
                        if (ImdbMetadataProvider.providerInfo.getConfig()
                                .getValueAsBool("scrapeLanguageNames")) {
                            md.addSpokenLanguage(LanguageUtils.getLocalizedLanguageNameFromLocalizedString(
                                    options.getLanguage(), languageElement.text(), matcher.group(1)));
                        } else {
                            md.addSpokenLanguage(matcher.group(1));
                        }
                    }
                }
            }
        }

        if (elementText.equals("Certification")) {
            Element nextElement = element.nextElementSibling();
            if (nextElement != null) {
                String languageCode = options.getCountry().getAlpha2();
                Elements certificationElements = nextElement.getElementsByAttributeValueStarting("href",
                        "/search/title?certificates=" + languageCode);
                boolean done = false;
                for (Element certificationElement : certificationElements) {
                    String certText = certificationElement.ownText();
                    int startOfCert = certText.indexOf(":");
                    if (startOfCert > 0 && certText.length() > startOfCert + 1) {
                        certText = certText.substring(startOfCert + 1);
                    }

                    Certification certification = Certification.getCertification(options.getCountry(),
                            certText);
                    if (certification != null) {
                        md.addCertification(certification);
                        done = true;
                        break;
                    }
                }

                if (!done && languageCode.equals("DE")) {
                    certificationElements = nextElement.getElementsByAttributeValueStarting("href",
                            "/search/title?certificates=XWG");
                    for (Element certificationElement : certificationElements) {
                        String certText = certificationElement.ownText();
                        int startOfCert = certText.indexOf(":");
                        if (startOfCert > 0 && certText.length() > startOfCert + 1) {
                            certText = certText.substring(startOfCert + 1);
                        }

                        Certification certification = Certification.getCertification(options.getCountry(),
                                certText);
                        if (certification != null) {
                            md.addCertification(certification);
                            break;
                        }
                    }
                }

            }
        }
    }

    // director
    Element directorsElement = doc.getElementById("directors");
    while (directorsElement != null && directorsElement.tag().getName() != "header") {
        directorsElement = directorsElement.parent();
    }
    if (directorsElement != null) {
        directorsElement = directorsElement.nextElementSibling();
    }
    if (directorsElement != null) {
        for (Element directorElement : directorsElement.getElementsByClass("name")) {
            String director = directorElement.text().trim();

            MediaCastMember cm = new MediaCastMember(MediaCastMember.CastType.DIRECTOR);
            cm.setName(director);
            md.addCastMember(cm);
        }
    }

    // actors
    Element castTableElement = doc.getElementsByClass("cast_list").first();
    if (castTableElement != null) {
        Elements tr = castTableElement.getElementsByTag("tr");
        for (Element row : tr) {
            MediaCastMember cm = parseCastMember(row);
            if (cm != null && StringUtils.isNotEmpty(cm.getName())
                    && StringUtils.isNotEmpty(cm.getCharacter())) {
                cm.setType(MediaCastMember.CastType.ACTOR);
                md.addCastMember(cm);
            }
        }
    }

    // writers
    Element writersElement = doc.getElementById("writers");
    while (writersElement != null && writersElement.tag().getName() != "header") {
        writersElement = writersElement.parent();
    }
    if (writersElement != null) {
        writersElement = writersElement.nextElementSibling();
    }
    if (writersElement != null) {
        Elements writersElements = writersElement.getElementsByAttributeValueStarting("href", "/name/");

        for (Element writerElement : writersElements) {
            String writer = cleanString(writerElement.ownText());
            MediaCastMember cm = new MediaCastMember(MediaCastMember.CastType.WRITER);
            cm.setName(writer);
            md.addCastMember(cm);
        }
    }

    // producers
    Element producersElement = doc.getElementById("producers");
    while (producersElement != null && producersElement.tag().getName() != "header") {
        producersElement = producersElement.parent();
    }
    if (producersElement != null) {
        producersElement = producersElement.nextElementSibling();
    }
    if (producersElement != null) {
        Elements producersElements = producersElement.getElementsByAttributeValueStarting("href", "/name/");

        for (Element producerElement : producersElements) {
            String producer = cleanString(producerElement.ownText());
            MediaCastMember cm = new MediaCastMember(MediaCastMember.CastType.PRODUCER);
            cm.setName(producer);
            md.addCastMember(cm);
        }
    }

    // producers
    Elements prodCompHeaderElements = doc.getElementsByClass("ipl-list-title");
    Element prodCompHeaderElement = null;

    for (Element possibleProdCompHeaderEl : prodCompHeaderElements) {
        if (possibleProdCompHeaderEl.ownText().equals("Production Companies")) {
            prodCompHeaderElement = possibleProdCompHeaderEl;
            break;
        }
    }

    while (prodCompHeaderElement != null && prodCompHeaderElement.tag().getName() != "header") {
        prodCompHeaderElement = prodCompHeaderElement.parent();
    }
    if (prodCompHeaderElement != null) {
        prodCompHeaderElement = prodCompHeaderElement.nextElementSibling();
    }
    if (prodCompHeaderElement != null) {
        Elements prodCompElements = prodCompHeaderElement.getElementsByAttributeValueStarting("href",
                "/company/");

        for (Element prodCompElement : prodCompElements) {
            String prodComp = prodCompElement.ownText();
            md.addProductionCompany(prodComp);
        }
    }

    return md;
}

From source file:org.tinymediamanager.scraper.imdb.ImdbParser.java

protected MediaMetadata parsePlotsummaryPage(Document doc, MediaScrapeOptions options, MediaMetadata md) {
    // imdb.com has another site structure
    if (getImdbSite() == ImdbSiteDefinition.IMDB_COM) {

        // first check synopsis content
        // Element zebraList = doc.getElementById("plot-synopsis-content");
        // if (zebraList != null) {
        // Elements p = zebraList.getElementsByClass("ipl-zebra-list__item");
        // if (!p.isEmpty()) {
        // Element em = p.get(0);
        // if (!"no-synopsis-content".equals(em.id())) {
        // String plot = cleanString(em.text());
        // md.setPlot(plot);
        // }/*from   ww  w.j a va2  s.  c  o m*/
        // }
        // }
        // NOPE: synopsis contains spoilers

        // just take first summary
        // <li class="ipl-zebra-list__item" id="summary-ps21700000">
        // <p>text text text text </p>
        // <div class="author-container">
        // <em>&mdash;<a href="/search/title?plot_author=author">Author Name</a></em>
        // </div>
        // </li>
        Element zebraList = doc.getElementById("plot-summaries-content");
        if (zebraList != null) {
            Elements p = zebraList.getElementsByClass("ipl-zebra-list__item");
            if (!p.isEmpty()) {
                Element em = p.get(0);

                // remove author
                Elements authors = em.getElementsByClass("author-container");
                if (!authors.isEmpty()) {
                    authors.get(0).remove();
                }

                if (!"no-summary-content".equals(em.id())) {
                    String plot = cleanString(em.text());
                    md.setPlot(plot);
                }
            }
        }

    } else {
        Element wiki = doc.getElementById("swiki.2.1");
        if (wiki != null) {
            String plot = cleanString(wiki.ownText());
            md.setPlot(plot);
        }
    }

    return md;
}

From source file:org.tinymediamanager.scraper.imdb.ImdbParser.java

protected MediaCastMember parseCastMember(Element row) {

    Element nameElement = row.getElementsByAttributeValueStarting("itemprop", "name").first();
    if (nameElement == null) {
        return null;
    }/*from ww  w  .  j  a  v a2s  . c  om*/
    String name = cleanString(nameElement.ownText());
    String characterName = "";

    Element characterElement = row.getElementsByClass("character").first();
    if (characterElement != null) {
        characterName = cleanString(characterElement.text());
        // and now strip off trailing commentaries like - (120 episodes,
        // 2006-2014)
        characterName = characterName.replaceAll("\\(.*?\\)$", "").trim();
    }

    String image = "";
    Element imageElement = row.getElementsByTag("img").first();
    if (imageElement != null) {
        String imageSrc = imageElement.attr("loadlate");

        if (!StringUtils.isEmpty(imageSrc)) {
            int fileStart = imageSrc.lastIndexOf("/");
            if (fileStart > 0) {
                // parse out the rescale/crop params
                int parameterStart = imageSrc.indexOf("._", fileStart);
                if (parameterStart > 0) {
                    int startOfExtension = imageSrc.lastIndexOf(".");
                    if (startOfExtension > parameterStart) {
                        // rebuild the path - scaled to 632 px height as in tmdb scraper
                        imageSrc = imageSrc.substring(0, parameterStart) + "._UY632"
                                + imageSrc.substring(startOfExtension);
                    }
                }
            }
            image = imageSrc;
        }
    }

    MediaCastMember cm = new MediaCastMember();
    cm.setCharacter(characterName);
    cm.setName(name);
    cm.setImageUrl(image);
    return cm;
}

From source file:org.tinymediamanager.scraper.imdb.ImdbTvShowParser.java

private boolean parseEpisodeList(int season, List<MediaEpisode> episodes, Document doc) {
    Pattern unknownPattern = Pattern.compile("Unknown");
    Pattern seasonEpisodePattern = Pattern.compile("S([0-9]*), Ep([0-9]*)");
    int episodeCounter = 0;

    // parse episodes
    Elements tables = doc.getElementsByClass("eplist");
    for (Element table : tables) {
        Elements rows = table.getElementsByClass("list_item");
        for (Element row : rows) {
            Matcher matcher = season == 0 ? unknownPattern.matcher(row.text())
                    : seasonEpisodePattern.matcher(row.text());
            if (matcher.find() && (season == 0 || matcher.groupCount() >= 2)) {
                try {
                    // we found a row containing episode data
                    MediaEpisode ep = new MediaEpisode(providerInfo.getId());

                    // parse season and ep number
                    if (season == 0) {
                        ep.season = season;
                        ep.episode = ++episodeCounter;
                    } else {
                        ep.season = Integer.parseInt(matcher.group(1));
                        ep.episode = Integer.parseInt(matcher.group(2));
                    }/*from ww w. j ava 2s.  com*/

                    // check if we have still valid data
                    if (season > 0 && season != ep.season) {
                        return false;
                    }

                    // get ep title and id
                    Elements anchors = row.getElementsByAttributeValueStarting("href", "/title/tt");
                    for (Element anchor : anchors) {
                        if ("name".equals(anchor.attr("itemprop"))) {
                            ep.title = anchor.text();
                            break;
                        }
                    }

                    String id = "";
                    Matcher idMatcher = IMDB_ID_PATTERN.matcher(anchors.get(0).attr("href"));
                    while (idMatcher.find()) {
                        if (idMatcher.group(1) != null) {
                            id = idMatcher.group(1);
                        }
                    }

                    if (StringUtils.isNotBlank(id)) {
                        ep.ids.put(providerInfo.getId(), id);
                    }

                    // plot
                    Element plot = row.getElementsByClass("item_description").first();
                    if (plot != null) {
                        ep.plot = plot.ownText();
                    }

                    // rating and rating count
                    Element ratingElement = row.getElementsByClass("ipl-rating-star__rating").first();
                    if (ratingElement != null) {
                        String ratingAsString = ratingElement.ownText().replace(",", ".");
                        try {
                            ep.rating = Float.valueOf(ratingAsString);
                        } catch (Exception ignored) {
                        }

                        Element votesElement = row.getElementsByClass("ipl-rating-star__total-votes").first();
                        if (votesElement != null) {
                            String countAsString = votesElement.ownText().replaceAll("[.,()]", "").trim();
                            try {
                                ep.voteCount = Integer.parseInt(countAsString);
                            } catch (Exception ignored) {
                            }
                        }
                    }

                    // release date
                    Element releaseDate = row.getElementsByClass("airdate").first();
                    if (releaseDate != null) {
                        ep.firstAired = releaseDate.ownText();
                    }

                    // poster
                    Element image = row.getElementsByTag("img").first();
                    if (image != null) {
                        String posterUrl = image.attr("src");
                        posterUrl = posterUrl.replaceAll("UX[0-9]{2,4}_", "");
                        posterUrl = posterUrl.replaceAll("UY[0-9]{2,4}_", "");
                        posterUrl = posterUrl.replaceAll("CR[0-9]{1,3},[0-9]{1,3},[0-9]{1,3},[0-9]{1,3}_", "");

                        if (StringUtils.isNotBlank(posterUrl)) {
                            MediaArtwork ma = new MediaArtwork(ImdbMetadataProvider.providerInfo.getId(),
                                    MediaArtwork.MediaArtworkType.THUMB);
                            ma.setPreviewUrl(posterUrl);
                            ma.setDefaultUrl(posterUrl);
                            ep.artwork.add(ma);
                        }
                    }

                    episodes.add(ep);
                } catch (Exception e) {
                    LOGGER.warn("failed parsing: " + row.text() + " for ep data; " + e.getMessage());
                }
            }
        }
    }
    return true;
}

From source file:org.xlrnet.metadict.engines.nobordbok.OrdbokEngine.java

@NotNull
private Optional<MonolingualEntry> processTableRow(@NotNull Element tableRow, @NotNull Language language) {
    MonolingualEntryBuilder entryBuilder = ImmutableMonolingualEntry.builder();
    DictionaryObjectBuilder objectBuilder = ImmutableDictionaryObject.builder().setLanguage(language);

    // Extract general form
    Element oppslagsord = tableRow.getElementsByClass("oppslagsord").first();
    if (oppslagsord != null) {
        extractGeneralForm(objectBuilder, oppslagsord);
    } else {//from  www. j av  a 2 s. com
        LOGGER.warn("Unable to find main element - skipping entry.");
        return Optional.empty();
    }

    // Extract wordclass and determine entrytype
    String wordClass = tableRow.getElementsByClass("oppsgramordklasse").first().text();
    entryBuilder.setEntryType(resolveEntryTypeWithWordClass(wordClass));

    // Get meanings
    Elements meaningCandidates = tableRow.select(".artikkelinnhold > .utvidet > .tyding");
    if (meaningCandidates.size() == 0)
        meaningCandidates = tableRow.select(".artikkelinnhold > .utvidet");
    meaningCandidates.forEach(e -> {
        String meaning = e.childNodes().stream()
                .filter(node -> (node instanceof TextNode) || (!((Element) node).hasClass("doemeliste")
                        && !node.hasAttr("style") && !((Element) node).hasClass("utvidet")
                        && !((Element) node).hasClass("artikkelinnhold")
                        && !((Element) node).hasClass("kompakt")))
                .map((Node n) -> {
                    if (n instanceof Element)
                        return ((Element) n).text();
                    else
                        return n.toString();
                }).collect(Collectors.joining());
        meaning = StringEscapeUtils.unescapeHtml4(meaning);
        meaning = StringUtils.strip(meaning);
        if (StringUtils.isNotBlank(meaning))
            objectBuilder.addMeaning(meaning);
    });

    entryBuilder.setContent(objectBuilder.build());

    return Optional.of(entryBuilder.build());
}

From source file:org.xlrnet.metadict.engines.woxikon.WoxikonEngine.java

private void extractBilingualSynonyms(@NotNull String queryString, @NotNull Element synonymsTable,
        @NotNull BilingualQueryResultBuilder resultBuilder, @NotNull Language sourceLanguage) {
    List<Element> synonymNodes = synonymsTable.select("tr").stream()
            .filter(e -> e.getElementsByTag("th").size() == 0).collect(Collectors.toList());

    if (synonymNodes.size() == 0) {
        LOGGER.debug("No synonym entries found");
        return;/*from w w  w . j  a va  2  s .  c  o m*/
    }

    String synonymEntryTitle = synonymsTable.select("span.hl").first().text();

    Map<String, SynonymGroupBuilder> synonymGroupMap = new HashMap<>();

    for (Element synonymNode : synonymNodes) {
        // Extract only information from the "from"-node (i.e. source language)
        DictionaryObject newSynonym = processSingleNode(
                synonymNode.getElementsByClass(CLASS_TRANSLATION).get(0), sourceLanguage, queryString);
        String groupName = newSynonym.getDescription() != null ? newSynonym.getDescription() : queryString;
        if (groupName != null) {
            SynonymGroupBuilder groupBuilder = synonymGroupMap.computeIfAbsent(groupName,
                    (s) -> ImmutableSynonymGroup.builder()
                            .setBaseMeaning(ImmutableDictionaryObject.createSimpleObject(sourceLanguage, s)));
            groupBuilder.addSynonym(newSynonym);
        } else {
            LOGGER.warn("Synonym group is null");
        }
    }

    SynonymEntryBuilder synonymEntryBuilder = ImmutableSynonymEntry.builder()
            .setBaseObject(ImmutableDictionaryObject.createSimpleObject(sourceLanguage, synonymEntryTitle));

    for (SynonymGroupBuilder synonymGroupBuilder : synonymGroupMap.values()) {
        synonymEntryBuilder.addSynonymGroup(synonymGroupBuilder.build());
    }

    resultBuilder.addSynonymEntry(synonymEntryBuilder.build());
}