Example usage for org.jsoup.nodes Document getElementsByClass

List of usage examples for org.jsoup.nodes Document getElementsByClass

Introduction

In this page you can find the example usage for org.jsoup.nodes Document getElementsByClass.

Prototype

public Elements getElementsByClass(String className) 

Source Link

Document

Find elements that have this class, including or under this element.

Usage

From source file:org.tinymediamanager.scraper.imdb.ImdbMovieParser.java

private MediaMetadata parseReleaseinfoPageAKAs(Document doc, MediaScrapeOptions options, MediaMetadata md) {
    // <table id="akas" class="subpage_data spEven2Col">
    // <tr class="even">
    // <td>(original title)</td>
    // <td>Intouchables</td>
    // </tr>
    // need to search all tables for correct ID, since the UNIQUE id is used multiple times - thanks for nothing :p
    for (Element table : doc.getElementsByTag("table")) {
        if (table.id().equalsIgnoreCase("akas")) {
            Elements rows = table.getElementsByTag("tr");
            for (Element row : rows) {
                Element c1 = row.getElementsByTag("td").get(0);
                Element c2 = row.getElementsByTag("td").get(1);
                if (c1 != null && c1.text().toLowerCase(Locale.ROOT).contains("original title")) {
                    md.setOriginalTitle(c2.text());
                    break;
                }/*from  w  ww . jav a2s  . c om*/
            }
        }
    }

    // alternative; new way with table classes
    // <tr class="ipl-zebra-list__item aka-item">
    // <td class="aka-item__name">Germany</td>
    // <td class="aka-item__title">Avatar - Aufbruch nach Pandora</td>
    // </tr>
    if (md.getOriginalTitle().isEmpty()) {
        Elements rows = doc.getElementsByClass("aka-item");
        for (Element row : rows) {
            Element country = row.getElementsByClass("aka-item__name").first();
            Element title = row.getElementsByClass("aka-item__title").first();
            if (country != null && country.text().toLowerCase(Locale.ROOT).contains("original title")) {
                md.setOriginalTitle(title.text());
                break;
            }
        }
    }

    return md;
}

From source file:org.tinymediamanager.scraper.imdb.ImdbParser.java

/**
 * do the search according to the type//from www . j av a  2  s  .  co  m
 * 
 * @param query
 *          the search params
 * @return the found results
 */
protected List<MediaSearchResult> search(MediaSearchOptions query) throws Exception {
    List<MediaSearchResult> result = new ArrayList<>();

    /*
     * IMDb matches seem to come in several "flavours".
     * 
     * Firstly, if there is one exact match it returns the matching IMDb page.
     * 
     * If that fails to produce a unique hit then a list of possible matches are returned categorised as: Popular Titles (Displaying ? Results) Titles
     * (Exact Matches) (Displaying ? Results) Titles (Partial Matches) (Displaying ? Results)
     * 
     * We should check the Exact match section first, then the poplar titles and finally the partial matches.
     * 
     * Note: That even with exact matches there can be more than 1 hit, for example "Star Trek"
     */
    String searchTerm = "";

    if (StringUtils.isNotEmpty(query.getImdbId())) {
        searchTerm = query.getImdbId();
    }

    if (StringUtils.isEmpty(searchTerm)) {
        searchTerm = query.getQuery();
    }

    if (StringUtils.isEmpty(searchTerm)) {
        return result;
    }

    // parse out language and coutry from the scraper query
    String language = query.getLanguage().getLanguage();
    int myear = query.getYear();
    String country = query.getCountry().getAlpha2(); // for passing the country to the scrape

    searchTerm = MetadataUtil.removeNonSearchCharacters(searchTerm);

    StringBuilder sb = new StringBuilder(getImdbSite().getSite());
    sb.append("find?q=");
    try {
        // search site was everytime in UTF-8
        sb.append(URLEncoder.encode(searchTerm, "UTF-8"));
    } catch (UnsupportedEncodingException ex) {
        // Failed to encode the movie name for some reason!
        getLogger().debug("Failed to encode search term: " + searchTerm);
        sb.append(searchTerm);
    }

    // we need to search for all - otherwise we do not find TV movies
    sb.append(getSearchCategory());

    getLogger().debug("========= BEGIN IMDB Scraper Search for: " + sb.toString());
    Document doc;
    try {
        Url url = new Url(sb.toString());
        url.addHeader("Accept-Language", getAcceptLanguage(language, country));
        doc = Jsoup.parse(url.getInputStream(), "UTF-8", "");
    } catch (Exception e) {
        getLogger().debug("tried to fetch search response", e);
        return result;
    }

    // check if it was directly redirected to the site
    Elements elements = doc.getElementsByAttributeValue("rel", "canonical");
    for (Element element : elements) {
        MediaMetadata md = null;
        // we have been redirected to the movie site
        String movieName = null;
        String movieId = null;

        String href = element.attr("href");
        Matcher matcher = IMDB_ID_PATTERN.matcher(href);
        while (matcher.find()) {
            if (matcher.group(1) != null) {
                movieId = matcher.group(1);
            }
        }

        // get full information
        if (!StringUtils.isEmpty(movieId)) {
            MediaScrapeOptions options = new MediaScrapeOptions(type);
            options.setImdbId(movieId);
            options.setLanguage(query.getLanguage());
            options.setCountry(CountryCode.valueOf(country));
            md = getMetadata(options);
            if (!StringUtils.isEmpty(md.getTitle())) {
                movieName = md.getTitle();
            }
        }

        // if a movie name/id was found - return it
        if (StringUtils.isNotEmpty(movieName) && StringUtils.isNotEmpty(movieId)) {
            MediaSearchResult sr = new MediaSearchResult(ImdbMetadataProvider.providerInfo.getId(),
                    query.getMediaType());
            sr.setTitle(movieName);
            sr.setIMDBId(movieId);
            sr.setYear(md.getYear());
            sr.setMetadata(md);
            sr.setScore(1);

            // and parse out the poster
            String posterUrl = "";
            Elements posters = doc.getElementsByClass("poster");
            if (posters != null && !posters.isEmpty()) {
                Elements imgs = posters.get(0).getElementsByTag("img");
                for (Element img : imgs) {
                    posterUrl = img.attr("src");
                    posterUrl = posterUrl.replaceAll("UX[0-9]{2,4}_", "UX200_");
                    posterUrl = posterUrl.replaceAll("UY[0-9]{2,4}_", "UY200_");
                    posterUrl = posterUrl.replaceAll("CR[0-9]{1,3},[0-9]{1,3},[0-9]{1,3},[0-9]{1,3}_", "");
                }
            }
            if (StringUtils.isNotBlank(posterUrl)) {
                sr.setPosterUrl(posterUrl);
            }

            result.add(sr);
            return result;
        }
    }

    // parse results
    // elements = doc.getElementsByClass("result_text");
    elements = doc.getElementsByClass("findResult");
    for (Element tr : elements) {
        // we only want the tr's
        if (!"tr".equalsIgnoreCase(tr.tagName())) {
            continue;
        }

        // find the id / name
        String movieName = "";
        String movieId = "";
        int year = 0;
        Elements tds = tr.getElementsByClass("result_text");
        for (Element element : tds) {
            // we only want the td's
            if (!"td".equalsIgnoreCase(element.tagName())) {
                continue;
            }

            // filter out unwanted results
            Pattern unwantedSearchResultPattern = getUnwantedSearchResultPattern();
            if (unwantedSearchResultPattern != null) {
                Matcher matcher = unwantedSearchResultPattern.matcher(element.text());
                if (matcher.find()) {
                    continue;
                }
            }

            // is there a localized name? (aka)
            String localizedName = "";
            Elements italics = element.getElementsByTag("i");
            if (italics.size() > 0) {
                localizedName = italics.text().replace("\"", "");
            }

            // get the name inside the link
            Elements anchors = element.getElementsByTag("a");
            for (Element a : anchors) {
                if (StringUtils.isNotEmpty(a.text())) {
                    // movie name
                    if (StringUtils.isNotBlank(localizedName) && !language.equals("en")) {
                        // take AKA as title, but only if not EN
                        movieName = localizedName;
                    } else {
                        movieName = a.text();
                    }

                    // parse id
                    String href = a.attr("href");
                    Matcher matcher = IMDB_ID_PATTERN.matcher(href);
                    while (matcher.find()) {
                        if (matcher.group(1) != null) {
                            movieId = matcher.group(1);
                        }
                    }

                    // try to parse out the year
                    Pattern yearPattern = Pattern.compile("\\(([0-9]{4})|/\\)");
                    matcher = yearPattern.matcher(element.text());
                    while (matcher.find()) {
                        if (matcher.group(1) != null) {
                            try {
                                year = Integer.parseInt(matcher.group(1));
                                break;
                            } catch (Exception ignored) {
                            }
                        }
                    }
                    break;
                }
            }
        }

        // if an id/name was found - parse the poster image
        String posterUrl = "";
        tds = tr.getElementsByClass("primary_photo");
        for (Element element : tds) {
            Elements imgs = element.getElementsByTag("img");
            for (Element img : imgs) {
                posterUrl = img.attr("src");
                posterUrl = posterUrl.replaceAll("UX[0-9]{2,4}_", "UX200_");
                posterUrl = posterUrl.replaceAll("UY[0-9]{2,4}_", "UY200_");
                posterUrl = posterUrl.replaceAll("CR[0-9]{1,3},[0-9]{1,3},[0-9]{1,3},[0-9]{1,3}_", "");
            }
        }

        // if no movie name/id was found - continue
        if (StringUtils.isEmpty(movieName) || StringUtils.isEmpty(movieId)) {
            continue;
        }

        MediaSearchResult sr = new MediaSearchResult(ImdbMetadataProvider.providerInfo.getId(),
                query.getMediaType());
        sr.setTitle(movieName);
        sr.setIMDBId(movieId);
        sr.setYear(year);
        sr.setPosterUrl(posterUrl);

        if (movieId.equals(query.getImdbId())) {
            // perfect match
            sr.setScore(1);
        } else {
            // compare score based on names
            float score = MetadataUtil.calculateScore(searchTerm, movieName);
            if (posterUrl.isEmpty() || posterUrl.contains("nopicture")) {
                getLogger().debug("no poster - downgrading score by 0.01");
                score = score - 0.01f;
            }
            if (yearDiffers(myear, year)) {
                float diff = (float) Math.abs(year - myear) / 100;
                getLogger()
                        .debug("parsed year does not match search result year - downgrading score by " + diff);
                score -= diff;
            }
            sr.setScore(score);
        }

        result.add(sr);

        // only get 40 results
        if (result.size() >= 40) {
            break;
        }
    }
    Collections.sort(result);
    Collections.reverse(result);

    return result;
}

From source file:org.tinymediamanager.scraper.imdb.ImdbParser.java

protected MediaMetadata parseReferencePage(Document doc, MediaScrapeOptions options, MediaMetadata md) {
    /*//from w  w w .ja v a2  s  .c  om
     * title and year have the following structure
     * 
     * <div id="tn15title"><h1>Merida - Legende der Highlands <span>(<a href="/year/2012/">2012</a>) <span class="pro-link">...</span> <span
     * class="title-extra">Brave <i>(original title)</i></span> </span></h1> </div>
     */

    // title
    Element title = doc.getElementsByAttributeValue("name", "title").first();
    if (title != null) {
        String movieTitle = cleanString(title.attr("content"));
        int yearStart = movieTitle.lastIndexOf("(");
        if (yearStart > 0) {
            movieTitle = movieTitle.substring(0, yearStart - 1).trim();
            md.setTitle(movieTitle);
        }
    }

    // original title and year
    Element originalTitleYear = doc.getElementsByAttributeValue("property", "og:title").first();
    if (originalTitleYear != null) {
        String content = originalTitleYear.attr("content");
        int startOfYear = content.lastIndexOf("(");
        if (startOfYear > 0) {
            // noo - this is NOT the original title!!! (seems always english?) parse from AKAs page...
            // String originalTitle = content.substring(0, startOfYear - 1).trim();
            // md.setOriginalTitle(originalTitle);

            String yearText = content.substring(startOfYear);

            // search year
            Pattern yearPattern = Pattern.compile("[1-2][0-9]{3}");
            Matcher matcher = yearPattern.matcher(yearText);
            while (matcher.find()) {
                if (matcher.group(0) != null) {
                    String movieYear = matcher.group(0);
                    try {
                        md.setYear(Integer.parseInt(movieYear));
                        break;
                    } catch (Exception ignored) {
                    }
                }
            }
        }
    }

    // poster
    Element poster = doc.getElementsByAttributeValue("property", "og:image").first();
    if (poster != null) {
        String posterUrl = poster.attr("content");

        int fileStart = posterUrl.lastIndexOf("/");
        if (fileStart > 0) {
            int parameterStart = posterUrl.indexOf("_", fileStart);
            if (parameterStart > 0) {
                int startOfExtension = posterUrl.lastIndexOf(".");
                if (startOfExtension > parameterStart) {
                    posterUrl = posterUrl.substring(0, parameterStart) + posterUrl.substring(startOfExtension);

                }
            }
        }
        processMediaArt(md, MediaArtwork.MediaArtworkType.POSTER, posterUrl);
    }

    /*
     * <div class="starbar-meta"> <b>7.4/10</b> &nbsp;&nbsp;<a href="ratings" class="tn15more">52,871 votes</a>&nbsp;&raquo; </div>
     */

    // rating and rating count
    Element ratingElement = doc.getElementsByClass("ipl-rating-star__rating").first();
    if (ratingElement != null) {
        String ratingAsString = ratingElement.ownText().replace(",", ".");
        try {
            md.setRating(Float.valueOf(ratingAsString));
        } catch (Exception ignored) {
        }

        Element votesElement = doc.getElementsByClass("ipl-rating-star__total-votes").first();
        if (votesElement != null) {
            String countAsString = votesElement.ownText().replaceAll("[.,()\\u00a0]", "").trim();
            try {
                md.setVoteCount(Integer.parseInt(countAsString));
            } catch (Exception ignored) {
            }
        }
    }
    // top250
    Element topRatedElement = doc.getElementsByAttributeValue("href", "/chart/top").first();
    if (topRatedElement != null) {
        Pattern topPattern = Pattern.compile("Top Rated Movies: #([0-9]{1,3})");
        Matcher matcher = topPattern.matcher(topRatedElement.ownText());
        while (matcher.find()) {
            if (matcher.group(1) != null) {
                try {
                    String top250Text = matcher.group(1);
                    md.setTop250(Integer.parseInt(top250Text));
                } catch (Exception ignored) {
                }
            }
        }
    }

    // releasedate
    Element releaseDateElement = doc
            .getElementsByAttributeValue("href", "/title/" + options.getImdbId().toLowerCase() + "/releaseinfo")
            .first();
    if (releaseDateElement != null) {
        String releaseDateText = releaseDateElement.ownText();
        int startOfCountry = releaseDateText.indexOf("(");
        if (startOfCountry > 0) {
            releaseDateText = releaseDateText.substring(0, startOfCountry - 1).trim();
        }
        try {
            SimpleDateFormat sdf = new SimpleDateFormat("d MMMM yyyy", Locale.US);
            Date parsedDate = sdf.parse(releaseDateText);
            md.setReleaseDate(parsedDate);
        } catch (ParseException otherformat) {
            try {
                SimpleDateFormat sdf = new SimpleDateFormat("MMMM yyyy", Locale.US);
                Date parsedDate = sdf.parse(releaseDateText);
                md.setReleaseDate(parsedDate);
            } catch (ParseException ignored) {
            }
        }
    }

    Elements elements = doc.getElementsByClass("ipl-zebra-list__label");
    for (Element element : elements) {
        // only parse tds
        if (!"td".equals(element.tag().getName())) {
            continue;
        }

        String elementText = element.ownText();

        if (elementText.equals("Taglines")) {
            if (!ImdbMetadataProvider.providerInfo.getConfig().getValueAsBool("useTmdb")) {
                Element taglineElement = element.nextElementSibling();
                if (taglineElement != null) {
                    String tagline = cleanString(taglineElement.ownText().replaceAll("", ""));
                    md.setTagline(tagline);
                }
            }
        }

        if (elementText.equals("Genres")) {
            Element nextElement = element.nextElementSibling();
            if (nextElement != null) {
                Elements genreElements = nextElement.getElementsByAttributeValueStarting("href", "/genre/");

                for (Element genreElement : genreElements) {
                    String genreText = genreElement.ownText();
                    md.addGenre(getTmmGenre(genreText));
                }
            }
        }

        /*
         * Old HTML, but maybe the same content formart <div class="info"><h5>Runtime:</h5><div class="info-content">162 min | 171 min (special edition)
         * | 178 min (extended cut)</div></div>
         */
        if (elementText.equals("Runtime")) {
            Element nextElement = element.nextElementSibling();
            if (nextElement != null) {
                Element runtimeElement = nextElement.getElementsByClass("ipl-inline-list__item").first();
                if (runtimeElement != null) {
                    String first = runtimeElement.ownText().split("\\|")[0];
                    String runtimeAsString = cleanString(first.replaceAll("min", ""));
                    int runtime = 0;
                    try {
                        runtime = Integer.parseInt(runtimeAsString);
                    } catch (Exception e) {
                        // try to filter out the first number we find
                        Pattern runtimePattern = Pattern.compile("([0-9]{2,3})");
                        Matcher matcher = runtimePattern.matcher(runtimeAsString);
                        if (matcher.find()) {
                            runtime = Integer.parseInt(matcher.group(0));
                        }
                    }
                    md.setRuntime(runtime);
                }
            }
        }

        if (elementText.equals("Country")) {
            Element nextElement = element.nextElementSibling();
            if (nextElement != null) {
                Elements countryElements = nextElement.getElementsByAttributeValueStarting("href", "/country/");
                Pattern pattern = Pattern.compile("/country/(.*)");

                for (Element countryElement : countryElements) {
                    Matcher matcher = pattern.matcher(countryElement.attr("href"));
                    if (matcher.matches()) {
                        if (ImdbMetadataProvider.providerInfo.getConfig()
                                .getValueAsBool("scrapeLanguageNames")) {
                            md.addCountry(LanguageUtils.getLocalizedCountryForLanguage(
                                    options.getLanguage().getLanguage(), countryElement.text(),
                                    matcher.group(1)));
                        } else {
                            md.addCountry(matcher.group(1));
                        }
                    }
                }
            }
        }

        if (elementText.equals("Language")) {
            Element nextElement = element.nextElementSibling();
            if (nextElement != null) {
                Elements languageElements = nextElement.getElementsByAttributeValueStarting("href",
                        "/language/");
                Pattern pattern = Pattern.compile("/language/(.*)");

                for (Element languageElement : languageElements) {
                    Matcher matcher = pattern.matcher(languageElement.attr("href"));
                    if (matcher.matches()) {
                        if (ImdbMetadataProvider.providerInfo.getConfig()
                                .getValueAsBool("scrapeLanguageNames")) {
                            md.addSpokenLanguage(LanguageUtils.getLocalizedLanguageNameFromLocalizedString(
                                    options.getLanguage(), languageElement.text(), matcher.group(1)));
                        } else {
                            md.addSpokenLanguage(matcher.group(1));
                        }
                    }
                }
            }
        }

        if (elementText.equals("Certification")) {
            Element nextElement = element.nextElementSibling();
            if (nextElement != null) {
                String languageCode = options.getCountry().getAlpha2();
                Elements certificationElements = nextElement.getElementsByAttributeValueStarting("href",
                        "/search/title?certificates=" + languageCode);
                boolean done = false;
                for (Element certificationElement : certificationElements) {
                    String certText = certificationElement.ownText();
                    int startOfCert = certText.indexOf(":");
                    if (startOfCert > 0 && certText.length() > startOfCert + 1) {
                        certText = certText.substring(startOfCert + 1);
                    }

                    Certification certification = Certification.getCertification(options.getCountry(),
                            certText);
                    if (certification != null) {
                        md.addCertification(certification);
                        done = true;
                        break;
                    }
                }

                if (!done && languageCode.equals("DE")) {
                    certificationElements = nextElement.getElementsByAttributeValueStarting("href",
                            "/search/title?certificates=XWG");
                    for (Element certificationElement : certificationElements) {
                        String certText = certificationElement.ownText();
                        int startOfCert = certText.indexOf(":");
                        if (startOfCert > 0 && certText.length() > startOfCert + 1) {
                            certText = certText.substring(startOfCert + 1);
                        }

                        Certification certification = Certification.getCertification(options.getCountry(),
                                certText);
                        if (certification != null) {
                            md.addCertification(certification);
                            break;
                        }
                    }
                }

            }
        }
    }

    // director
    Element directorsElement = doc.getElementById("directors");
    while (directorsElement != null && directorsElement.tag().getName() != "header") {
        directorsElement = directorsElement.parent();
    }
    if (directorsElement != null) {
        directorsElement = directorsElement.nextElementSibling();
    }
    if (directorsElement != null) {
        for (Element directorElement : directorsElement.getElementsByClass("name")) {
            String director = directorElement.text().trim();

            MediaCastMember cm = new MediaCastMember(MediaCastMember.CastType.DIRECTOR);
            cm.setName(director);
            md.addCastMember(cm);
        }
    }

    // actors
    Element castTableElement = doc.getElementsByClass("cast_list").first();
    if (castTableElement != null) {
        Elements tr = castTableElement.getElementsByTag("tr");
        for (Element row : tr) {
            MediaCastMember cm = parseCastMember(row);
            if (cm != null && StringUtils.isNotEmpty(cm.getName())
                    && StringUtils.isNotEmpty(cm.getCharacter())) {
                cm.setType(MediaCastMember.CastType.ACTOR);
                md.addCastMember(cm);
            }
        }
    }

    // writers
    Element writersElement = doc.getElementById("writers");
    while (writersElement != null && writersElement.tag().getName() != "header") {
        writersElement = writersElement.parent();
    }
    if (writersElement != null) {
        writersElement = writersElement.nextElementSibling();
    }
    if (writersElement != null) {
        Elements writersElements = writersElement.getElementsByAttributeValueStarting("href", "/name/");

        for (Element writerElement : writersElements) {
            String writer = cleanString(writerElement.ownText());
            MediaCastMember cm = new MediaCastMember(MediaCastMember.CastType.WRITER);
            cm.setName(writer);
            md.addCastMember(cm);
        }
    }

    // producers
    Element producersElement = doc.getElementById("producers");
    while (producersElement != null && producersElement.tag().getName() != "header") {
        producersElement = producersElement.parent();
    }
    if (producersElement != null) {
        producersElement = producersElement.nextElementSibling();
    }
    if (producersElement != null) {
        Elements producersElements = producersElement.getElementsByAttributeValueStarting("href", "/name/");

        for (Element producerElement : producersElements) {
            String producer = cleanString(producerElement.ownText());
            MediaCastMember cm = new MediaCastMember(MediaCastMember.CastType.PRODUCER);
            cm.setName(producer);
            md.addCastMember(cm);
        }
    }

    // producers
    Elements prodCompHeaderElements = doc.getElementsByClass("ipl-list-title");
    Element prodCompHeaderElement = null;

    for (Element possibleProdCompHeaderEl : prodCompHeaderElements) {
        if (possibleProdCompHeaderEl.ownText().equals("Production Companies")) {
            prodCompHeaderElement = possibleProdCompHeaderEl;
            break;
        }
    }

    while (prodCompHeaderElement != null && prodCompHeaderElement.tag().getName() != "header") {
        prodCompHeaderElement = prodCompHeaderElement.parent();
    }
    if (prodCompHeaderElement != null) {
        prodCompHeaderElement = prodCompHeaderElement.nextElementSibling();
    }
    if (prodCompHeaderElement != null) {
        Elements prodCompElements = prodCompHeaderElement.getElementsByAttributeValueStarting("href",
                "/company/");

        for (Element prodCompElement : prodCompElements) {
            String prodComp = prodCompElement.ownText();
            md.addProductionCompany(prodComp);
        }
    }

    return md;
}

From source file:org.tinymediamanager.scraper.imdb.ImdbTvShowParser.java

/**
 * get the episode metadata./*www.ja va2  s  .c om*/
 * 
 * @param options
 *          the scrape options
 * @return the MediaMetaData
 * @throws Exception
 */
MediaMetadata getEpisodeMetadata(MediaScrapeOptions options) throws Exception {
    MediaMetadata md = new MediaMetadata(providerInfo.getId());

    String imdbId = options.getImdbId();
    if (StringUtils.isBlank(imdbId)) {
        return md;
    }

    // get episode number and season number
    int seasonNr = -1;
    int episodeNr = -1;

    try {
        seasonNr = Integer.parseInt(options.getId(MediaMetadata.SEASON_NR));
        episodeNr = Integer.parseInt(options.getId(MediaMetadata.EPISODE_NR));
    } catch (Exception e) {
        LOGGER.warn("error parsing season/episode number");
    }

    if (seasonNr == -1 || episodeNr == -1) {
        return md;
    }

    // first get the base episode metadata which can be gathered via
    // getEpisodeList()
    List<MediaEpisode> episodes = getEpisodeList(options);

    MediaEpisode wantedEpisode = null;
    for (MediaEpisode episode : episodes) {
        if (episode.season == seasonNr && episode.episode == episodeNr) {
            wantedEpisode = episode;
            break;
        }
    }

    // we did not find the episode; return
    if (wantedEpisode == null) {
        return md;
    }

    md.setId(providerInfo.getId(), wantedEpisode.ids.get(providerInfo.getId()));
    md.setEpisodeNumber(wantedEpisode.episode);
    md.setSeasonNumber(wantedEpisode.season);
    md.setTitle(wantedEpisode.title);
    md.setPlot(wantedEpisode.plot);
    md.setRating(wantedEpisode.rating);
    md.setVoteCount(wantedEpisode.voteCount);

    try {
        SimpleDateFormat sdf = new SimpleDateFormat("d MMM. yyyy", Locale.US);
        md.setReleaseDate(sdf.parse(wantedEpisode.firstAired));
    } catch (ParseException e) {
        try {
            SimpleDateFormat sdf = new SimpleDateFormat("d MMM yyyy", Locale.US); // without "dot" - "May" for example
            md.setReleaseDate(sdf.parse(wantedEpisode.firstAired));
        } catch (ParseException ign) {
            LOGGER.warn("Could not parse date format: {}", wantedEpisode.firstAired);
        }
    }

    // and finally the cast which needed to be fetched from the fullcredits page
    if (wantedEpisode.ids.get(providerInfo.getId()) instanceof String
            && StringUtils.isNotBlank((String) wantedEpisode.ids.get(providerInfo.getId()))) {
        Url url = new Url(
                imdbSite.getSite() + "/title/" + wantedEpisode.ids.get(providerInfo.getId()) + "/fullcredits");
        url.addHeader("Accept-Language", "en"); // force EN for parsing by HTMl texts
        Document doc = Jsoup.parse(url.getInputStream(), imdbSite.getCharset().displayName(), "");

        // director & writer
        Element fullcredits = doc.getElementById("fullcredits_content");
        if (fullcredits != null) {
            Elements tables = fullcredits.getElementsByTag("table");

            // first table are directors
            if (tables.get(0) != null) {
                for (Element director : tables.get(0).getElementsByClass("name")) {
                    MediaCastMember cm = new MediaCastMember(MediaCastMember.CastType.DIRECTOR);
                    cm.setName(director.text());
                    md.addCastMember(cm);
                }
            }

            // second table are writers
            if (tables.get(1) != null) {
                for (Element writer : tables.get(1).getElementsByClass("name")) {
                    MediaCastMember cm = new MediaCastMember(MediaCastMember.CastType.WRITER);
                    cm.setName(writer.text());
                    md.addCastMember(cm);
                }
            }
        }

        // actors
        Element castTableElement = doc.getElementsByClass("cast_list").first();
        if (castTableElement != null) {
            Elements tr = castTableElement.getElementsByTag("tr");
            for (Element row : tr) {
                MediaCastMember cm = parseCastMember(row);
                if (cm != null && StringUtils.isNotEmpty(cm.getName())
                        && StringUtils.isNotEmpty(cm.getCharacter())) {
                    cm.setType(MediaCastMember.CastType.ACTOR);
                    md.addCastMember(cm);
                }
            }
        }
    }

    return md;
}

From source file:org.tinymediamanager.scraper.imdb.ImdbTvShowParser.java

private boolean parseEpisodeList(int season, List<MediaEpisode> episodes, Document doc) {
    Pattern unknownPattern = Pattern.compile("Unknown");
    Pattern seasonEpisodePattern = Pattern.compile("S([0-9]*), Ep([0-9]*)");
    int episodeCounter = 0;

    // parse episodes
    Elements tables = doc.getElementsByClass("eplist");
    for (Element table : tables) {
        Elements rows = table.getElementsByClass("list_item");
        for (Element row : rows) {
            Matcher matcher = season == 0 ? unknownPattern.matcher(row.text())
                    : seasonEpisodePattern.matcher(row.text());
            if (matcher.find() && (season == 0 || matcher.groupCount() >= 2)) {
                try {
                    // we found a row containing episode data
                    MediaEpisode ep = new MediaEpisode(providerInfo.getId());

                    // parse season and ep number
                    if (season == 0) {
                        ep.season = season;
                        ep.episode = ++episodeCounter;
                    } else {
                        ep.season = Integer.parseInt(matcher.group(1));
                        ep.episode = Integer.parseInt(matcher.group(2));
                    }/*from   www . ja va 2 s .co m*/

                    // check if we have still valid data
                    if (season > 0 && season != ep.season) {
                        return false;
                    }

                    // get ep title and id
                    Elements anchors = row.getElementsByAttributeValueStarting("href", "/title/tt");
                    for (Element anchor : anchors) {
                        if ("name".equals(anchor.attr("itemprop"))) {
                            ep.title = anchor.text();
                            break;
                        }
                    }

                    String id = "";
                    Matcher idMatcher = IMDB_ID_PATTERN.matcher(anchors.get(0).attr("href"));
                    while (idMatcher.find()) {
                        if (idMatcher.group(1) != null) {
                            id = idMatcher.group(1);
                        }
                    }

                    if (StringUtils.isNotBlank(id)) {
                        ep.ids.put(providerInfo.getId(), id);
                    }

                    // plot
                    Element plot = row.getElementsByClass("item_description").first();
                    if (plot != null) {
                        ep.plot = plot.ownText();
                    }

                    // rating and rating count
                    Element ratingElement = row.getElementsByClass("ipl-rating-star__rating").first();
                    if (ratingElement != null) {
                        String ratingAsString = ratingElement.ownText().replace(",", ".");
                        try {
                            ep.rating = Float.valueOf(ratingAsString);
                        } catch (Exception ignored) {
                        }

                        Element votesElement = row.getElementsByClass("ipl-rating-star__total-votes").first();
                        if (votesElement != null) {
                            String countAsString = votesElement.ownText().replaceAll("[.,()]", "").trim();
                            try {
                                ep.voteCount = Integer.parseInt(countAsString);
                            } catch (Exception ignored) {
                            }
                        }
                    }

                    // release date
                    Element releaseDate = row.getElementsByClass("airdate").first();
                    if (releaseDate != null) {
                        ep.firstAired = releaseDate.ownText();
                    }

                    // poster
                    Element image = row.getElementsByTag("img").first();
                    if (image != null) {
                        String posterUrl = image.attr("src");
                        posterUrl = posterUrl.replaceAll("UX[0-9]{2,4}_", "");
                        posterUrl = posterUrl.replaceAll("UY[0-9]{2,4}_", "");
                        posterUrl = posterUrl.replaceAll("CR[0-9]{1,3},[0-9]{1,3},[0-9]{1,3},[0-9]{1,3}_", "");

                        if (StringUtils.isNotBlank(posterUrl)) {
                            MediaArtwork ma = new MediaArtwork(ImdbMetadataProvider.providerInfo.getId(),
                                    MediaArtwork.MediaArtworkType.THUMB);
                            ma.setPreviewUrl(posterUrl);
                            ma.setDefaultUrl(posterUrl);
                            ep.artwork.add(ma);
                        }
                    }

                    episodes.add(ep);
                } catch (Exception e) {
                    LOGGER.warn("failed parsing: " + row.text() + " for ep data; " + e.getMessage());
                }
            }
        }
    }
    return true;
}

From source file:org.tinymediamanager.scraper.ofdb.OfdbMetadataProvider.java

@Override
public MediaMetadata getMetadata(MediaScrapeOptions options) throws Exception {
    LOGGER.debug("getMetadata() " + options.toString());

    if (options.getType() != MediaType.MOVIE) {
        throw new UnsupportedMediaTypeException(options.getType());
    }//from  ww  w.j  av  a 2  s.  c  om

    // we have 3 entry points here
    // a) getMetadata has been called with an ofdbId
    // b) getMetadata has been called with an imdbId
    // c) getMetadata has been called from a previous search

    String detailUrl = "";

    // case a) and c)
    if (StringUtils.isNotBlank(options.getId(getProviderInfo().getId())) || options.getResult() != null) {

        if (StringUtils.isNotBlank(options.getId(getProviderInfo().getId()))) {
            detailUrl = "http://www.ofdb.de/view.php?page=film&fid=" + options.getId(getProviderInfo().getId());
        } else {
            detailUrl = options.getResult().getUrl();
        }
    }

    // case b)
    if (options.getResult() == null && StringUtils.isNotBlank(options.getId(MediaMetadata.IMDB))) {
        MediaSearchOptions searchOptions = new MediaSearchOptions(MediaType.MOVIE);
        searchOptions.setImdbId(options.getId(MediaMetadata.IMDB));
        try {
            List<MediaSearchResult> results = search(searchOptions);
            if (results != null && !results.isEmpty()) {
                options.setResult(results.get(0));
                detailUrl = options.getResult().getUrl();
            }
        } catch (Exception e) {
            LOGGER.warn("failed IMDB search: " + e.getMessage());
        }
    }

    // we can only work further if we got a search result on ofdb.de
    if (StringUtils.isBlank(detailUrl)) {
        throw new Exception("We did not get any useful movie url");
    }

    MediaMetadata md = new MediaMetadata(providerInfo.getId());
    // generic Elements used all over
    Elements el = null;
    String ofdbId = StrgUtils.substr(detailUrl, "film\\/(\\d+),");
    if (StringUtils.isBlank(ofdbId)) {
        ofdbId = StrgUtils.substr(detailUrl, "fid=(\\d+)");
    }

    Url url;
    try {
        LOGGER.trace("get details page");
        url = new Url(detailUrl);
        InputStream in = url.getInputStream();
        Document doc = Jsoup.parse(in, "UTF-8", "");
        in.close();

        if (doc.getAllElements().size() < 10) {
            throw new Exception("meh - we did not receive a valid web page");
        }

        // parse details

        // IMDB ID "http://www.imdb.com/Title?1194173"
        el = doc.getElementsByAttributeValueContaining("href", "imdb.com");
        if (!el.isEmpty()) {
            md.setId(MediaMetadata.IMDB, "tt" + StrgUtils.substr(el.first().attr("href"), "\\?(\\d+)"));
        }

        // title / year
        // <meta property="og:title" content="Bourne Vermchtnis, Das (2012)" />
        el = doc.getElementsByAttributeValue("property", "og:title");
        if (!el.isEmpty()) {
            String[] ty = parseTitle(el.first().attr("content"));
            md.setTitle(StrgUtils.removeCommonSortableName(ty[0]));
            try {
                md.setYear(Integer.parseInt(ty[1]));
            } catch (Exception ignored) {
            }
        }
        // another year position
        if (md.getYear() == 0) {
            // <a href="view.php?page=blaettern&Kat=Jahr&Text=2012">2012</a>
            el = doc.getElementsByAttributeValueContaining("href", "Kat=Jahr");
            try {
                md.setYear(Integer.parseInt(el.first().text()));
            } catch (Exception ignored) {
            }
        }

        // original title (has to be searched with a regexp)
        // <tr valign="top">
        // <td nowrap=""><font class="Normal" face="Arial,Helvetica,sans-serif"
        // size="2">Originaltitel:</font></td>
        // <td>&nbsp;&nbsp;</td>
        // <td width="99%"><font class="Daten" face="Arial,Helvetica,sans-serif"
        // size="2"><b>Brave</b></font></td>
        // </tr>
        String originalTitle = StrgUtils.substr(doc.body().html(), "(?s)Originaltitel.*?<b>(.*?)</b>");
        if (!originalTitle.isEmpty()) {
            md.setOriginalTitle(StrgUtils.removeCommonSortableName(originalTitle));
        }

        // Genre: <a href="view.php?page=genre&Genre=Action">Action</a>
        el = doc.getElementsByAttributeValueContaining("href", "page=genre");
        for (Element g : el) {
            md.addGenre(getTmmGenre(g.text()));
        }

        // rating
        // <div itemtype="http://schema.org/AggregateRating" itemscope
        // itemprop="aggregateRating">Note: <span
        // itemprop="ratingValue">6.73</span><meta
        // itemprop="worstRating" content="1" />
        el = doc.getElementsByAttributeValue("itemprop", "ratingValue");
        if (!el.isEmpty()) {
            String r = el.text();
            if (!r.isEmpty()) {
                try {
                    md.setRating(Float.parseFloat(r));
                } catch (Exception e) {
                    LOGGER.debug("could not parse rating");
                }
            }
        }

        // get PlotLink; open url and parse
        // <a href="plot/22523,31360,Die-Bourne-Identitt"><b>[mehr]</b></a>
        LOGGER.trace("parse plot");
        el = doc.getElementsByAttributeValueMatching("href", "plot\\/\\d+,");
        if (!el.isEmpty()) {
            String plotUrl = BASE_URL + "/" + el.first().attr("href");
            try {
                url = new Url(plotUrl);
                in = url.getInputStream();
                Document plot = Jsoup.parse(in, "UTF-8", "");
                in.close();
                Elements block = plot.getElementsByClass("Blocksatz"); // first
                                                                       // Blocksatz
                                                                       // is plot
                String p = block.first().text(); // remove all html stuff
                p = p.substring(p.indexOf("Mal gelesen") + 12); // remove "header"
                md.setPlot(p);
            } catch (Exception e) {
                LOGGER.error("failed to get plot page: " + e.getMessage());
            }
        }

        // http://www.ofdb.de/view.php?page=film_detail&fid=226745
        LOGGER.debug("parse actor detail");
        String movieDetail = BASE_URL + "/view.php?page=film_detail&fid=" + ofdbId;
        doc = null;
        try {
            url = new Url(movieDetail);
            in = url.getInputStream();
            doc = Jsoup.parse(in, "UTF-8", "");
            in.close();
        } catch (Exception e) {
            LOGGER.error("failed to get detail page: " + e.getMessage());
        }

        if (doc != null) {
            parseCast(doc.getElementsContainingOwnText("Regie"), MediaCastMember.CastType.DIRECTOR, md);
            parseCast(doc.getElementsContainingOwnText("Darsteller"), MediaCastMember.CastType.ACTOR, md);
            parseCast(doc.getElementsContainingOwnText("Stimme/Sprecher"), MediaCastMember.CastType.ACTOR, md);
            parseCast(doc.getElementsContainingOwnText("Synchronstimme (deutsch)"),
                    MediaCastMember.CastType.ACTOR, md);
            parseCast(doc.getElementsContainingOwnText("Drehbuchautor(in)"), MediaCastMember.CastType.WRITER,
                    md);
            parseCast(doc.getElementsContainingOwnText("Produzent(in)"), MediaCastMember.CastType.PRODUCER, md);
        }
    } catch (Exception e) {
        LOGGER.error("Error parsing " + detailUrl);
        throw e;
    }

    return md;
}

From source file:org.tinymediamanager.scraper.ofdb.OfdbMetadataProvider.java

@Override
public List<MediaTrailer> getTrailers(MediaScrapeOptions options) throws Exception {
    LOGGER.debug("getTrailers() " + options.toString());
    List<MediaTrailer> trailers = new ArrayList<>();
    if (!MetadataUtil.isValidImdbId(options.getImdbId())) {
        LOGGER.debug("IMDB id not found");
        return trailers;
    }//from  w w  w .j a  v  a2 s  . c  o  m
    /*
     * function getTrailerData(ci) { switch (ci) { case 'http://de.clip-1.filmtrailer.com/9507_31566_a_1.flv?log_var=72|491100001 -1|-' : return
     * '<b>Trailer 1</b><br><i>(small)</i><br><br>&raquo; 160px<br><br>Download:<br>&raquo; <a href=
     * "http://de.clip-1.filmtrailer.com/9507_31566_a_1.wmv?log_var=72|491100001-1|-" >wmv</a><br>'; case
     * 'http://de.clip-1.filmtrailer.com/9507_31566_a_2.flv?log_var=72|491100001 -1|-' : return '<b>Trailer 1</b><br><i>(medium)</i><br><br>&raquo;
     * 240px<br><br>Download:<br>&raquo; <a href= "http://de.clip-1.filmtrailer.com/9507_31566_a_2.wmv?log_var=72|491100001-1|-" >wmv</a><br>'; case
     * 'http://de.clip-1.filmtrailer.com/9507_31566_a_3.flv?log_var=72|491100001 -1|-' : return '<b>Trailer 1</b><br><i>(large)</i><br><br>&raquo;
     * 320px<br><br>Download:<br>&raquo; <a href= "http://de.clip-1.filmtrailer.com/9507_31566_a_3.wmv?log_var=72|491100001-1|-" >wmv</a><br>&raquo;
     * <a href= "http://de.clip-1.filmtrailer.com/9507_31566_a_3.mp4?log_var=72|491100001-1|-" >mp4</a><br>&raquo; <a href=
     * "http://de.clip-1.filmtrailer.com/9507_31566_a_3.webm?log_var=72|491100001-1|-" >webm</a><br>'; case
     * 'http://de.clip-1.filmtrailer.com/9507_31566_a_4.flv?log_var=72|491100001 -1|-' : return '<b>Trailer 1</b><br><i>(xlarge)</i><br><br>&raquo;
     * 400px<br><br>Download:<br>&raquo; <a href= "http://de.clip-1.filmtrailer.com/9507_31566_a_4.wmv?log_var=72|491100001-1|-" >wmv</a><br>&raquo;
     * <a href= "http://de.clip-1.filmtrailer.com/9507_31566_a_4.mp4?log_var=72|491100001-1|-" >mp4</a><br>&raquo; <a href=
     * "http://de.clip-1.filmtrailer.com/9507_31566_a_4.webm?log_var=72|491100001-1|-" >webm</a><br>'; case
     * 'http://de.clip-1.filmtrailer.com/9507_31566_a_5.flv?log_var=72|491100001 -1|-' : return '<b>Trailer 1</b><br><i>(xxlarge)</i><br><br>&raquo;
     * 640px<br><br>Download:<br>&raquo; <a href= "http://de.clip-1.filmtrailer.com/9507_31566_a_5.wmv?log_var=72|491100001-1|-" >wmv</a><br>&raquo;
     * <a href= "http://de.clip-1.filmtrailer.com/9507_31566_a_5.mp4?log_var=72|491100001-1|-" >mp4</a><br>&raquo; <a href=
     * "http://de.clip-1.filmtrailer.com/9507_31566_a_5.webm?log_var=72|491100001-1|-" >webm</a><br>'; case
     * 'http://de.clip-1.filmtrailer.com/9507_39003_a_1.flv?log_var=72|491100001 -1|-' : return '<b>Trailer 2</b><br><i>(small)</i><br><br>&raquo;
     * 160px<br><br>Download:<br>&raquo; <a href= "http://de.clip-1.filmtrailer.com/9507_39003_a_1.wmv?log_var=72|491100001-1|-" >wmv</a><br>'; case
     * 'http://de.clip-1.filmtrailer.com/9507_39003_a_2.flv?log_var=72|491100001 -1|-' : return '<b>Trailer 2</b><br><i>(medium)</i><br><br>&raquo;
     * 240px<br><br>Download:<br>&raquo; <a href= "http://de.clip-1.filmtrailer.com/9507_39003_a_2.wmv?log_var=72|491100001-1|-" >wmv</a><br>'; case
     * 'http://de.clip-1.filmtrailer.com/9507_39003_a_3.flv?log_var=72|491100001 -1|-' : return '<b>Trailer 2</b><br><i>(large)</i><br><br>&raquo;
     * 320px<br><br>Download:<br>&raquo; <a href= "http://de.clip-1.filmtrailer.com/9507_39003_a_3.wmv?log_var=72|491100001-1|-" >wmv</a><br>&raquo;
     * <a href= "http://de.clip-1.filmtrailer.com/9507_39003_a_3.mp4?log_var=72|491100001-1|-" >mp4</a><br>&raquo; <a href=
     * "http://de.clip-1.filmtrailer.com/9507_39003_a_3.webm?log_var=72|491100001-1|-" >webm</a><br>'; case
     * 'http://de.clip-1.filmtrailer.com/9507_39003_a_4.flv?log_var=72|491100001 -1|-' : return '<b>Trailer 2</b><br><i>(xlarge)</i><br><br>&raquo;
     * 400px<br><br>Download:<br>&raquo; <a href= "http://de.clip-1.filmtrailer.com/9507_39003_a_4.wmv?log_var=72|491100001-1|-" >wmv</a><br>&raquo;
     * <a href= "http://de.clip-1.filmtrailer.com/9507_39003_a_4.mp4?log_var=72|491100001-1|-" >mp4</a><br>&raquo; <a href=
     * "http://de.clip-1.filmtrailer.com/9507_39003_a_4.webm?log_var=72|491100001-1|-" >webm</a><br>'; case
     * 'http://de.clip-1.filmtrailer.com/9507_39003_a_5.flv?log_var=72|491100001 -1|-' : return '<b>Trailer 2</b><br><i>(xxlarge)</i><br><br>&raquo;
     * 640px<br><br>Download:<br>&raquo; <a href= "http://de.clip-1.filmtrailer.com/9507_39003_a_5.wmv?log_var=72|491100001-1|-" >wmv</a><br>&raquo;
     * <a href= "http://de.clip-1.filmtrailer.com/9507_39003_a_5.mp4?log_var=72|491100001-1|-" >mp4</a><br>&raquo; <a href=
     * "http://de.clip-1.filmtrailer.com/9507_39003_a_5.webm?log_var=72|491100001-1|-" >webm</a><br>'; } }
     */
    Url url = null;
    String searchString = BASE_URL + "/view.php?page=suchergebnis&Kat=IMDb&SText=" + options.getImdbId();
    try {
        // search with IMDB
        url = new Url(searchString);
        InputStream in = url.getInputStream();
        Document doc = Jsoup.parse(in, "UTF-8", "");
        in.close();
        Elements filme = doc.getElementsByAttributeValueMatching("href", "film\\/\\d+,");
        if (filme == null || filme.isEmpty()) {
            LOGGER.debug("found no search results");
            return trailers;
        }
        LOGGER.debug("found " + filme.size() + " search results"); // hopefully
                                                                   // only one

        LOGGER.debug("get (trailer) details page");
        url = new Url(BASE_URL + "/" + StrgUtils.substr(filme.first().toString(), "href=\\\"(.*?)\\\""));
        in = url.getInputStream();
        doc = Jsoup.parse(in, "UTF-8", "");
        in.close();

        // OLD STYLE
        // <b>Trailer 1</b><br><i>(xxlarge)</i><br><br>&raquo; 640px<br><br>Download:<br>&raquo; <a href=
        // "http://de.clip-1.filmtrailer.com/9507_31566_a_5.wmv?log_var=72|491100001-1|-" >wmv</a><br>&raquo; <a href=
        // "http://de.clip-1.filmtrailer.com/9507_31566_a_5.mp4?log_var=72|491100001-1|-" >mp4</a><br>&raquo; <a href=
        // "http://de.clip-1.filmtrailer.com/9507_31566_a_5.webm?log_var=72|491100001-1|-" >webm</a><br>
        Pattern regex = Pattern.compile("return '(.*?)';");
        Matcher m = regex.matcher(doc.toString());
        while (m.find()) {
            String s = m.group(1);
            String tname = StrgUtils.substr(s, "<b>(.*?)</b>");
            String tpix = StrgUtils.substr(s, "raquo; (.*?)x<br>");
            // String tqual = StrgUtils.substr(s, "<i>\\((.*?)\\)</i>");

            // url + format
            Pattern lr = Pattern.compile("<a href=\"(.*?)\">(.*?)</a>");
            Matcher lm = lr.matcher(s);
            while (lm.find()) {
                String turl = lm.group(1);
                // String tformat = lm.group(2);
                MediaTrailer trailer = new MediaTrailer();
                trailer.setName(tname);
                // trailer.setQuality(tpix + " (" + tformat + ")");
                trailer.setQuality(tpix);
                trailer.setProvider("filmtrailer");
                trailer.setUrl(turl);
                LOGGER.debug(trailer.toString());
                trailers.add(trailer);
            }
        }

        // NEW STYLE (additional!)
        // <div class="clips" id="clips2" style="display: none;">
        // <img src="images/flag_de.gif" align="left" vspace="3" width="18" height="12">&nbsp;
        // <img src="images/trailer_6.gif" align="top" vspace="1" width="16" height="16" alt="freigegeben ab 6 Jahren">&nbsp;
        // <i>Trailer 1:</i>
        // <a href="http://de.clip-1.filmtrailer.com/2845_6584_a_1.flv?log_var=67|491100001-1|-">&nbsp;small&nbsp;</a> &nbsp;
        // <a href="http://de.clip-1.filmtrailer.com/2845_6584_a_2.flv?log_var=67|491100001-1|-">&nbsp;medium&nbsp;</a> &nbsp;
        // <a href="http://de.clip-1.filmtrailer.com/2845_6584_a_3.flv?log_var=67|491100001-1|-">&nbsp;large&nbsp;</a> &nbsp;
        // <a href="http://de.clip-1.filmtrailer.com/2845_6584_a_4.flv?log_var=67|491100001-1|-">&nbsp;xlarge&nbsp;</a> &nbsp;
        // <a href="http://de.clip-1.filmtrailer.com/2845_6584_a_5.flv?log_var=67|491100001-1|-">&nbsp;xxlarge&nbsp;</a> &nbsp;
        // <br>
        // <img src="images/flag_de.gif" align="left" vspace="3" width="18" height="12">&nbsp;
        // <img src="images/trailer_6.gif" align="top" vspace="1" width="16" height="16" alt="freigegeben ab 6 Jahren">&nbsp;
        // <i>Trailer 2:</i>
        // <a href="http://de.clip-1.filmtrailer.com/2845_8244_a_1.flv?log_var=67|491100001-1|-">&nbsp;small&nbsp;</a> &nbsp;
        // <a href="http://de.clip-1.filmtrailer.com/2845_8244_a_2.flv?log_var=67|491100001-1|-">&nbsp;medium&nbsp;</a> &nbsp;
        // <a href="http://de.clip-1.filmtrailer.com/2845_8244_a_3.flv?log_var=67|491100001-1|-">&nbsp;large&nbsp;</a> &nbsp;
        // <a href="http://de.clip-1.filmtrailer.com/2845_8244_a_4.flv?log_var=67|491100001-1|-">&nbsp;xlarge&nbsp;</a> &nbsp;
        // <a href="http://de.clip-1.filmtrailer.com/2845_8244_a_5.flv?log_var=67|491100001-1|-">&nbsp;xxlarge&nbsp;</a> &nbsp;
        // <br>
        // <img src="images/flag_de.gif" align="left" vspace="3" width="18" height="12">&nbsp;
        // <img src="images/trailer_6.gif" align="top" vspace="1" width="16" height="16" alt="freigegeben ab 6 Jahren">&nbsp;
        // <i>Trailer 3:</i>
        // <a href="http://de.clip-1.filmtrailer.com/2845_14749_a_1.flv?log_var=67|491100001-1|-">&nbsp;small&nbsp;</a> &nbsp;
        // <a href="http://de.clip-1.filmtrailer.com/2845_14749_a_2.flv?log_var=67|491100001-1|-">&nbsp;medium&nbsp;</a> &nbsp;
        // <a href="http://de.clip-1.filmtrailer.com/2845_14749_a_3.flv?log_var=67|491100001-1|-">&nbsp;large&nbsp;</a> &nbsp;
        // <a href="http://de.clip-1.filmtrailer.com/2845_14749_a_4.flv?log_var=67|491100001-1|-">&nbsp;xlarge&nbsp;</a> &nbsp;
        // <a href="http://de.clip-1.filmtrailer.com/2845_14749_a_5.flv?log_var=67|491100001-1|-">&nbsp;xxlarge&nbsp;</a> &nbsp;
        // <br>
        // <br>
        // </div>

        // new style size
        // 1 = 160 x 90 = small
        // 2 = 240 x 136 = medium
        // 3 = 320 x 180 = large
        // 4 = 400 x 226 = xlarge
        // 5 = 640 x 360 = xxlarge

        ;

        regex = Pattern.compile("<i>(.*?)</i>(.*?)<br>", Pattern.DOTALL); // get them as single trailer line
        m = regex.matcher(doc.getElementsByClass("clips").html());
        while (m.find()) {
            // LOGGER.info(doc.getElementsByClass("clips").html());
            // parse each line with 5 qualities
            String tname = m.group(1).trim();
            tname = tname.replaceFirst(":$", ""); // replace ending colon

            String urls = m.group(2);
            // url + format
            Pattern lr = Pattern.compile("<a href=\"(.*?)\">(.*?)</a>");
            Matcher lm = lr.matcher(urls);
            while (lm.find()) {
                String turl = lm.group(1);
                String tpix = "";
                String tformat = lm.group(2).replaceAll("&nbsp;", "").trim();
                switch (tformat) {
                case "small":
                    tpix = "90p";
                    break;

                case "medium":
                    tpix = "136p";
                    break;

                case "large":
                    tpix = "180p";
                    break;

                case "xlarge":
                    tpix = "226p";
                    break;

                case "xxlarge":
                    tpix = "360p";
                    break;

                default:
                    break;
                }
                MediaTrailer trailer = new MediaTrailer();
                trailer.setName(tname);
                // trailer.setQuality(tpix + " (" + tformat + ")");
                trailer.setQuality(tpix);
                trailer.setProvider("filmtrailer");
                trailer.setUrl(turl);
                LOGGER.debug(trailer.toString());
                trailers.add(trailer);
            }
        }
    } catch (Exception e) {
        if (url != null) {
            LOGGER.error("Error parsing {}", url.toString());
        } else {
            LOGGER.error("Error parsing {}", searchString);
        }

        throw e;
    }
    return trailers;
}

From source file:org.xlrnet.metadict.engines.woxikon.WoxikonEngine.java

private void processTranslationTable(@NotNull String queryString, @NotNull Document document,
        @NotNull BilingualQueryResultBuilder resultBuilder, @NotNull Language sourceLanguage,
        @NotNull Language targetLanguage) {
    // Find main table (german to X)
    String languageIdentifier = sourceLanguage.getIdentifier().toLowerCase() + "-"
            + targetLanguage.getIdentifier().toLowerCase();

    Element translationTable = document.getElementById("dictionary-" + languageIdentifier);

    // Process the main table with its entries
    if (translationTable != null) {
        // Find all relevant entries, filter them by class and process them
        translationTable.getElementsByTag("tr").stream().filter(e -> e.getElementsByTag("th").size() == 0)
                .forEach(e -> processEntry(queryString, e, resultBuilder, sourceLanguage, targetLanguage));
        // Extract synonyms
        Elements synonymTableCandidates = document.getElementsByClass("dictionary-synonyms-table");
        if (synonymTableCandidates.size() > 0) {
            extractBilingualSynonyms(queryString, synonymTableCandidates.get(0), resultBuilder, sourceLanguage);
        }/*  w w  w  . ja  v a 2  s .c  om*/

    } else {
        LOGGER.debug("Translation table for {} -> {} with query \"{}\" is null", languageIdentifier,
                targetLanguage.getIdentifier(), queryString);
    }
}

From source file:poe.trade.assist.SearchForm.java

private String removeAllExceptSearchForm(String html) {
    String htmlDirectory = htmlDirectory();
    Document doc = Jsoup.parse(html);

    // Remove stuff outside of id="main"
    //      doc.body().children().stream().filter(e -> !"main".equalsIgnoreCase(e.id())).forEach(e -> e.remove());

    Element head = doc.head();/*from  www .  ja  v  a  2s. c om*/

    // Replace everthing in the <head>
    head.children().stream().forEach(e -> e.remove());
    head.appendElement("meta").attr("charset", "utf-8");
    head.appendElement("meta").attr("name", "viewport").attr("content", "width=device-width");
    head.appendElement("title").text("poe.trade.assist");
    head.appendElement("script").attr("type", "text/javascript").attr("src", htmlDirectory + "packed.js");
    head.appendElement("link").attr("rel", "stylesheet").attr("href", htmlDirectory + "packed_dark.css");

    // Show search form
    Optional.ofNullable(doc.getElementById("search-form")).ifPresent(e -> e.attr("style", ""));

    Optional.ofNullable(doc.getElementById("search"))
            .ifPresent(e -> e.attr("action", "http://poe.trade/search"));
    //      
    //      Element mainElement = doc.getElementById("main");
    //      Element topDivContainer = mainElement.child(0);
    //
    //      // Remove everthing that is not id="content" or h2
    //      topDivContainer.children().stream()
    //         .filter(e -> !"content".equalsIgnoreCase(e.id()))
    //         .filter(e -> !e.tag().getName().equalsIgnoreCase("h2"))
    //         .forEach(e -> e.remove());
    //      
    //      // Clean up stuff inside id="content"
    //      
    //         // Remove "Show search form", "search/import"
    //         Optional<Element> searchFormElem = doc.getElementsByTag("a").stream().filter(e -> e.hasClass("button") && e.hasClass("secondary") && e.hasClass("expand")).findFirst();
    //         searchFormElem.ifPresent(e -> e.remove());
    //         
    //         Optional<Element> searchOrImportDiv = doc.getElementsByTag("div").stream().filter(e -> e.hasClass("row") && e.hasClass("form-choose-action")).findFirst();
    //         searchOrImportDiv.ifPresent(e -> e.remove());
    //
    //         // Remove search results
    Elements searchResultBlocks = doc.getElementsByClass("search-results-block");
    if (searchResultBlocks.size() > 0) {
        searchResultBlocks.get(0).remove();
    }

    // append assist as the last element in body
    //       doc.body().appendElement("script").attr("type", "text/javascript").attr("src", htmlDirectory + "assist.js");

    String cleanHtml = doc.toString();
    return cleanHtml;
}

From source file:Search.DataManipulation.DataParser.java

public String getIcon(Document dom) throws IOException {
    Elements iconClass = dom.getElementsByClass("cover-container");
    Elements iconClass1 = iconClass.select("img.cover-image[alt=Cover art]");
    String iconUrl = iconClass1.first().attr("src");
    byte[] iconByte = dataHandler.imageDownloader(iconUrl);

    if (iconByte.length == 0) {
        log.warn("Invalid Icon url found by Search.DataManipulation.DataValidator, not adding to appData");
        return null;
    } else {//from  w w  w .j av  a  2s .  co  m
        String icon = Base64.getEncoder().encodeToString(iconByte);
        return icon;
    }
}