Java tutorial
/* * Copyright 2012 - 2016 Manuel Laggner * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.tinymediamanager.scraper.imdb; import static org.tinymediamanager.scraper.imdb.ImdbMetadataProvider.CAT_TITLE; import static org.tinymediamanager.scraper.imdb.ImdbMetadataProvider.cleanString; import static org.tinymediamanager.scraper.imdb.ImdbMetadataProvider.executor; import static org.tinymediamanager.scraper.imdb.ImdbMetadataProvider.providerInfo; import java.text.ParseException; import java.text.SimpleDateFormat; import java.util.Date; import java.util.Locale; import java.util.concurrent.Callable; import java.util.concurrent.ExecutorCompletionService; import java.util.concurrent.Future; import java.util.regex.Matcher; import java.util.regex.Pattern; import org.apache.commons.lang3.StringUtils; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import org.jsoup.select.Elements; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.tinymediamanager.scraper.MediaMetadata; import org.tinymediamanager.scraper.MediaScrapeOptions; import org.tinymediamanager.scraper.entities.MediaType; import org.tinymediamanager.scraper.util.MetadataUtil; /** * The class ImdbMovieParser is used to parse the movie sites at imdb.com * * @author Manuel Laggner */ public class ImdbMovieParser extends ImdbParser { private static final Logger LOGGER = LoggerFactory.getLogger(ImdbMovieParser.class); private static final Pattern UNWANTED_SEARCH_RESULTS = Pattern .compile(".*\\((TV Series|TV Episode|Short|Video Game)\\).*"); private ImdbSiteDefinition imdbSite; public ImdbMovieParser(ImdbSiteDefinition imdbSite) { super(MediaType.MOVIE); this.imdbSite = imdbSite; } @Override protected Pattern getUnwantedSearchResultPattern() { if (ImdbMetadataProvider.providerInfo.getConfig().getValueAsBool("filterUnwantedCategories")) { return UNWANTED_SEARCH_RESULTS; } return null; } @Override protected Logger getLogger() { return LOGGER; } @Override protected ImdbSiteDefinition getImdbSite() { return imdbSite; } @Override protected MediaMetadata getMetadata(MediaScrapeOptions options) throws Exception { return getMovieMetadata(options); } @Override protected String getSearchCategory() { return CAT_TITLE; } MediaMetadata getMovieMetadata(MediaScrapeOptions options) throws Exception { MediaMetadata md = new MediaMetadata(providerInfo.getId()); // check if there is a md in the result if (options.getResult() != null && options.getResult().getMediaMetadata() != null) { LOGGER.debug("IMDB: getMetadata from cache: " + options.getResult()); return options.getResult().getMediaMetadata(); } String imdbId = ""; // imdbId from searchResult if (options.getResult() != null) { imdbId = options.getResult().getIMDBId(); } // imdbid from scraper option if (!MetadataUtil.isValidImdbId(imdbId)) { imdbId = options.getImdbId(); } if (!MetadataUtil.isValidImdbId(imdbId)) { return md; } LOGGER.debug("IMDB: getMetadata(imdbId): " + imdbId); md.setId(providerInfo.getId(), imdbId); ExecutorCompletionService<Document> compSvcImdb = new ExecutorCompletionService<>(executor); ExecutorCompletionService<MediaMetadata> compSvcTmdb = new ExecutorCompletionService<>(executor); // worker for imdb request (/reference) (everytime from www.imdb.com) // StringBuilder sb = new StringBuilder(imdbSite.getSite()); StringBuilder sb = new StringBuilder(ImdbSiteDefinition.IMDB_COM.getSite()); sb.append("title/"); sb.append(imdbId); sb.append("/reference"); Callable<Document> worker = new ImdbWorker(sb.toString(), options.getLanguage().getLanguage(), options.getCountry().getAlpha2(), imdbSite); Future<Document> futureReference = compSvcImdb.submit(worker); // worker for imdb request (/plotsummary) (from chosen site) Future<Document> futurePlotsummary; sb = new StringBuilder(imdbSite.getSite()); sb.append("title/"); sb.append(imdbId); sb.append("/plotsummary"); worker = new ImdbWorker(sb.toString(), options.getLanguage().getLanguage(), options.getCountry().getAlpha2(), imdbSite); futurePlotsummary = compSvcImdb.submit(worker); // worker for tmdb request Future<MediaMetadata> futureTmdb = null; if (ImdbMetadataProvider.providerInfo.getConfig().getValueAsBool("useTmdb") || ImdbMetadataProvider.providerInfo.getConfig().getValueAsBool("scrapeCollectionInfo")) { Callable<MediaMetadata> worker2 = new TmdbWorker(imdbId, options.getLanguage(), options.getCountry()); futureTmdb = compSvcTmdb.submit(worker2); } Document doc; doc = futureReference.get(); parseReferencePage(doc, options, md); /* * plot from /plotsummary */ // build the url doc = futurePlotsummary.get(); parsePlotsummaryPage(doc, options, md); // title also from chosen site if we are not scraping akas.imdb.com if (imdbSite != ImdbSiteDefinition.IMDB_COM) { Element title = doc.getElementById("tn15title"); if (title != null) { Element element; // title Elements elements = title.getElementsByClass("main"); if (elements.size() > 0) { element = elements.first(); String movieTitle = cleanString(element.ownText()); md.setTitle(movieTitle); } } } // get the release info page Future<Document> futureReleaseinfo; sb = new StringBuilder(imdbSite.getSite()); sb.append("title/"); sb.append(imdbId); sb.append("/releaseinfo"); worker = new ImdbWorker(sb.toString(), options.getLanguage().getLanguage(), options.getCountry().getAlpha2(), imdbSite); futureReleaseinfo = compSvcImdb.submit(worker); doc = futureReleaseinfo.get(); // parse original title here!! parseReleaseinfoPageAKAs(doc, options, md); // did we get a release date? if (md.getReleaseDate() == null || ImdbMetadataProvider.providerInfo.getConfig().getValueAsBool("localReleaseDate")) { parseReleaseinfoPage(doc, options, md); } // get data from tmdb? if (futureTmdb != null && (ImdbMetadataProvider.providerInfo.getConfig().getValueAsBool("useTmdb") || ImdbMetadataProvider.providerInfo.getConfig().getValueAsBool("scrapeCollectionInfo"))) { try { MediaMetadata tmdbMd = futureTmdb.get(); if (ImdbMetadataProvider.providerInfo.getConfig().getValueAsBool("useTmdb") && tmdbMd != null) { // tmdbid md.setId(MediaMetadata.TMDB, tmdbMd.getId(MediaMetadata.TMDB)); // title if (StringUtils.isNotBlank(tmdbMd.getTitle())) { md.setTitle(tmdbMd.getTitle()); } // original title if (StringUtils.isNotBlank(tmdbMd.getOriginalTitle())) { md.setOriginalTitle(tmdbMd.getOriginalTitle()); } // tagline if (StringUtils.isNotBlank(tmdbMd.getTagline())) { md.setTagline(tmdbMd.getTagline()); } // plot if (StringUtils.isNotBlank(tmdbMd.getPlot())) { md.setPlot(tmdbMd.getPlot()); } // collection info if (StringUtils.isNotBlank(tmdbMd.getCollectionName())) { md.setCollectionName(tmdbMd.getCollectionName()); md.setId(MediaMetadata.TMDB_SET, tmdbMd.getId(MediaMetadata.TMDB_SET)); } } if (ImdbMetadataProvider.providerInfo.getConfig().getValueAsBool("scrapeCollectionInfo") && tmdbMd != null) { md.setId(MediaMetadata.TMDB_SET, tmdbMd.getId(MediaMetadata.TMDB_SET)); md.setCollectionName(tmdbMd.getCollectionName()); } md.setId(tmdbMd.getProviderId(), tmdbMd.getId(tmdbMd.getProviderId())); } catch (Exception ignored) { } } // if we have still no original title, take the title if (StringUtils.isBlank(md.getOriginalTitle())) { md.setOriginalTitle(md.getTitle()); } // populate id md.setId(ImdbMetadataProvider.providerInfo.getId(), imdbId); return md; } private MediaMetadata parseReleaseinfoPage(Document doc, MediaScrapeOptions options, MediaMetadata md) { Date releaseDate = null; Pattern pattern = Pattern.compile("/calendar/\\?region=(.{2})"); // old way Element tableReleaseDates = doc.getElementById("release_dates"); if (tableReleaseDates != null) { Elements rows = tableReleaseDates.getElementsByTag("tr"); // first round: check the release date for the first one with the requested country for (Element row : rows) { // get the anchor Element anchor = row.getElementsByAttributeValueStarting("href", "/calendar/").first(); if (anchor != null) { Matcher matcher = pattern.matcher(anchor.attr("href")); if (matcher.find() && options.getCountry().getAlpha2().equalsIgnoreCase(matcher.group(1))) { Element column = row.getElementsByClass("release_date").first(); if (column != null) { try { SimpleDateFormat sdf = new SimpleDateFormat("d MMMM yyyy", Locale.US); releaseDate = sdf.parse(column.text()); break; } catch (ParseException otherformat) { try { SimpleDateFormat sdf = new SimpleDateFormat("MMMM yyyy", Locale.US); releaseDate = sdf.parse(column.text()); break; } catch (ParseException ignored) { } } } } } } } // new way; iterating over class name items if (releaseDate == null) { Elements rows = doc.getElementsByClass("release-date-item"); for (Element row : rows) { Element anchor = row.getElementsByAttributeValueStarting("href", "/calendar/").first(); if (anchor != null) { Matcher matcher = pattern.matcher(anchor.attr("href")); if (matcher.find() && options.getCountry().getAlpha2().equalsIgnoreCase(matcher.group(1))) { Element column = row.getElementsByClass("release-date-item__date").first(); if (column != null) { try { SimpleDateFormat sdf = new SimpleDateFormat("d MMMM yyyy", Locale.US); releaseDate = sdf.parse(column.text()); break; } catch (ParseException otherformat) { try { SimpleDateFormat sdf = new SimpleDateFormat("MMMM yyyy", Locale.US); releaseDate = sdf.parse(column.text()); break; } catch (ParseException ignored) { } } } } else { LOGGER.trace("country {} does not match ours {}", matcher.group(1), options.getCountry().getAlpha2()); } } } } // no matching local release date found; take the first one if (releaseDate == null) { Element column = doc.getElementsByClass("release_date").first(); if (column == null) { column = doc.getElementsByClass("release-date-item__date").first(); } if (column != null) { try { SimpleDateFormat sdf = new SimpleDateFormat("d MMMM yyyy", Locale.US); releaseDate = sdf.parse(column.text()); } catch (ParseException otherformat) { try { SimpleDateFormat sdf = new SimpleDateFormat("MMMM yyyy", Locale.US); releaseDate = sdf.parse(column.text()); } catch (ParseException ignored) { } } } } if (releaseDate != null) { md.setReleaseDate(releaseDate); } return md; } // AKAs and original title private MediaMetadata parseReleaseinfoPageAKAs(Document doc, MediaScrapeOptions options, MediaMetadata md) { // <table id="akas" class="subpage_data spEven2Col"> // <tr class="even"> // <td>(original title)</td> // <td>Intouchables</td> // </tr> // need to search all tables for correct ID, since the UNIQUE id is used multiple times - thanks for nothing :p for (Element table : doc.getElementsByTag("table")) { if (table.id().equalsIgnoreCase("akas")) { Elements rows = table.getElementsByTag("tr"); for (Element row : rows) { Element c1 = row.getElementsByTag("td").get(0); Element c2 = row.getElementsByTag("td").get(1); if (c1 != null && c1.text().toLowerCase(Locale.ROOT).contains("original title")) { md.setOriginalTitle(c2.text()); break; } } } } // alternative; new way with table classes // <tr class="ipl-zebra-list__item aka-item"> // <td class="aka-item__name">Germany</td> // <td class="aka-item__title">Avatar - Aufbruch nach Pandora</td> // </tr> if (md.getOriginalTitle().isEmpty()) { Elements rows = doc.getElementsByClass("aka-item"); for (Element row : rows) { Element country = row.getElementsByClass("aka-item__name").first(); Element title = row.getElementsByClass("aka-item__title").first(); if (country != null && country.text().toLowerCase(Locale.ROOT).contains("original title")) { md.setOriginalTitle(title.text()); break; } } } return md; } }