Java tutorial
/* * Copyright 2012 - 2016 Manuel Laggner * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.tinymediamanager.scraper.imdb; import static org.tinymediamanager.scraper.imdb.ImdbMetadataProvider.CAT_TV; import static org.tinymediamanager.scraper.imdb.ImdbMetadataProvider.providerInfo; import java.text.ParseException; import java.text.SimpleDateFormat; import java.util.ArrayList; import java.util.List; import java.util.Locale; import java.util.regex.Matcher; import java.util.regex.Pattern; import org.apache.commons.lang3.StringUtils; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import org.jsoup.select.Elements; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.tinymediamanager.scraper.MediaMetadata; import org.tinymediamanager.scraper.MediaScrapeOptions; import org.tinymediamanager.scraper.entities.MediaArtwork; import org.tinymediamanager.scraper.entities.MediaCastMember; import org.tinymediamanager.scraper.entities.MediaEpisode; import org.tinymediamanager.scraper.entities.MediaType; import org.tinymediamanager.scraper.http.CachedUrl; import org.tinymediamanager.scraper.http.Url; import org.tinymediamanager.scraper.util.MetadataUtil; /** * The class ImdbTvShowParser is used to parse TV show site of imdb.com * * @author Manuel Laggner */ public class ImdbTvShowParser extends ImdbParser { private static final Logger LOGGER = LoggerFactory.getLogger(ImdbTvShowParser.class); private static final Pattern UNWANTED_SEARCH_RESULTS = Pattern .compile(".*\\((TV Movies|TV Episode|Short|Video Game)\\).*"); // stripped out private ImdbSiteDefinition imdbSite; public ImdbTvShowParser(ImdbSiteDefinition imdbSite) { super(MediaType.TV_SHOW); this.imdbSite = imdbSite; } @Override protected Pattern getUnwantedSearchResultPattern() { if (ImdbMetadataProvider.providerInfo.getConfig().getValueAsBool("filterUnwantedCategories")) { return UNWANTED_SEARCH_RESULTS; } return null; } @Override protected Logger getLogger() { return LOGGER; } @Override protected ImdbSiteDefinition getImdbSite() { return imdbSite; } @Override protected MediaMetadata getMetadata(MediaScrapeOptions options) throws Exception { switch (options.getType()) { case TV_SHOW: return getTvShowMetadata(options); case TV_EPISODE: return getEpisodeMetadata(options); default: break; } return new MediaMetadata(providerInfo.getId()); } @Override protected String getSearchCategory() { return CAT_TV; } /** * get the TV show metadata * * @param options * the scrape options * @return the MediaMetadata * @throws Exception */ MediaMetadata getTvShowMetadata(MediaScrapeOptions options) throws Exception { MediaMetadata md = new MediaMetadata(providerInfo.getId()); String imdbId = ""; // imdbId from searchResult if (options.getResult() != null) { imdbId = options.getResult().getIMDBId(); } // imdbid from scraper option if (!MetadataUtil.isValidImdbId(imdbId)) { imdbId = options.getImdbId(); } if (!MetadataUtil.isValidImdbId(imdbId)) { return md; } LOGGER.debug("IMDB: getMetadata(imdbId): " + imdbId); // get reference data CachedUrl url = new CachedUrl(imdbSite.getSite() + "/title/" + imdbId + "/reference"); url.addHeader("Accept-Language", getAcceptLanguage(options.getLanguage().getLanguage(), options.getCountry().getAlpha2())); Document doc = Jsoup.parse(url.getInputStream(), imdbSite.getCharset().displayName(), ""); parseReferencePage(doc, options, md); // get plot url = new CachedUrl(imdbSite.getSite() + "/title/" + imdbId + "/plotsummary"); url.addHeader("Accept-Language", getAcceptLanguage(options.getLanguage().getLanguage(), options.getCountry().getAlpha2())); doc = Jsoup.parse(url.getInputStream(), imdbSite.getCharset().displayName(), ""); parsePlotsummaryPage(doc, options, md); // populate id md.setId(ImdbMetadataProvider.providerInfo.getId(), imdbId); return md; } /** * get the episode metadata. * * @param options * the scrape options * @return the MediaMetaData * @throws Exception */ MediaMetadata getEpisodeMetadata(MediaScrapeOptions options) throws Exception { MediaMetadata md = new MediaMetadata(providerInfo.getId()); String imdbId = options.getImdbId(); if (StringUtils.isBlank(imdbId)) { return md; } // get episode number and season number int seasonNr = -1; int episodeNr = -1; try { seasonNr = Integer.parseInt(options.getId(MediaMetadata.SEASON_NR)); episodeNr = Integer.parseInt(options.getId(MediaMetadata.EPISODE_NR)); } catch (Exception e) { LOGGER.warn("error parsing season/episode number"); } if (seasonNr == -1 || episodeNr == -1) { return md; } // first get the base episode metadata which can be gathered via // getEpisodeList() List<MediaEpisode> episodes = getEpisodeList(options); MediaEpisode wantedEpisode = null; for (MediaEpisode episode : episodes) { if (episode.season == seasonNr && episode.episode == episodeNr) { wantedEpisode = episode; break; } } // we did not find the episode; return if (wantedEpisode == null) { return md; } md.setId(providerInfo.getId(), wantedEpisode.ids.get(providerInfo.getId())); md.setEpisodeNumber(wantedEpisode.episode); md.setSeasonNumber(wantedEpisode.season); md.setTitle(wantedEpisode.title); md.setPlot(wantedEpisode.plot); md.setRating(wantedEpisode.rating); md.setVoteCount(wantedEpisode.voteCount); try { SimpleDateFormat sdf = new SimpleDateFormat("d MMM. yyyy", Locale.US); md.setReleaseDate(sdf.parse(wantedEpisode.firstAired)); } catch (ParseException e) { try { SimpleDateFormat sdf = new SimpleDateFormat("d MMM yyyy", Locale.US); // without "dot" - "May" for example md.setReleaseDate(sdf.parse(wantedEpisode.firstAired)); } catch (ParseException ign) { LOGGER.warn("Could not parse date format: {}", wantedEpisode.firstAired); } } // and finally the cast which needed to be fetched from the fullcredits page if (wantedEpisode.ids.get(providerInfo.getId()) instanceof String && StringUtils.isNotBlank((String) wantedEpisode.ids.get(providerInfo.getId()))) { Url url = new Url( imdbSite.getSite() + "/title/" + wantedEpisode.ids.get(providerInfo.getId()) + "/fullcredits"); url.addHeader("Accept-Language", "en"); // force EN for parsing by HTMl texts Document doc = Jsoup.parse(url.getInputStream(), imdbSite.getCharset().displayName(), ""); // director & writer Element fullcredits = doc.getElementById("fullcredits_content"); if (fullcredits != null) { Elements tables = fullcredits.getElementsByTag("table"); // first table are directors if (tables.get(0) != null) { for (Element director : tables.get(0).getElementsByClass("name")) { MediaCastMember cm = new MediaCastMember(MediaCastMember.CastType.DIRECTOR); cm.setName(director.text()); md.addCastMember(cm); } } // second table are writers if (tables.get(1) != null) { for (Element writer : tables.get(1).getElementsByClass("name")) { MediaCastMember cm = new MediaCastMember(MediaCastMember.CastType.WRITER); cm.setName(writer.text()); md.addCastMember(cm); } } } // actors Element castTableElement = doc.getElementsByClass("cast_list").first(); if (castTableElement != null) { Elements tr = castTableElement.getElementsByTag("tr"); for (Element row : tr) { MediaCastMember cm = parseCastMember(row); if (cm != null && StringUtils.isNotEmpty(cm.getName()) && StringUtils.isNotEmpty(cm.getCharacter())) { cm.setType(MediaCastMember.CastType.ACTOR); md.addCastMember(cm); } } } } return md; } /** * parse the episode list from the ratings overview * * @param options * the scrape options * @return the episode list * @throws Exception */ List<MediaEpisode> getEpisodeList(MediaScrapeOptions options) throws Exception { List<MediaEpisode> episodes = new ArrayList<>(); // parse the episodes from the ratings overview page (e.g. // http://www.imdb.com/title/tt0491738/epdate ) String imdbId = options.getImdbId(); if (StringUtils.isBlank(imdbId)) { return episodes; } // we need to parse every season for its own _._ // first the specials CachedUrl url = new CachedUrl(imdbSite.getSite() + "/title/" + imdbId + "/epdate"); url.addHeader("Accept-Language", getAcceptLanguage(options.getLanguage().getLanguage(), options.getCountry().getAlpha2())); Document doc = Jsoup.parse(url.getInputStream(), imdbSite.getCharset().displayName(), ""); parseEpisodeList(0, episodes, doc); // then parse every season for (int i = 1;; i++) { try { url = new CachedUrl(imdbSite.getSite() + "/title/" + imdbId + "/epdate?season=" + i); url.addHeader("Accept-Language", getAcceptLanguage(options.getLanguage().getLanguage(), options.getCountry().getAlpha2())); doc = Jsoup.parse(url.getInputStream(), imdbSite.getCharset().displayName(), ""); // if the given season number and the parsed one does not match, break here if (!parseEpisodeList(i, episodes, doc)) { break; } } catch (Exception e) { LOGGER.warn("problem parsing ep list: " + e.getMessage()); } } return episodes; } private boolean parseEpisodeList(int season, List<MediaEpisode> episodes, Document doc) { Pattern unknownPattern = Pattern.compile("Unknown"); Pattern seasonEpisodePattern = Pattern.compile("S([0-9]*), Ep([0-9]*)"); int episodeCounter = 0; // parse episodes Elements tables = doc.getElementsByClass("eplist"); for (Element table : tables) { Elements rows = table.getElementsByClass("list_item"); for (Element row : rows) { Matcher matcher = season == 0 ? unknownPattern.matcher(row.text()) : seasonEpisodePattern.matcher(row.text()); if (matcher.find() && (season == 0 || matcher.groupCount() >= 2)) { try { // we found a row containing episode data MediaEpisode ep = new MediaEpisode(providerInfo.getId()); // parse season and ep number if (season == 0) { ep.season = season; ep.episode = ++episodeCounter; } else { ep.season = Integer.parseInt(matcher.group(1)); ep.episode = Integer.parseInt(matcher.group(2)); } // check if we have still valid data if (season > 0 && season != ep.season) { return false; } // get ep title and id Elements anchors = row.getElementsByAttributeValueStarting("href", "/title/tt"); for (Element anchor : anchors) { if ("name".equals(anchor.attr("itemprop"))) { ep.title = anchor.text(); break; } } String id = ""; Matcher idMatcher = IMDB_ID_PATTERN.matcher(anchors.get(0).attr("href")); while (idMatcher.find()) { if (idMatcher.group(1) != null) { id = idMatcher.group(1); } } if (StringUtils.isNotBlank(id)) { ep.ids.put(providerInfo.getId(), id); } // plot Element plot = row.getElementsByClass("item_description").first(); if (plot != null) { ep.plot = plot.ownText(); } // rating and rating count Element ratingElement = row.getElementsByClass("ipl-rating-star__rating").first(); if (ratingElement != null) { String ratingAsString = ratingElement.ownText().replace(",", "."); try { ep.rating = Float.valueOf(ratingAsString); } catch (Exception ignored) { } Element votesElement = row.getElementsByClass("ipl-rating-star__total-votes").first(); if (votesElement != null) { String countAsString = votesElement.ownText().replaceAll("[.,()]", "").trim(); try { ep.voteCount = Integer.parseInt(countAsString); } catch (Exception ignored) { } } } // release date Element releaseDate = row.getElementsByClass("airdate").first(); if (releaseDate != null) { ep.firstAired = releaseDate.ownText(); } // poster Element image = row.getElementsByTag("img").first(); if (image != null) { String posterUrl = image.attr("src"); posterUrl = posterUrl.replaceAll("UX[0-9]{2,4}_", ""); posterUrl = posterUrl.replaceAll("UY[0-9]{2,4}_", ""); posterUrl = posterUrl.replaceAll("CR[0-9]{1,3},[0-9]{1,3},[0-9]{1,3},[0-9]{1,3}_", ""); if (StringUtils.isNotBlank(posterUrl)) { MediaArtwork ma = new MediaArtwork(ImdbMetadataProvider.providerInfo.getId(), MediaArtwork.MediaArtworkType.THUMB); ma.setPreviewUrl(posterUrl); ma.setDefaultUrl(posterUrl); ep.artwork.add(ma); } } episodes.add(ep); } catch (Exception e) { LOGGER.warn("failed parsing: " + row.text() + " for ep data; " + e.getMessage()); } } } } return true; } }