com.moviejukebox.plugin.ImdbPlugin.java Source code

Introduction

Here is the source code for com.moviejukebox.plugin.ImdbPlugin.java
Source

/*
 *      Copyright (c) 2004-2016 YAMJ Members
 *      https://github.com/orgs/YAMJ/people
 *
 *      This file is part of the Yet Another Movie Jukebox (YAMJ) project.
 *
 *      YAMJ is free software: you can redistribute it and/or modify
 *      it under the terms of the GNU General Public License as published by
 *      the Free Software Foundation, either version 3 of the License, or
 *      any later version.
 *
 *      YAMJ is distributed in the hope that it will be useful,
 *      but WITHOUT ANY WARRANTY; without even the implied warranty of
 *      MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *      GNU General Public License for more details.
 *
 *      You should have received a copy of the GNU General Public License
 *      along with YAMJ.  If not, see <http://www.gnu.org/licenses/>.
 *
 *      Web: https://github.com/YAMJ/yamj-v2
 *
 */
package com.moviejukebox.plugin;

import static com.moviejukebox.model.Movie.UNKNOWN;
import static com.moviejukebox.tools.PropertiesUtil.FALSE;
import static com.moviejukebox.tools.PropertiesUtil.TRUE;
import static com.moviejukebox.tools.StringTools.isNotValidString;
import static com.moviejukebox.tools.StringTools.isValidString;
import static com.moviejukebox.tools.StringTools.trimToLength;

import com.moviejukebox.model.*;
import com.moviejukebox.scanner.artwork.FanartScanner;
import com.moviejukebox.tools.*;
import java.io.IOException;
import java.text.NumberFormat;
import java.util.*;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.commons.lang3.StringUtils;
import org.apache.commons.lang3.math.NumberUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

public class ImdbPlugin implements MovieDatabasePlugin {

    public static final String IMDB_PLUGIN_ID = "imdb";
    private static final Logger LOG = LoggerFactory.getLogger(ImdbPlugin.class);
    protected String preferredCountry;
    protected Pattern pRelease;
    private final String imdbPlot;
    protected YamjHttpClient httpClient;
    protected boolean downloadFanart;
    private final boolean extractCertificationFromMPAA;
    private final boolean fullInfo;
    protected String fanartToken;
    protected String fanartExtension;
    private final int preferredBiographyLength;
    private final int preferredFilmographyMax;
    protected int actorMax;
    protected int directorMax;
    protected int writerMax;
    private final int triviaMax;
    protected ImdbInfo imdbInfo;
    protected AspectRatioTools aspectTools;
    private final boolean skipFaceless;
    private final boolean skipVG;
    private final boolean skipTV;
    private final boolean skipV;
    private final List<String> jobsInclude;
    // Should we scrape the award information
    private final boolean scrapeAwards;
    // Should we scrape the won awards only
    private final boolean scrapeWonAwards;
    // Should we scrape the business information
    private final boolean scrapeBusiness;
    // Should we scrape the trivia information
    private final boolean scrapeTrivia;
    // Site Literals
    private static final String IMDB_TITLE = "title/";
    private static final String IMDB_NAME = "name/";
    // Suffix Literals
    private static final String SUFFIX_FILMOYEAR = "/filmoyear";
    private static final String SUFFIX_BIO = "/bio";
    private static final String SUFFIX_PARENTALGUIDE = "/parentalguide#certification";
    private static final String SUFFIX_RELEASEINFO = "/releaseinfo";
    private static final String SUFFIX_AWARDS = "/awards";
    private static final String SUFFIX_FULLCREDITS = "/fullcredits";
    private static final String SUFFIX_PLOTSUMMARY = "/plotsummary";
    private static final String SUFFIX_TRIVIA = "/trivia";
    private static final String SUFFIX_BUSINESS = "/business";
    // Literals
    private static final String HTML_H5_END = ":</h5>";
    private static final String HTML_H5_START = "<h5>";
    private static final String HTML_DIV_END = "</div>";
    private static final String HTML_A_END = "</a>";
    private static final String HTML_A_START = "<a ";
    private static final String HTML_TABLE_END = "</table>";
    private static final String HTML_TD_END = "</td>";
    private static final String HTML_H4_END = ":</h4>";
    private static final String HTML_BREAK = "<br/>";
    private static final String HTML_SPAN_END = "</span>";
    private static final String HTML_GT = ">";
    // Patterns for the name searching
    private static final Pattern PATTERN_BIO = Pattern.compile("<h\\d.*?>Mini Bio.*?</h\\d>.*?<p>(.*?)</p>");

    // Patterns for filmography
    // 1: Section title (job), 2: Number of credits
    private static final Pattern P_JOB_SELECTION = Pattern
            .compile("<a name=\".*?\">(.*?)</a> \\((\\d*?) credit.?\\)");
    // 1: Single video
    private static final Pattern P_JOB_ITEMS = Pattern
            .compile("(?s)(<div class=\"filmo-row.*?>(.*?)(?:</div>\\s*)+)");
    // 1: Release Year (for acting roles)
    private static final Pattern P_JOB_YEAR = Pattern.compile("\\\"year_column\\\">(?:&nbsp;){0,1}(\\d{4})</span>");
    // 1: IMDB ID, 2: Title
    private static final Pattern P_JOB_ID_TITLE = Pattern.compile("/title/(tt\\d*?)/.*?>(.*?)<br");

    // Pattern for DOB
    private static final Pattern PATTERN_DOB = Pattern.compile("(\\d{1,2})-(\\d{1,2})");

    // AKA scraping
    private final boolean akaScrapeTitle;
    private final String[] akaMatchingCountries;
    private final String[] akaIgnoreVersions;

    public ImdbPlugin() {
        imdbInfo = new ImdbInfo();
        aspectTools = new AspectRatioTools();

        httpClient = YamjHttpClientBuilder.getHttpClient();

        preferredCountry = PropertiesUtil.getProperty("imdb.preferredCountry", "USA");
        imdbPlot = PropertiesUtil.getProperty("imdb.plot", "short");
        downloadFanart = PropertiesUtil.getBooleanProperty("fanart.movie.download", Boolean.FALSE);
        fanartToken = PropertiesUtil.getProperty("mjb.scanner.fanartToken", ".fanart");
        fanartExtension = PropertiesUtil.getProperty("fanart.format", "jpg");
        extractCertificationFromMPAA = PropertiesUtil.getBooleanProperty("imdb.getCertificationFromMPAA",
                Boolean.TRUE);
        fullInfo = PropertiesUtil.getBooleanProperty("imdb.full.info", Boolean.FALSE);

        // People properties
        preferredBiographyLength = PropertiesUtil.getIntProperty("plugin.biography.maxlength", 500);
        preferredFilmographyMax = PropertiesUtil.getIntProperty("plugin.filmography.max", 20);
        actorMax = PropertiesUtil.getReplacedIntProperty("movie.actor.maxCount", "plugin.people.maxCount.actor",
                10);
        directorMax = PropertiesUtil.getReplacedIntProperty("movie.director.maxCount",
                "plugin.people.maxCount.director", 2);
        writerMax = PropertiesUtil.getReplacedIntProperty("movie.writer.maxCount", "plugin.people.maxCount.writer",
                3);
        skipFaceless = PropertiesUtil.getBooleanProperty("plugin.people.skip.faceless", Boolean.FALSE);
        skipVG = PropertiesUtil.getBooleanProperty("plugin.people.skip.VG", Boolean.TRUE);
        skipTV = PropertiesUtil.getBooleanProperty("plugin.people.skip.TV", Boolean.FALSE);
        skipV = PropertiesUtil.getBooleanProperty("plugin.people.skip.V", Boolean.FALSE);
        jobsInclude = Arrays.asList(PropertiesUtil
                .getProperty("plugin.filmography.jobsInclude", "Director,Writer,Actor,Actress").split(","));

        // Trivia properties
        triviaMax = PropertiesUtil.getIntProperty("plugin.trivia.maxCount", 15);

        // Award properties
        String tmpAwards = PropertiesUtil.getProperty("mjb.scrapeAwards", FALSE);
        scrapeWonAwards = "won".equalsIgnoreCase(tmpAwards);
        scrapeAwards = tmpAwards.equalsIgnoreCase(TRUE) || scrapeWonAwards;

        // Business properties
        scrapeBusiness = PropertiesUtil.getBooleanProperty("mjb.scrapeBusiness", Boolean.FALSE);

        // Trivia properties
        scrapeTrivia = PropertiesUtil.getBooleanProperty("mjb.scrapeTrivia", Boolean.FALSE);

        // Other properties
        akaScrapeTitle = PropertiesUtil.getBooleanProperty("imdb.aka.scrape.title", Boolean.FALSE);
        akaIgnoreVersions = PropertiesUtil.getProperty("imdb.aka.ignore.versions", "").split(",");

        String fallbacks = PropertiesUtil.getProperty("imdb.aka.fallback.countries", "");
        if (StringTools.isNotValidString(fallbacks)) {
            akaMatchingCountries = new String[] { preferredCountry };
        } else {
            akaMatchingCountries = (preferredCountry + "," + fallbacks).split(",");
        }

        // PATTERN
        pRelease = Pattern.compile(
                "(?:.*?)\\Q" + preferredCountry + "\\E(?:.*?)\\Qrelease_date\">\\E(.*?)(?:<.*?>)(.*?)(?:</a>.*)");

    }

    @Override
    public String getPluginID() {
        return IMDB_PLUGIN_ID;
    }

    @Override
    public boolean scan(Movie movie) {
        String imdbId = movie.getId(IMDB_PLUGIN_ID);
        if (isNotValidString(imdbId)) {
            imdbId = imdbInfo.getImdbId(movie.getTitle(), movie.getYear(), movie.isTVShow());
            movie.setId(IMDB_PLUGIN_ID, imdbId);
        }

        boolean retval = Boolean.FALSE;
        if (isValidString(imdbId)) {
            retval = updateImdbMediaInfo(movie);
        }
        return retval;
    }

    protected String getPreferredValue(List<String> values, boolean useLast) {
        String value = UNKNOWN;

        if (useLast) {
            Collections.reverse(values);
        }

        for (String text : values) {
            String country = null;

            int pos = text.indexOf(':');
            if (pos != -1) {
                country = text.substring(0, pos);
                text = text.substring(pos + 1);
            }
            pos = text.indexOf('(');
            if (pos != -1) {
                text = text.substring(0, pos).trim();
            }

            if (country == null) {
                if (value.equals(UNKNOWN)) {
                    value = text;
                }
            } else if (country.equals(preferredCountry)) {
                value = text;
                // No need to continue scanning
                break;
            }
        }
        return HTMLTools.stripTags(value);
    }

    /**
     * Scan IMDB HTML page for the specified movie
     */
    private boolean updateImdbMediaInfo(Movie movie) {
        String imdbID = movie.getId(IMDB_PLUGIN_ID);
        if (!imdbID.startsWith("tt")) {
            imdbID = "tt" + imdbID;
            // Correct the ID if it's wrong
            movie.setId(IMDB_PLUGIN_ID, imdbID);
        }

        String xml = ImdbPlugin.this.getImdbUrl(movie);

        // Add the combined tag to the end of the request if required
        if (fullInfo) {
            xml += "combined";
        }

        xml = getImdbData(xml);

        if (!Movie.TYPE_TVSHOW.equals(movie.getMovieType())
                && (xml.contains("\"tv-extra\"") || xml.contains("\"tv-series-series\""))) {
            movie.setMovieType(Movie.TYPE_TVSHOW);
            return Boolean.FALSE;
        }

        // We can work out if this is the new site by looking for " - IMDb" at the end of the title
        String title = HTMLTools.extractTag(xml, "<title>");
        if (!Movie.TYPE_TVSHOW.equals(movie.getMovieType()) && title.contains("(TV Series")) {
            movie.setMovieType(Movie.TYPE_TVSHOW);
            return Boolean.FALSE;
        }

        // Correct the title if "imdb" found
        if (StringUtils.endsWithIgnoreCase(title, " - imdb")) {
            title = title.substring(0, title.length() - 7);
        } else if (StringUtils.startsWithIgnoreCase(title, "imdb - ")) {
            title = title.substring(7);
        }

        // Remove the (VG) or (V) tags from the title
        title = title.replaceAll(" \\([VG|V]\\)$", "");

        //String yearPattern = "(?i).\\((?:TV.|VIDEO.)?(\\d{4})(?:/[^\\)]+)?\\)";
        String yearPattern = "(?i).\\((?:TV.|VIDEO.)?(\\d{4})";
        Pattern pattern = Pattern.compile(yearPattern, Pattern.CASE_INSENSITIVE);
        Matcher matcher = pattern.matcher(title);
        if (matcher.find()) {
            // If we've found a year, set it in the movie
            if (OverrideTools.checkOverwriteYear(movie, IMDB_PLUGIN_ID)) {
                movie.setYear(matcher.group(1), IMDB_PLUGIN_ID);
            }

            // Remove the year from the title
            title = title.substring(0, title.indexOf(matcher.group(0)));
        }

        if (OverrideTools.checkOverwriteTitle(movie, IMDB_PLUGIN_ID)) {
            movie.setTitle(title, IMDB_PLUGIN_ID);
        }

        if (OverrideTools.checkOverwriteOriginalTitle(movie, IMDB_PLUGIN_ID)) {
            String originalTitle = title;
            if (xml.contains("<span class=\"title-extra\">")) {
                originalTitle = HTMLTools.extractTag(xml, "<span class=\"title-extra\">", "</span>");
                if (originalTitle.contains("(original title)")) {
                    originalTitle = originalTitle.replace(" <i>(original title)</i>", "");
                } else {
                    originalTitle = title;
                }
            }
            movie.setOriginalTitle(originalTitle, IMDB_PLUGIN_ID);
        }

        // Update the movie information
        updateInfo(movie, xml);

        // update common values
        updateInfoCommon(movie, xml);

        if (scrapeAwards) {
            updateAwards(movie); // Issue 1901: Awards
        }

        if (scrapeBusiness) {
            updateBusiness(movie); // Issue 2012: Financial information about movie
        }

        if (scrapeTrivia) {
            updateTrivia(movie); // Issue 2013: Add trivia
        }

        // TODO: Move this check out of here, it doesn't belong.
        if (downloadFanart && isNotValidString(movie.getFanartURL())) {
            movie.setFanartURL(getFanartURL(movie));
            if (isValidString(movie.getFanartURL())) {
                movie.setFanartFilename(movie.getBaseName() + fanartToken + "." + fanartExtension);
            }
        }

        // always true
        return Boolean.TRUE;
    }

    /**
     * Process the new IMDb format web page
     *
     * @param movie
     * @param xml
     */
    private void updateInfo(Movie movie, String xml) {
        // RATING
        if (movie.getRating(IMDB_PLUGIN_ID) == -1) {
            String srtRating = HTMLTools.extractTag(xml, "starbar-meta\">", HTML_DIV_END).replace(",", ".");
            int intRating = parseRating(HTMLTools.stripTags(srtRating));

            // Try another format for the rating
            if (intRating == -1) {
                srtRating = HTMLTools.extractTag(xml, "ratingValue\">", HTML_SPAN_END).replace(",", ".");
                intRating = parseRating(HTMLTools.stripTags(srtRating));
            }

            movie.addRating(IMDB_PLUGIN_ID, intRating);
        }

        // TOP250
        if (OverrideTools.checkOverwriteTop250(movie, IMDB_PLUGIN_ID)) {
            String top250;
            if (fullInfo) {
                top250 = HTMLTools.extractTag(xml, "Top 250: #");
            } else {
                top250 = HTMLTools.extractTag(xml, "Top Rated Movies #");
            }
            movie.setTop250(top250, IMDB_PLUGIN_ID);
        }

        // RUNTIME
        if (OverrideTools.checkOverwriteRuntime(movie, IMDB_PLUGIN_ID)) {
            String runtime = "Runtime" + HTML_H4_END;
            List<String> runtimes = HTMLTools.extractTags(xml, runtime, HTML_DIV_END, null, "|", Boolean.FALSE);
            runtime = getPreferredValue(runtimes, false);

            // Strip any extraneous characters from the runtime
            int pos = runtime.indexOf("min");
            if (pos > 0) {
                runtime = runtime.substring(0, pos + 3);
            }
            movie.setRuntime(runtime, IMDB_PLUGIN_ID);
        }

        // COUNTRY
        if (OverrideTools.checkOverwriteCountry(movie, IMDB_PLUGIN_ID)) {
            List<String> countries = new ArrayList<>();
            String startTag = "Country" + HTML_H5_END;
            if (!xml.contains(startTag)) {
                startTag = "Country" + HTML_H4_END;
            }
            for (String country : HTMLTools.extractTags(xml, startTag, HTML_DIV_END, "<a href=\"", HTML_A_END)) {
                countries.add(HTMLTools.removeHtmlTags(country));
            }
            movie.setCountries(countries, IMDB_PLUGIN_ID);
        }

        // COMPANY
        if (OverrideTools.checkOverwriteCompany(movie, IMDB_PLUGIN_ID)) {
            String startTag = "Company" + HTML_H5_END;
            if (!xml.contains(startTag)) {
                startTag = "Production Co" + HTML_H4_END;
            }
            if (!xml.contains(startTag)) {
                startTag = "<h3>Company";
            }

            for (String company : HTMLTools.extractTags(xml, startTag, HTML_DIV_END, "<a href", HTML_A_END)) {
                company = HTMLTools.stripTags(company, false);
                if (company != null) {
                    // TODO Save more than one company
                    movie.setCompany(company, IMDB_PLUGIN_ID);
                    break;
                }
            }
        }

        // GENRES
        if (OverrideTools.checkOverwriteGenres(movie, IMDB_PLUGIN_ID)) {
            String startTag = "Genre" + HTML_H5_END;
            if (!xml.contains(startTag)) {
                startTag = "Genres" + HTML_H4_END;
            }
            movie.setGenres(HTMLTools.extractTags(xml, startTag, HTML_DIV_END, "<a href", HTML_A_END),
                    IMDB_PLUGIN_ID);
        }

        // QUOTE
        if (OverrideTools.checkOverwriteQuote(movie, IMDB_PLUGIN_ID)) {
            for (String quote : HTMLTools.extractTags(xml, "<h4>Quotes</h4>", "<span class=\"", "<br", "<br")) {
                if (quote != null) {
                    quote = HTMLTools.stripTags(quote);
                    movie.setQuote(cleanStringEnding(quote), IMDB_PLUGIN_ID);
                    break;
                }
            }
        }

        // OUTLINE
        if (OverrideTools.checkOverwriteOutline(movie, IMDB_PLUGIN_ID)) {
            // The new outline is at the end of the review section with no preceding text
            String imdbOutline = HTMLTools.extractTag(xml, "<div class=\"summary_text\" itemprop=\"description\">",
                    HTML_DIV_END);
            imdbOutline = cleanStringEnding(HTMLTools.removeHtmlTags(imdbOutline)).trim();

            if (isNotValidString(imdbOutline)) {
                // ensure the outline is set to unknown if it's blank or null
                imdbOutline = UNKNOWN;
            }

            movie.setOutline(imdbOutline, IMDB_PLUGIN_ID);
        }

        // PLOT
        if (OverrideTools.checkOverwritePlot(movie, IMDB_PLUGIN_ID)) {
            getPlot(movie, xml);
        }

        // CERTIFICATION
        if (OverrideTools.checkOverwriteCertification(movie, IMDB_PLUGIN_ID)) {
            String certification = movie.getCertification();
            // Use the default site definition for the certification, because the local versions don't have the parentalguide page
            String certXML = getImdbData(getImdbUrl(movie, SUFFIX_PARENTALGUIDE));
            if (extractCertificationFromMPAA) {
                String mpaa = HTMLTools.extractTag(certXML, "<h5><a href=\"/mpaa\">MPAA</a>:</h5>", 1);
                if (!mpaa.equals(UNKNOWN)) {
                    String key = "Rated ";
                    int pos = mpaa.indexOf(key);
                    if (pos != -1) {
                        int start = key.length();
                        pos = mpaa.indexOf(" on appeal for ", start);
                        if (pos == -1) {
                            pos = mpaa.indexOf(" for ", start);
                        }
                        if (pos != -1) {
                            certification = mpaa.substring(start, pos);
                        }
                    }
                }
            }

            if (isNotValidString(certification)) {
                certification = getPreferredValue(
                        HTMLTools.extractTags(certXML, HTML_H5_START + "Certification" + HTML_H5_END, HTML_DIV_END,
                                "<a href=\"/search/title?certificates=", HTML_A_END),
                        true);
            }

            if (isNotValidString(certification)) {
                certification = Movie.NOTRATED;
            }

            movie.setCertification(certification, IMDB_PLUGIN_ID);
        }

        // YEAR
        if (OverrideTools.checkOverwriteYear(movie, IMDB_PLUGIN_ID)) {
            Pattern getYear = Pattern.compile("(?:\\s*" + "\\((\\d{4})(?:/[^\\)]+)?\\)|<a href=\"/year/(\\d{4}))");
            Matcher m = getYear.matcher(xml);
            if (m.find()) {
                String year = m.group(1);
                if (isNotValidString(year)) {
                    year = m.group(2);
                }
                movie.setYear(year, IMDB_PLUGIN_ID);
            }

            // second approach
            if (isNotValidString(movie.getYear())) {
                movie.setYear(HTMLTools.extractTag(xml, "<a href=\"/year/", 1), IMDB_PLUGIN_ID);
                if (isNotValidString(movie.getYear())) {
                    String fullReleaseDate = HTMLTools.getTextAfterElem(xml,
                            HTML_H5_START + "Original Air Date" + HTML_H5_END, 0);
                    if (isValidString(fullReleaseDate)) {
                        movie.setYear(fullReleaseDate.split(" ")[2], IMDB_PLUGIN_ID);
                    }
                }
            }
        }

        // TAGLINE
        if (OverrideTools.checkOverwriteTagline(movie, IMDB_PLUGIN_ID)) {
            movie.setTagline(extractTagline(xml), IMDB_PLUGIN_ID);
        }

        // TV SHOW
        if (movie.isTVShow()) {
            updateTVShowInfo(movie);
        }
    }

    /**
     * Look for the tagline in the XML
     *
     * @param xml The source XML
     * @return The tagline found (or UNKNOWN)
     */
    private static String extractTagline(String xml) {
        String tagline = UNKNOWN;

        // Look for the tagline with upto 3 characters after the sitedef to ensure we get any plurals on the end
        Pattern pTagline = Pattern.compile("(Tagline.{0,3}?:</h\\d>)", Pattern.CASE_INSENSITIVE);
        Matcher m = pTagline.matcher(xml);

        if (m.find()) {
            int beginIndex = m.start();
            // We need to work out which of the two formats to use, this is dependent on which comes first "<span" or "</div"
            String endMarker;
            if (StringUtils.indexOf(xml, "<span", beginIndex) < StringUtils.indexOf(xml, HTML_DIV_END,
                    beginIndex)) {
                endMarker = "<span";
            } else {
                endMarker = HTML_DIV_END;
            }

            // Now look for the right string
            tagline = HTMLTools.stripTags(HTMLTools.extractTag(xml, m.group(1), endMarker), false);
            tagline = cleanStringEnding(tagline);
        }

        return tagline;
    }

    /**
     * Process the plot from the Combined XML
     *
     * @param movie
     * @param xml
     */
    private void getPlot(Movie movie, String xml) {
        String xmlPlot = UNKNOWN;

        // Get the long plot from the summary page
        if ("long".equalsIgnoreCase(imdbPlot)) {
            xmlPlot = getPlotSummary(movie);
        }

        // Search on the combined page
        if (isNotValidString(xmlPlot)) {
            xmlPlot = HTMLTools.extractTag(xml, "<h5>Plot" + HTML_H5_END, HTML_DIV_END);
            xmlPlot = HTMLTools.removeHtmlTags(xmlPlot).trim();

            // This plot didn't work, look for another version
            if (isNotValidString(xmlPlot)) {
                xmlPlot = HTMLTools.extractTag(xml, "<h5>Plot</h5>", "<span class=\"");
                xmlPlot = HTMLTools.removeHtmlTags(xmlPlot).trim();
            }

            // This plot didn't work, look for another version
            if (isNotValidString(xmlPlot)) {
                xmlPlot = HTMLTools.extractTag(xml, "<h5>Plot</h5>", "<p>");
                xmlPlot = HTMLTools.removeHtmlTags(xmlPlot).trim();
            }

            // This plot didn't work, look for another version
            if (isNotValidString(xmlPlot)) {
                xmlPlot = HTMLTools.extractTag(xml, "<div class=\"summary_text\" itemprop=\"description\">",
                        HTML_DIV_END);
                xmlPlot = HTMLTools.removeHtmlTags(xmlPlot).trim();
            }

            // This plot didn't work, look for another version
            if (isNotValidString(xmlPlot)) {
                xmlPlot = HTMLTools.extractTag(xml, "<div class=\"inline canwrap\" itemprop=\"description\">",
                        HTML_DIV_END);
                xmlPlot = HTMLTools.removeHtmlTags(xmlPlot).trim();
            }

            // See if the plot has the "metacritic" text and remove it
            int pos = xmlPlot.indexOf("Metacritic.com)");
            if (pos > 0) {
                xmlPlot = xmlPlot.substring(pos + "Metacritic.com)".length());
            }
            // remove the "Written by" annotation
            pos = xmlPlot.indexOf("Written by");
            if (pos > 0) {
                xmlPlot = xmlPlot.substring(0, pos);
            }

            // Check the length of the plot is OK
            if (isValidString(xmlPlot)) {
                xmlPlot = cleanStringEnding(xmlPlot);
            } else {
                // The plot might be blank or null so set it to UNKNOWN
                xmlPlot = UNKNOWN;
            }
        }

        // Update the plot with the found plot, or the outline if not found
        if (isValidString(xmlPlot)) {
            movie.setPlot(xmlPlot, IMDB_PLUGIN_ID);
        } else {
            movie.setPlot(movie.getOutline(), IMDB_PLUGIN_ID);
        }
    }

    /**
     * Retrieves the long plot description from IMDB if it exists, else "UNKNOWN"
     *
     * @param movie
     * @return long plot
     */
    private String getPlotSummary(Identifiable movie) {
        String plot = Movie.UNKNOWN;

        String xml = getImdbData(getImdbUrl(movie, SUFFIX_PLOTSUMMARY));

        String result = HTMLTools.extractTag(xml, "<p class=\"plotSummary\">", "</p>");
        if (isValidString(result)
                && !result.contains("It looks like we don't have any Synopsis for this title yet.")) {
            plot = HTMLTools.stripTags(result);
        }

        // second parsing other site
        result = HTMLTools.extractTag(xml, "<div id=\"swiki.2.1\">", HTML_DIV_END);
        if (isValidString(result)
                && !result.contains("It looks like we don't have any Synopsis for this title yet.")) {
            plot = HTMLTools.stripTags(result);
        }

        return plot;
    }

    /**
     * Scrape info which is common for old and new IMDb.
     *
     * @param movie
     * @param xml
     * @param imdbNewVersion
     */
    private void updateInfoCommon(Movie movie, String xml) {
        // Store the release info page for release info & AKAs
        String releaseInfoXML = UNKNOWN;
        // Store the aka list
        Map<String, String> akas = null;

        if (OverrideTools.checkOverwriteAspectRatio(movie, IMDB_PLUGIN_ID)) {
            // determine start and end string
            String startString;
            String endString;
            if (fullInfo) {
                startString = HTML_H5_START + "Aspect Ratio" + HTML_H5_END + "<div class=\"info-content\">";
                endString = "<a class";
            } else {
                startString = "<h4 class=\"inline\">Aspect Ratio" + HTML_H4_END;
                endString = HTML_DIV_END;
            }

            // find unclean aspect ratio
            String uncleanAspectRatio = HTMLTools.extractTag(xml, startString, endString).trim();

            if (StringTools.isValidString(uncleanAspectRatio)) {
                // remove spaces and replace , with .
                uncleanAspectRatio = uncleanAspectRatio.replace(" ", "").replace(",", ".");
                // set aspect ratio
                movie.setAspectRatio(aspectTools.cleanAspectRatio(uncleanAspectRatio), IMDB_PLUGIN_ID);
            }
        }

        // RELEASE DATE
        if (OverrideTools.checkOverwriteReleaseDate(movie, IMDB_PLUGIN_ID)) {
            // Load the release page from IMDB
            if (StringTools.isNotValidString(releaseInfoXML)) {
                releaseInfoXML = getImdbData(getImdbUrl(movie, SUFFIX_RELEASEINFO));
            }

            Matcher mRelease = pRelease.matcher(releaseInfoXML);

            // "contains" is a quick match before the slower find() is triggered.
            if (releaseInfoXML.contains(preferredCountry) && mRelease.find()) {
                String releaseDate = mRelease.group(1) + " " + mRelease.group(2);
                movie.setReleaseDate(releaseDate, IMDB_PLUGIN_ID);
            }
        }

        // ORIGINAL TITLE / AKAS
        if (OverrideTools.checkOverwriteOriginalTitle(movie, IMDB_PLUGIN_ID)) {
            // Load the AKA page from IMDb
            if (StringTools.isNotValidString(releaseInfoXML)) {
                releaseInfoXML = getImdbData(getImdbUrl(movie, SUFFIX_RELEASEINFO));
            }

            // The AKAs are stored in the format "title", "country"
            // therefore we need to look for the preferredCountry and then work backwards
            List<String> akaList = HTMLTools.extractTags(releaseInfoXML, "<a id=\"akas\" name=\"akas\">",
                    HTML_TABLE_END, "<td>", HTML_TD_END, Boolean.FALSE);
            akas = buildAkaMap(akaList);

            String foundValue = null;
            for (Map.Entry<String, String> aka : akas.entrySet()) {
                if (aka.getKey().contains("original title")) {
                    foundValue = aka.getValue().trim();
                    break;
                }
            }
            movie.setOriginalTitle(foundValue, IMDB_PLUGIN_ID);
        }

        // TITLE for preferred country from AKAS
        if (akaScrapeTitle && OverrideTools.checkOverwriteTitle(movie, IMDB_PLUGIN_ID)) {
            // Load the AKA page from IMDb
            if (StringTools.isNotValidString(releaseInfoXML)) {
                releaseInfoXML = getImdbData(getImdbUrl(movie, SUFFIX_RELEASEINFO));
            }

            // The AKAs are stored in the format "title", "country"
            // therefore we need to look for the preferredCountry and then work backwards
            if (akas == null) {
                // Just extract the AKA section from the page
                List<String> akaList = HTMLTools.extractTags(releaseInfoXML, "<a id=\"akas\" name=\"akas\">",
                        HTML_TABLE_END, "<td>", HTML_TD_END, Boolean.FALSE);
                akas = buildAkaMap(akaList);
            }

            String foundValue = null;
            // NOTE: First matching country is the preferred country
            for (String matchCountry : akaMatchingCountries) {

                if (StringUtils.isBlank(matchCountry)) {
                    // must be a valid country setting
                    continue;
                }

                for (Map.Entry<String, String> aka : akas.entrySet()) {
                    int startIndex = aka.getKey().indexOf(matchCountry);
                    if (startIndex > -1) {
                        String extracted = aka.getKey().substring(startIndex);
                        int endIndex = extracted.indexOf('/');
                        if (endIndex > -1) {
                            extracted = extracted.substring(0, endIndex);
                        }

                        boolean valid = Boolean.TRUE;
                        for (String ignore : akaIgnoreVersions) {
                            if (StringUtils.isNotBlank(ignore)
                                    && StringUtils.containsIgnoreCase(extracted, ignore.trim())) {
                                valid = Boolean.FALSE;
                                break;
                            }
                        }

                        if (valid) {
                            foundValue = aka.getValue().trim();
                            break;
                        }
                    }
                }

                if (foundValue != null) {
                    // we found a title for the country matcher
                    break;
                }
            }
            movie.setTitle(foundValue, IMDB_PLUGIN_ID);
        }

        // holds the full credits page
        String fullcreditsXML = UNKNOWN;

        // DIRECTOR(S)
        boolean overrideNormal = OverrideTools.checkOverwriteDirectors(movie, IMDB_PLUGIN_ID);
        boolean overridePeople = OverrideTools.checkOverwritePeopleDirectors(movie, IMDB_PLUGIN_ID);
        if (overrideNormal || overridePeople) {
            boolean found = Boolean.FALSE;

            // get from combined page (same layout as full credits)
            if (fullInfo) {
                found = extractDirectorsFromFullCredits(movie, xml, overrideNormal, overridePeople);
            }

            // get from full credits
            if (!found) {
                if (isNotValidString(fullcreditsXML)) {
                    fullcreditsXML = getImdbData(getImdbUrl(movie, SUFFIX_FULLCREDITS));
                }
                extractDirectorsFromFullCredits(movie, fullcreditsXML, overrideNormal, overridePeople);
            }
        }

        // WRITER(S)
        overrideNormal = OverrideTools.checkOverwriteWriters(movie, IMDB_PLUGIN_ID);
        overridePeople = OverrideTools.checkOverwritePeopleWriters(movie, IMDB_PLUGIN_ID);
        if (overrideNormal || overridePeople) {
            boolean found = Boolean.FALSE;

            // get from combined page (same layout as full credits)
            if (fullInfo) {
                found = extractWritersFromFullCredits(movie, xml, overrideNormal, overridePeople);
            }

            // get from full credits
            if (!found) {
                if (isNotValidString(fullcreditsXML)) {
                    fullcreditsXML = getImdbData(getImdbUrl(movie, SUFFIX_FULLCREDITS));
                }
                extractWritersFromFullCredits(movie, fullcreditsXML, overrideNormal, overridePeople);
            }
        }

        // CAST
        overrideNormal = OverrideTools.checkOverwriteActors(movie, IMDB_PLUGIN_ID);
        overridePeople = OverrideTools.checkOverwritePeopleActors(movie, IMDB_PLUGIN_ID);
        if (overrideNormal || overridePeople) {
            boolean found = Boolean.FALSE;

            // get from combined page (same layout as full credits)
            if (fullInfo) {
                found = extractCastFromFullCredits(movie, xml, overrideNormal, overridePeople);
            }

            // get from full credits
            if (!found) {
                if (isNotValidString(fullcreditsXML)) {
                    fullcreditsXML = getImdbData(getImdbUrl(movie, SUFFIX_FULLCREDITS));
                }
                extractCastFromFullCredits(movie, fullcreditsXML, overrideNormal, overridePeople);
            }
        }
    }

    private boolean extractCastFromFullCredits(Movie movie, String fullcreditsXML, boolean overrideNormal,
            boolean overridePeople) {
        // count for already set cast
        int count = 0;
        // flag to indicate if cast must be cleared
        boolean clearCast = Boolean.TRUE;
        boolean clearPeopleCast = Boolean.TRUE;
        // flag to indicate if match has been found
        boolean found = Boolean.FALSE;

        for (String actorBlock : HTMLTools.extractTags(fullcreditsXML, "<table class=\"cast_list\">",
                HTML_TABLE_END, "<td class=\"primary_photo\"", "</tr>")) {
            // skip faceless persons ("loadlate hidden" is present for actors with photos)
            if (skipFaceless && !actorBlock.contains("loadlate hidden")) {
                continue;
            }

            int nmPosition = actorBlock.indexOf("/nm");
            String personID = actorBlock.substring(nmPosition + 1, actorBlock.indexOf("/", nmPosition + 1));

            String name = HTMLTools
                    .stripTags(HTMLTools.extractTag(actorBlock, "itemprop=\"name\">", HTML_SPAN_END));
            String character = HTMLTools
                    .stripTags(HTMLTools.extractTag(actorBlock, "<td class=\"character\">", HTML_TD_END));

            if (overrideNormal) {
                // clear cast if not already done
                if (clearCast) {
                    movie.clearCast();
                    clearCast = Boolean.FALSE;
                }
                // add actor
                movie.addActor(name, IMDB_PLUGIN_ID);
            }

            if (overridePeople) {
                // clear cast if not already done
                if (clearPeopleCast) {
                    movie.clearPeopleCast();
                    clearPeopleCast = Boolean.FALSE;
                }
                // add actor
                movie.addActor(IMDB_PLUGIN_ID + ":" + personID, name, character,
                        imdbInfo.getImdbSite() + IMDB_NAME + personID + "/", UNKNOWN, IMDB_PLUGIN_ID);
            }

            found = Boolean.TRUE;
            count++;
            if (count == actorMax) {
                break;
            }
        }

        return found;
    }

    private boolean extractDirectorsFromFullCredits(Movie movie, String fullcreditsXML, boolean overrideNormal,
            boolean overridePeople) {
        // count for already set directors
        int count = 0;
        // flag to indicate if directors must be cleared
        boolean clearDirectors = Boolean.TRUE;
        boolean clearPeopleDirectors = Boolean.TRUE;
        // flag to indicate if match has been found
        boolean found = Boolean.FALSE;

        for (String directorMatch : new String[] { "Directed by", "Director", "Directors" }) {
            if (fullcreditsXML.contains(HTML_GT + directorMatch + "&nbsp;</h4>")) {
                for (String member : HTMLTools.extractTags(fullcreditsXML, HTML_GT + directorMatch + "&nbsp;</h4>",
                        HTML_TABLE_END, HTML_A_START, HTML_A_END, Boolean.FALSE)) {
                    int beginIndex = member.indexOf("href=\"/name/");
                    if (beginIndex > -1) {
                        String personID = member.substring(beginIndex + 12, member.indexOf("/", beginIndex + 12));
                        String director = member.substring(member.indexOf(HTML_GT, beginIndex) + 1).trim();
                        if (overrideNormal) {
                            // clear directors if not already done
                            if (clearDirectors) {
                                movie.clearDirectors();
                                clearDirectors = Boolean.FALSE;
                            }
                            // add director
                            movie.addDirector(director, IMDB_PLUGIN_ID);
                        }

                        if (overridePeople) {
                            // clear directors if not already done
                            if (clearPeopleDirectors) {
                                movie.clearPeopleDirectors();
                                clearPeopleDirectors = Boolean.FALSE;
                            }
                            // add director, but check that there are no invalid characters in the name which may indicate a bad scrape
                            if (StringUtils.containsNone(director, "<>:/")) {
                                movie.addDirector(IMDB_PLUGIN_ID + ":" + personID, director,
                                        imdbInfo.getImdbSite() + IMDB_NAME + personID + "/", IMDB_PLUGIN_ID);
                                found = Boolean.TRUE;
                                count++;
                            } else {
                                LOG.debug("Invalid director name found: '{}'", director);
                            }
                        }

                        if (count == directorMax) {
                            break;
                        }
                    }
                }
            }
            if (found) {
                // We found a match, so stop search.
                break;
            }
        }

        return found;
    }

    private boolean extractWritersFromFullCredits(Movie movie, String fullcreditsXML, boolean overrideNormal,
            boolean overridePeople) {
        // count for already set writers
        int count = 0;
        // flag to indicate if writers must be cleared
        boolean clearWriters = Boolean.TRUE;
        boolean clearPeopleWriters = Boolean.TRUE;
        // flag to indicate if match has been found
        boolean found = Boolean.FALSE;

        for (String writerMatch : new String[] { "Writing credits", "Writer", "Writers" }) {
            if (StringUtils.indexOfIgnoreCase(fullcreditsXML, HTML_GT + writerMatch) >= 0) {
                for (String member : HTMLTools.extractTags(fullcreditsXML, HTML_GT + writerMatch, HTML_TABLE_END,
                        HTML_A_START, HTML_A_END, Boolean.FALSE)) {
                    int beginIndex = member.indexOf("href=\"/name/");
                    if (beginIndex > -1) {
                        String personID = member.substring(beginIndex + 12, member.indexOf("/", beginIndex + 12));
                        String name = StringUtils
                                .trimToEmpty(member.substring(member.indexOf(HTML_GT, beginIndex) + 1));
                        if (!name.contains("more credit")) {

                            if (overrideNormal) {
                                // clear writers if not already done
                                if (clearWriters) {
                                    movie.clearWriters();
                                    clearWriters = Boolean.FALSE;
                                }
                                // add writer
                                movie.addWriter(name, IMDB_PLUGIN_ID);
                            }

                            if (overridePeople) {
                                // clear writers if not already done
                                if (clearPeopleWriters) {
                                    movie.clearPeopleWriters();
                                    clearPeopleWriters = Boolean.FALSE;
                                }
                                // add writer
                                movie.addWriter(IMDB_PLUGIN_ID + ":" + personID, name,
                                        imdbInfo.getImdbSite() + IMDB_NAME + personID + "/", IMDB_PLUGIN_ID);
                            }

                            found = Boolean.TRUE;
                            count++;
                            if (count == writerMax) {
                                break;
                            }
                        }
                    }
                }
            }

            if (found) {
                // We found a match, so stop search.
                break;
            }
        }

        return found;
    }

    /**
     * Process a awards in the IMDb web page
     *
     * @param movie
     * @return
     */
    private boolean updateAwards(Movie movie) {
        String imdbId = movie.getId(IMDB_PLUGIN_ID);

        String awardXML = getImdbData(getImdbUrl(imdbId, IMDB_TITLE, SUFFIX_AWARDS));
        if (awardXML.contains("<h1 class=\"header\">Awards</h1>")) {

            List<String> awardHtmlList = HTMLTools.extractTags(awardXML, "<h1 class=\"header\">Awards</h1>",
                    "<div class=\"article\"", "<h3>", "</table>", false);

            Collection<AwardEvent> awardList = new ArrayList<>();
            for (String awardBlock : awardHtmlList) {
                String awardEvent = awardBlock.substring(0, awardBlock.indexOf('<')).trim();

                AwardEvent aEvent = new AwardEvent();
                aEvent.setName(awardEvent);

                String tmpString = HTMLTools.extractTag(awardBlock, "<a href=", HTML_A_END).trim();
                tmpString = tmpString.substring(tmpString.indexOf('>') + 1).trim();
                int awardYear = NumberUtils.isNumber(tmpString) ? Integer.parseInt(tmpString) : -1;

                tmpString = StringUtils.trimToEmpty(
                        HTMLTools.extractTag(awardBlock, "<span class=\"award_category\">", "</span>"));
                Award aAward = new Award();
                aAward.setName(tmpString);
                aAward.setYear(awardYear);

                boolean awardOutcomeWon = true;
                for (String outcomeBlock : HTMLTools.extractHtmlTags(awardBlock, "<table class=", null, "<tr>",
                        "</tr>")) {
                    String outcome = HTMLTools.extractTag(outcomeBlock, "<b>", "</b>");
                    if (StringTools.isValidString(outcome)) {
                        awardOutcomeWon = "won".equalsIgnoreCase(outcome);
                    }

                    String awardDescription = StringUtils.trimToEmpty(
                            HTMLTools.extractTag(outcomeBlock, "<td class=\"award_description\">", "<br />"));
                    // Check to see if there was a missing title and just the name in the result
                    if (awardDescription.contains("href=\"/name/")) {
                        awardDescription = StringUtils.trimToEmpty(
                                HTMLTools.extractTag(outcomeBlock, "<span class=\"award_category\">", "</span>"));
                    }

                    if (awardOutcomeWon) {
                        aAward.addWon(awardDescription);
                    } else {
                        aAward.addNomination(awardDescription);
                    }
                }

                if (!scrapeWonAwards || (aAward.getWon() > 0)) {
                    LOG.debug("{} - Adding award: {}", movie.getBaseName(), aAward.toString());
                    aEvent.addAward(aAward);
                }

                if (!aEvent.getAwards().isEmpty()) {
                    awardList.add(aEvent);
                }
            }

            if (!awardList.isEmpty()) {
                movie.setAwards(awardList);
            }
        } else {
            LOG.debug("No awards found for {}", movie.getBaseName());
        }
        return Boolean.TRUE;
    }

    /**
     * Process financial information in the IMDb web page
     *
     * @param movie
     * @return
     */
    private boolean updateBusiness(Movie movie) {
        String imdbId = movie.getId(IMDB_PLUGIN_ID);
        String xml = getImdbData(getImdbUrl(imdbId, IMDB_TITLE, SUFFIX_BUSINESS));

        if (isValidString(xml)) {
            String budget = HTMLTools.extractTag(xml, "<h5>Budget</h5>", HTML_BREAK).replaceAll("\\s.*", "");
            movie.setBudget(budget);
            NumberFormat moneyFormat = NumberFormat.getNumberInstance(new Locale("US"));
            for (int i = 0; i < 2; i++) {
                for (String oWeek : HTMLTools.extractTags(xml,
                        HTML_H5_START + (i == 0 ? "Opening Weekend" : "Gross") + "</h5", HTML_H5_START, "",
                        "<br/")) {
                    if (isValidString(oWeek)) {
                        String currency = oWeek.replaceAll("\\d+.*", "");
                        long value = NumberUtils.toLong(
                                oWeek.replaceAll("^\\D*\\s*", "").replaceAll("\\s.*", "").replaceAll(",", ""), -1L);
                        String country = HTMLTools.extractTag(oWeek, "(", ")");
                        if ("Worldwide".equals(country) && !"$".equals(currency)) {
                            continue;
                        }
                        String money = i == 0 ? movie.getOpenWeek(country) : movie.getGross(country);
                        if (isValidString(money)) {
                            long m = NumberUtils.toLong(money.replaceAll("^\\D*\\s*", "").replaceAll(",", ""), -1L);
                            value = i == 0 ? value + m : value > m ? value : m;
                        }
                        if (i == 0) {
                            movie.setOpenWeek(country, currency + moneyFormat.format(value));
                        } else {
                            movie.setGross(country, currency + moneyFormat.format(value));
                        }
                    }
                }
            }
            return Boolean.TRUE;
        }
        return Boolean.FALSE;
    }

    /**
     * Process trivia in the IMDb web page
     *
     * @param movie
     * @return
     * @throws IOException
     */
    private boolean updateTrivia(Movie movie) {
        if (triviaMax == 0) {
            return Boolean.FALSE;
        }

        String xml = getImdbData(getImdbUrl(movie, SUFFIX_TRIVIA));

        if (isValidString(xml)) {
            int i = 0;
            for (String tmp : HTMLTools.extractTags(xml, "<div class=\"list\">", "<div class=\"list\">",
                    "<div class=\"sodatext\"", HTML_DIV_END)) {
                if (i < triviaMax || triviaMax == -1) {
                    tmp = HTMLTools.removeHtmlTags(tmp);
                    tmp = tmp.trim();
                    movie.addDidYouKnow(tmp);
                    i++;
                } else {
                    break;
                }
            }
            return Boolean.TRUE;
        }
        return Boolean.FALSE;
    }

    /**
     * Parse the rating
     *
     * @param rating
     * @return
     */
    private static int parseRating(String rating) {
        StringTokenizer st = new StringTokenizer(rating, "/ ()");
        return StringTools.parseRating(st.nextToken());
    }

    /**
     * Get the fanart for the movie from the FanartScanner
     *
     * @param movie
     * @return
     */
    protected String getFanartURL(Movie movie) {
        return FanartScanner.getFanartURL(movie);
    }

    @Override
    public void scanTVShowTitles(Movie movie) {
        String imdbId = movie.getId(IMDB_PLUGIN_ID);

        if (!movie.isTVShow() || !movie.hasNewMovieFiles() || isNotValidString(imdbId)) {
            return;
        }

        String xml = getImdbData(getImdbUrl(movie, "/episodes?season=" + movie.getSeason()));

        if (StringUtils.isBlank(xml)) {
            return;
        }

        for (MovieFile file : movie.getMovieFiles()) {

            for (int episode = file.getFirstPart(); episode <= file.getLastPart(); ++episode) {

                int beginIndex = xml.indexOf("<meta itemprop=\"episodeNumber\" content=\"" + episode + "\"/>");
                if (beginIndex == -1) {
                    continue;
                }
                int endIndex = xml.indexOf("<div class=\"clear\"", beginIndex);
                String episodeXml = xml.substring(beginIndex, endIndex);

                if (OverrideTools.checkOverwriteEpisodeTitle(file, episode, IMDB_PLUGIN_ID)) {
                    String episodeName = HTMLTools.extractTag(episodeXml, "itemprop=\"name\">", HTML_A_END);
                    file.setTitle(episode, episodeName, IMDB_PLUGIN_ID);
                }

                if (OverrideTools.checkOverwriteEpisodePlot(file, episode, IMDB_PLUGIN_ID)) {
                    String plot = HTMLTools.extractTag(episodeXml, "itemprop=\"description\">", HTML_DIV_END);
                    file.setPlot(episode, plot, IMDB_PLUGIN_ID);
                }

                if (OverrideTools.checkOverwriteEpisodeFirstAired(file, episode, IMDB_PLUGIN_ID)) {
                    String firstAired = HTMLTools.extractTag(episodeXml, "<div class=\"airdate\">", "</div>");
                    file.setFirstAired(episode, firstAired, IMDB_PLUGIN_ID);
                }
            }
        }
    }

    /**
     * Get the TV show information from IMDb
     *
     * @param movie
     */
    protected void updateTVShowInfo(Movie movie) {
        scanTVShowTitles(movie);
    }

    @Override
    public boolean scanNFO(String nfo, Movie movie) {
        boolean result = Boolean.TRUE;

        // If we already have the ID, skip the scanning of the NFO file
        if (StringTools.isValidString(movie.getId(IMDB_PLUGIN_ID))) {
            return result;
        }

        LOG.debug("Scanning NFO for Imdb Id");
        String id = searchIMDB(nfo, movie);
        if (isValidString(id)) {
            movie.setId(IMDB_PLUGIN_ID, id);
            LOG.debug("IMDb Id found in nfo: {}", movie.getId(IMDB_PLUGIN_ID));
        } else {
            int beginIndex = nfo.indexOf("/tt");
            if (beginIndex != -1) {
                StringTokenizer st = new StringTokenizer(nfo.substring(beginIndex + 1),
                        "/ \n,:!&\"'(--_)=$");
                movie.setId(IMDB_PLUGIN_ID, st.nextToken());
                LOG.debug("IMDb Id found in nfo: {}", movie.getId(IMDB_PLUGIN_ID));
            } else {
                beginIndex = nfo.indexOf("/Title?");
                if (beginIndex != -1 && beginIndex + 7 < nfo.length()) {
                    StringTokenizer st = new StringTokenizer(nfo.substring(beginIndex + 7),
                            "/ \n,:!&\"'(--_)=$");
                    movie.setId(IMDB_PLUGIN_ID, "tt" + st.nextToken());
                    LOG.debug("IMDb Id found in nfo: {}", movie.getId(IMDB_PLUGIN_ID));
                } else {
                    LOG.debug("No IMDb Id found in nfo !");
                    result = Boolean.FALSE;
                }
            }
        }
        return result;
    }

    /**
     * Search for the IMDB Id in the NFO file
     *
     * @param nfo
     * @param movie
     * @return
     */
    private static String searchIMDB(String nfo, Movie movie) {
        final int flags = Pattern.CASE_INSENSITIVE | Pattern.DOTALL;
        String imdbPattern = ")[\\W].*?(tt\\d{7})";
        // Issue 1912 escape special regex characters in title
        String title = Pattern.quote(movie.getTitle());
        String id = UNKNOWN;

        Pattern patternTitle;
        Matcher matchTitle;

        try {
            patternTitle = Pattern.compile("(" + title + imdbPattern, flags);
            matchTitle = patternTitle.matcher(nfo);
            if (matchTitle.find()) {
                id = matchTitle.group(2);
            } else {
                String dir = FileTools.getParentFolderName(movie.getFile());
                Pattern patternDir = Pattern.compile("(" + dir + imdbPattern, flags);
                Matcher matchDir = patternDir.matcher(nfo);
                if (matchDir.find()) {
                    id = matchDir.group(2);
                } else {
                    String strippedNfo = nfo.replaceAll("(?is)[^\\w\\r\\n]", "");
                    String strippedTitle = title.replaceAll("(?is)[^\\w\\r\\n]", "");
                    Pattern patternStrippedTitle = Pattern.compile("(" + strippedTitle + imdbPattern, flags);
                    Matcher matchStrippedTitle = patternStrippedTitle.matcher(strippedNfo);
                    if (matchStrippedTitle.find()) {
                        id = matchTitle.group(2);
                    } else {
                        String strippedDir = dir.replaceAll("(?is)[^\\w\\r\\n]", "");
                        Pattern patternStrippedDir = Pattern.compile("(" + strippedDir + imdbPattern, flags);
                        Matcher matchStrippedDir = patternStrippedDir.matcher(strippedNfo);
                        if (matchStrippedDir.find()) {
                            id = matchTitle.group(2);
                        }
                    }
                }
            }
        } catch (Exception error) {
            LOG.error("Error locating the IMDb ID in the nfo file for {}", movie.getBaseFilename());
            LOG.error(error.getMessage());
        }

        return StringUtils.trim(id);
    }

    /**
     * Remove the "see more" or "more" values from the end of a string
     *
     * @param uncleanString
     * @return
     */
    protected static String cleanStringEnding(String uncleanString) {
        int pos = uncleanString.indexOf("more");
        // First let's check if "more" exists in the string
        if (pos > 0) {
            if (uncleanString.endsWith("more")) {
                return uncleanString.substring(0, uncleanString.length() - 4).trim();
            }

            pos = uncleanString.toLowerCase().indexOf("see more");
            if (pos > 0) {
                return uncleanString.substring(0, pos).trim();
            }
        }

        pos = uncleanString.toLowerCase().indexOf("full summary");
        if (pos > 0) {
            return uncleanString.substring(0, pos).trim();
        }

        return uncleanString.trim();
    }

    @Override
    public boolean scan(Person person) {
        String imdbId = person.getId(IMDB_PLUGIN_ID);
        if (isNotValidString(imdbId)) {
            LOG.debug("Looking for IMDB ID for {}", person.getName());
            String movieId = UNKNOWN;
            for (Movie movie : person.getMovies()) {
                movieId = movie.getId(IMDB_PLUGIN_ID);
                if (isValidString(movieId)) {
                    break;
                }
            }
            imdbId = imdbInfo.getImdbPersonId(person.getName(), movieId);
            person.setId(IMDB_PLUGIN_ID, imdbId);
        }

        boolean retval = Boolean.TRUE;
        if (isValidString(imdbId)) {
            retval = updateImdbPersonInfo(person);
        } else {
            LOG.debug("IMDB ID not found for {}", person.getName());
        }
        return retval;
    }

    /**
     * Scan IMDB HTML page for the specified person
     */
    private boolean updateImdbPersonInfo(Person person) {
        String imdbID = person.getId(IMDB_PLUGIN_ID);
        if (!imdbID.startsWith("nm")) {
            imdbID = "nm" + imdbID;
            // Correct the ID if it's wrong
            person.setId(IMDB_PLUGIN_ID, "nm" + imdbID);
        }

        LOG.info("Getting information for {} ({})", person.getName(), imdbID);
        String xml = getImdbData(getImdbUrl(person));

        // We can work out if this is the new site by looking for " - IMDb" at the end of the title
        String title = HTMLTools.extractTag(xml, "<title>");
        // Check for the new version and correct the title if found.
        if (title.toLowerCase().endsWith(" - imdb")) {
            title = title.substring(0, title.length() - 7);
        }
        if (title.toLowerCase().startsWith("imdb - ")) {
            title = title.substring(7);
        }
        person.setName(title);

        return updateInfo(person, xml);
    }

    /**
     * Process the new IMDb format web page
     *
     * @param person
     * @param xml
     * @return
     */
    private boolean updateInfo(Person person, String xml) {
        person.setUrl(ImdbPlugin.this.getImdbUrl(person));

        if (xml.contains("Alternate Names:")) {
            String name = HTMLTools.extractTag(xml, "Alternate Names:</h4>", HTML_DIV_END);
            if (isValidString(name)) {
                for (String item : name.split("<span>\\|</span>")) {
                    person.addAka(StringUtils.trimToEmpty(item));
                }
            }
        }

        if (xml.contains("id=\"img_primary\"")) {
            LOG.debug("Looking for image on webpage for {}", person.getName());
            String photoURL = HTMLTools.extractTag(xml, "id=\"img_primary\"", HTML_TD_END);

            if (photoURL.contains("http://ia.media-imdb.com/images")) {
                photoURL = "http://ia.media-imdb.com/images"
                        + HTMLTools.extractTag(photoURL, "src=\"http://ia.media-imdb.com/images", "\"");
                if (isValidString(photoURL)) {
                    person.setPhotoURL(photoURL);
                    person.setPhotoFilename();
                }
            }
        } else {
            LOG.debug("No image found on webpage for {}", person.getName());
        }

        // get personal information
        String xmlInfo = getImdbData(getImdbUrl(person, SUFFIX_BIO));

        StringBuilder date = new StringBuilder();
        int endIndex;
        int beginIndex = xmlInfo.indexOf(">Date of Birth</td>");

        if (beginIndex > -1) {
            endIndex = xmlInfo.indexOf(">Date of Death</td>");
            beginIndex = xmlInfo.indexOf("birth_monthday=", beginIndex);
            if (beginIndex > -1 && (endIndex == -1 || beginIndex < endIndex)) {
                Matcher m = PATTERN_DOB.matcher(xmlInfo.substring(beginIndex + 15, beginIndex + 20));
                if (m.find()) {
                    date.append(m.group(2)).append("-").append(m.group(1));
                }
            }

            beginIndex = xmlInfo.indexOf("birth_year=", beginIndex);
            if (beginIndex > -1 && (endIndex == -1 || beginIndex < endIndex)) {
                if (date.length() > 0) {
                    date.append("-");
                }
                date.append(xmlInfo.substring(beginIndex + 11, beginIndex + 15));
            }

            beginIndex = xmlInfo.indexOf("birth_place=", beginIndex);
            String place;
            if (beginIndex > -1) {
                place = HTMLTools.extractTag(xmlInfo, "birth_place=", HTML_A_END);
                int start = place.indexOf('>');
                if (start > -1 && start < place.length()) {
                    place = place.substring(start + 1);
                }
                if (isValidString(place)) {
                    person.setBirthPlace(place);
                }
            }
        }

        beginIndex = xmlInfo.indexOf(">Date of Death</td>");
        if (beginIndex > -1) {
            endIndex = xmlInfo.indexOf(">Mini Bio (1)</h4>", beginIndex);
            beginIndex = xmlInfo.indexOf("death_monthday=", beginIndex);
            StringBuilder dDate = new StringBuilder();
            if (beginIndex > -1 && (endIndex == -1 || beginIndex < endIndex)) {
                Matcher m = PATTERN_DOB.matcher(xmlInfo.substring(beginIndex + 15, beginIndex + 20));
                if (m.find()) {
                    dDate.append(m.group(2));
                    dDate.append("-");
                    dDate.append(m.group(1));
                }
            }
            beginIndex = xmlInfo.indexOf("death_date=", beginIndex);
            if (beginIndex > -1 && (endIndex == -1 || beginIndex < endIndex)) {
                if (dDate.length() > 0) {
                    dDate.append("-");
                }
                dDate.append(xmlInfo.substring(beginIndex + 11, beginIndex + 15));
            }
            if (dDate.length() > 0) {
                date.append("/").append(dDate);
            }
        }

        if (StringUtils.isNotBlank(date)) {
            person.setYear(date.toString());
        }

        beginIndex = xmlInfo.indexOf(">Birth Name</td>");
        if (beginIndex > -1) {
            beginIndex += 20;
            String name = xmlInfo.substring(beginIndex, xmlInfo.indexOf(HTML_TD_END, beginIndex));
            if (isValidString(name)) {
                person.setBirthName(HTMLTools.decodeHtml(name));
            }
        }

        beginIndex = xmlInfo.indexOf(">Nickname</td>");
        if (beginIndex > -1) {
            String name = xmlInfo.substring(beginIndex + 17, xmlInfo.indexOf(HTML_TD_END, beginIndex + 17));
            if (isValidString(name)) {
                person.addAka(name);
            }
        } else {
            beginIndex = xmlInfo.indexOf(">Nicknames</td>");
            if (beginIndex > -1) {
                String name = xmlInfo.substring(beginIndex + 19, xmlInfo.indexOf(HTML_TD_END, beginIndex + 19));
                for (String n : name.split("<br>")) {
                    person.addAka(n.trim());
                }
            }
        }

        Matcher m = PATTERN_BIO.matcher(xmlInfo);
        if (m.find()) {
            String bio = HTMLTools.stripTags(m.group(1), true);
            if (isValidString(bio)) {
                bio = trimToLength(bio, preferredBiographyLength);
                person.setBiography(bio);
            }
        }

        // get known movies
        xmlInfo = getImdbData(getImdbUrl(person, SUFFIX_FILMOYEAR));
        if (xmlInfo.contains("<div id=\"tn15content\">")) {
            int count = HTMLTools.extractTags(xmlInfo, "<div id=\"tn15content\">", HTML_DIV_END, "<li>", "</li>")
                    .size();
            person.setKnownMovies(count);
        }

        // get filmography
        processFilmography(person, xml);

        int version = person.getVersion();
        person.setVersion(++version);
        return Boolean.TRUE;
    }

    /**
     * Process the person's filmography from the source XML
     *
     * @param person
     * @param sourceXml
     */
    protected void processFilmography(Person person, String sourceXml) {
        int beginIndex, endIndex;

        if (!sourceXml.contains("<h2>Filmography</h2>")) {
            LOG.info("No filmography found for {}", person.getName());
            return;
        }

        // List of films for the person
        Map<String, Filmography> filmography = new TreeMap<>();

        Matcher mJobList = P_JOB_SELECTION.matcher(sourceXml);

        // Loop around the jobs
        while (mJobList.find()) {
            // The current job type we are processing
            String currentJob = mJobList.group(1);
            // Save the start of the section
            beginIndex = mJobList.start();
            // Find the end of the section
            endIndex = sourceXml.indexOf("<div id=\"filmo-", beginIndex);
            if (endIndex < 0) {
                // This might be the last section, so search for the end
                endIndex = sourceXml.indexOf("<h2>Related Videos</h2>", beginIndex);
                if (endIndex < 0) {
                    LOG.warn("Failed to locate the end of the job list for {} - '{}'", person.getName(),
                            currentJob);
                    break;
                }
            }

            if (jobsInclude.contains(currentJob)) {
                LOG.trace("Job: '{}' with '{}' credits is required", currentJob, mJobList.group(2));
            } else {
                LOG.trace("Job: '{}' with '{}' credits is NOT required", currentJob, mJobList.group(2));
                // Skip this job
                continue;
            }

            String videoList = sourceXml.substring(beginIndex, endIndex);
            Matcher mJobs = P_JOB_ITEMS.matcher(videoList);

            Matcher mJob;
            int count = 1;
            while (mJobs.find() && count <= preferredFilmographyMax) {
                String jobItem = mJobs.group(1).trim();
                LOG.trace("{} #{}: {}", currentJob, count++, jobItem);

                if (checkSkips(jobItem)) {
                    continue;
                }

                String title;
                String id;

                /*
                 * Generic stuff to all jobs
                 */
                mJob = P_JOB_ID_TITLE.matcher(jobItem);
                if (mJob.find()) {
                    id = mJob.group(1);
                    title = mJob.group(2);
                    // Strip out anything after the (
                    if (title.contains("(")) {
                        title = title.substring(0, title.indexOf('('));
                    }

                    // Remove any HTML tags
                    title = HTMLTools.stripTags(title, true);
                } else {
                    LOG.warn("No ID and Title found");
                    continue;
                }

                // Create the filmography
                Filmography film = filmography.get(id);
                if (film == null) {
                    film = new Filmography();
                    film.setId(id);
                    film.setName(title);
                    film.setJob(currentJob);
                    film.setUrl(getImdbUrl(id, IMDB_TITLE, null));
                    filmography.put(id, film);
                } else {
                    LOG.debug("Film '{}' already exists for {} as '{}', skipping '{}'", film.getTitle(),
                            person.getName(), film.getJob(), currentJob);
                    continue;
                }

                // YEAR
                mJob = P_JOB_YEAR.matcher(jobItem);
                if (mJob.find()) {
                    film.setYear(mJob.group(1));
                } else {
                    LOG.debug("No year found for {} in '{}'", person.getName(), title);
                }

                /*
                 * Specific job processing
                 */
                switch (currentJob.toLowerCase()) {
                case "actor":
                case "acress":
                    processActorItem(film, jobItem);
                    break;
                case "producer":
                    film.setJob(Filmography.JOB_PRODUCER);
                    break;
                case "writer":
                    film.setJob(Filmography.JOB_WRITER);
                    break;
                case "director":
                    film.setJob(Filmography.JOB_DIRECTOR);
                    break;
                default:
                    film.setJob(currentJob);
                    break;
                }
                film.setDepartment();
            }
        }

        // Add the information about the film
        updateFilmography(person, filmography);
    }

    /**
     * Check the films that have missing characters
     *
     * @param person Person to check
     * @param filmography The filmography to scan
     */
    private void updateFilmography(Person person, Map<String, Filmography> filmography) {
        int beginIndex, endIndex;

        Iterator<String> iterFilm = filmography.keySet().iterator();
        int count = 0;
        while (iterFilm.hasNext() && count < preferredFilmographyMax) {
            Filmography film = filmography.get(iterFilm.next());

            LOG.trace("Updating '{}' {}: {} - {}", film.getTitle(), film.getDepartment(), film.getJob(),
                    film.getCharacter());
            if (Filmography.DEPT_ACTORS.equals(film.getDepartment()) && isNotValidString(film.getCharacter())) {
                String movieXML = getImdbData(getImdbUrl(film.getId(), IMDB_TITLE, SUFFIX_FULLCREDITS));

                if (StringUtils.isBlank(movieXML)) {
                    continue;
                }

                beginIndex = movieXML.indexOf("(in credits order)");
                if (beginIndex < 0) {
                    // Try an alternative search
                    beginIndex = movieXML.indexOf("name=\"cast\" id=\"cast\"");
                }

                String character = Movie.UNKNOWN;
                if (beginIndex > -1) {
                    endIndex = movieXML.indexOf(">Produced by", beginIndex);
                    endIndex = endIndex < 0 ? movieXML.length() : endIndex;

                    character = HTMLTools.extractTag(movieXML.substring(beginIndex, endIndex),
                            "<a href=\"/name/" + person.getId(), "</tr>");
                    character = HTMLTools
                            .stripTags(HTMLTools.extractTag(character, "<td class=\"character\">", "</td>"));

                    // Remove any text in brackets
                    endIndex = character.indexOf('(');
                    if (endIndex > -1) {
                        character = character.substring(0, endIndex);
                    }
                }

                if (isValidString(character)) {
                    LOG.trace("Found character '{}' for {}", character, person.getName());
                    film.setCharacter(character);
                }
            }
            person.addFilm(film);
            count++;
        }
    }

    /**
     * Check the XML against a set of skip conditions.
     *
     * @param jobItem
     * @return True if the item should be skipped
     */
    private boolean checkSkips(final String jobItem) {
        if (skipTV && (jobItem.contains("filmo-episodes") || jobItem.contains("TV Series")
                || jobItem.contains("Video") || jobItem.contains("TV Special"))) {
            LOG.trace("Skipping because it's a TV Show");
            return true;
        }

        if (skipVG && jobItem.contains("(Video Game)")) {
            LOG.trace("Skipping because it's a video game");
            return true;
        }

        if (skipV && jobItem.contains("(Video)")) {
            LOG.trace("Skipping because it's a video");
            return true;
        }

        return false;
    }

    /**
     * Find the character from the XML item string
     *
     * @param item
     * @return
     */
    private static String getCharacter(final String item) {
        String character = UNKNOWN;
        int beginIndex, endIndex;

        LOG.trace("Looking for character in '{}'", item);

        String charBegin = "<a href=\"/character/";
        String charEnd = "</a>";

        beginIndex = item.indexOf(charBegin);
        if (beginIndex > -1) {
            endIndex = item.indexOf(charEnd, beginIndex);
            endIndex = endIndex < 0 ? item.length() : endIndex + charEnd.length();

            character = HTMLTools.stripTags(item.substring(beginIndex, endIndex), true);

            // Remove any text in brackets
            endIndex = character.indexOf('(');
            if (endIndex > -1) {
                character = StringUtils.trimToEmpty(character.substring(0, endIndex));
            }
        } else {
            // Try an alternative method to get the character
            // It's usually at the end of the string between <br/> and </div>
            beginIndex = item.lastIndexOf("<br/>");
            endIndex = item.lastIndexOf("</div>");
            if (endIndex > beginIndex) {
                character = HTMLTools.stripTags(item.substring(beginIndex, endIndex), true);
                // Remove anything in ()
                character = character.replaceAll("\\([^\\)]*\\)", "");
            }
        }

        LOG.trace("Returning character: '{}'", StringTools.isValidString(character) ? character : UNKNOWN);
        return StringTools.isValidString(character) ? character : UNKNOWN;
    }

    /**
     * Process actor/actress specific items from the job
     *
     * @param film The Filmography to add the information to
     * @param jobItem The source XML to process
     */
    private static void processActorItem(Filmography film, final String jobItem) {
        film.setCharacter(getCharacter(jobItem));
        film.setJob(Filmography.JOB_ACTOR);
    }

    /**
     * Create a map of the AKA values
     *
     * @param list
     * @return
     */
    private static Map<String, String> buildAkaMap(List<String> list) {
        Map<String, String> map = new LinkedHashMap<>();
        int i = 0;
        do {
            try {
                String key = list.get(i++);
                String value = list.get(i++);
                map.put(key, value);
            } catch (Exception ignore) {
                i = -1;
            }
        } while (i != -1);
        return map;
    }

    private String getImdbData(String url) {
        String data;
        try {
            data = httpClient.request(url, imdbInfo.getCharset());
        } catch (IOException ex) {
            LOG.warn("Failed to get web page ({}) from IMDB: {}", url, ex.getMessage(), ex);
            data = StringUtils.EMPTY;
        }

        return data;
    }

    /**
     * Get the IMDb URL with the default site definition
     *
     * @param item An identifiable object to get the ID from
     * @return
     */
    protected String getImdbUrl(Identifiable item) {
        return this.getImdbUrl(item, null);
    }

    /**
     * Get the IMDb URL with the default site definition
     *
     * @param item An identifiable object to get the ID from
     * @param typeSuffix The suffix, optional
     * @return
     */
    protected String getImdbUrl(Identifiable item, String typeSuffix) {
        String type;
        if (item instanceof Person) {
            type = IMDB_NAME;
        } else {
            type = IMDB_TITLE;
        }
        return getImdbUrl(item.getId(IMDB_PLUGIN_ID), type, typeSuffix);
    }

    /**
     * Get the IMDB URL for the ID
     *
     * @param id The ID, either person or movie
     * @param type The URL Type to get - Must not start or end with "/"
     * @param typeSuffix The suffix, optional
     * @return
     */
    protected String getImdbUrl(String id, String type, String typeSuffix) {
        StringBuilder url = new StringBuilder();
        url.append(imdbInfo.getImdbSite());

        if (type.startsWith("/")) {
            url.append(type.substring(1));
        } else {
            url.append(type);
        }

        url.append(id);

        if (StringUtils.isBlank(typeSuffix)) {
            url.append('/');
        } else {
            url.append(typeSuffix);
        }

        return url.toString();
    }
}