org.yamj.core.service.metadata.online.ImdbSearchEngine.java Source code

Java tutorial

Introduction

Here is the source code for org.yamj.core.service.metadata.online.ImdbSearchEngine.java

Source

/*
 *      Copyright (c) 2004-2015 YAMJ Members
 *      https://github.com/organizations/YAMJ/teams
 *
 *      This file is part of the Yet Another Media Jukebox (YAMJ).
 *
 *      YAMJ is free software: you can redistribute it and/or modify
 *      it under the terms of the GNU General Public License as published by
 *      the Free Software Foundation, either version 3 of the License, or
 *      any later version.
 *
 *      YAMJ is distributed in the hope that it will be useful,
 *      but WITHOUT ANY WARRANTY; without even the implied warranty of
 *      MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *      GNU General Public License for more details.
 *
 *      You should have received a copy of the GNU General Public License
 *      along with YAMJ.  If not, see <http://www.gnu.org/licenses/>.
 *
 *      Web: https://github.com/YAMJ/yamj-v3
 *
 */
package org.yamj.core.service.metadata.online;

import java.io.IOException;
import java.io.UnsupportedEncodingException;
import java.net.URLEncoder;
import java.util.StringTokenizer;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import javax.annotation.PostConstruct;
import org.apache.commons.lang3.StringUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.stereotype.Service;
import org.yamj.api.common.http.DigestedResponse;
import org.yamj.core.configuration.ConfigService;
import org.yamj.core.tools.web.HTMLTools;
import org.yamj.core.tools.web.PoolingHttpClient;
import org.yamj.core.tools.web.ResponseTools;
import org.yamj.core.tools.web.TemporaryUnavailableException;

@Service("imdbSearchEngine")
public class ImdbSearchEngine {

    private static final Logger LOG = LoggerFactory.getLogger(ImdbSearchEngine.class);
    private static final String OBJECT_MOVIE = "movie";
    private static final String OBJECT_PERSON = "person";
    private static final String CATEGORY_MOVIE = "movie";
    private static final String CATEGORY_TV = "tv";
    private static final String CATEGORY_ALL = "all";
    private static final String SEARCH_FIRST = "first";
    private static final String SEARCH_EXACT = "exact";
    private static final String HTML_SLASH_QUOTE = "/\"";
    private static final Pattern PERSON_REGEX = Pattern
            .compile(Pattern.quote("<link rel=\"canonical\" href=\"http://www.imdb.com/name/(nm\\d+)/\""));
    private static final Pattern TITLE_REGEX = Pattern
            .compile(Pattern.quote("<link rel=\"canonical\" href=\"http://www.imdb.com/title/(tt\\d+)/\""));

    private SearchEngineTools searchEngineTools;

    @Autowired
    private PoolingHttpClient httpClient;
    @Autowired
    private ConfigService configService;

    @PostConstruct
    public void init() {
        LOG.info("Initialize IMDb search engine");

        String country = configService.getProperty("imdb.id.search.country", "us");
        searchEngineTools = new SearchEngineTools(httpClient, country);
    }

    /**
     * Retrieve the IMDb matching the specified movie name and year. This routine is based on a IMDb request.
     *
     * @param title
     * @param year
     * @param isTVShow flag to indicate if the searched movie is a TV show
     * @param throwTempError flag to indicate if error should be thrown if service is temporary not available 
     * @return the IMDb id
     */
    public String getImdbId(String title, int year, boolean isTVShow, boolean throwTempError) {
        return getImdbId(title, year, (isTVShow ? CATEGORY_TV : CATEGORY_MOVIE), throwTempError);
    }

    /**
     * Retrieve the IMDb matching the specified movie name and year. This routine is based on a IMDb request.
     *
     * @param title
     * @param year
     * @param categoryType the type of the category to search within
     * @param throwTempError flag to indicate if error should be thrown if service is temporary not available 
     * @return the IMDb id
     */
    private String getImdbId(String title, int year, String categoryType, boolean throwTempError) {
        String imdbId = getImdbIdFromImdb(title, year, OBJECT_MOVIE, categoryType, throwTempError);
        if (StringUtils.isBlank(imdbId)) {
            // try with search engines
            String imdbUrl;
            if (CATEGORY_TV.equals(categoryType)) {
                // leave out the year
                imdbUrl = searchEngineTools.searchURL(title, -1, "www.imdb.com/title", throwTempError);
            } else {
                imdbUrl = searchEngineTools.searchURL(title, year, "www.imdb.com/title", throwTempError);
            }
            imdbId = getImdbIdFromURL(imdbUrl, categoryType);
        }
        return imdbId;
    }

    /**
     * @param personName
     * @param movieId
     * @return
     */
    public String getImdbPersonId(String personName, String movieId, boolean throwTempError) {
        try {
            if (StringUtils.isNotBlank(movieId)) {
                StringBuilder sb = new StringBuilder("http://www.imdb.com/").append("search/name?name=")
                        .append(URLEncoder.encode(personName, "UTF-8")).append("&role=").append(movieId);

                LOG.debug("Querying IMDB for '{}'", sb.toString());
                DigestedResponse response = httpClient.requestContent(sb.toString());
                if (ResponseTools.isOK(response)) {

                    // Check if this is an exact match (we got a person page instead of a results list)
                    Matcher personMatch = PERSON_REGEX.matcher(response.getContent());
                    if (personMatch.find()) {
                        LOG.debug("IMDb returned one match '{}'", personMatch.group(1));
                        return personMatch.group(1);
                    }

                    String firstPersonId = HTMLTools.extractTag(
                            HTMLTools.extractTag(response.getContent(), "<tr class=\"even detailed\">", "</tr>"),
                            "<a href=\"/name/", HTML_SLASH_QUOTE);
                    if (StringUtils.isNotBlank(firstPersonId)) {
                        return firstPersonId;
                    }
                } else if (throwTempError && ResponseTools.isTemporaryError(response)) {
                    throw new TemporaryUnavailableException(
                            "IMDb service temporary not available: " + response.getStatusCode());
                }
            }

            return getImdbPersonId(personName, throwTempError);
        } catch (IOException ex) {
            LOG.error("Failed retrieving IMDb Id for person '{}': {}", personName, ex.getMessage());
            LOG.trace("IMDb search error", ex);
            return null;
        }
    }

    /**
     * Get the IMDb ID for a person
     *
     * @param personName
     * @return
     */
    public String getImdbPersonId(String personName, boolean throwTempError) {
        String imdbId = getImdbIdFromImdb(personName.toLowerCase(), -1, OBJECT_PERSON, CATEGORY_ALL,
                throwTempError);
        if (StringUtils.isBlank(imdbId)) {
            String imdbUrl = searchEngineTools.searchURL(personName, -1, "www.imdb.com/name", throwTempError);
            imdbId = getImdbIdFromURL(imdbUrl, OBJECT_PERSON);
        }
        return imdbId;
    }

    private static String getImdbIdFromURL(String url, String objectType) {
        if (StringUtils.isBlank(url)) {
            return null;
        }

        String imdbId = StringUtils.EMPTY;
        int beginIndex = url.indexOf(objectType.equals(OBJECT_MOVIE) ? "/title/tt" : "/name/nm");
        if (beginIndex > -1) {
            int index;
            if (objectType.equals(OBJECT_MOVIE)) {
                index = beginIndex + 7;
            } else {
                index = beginIndex + 6;
            }
            StringTokenizer st = new StringTokenizer(url.substring(index), HTML_SLASH_QUOTE);
            imdbId = st.nextToken();
        }

        if (imdbId.startsWith(objectType.equals(OBJECT_MOVIE) ? "tt" : "nm")) {
            LOG.debug("Found IMDb ID '{}'", imdbId);
            return imdbId;
        }
        return null;
    }

    /**
     * Retrieve the IMDb matching the specified movie name and year. This routine is base on a IMDb request.
     */
    private String getImdbIdFromImdb(String title, int year, String objectType, String categoryType,
            boolean throwTempError) {
        String searchMatch = configService.getProperty("imdb.id.search.match", "regular");
        boolean searchVariable = configService.getBooleanProperty("imdb.id.search.variable", Boolean.TRUE);

        StringBuilder sb = new StringBuilder("http://www.imdb.com/");
        sb.append("find?q=");
        try {
            sb.append(URLEncoder.encode(title, "UTF-8"));
        } catch (UnsupportedEncodingException ex) {
            LOG.debug("Failed to encode title '{}'", title);
            sb.append(title);
        }

        if (year > 0) {
            sb.append("+%28").append(year).append("%29");
        }
        sb.append("&s=");
        if (objectType.equals(OBJECT_MOVIE)) {
            sb.append("tt");
            if (searchVariable) {
                if (categoryType.equals(CATEGORY_MOVIE)) {
                    sb.append("&ttype=ft");
                } else if (categoryType.equals(CATEGORY_TV)) {
                    sb.append("&ttype=tv");
                }
            }
        } else {
            sb.append("nm");
        }
        sb.append("&site=aka");

        LOG.debug("Querying IMDb for '{}'", sb.toString());
        String xml;
        try {
            DigestedResponse response = httpClient.requestContent(sb.toString());
            if (throwTempError && ResponseTools.isTemporaryError(response)) {
                throw new TemporaryUnavailableException(
                        "IMDb service temporary not available: " + response.getStatusCode());
            } else if (ResponseTools.isNotOK(response)) {
                LOG.error("Can't find IMDb id due response status {} for '{}'", response.getStatusCode(), title);
                return null;
            }
            xml = response.getContent();
        } catch (IOException ex) {
            LOG.error("Failed retrieving IMDb Id for '{}': {}", title, ex.getMessage());
            LOG.trace("IMDb search error", ex);
            return null;
        }

        // Check if this is an exact match (we got a movie page instead of a results list)
        Pattern titleRegex;
        if (objectType.equals(OBJECT_MOVIE)) {
            titleRegex = TITLE_REGEX;
        } else {
            titleRegex = PERSON_REGEX;
        }

        Matcher titleMatch = titleRegex.matcher(xml);
        if (titleMatch.find()) {
            LOG.debug("IMDb returned one match '{}'", titleMatch.group(1));
            return titleMatch.group(1);
        }

        String searchName = HTMLTools
                .extractTag(HTMLTools.extractTag(xml, ";ttype=ep\">", "\"</a>.</li>"), "<b>", "</b>").toLowerCase();
        final String formattedName;
        final String formattedYear;
        final String formattedExact;

        if (SEARCH_FIRST.equalsIgnoreCase(searchMatch)) {
            // first match so nothing more to check
            formattedName = null;
            formattedYear = null;
            formattedExact = null;
        } else if (StringUtils.isNotBlank(searchName)) {
            if (year > 0 && searchName.endsWith(")") && searchName.contains("(")) {
                searchName = searchName.substring(0, searchName.lastIndexOf('(') - 1);
                formattedName = searchName.toLowerCase();
                formattedYear = "(" + year + ")";
                formattedExact = formattedName + "</a> " + formattedYear;
            } else {
                formattedName = searchName.toLowerCase();
                formattedYear = "</a>";
                formattedExact = formattedName + formattedYear;
            }
        } else {
            sb = new StringBuilder();
            try {
                sb.append(URLEncoder.encode(title, "UTF-8").replace("+", " "));
            } catch (UnsupportedEncodingException ex) {
                LOG.debug("Failed to encode title '{}'", title);
                sb.append(title);
            }
            formattedName = sb.toString().toLowerCase();
            if (year > 0) {
                formattedYear = "(" + year + ")";
                formattedExact = formattedName + "</a> " + formattedYear;
            } else {
                formattedYear = "</a>";
                formattedExact = formattedName + formattedYear;

            }
            searchName = formattedExact;
        }

        for (String searchResult : HTMLTools.extractTags(xml, "<table class=\"findList\">", "</table>",
                "<td class=\"result_text\">", "</td>", false)) {
            boolean foundMatch = false;
            if (SEARCH_FIRST.equalsIgnoreCase(searchMatch)) {
                // first result matches
                foundMatch = true;
            } else if (SEARCH_EXACT.equalsIgnoreCase(searchMatch)) {
                // exact match
                foundMatch = (searchResult.toLowerCase().contains(formattedExact));
            } else {
                // regular match: name and year match independent from each other
                int nameIndex = searchResult.toLowerCase().indexOf(formattedName);
                if (nameIndex != -1) {
                    foundMatch = (searchResult.indexOf(formattedYear) > nameIndex);
                }
            }

            if (foundMatch) {
                return HTMLTools.extractTag(searchResult,
                        "<a href=\"" + (objectType.equals(OBJECT_MOVIE) ? "/title/" : "/name/"), "/");
            }
            for (String otherResult : HTMLTools.extractTags(searchResult, "</';\">", "</p>",
                    "<p class=\"find-aka\">", "</em>", false)) {
                if (otherResult.toLowerCase().contains("\"" + searchName + "\"")) {
                    return HTMLTools.extractTag(searchResult,
                            "/images/b.gif?link=" + (objectType.equals(OBJECT_MOVIE) ? "/title/" : "/name/"),
                            "/';\">");
                }
            }
        }

        // alternate search for person ID
        if (objectType.equals(OBJECT_PERSON)) {
            String firstPersonId = HTMLTools.extractTag(
                    HTMLTools.extractTag(xml, "<table><tr> <td valign=\"top\">", "</td></tr></table>"),
                    "<a href=\"/name/", HTML_SLASH_QUOTE);
            if (StringUtils.isBlank(firstPersonId)) {
                // alternate approach
                int beginIndex = xml.indexOf("<a href=\"/name/nm");
                if (beginIndex > -1) {
                    StringTokenizer st = new StringTokenizer(xml.substring(beginIndex + 15), HTML_SLASH_QUOTE);
                    firstPersonId = st.nextToken();
                }
            }

            if (firstPersonId.startsWith("nm")) {
                LOG.debug("Found IMDb ID '{}'", firstPersonId);
                return firstPersonId;
            }
        }

        LOG.debug("Failed to find a match on IMDb");
        return null;
    }
}