imdbplugin.ImdbParser.java Source code

Java tutorial

Introduction

Here is the source code for imdbplugin.ImdbParser.java

Source

/*
 * This program is free software: you can redistribute it and/or modify it under
 * the terms of the GNU General Public License as published by the Free Software
 * Foundation, either version 3 of the License, or (at your option) any later
 * version.
 *
 * This program is distributed in the hope that it will be useful, but WITHOUT
 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
 * details.
 *
 * You should have received a copy of the GNU General Public License along with
 * this program. If not, see <http://www.gnu.org/licenses/>.
 */
package imdbplugin;

import java.io.BufferedInputStream;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.zip.GZIPInputStream;

import org.apache.commons.lang.StringUtils;

import util.io.IOUtilities;
import util.ui.Localizer;
import util.ui.progress.ProgressInputStream;
import devplugin.ProgressMonitor;

public class ImdbParser {
    private static final Localizer mLocalizer = Localizer.getLocalizerFor(ImdbParser.class);

    private static final java.util.logging.Logger mLog = java.util.logging.Logger
            .getLogger(ImdbParser.class.getName());

    private static final Pattern EPISODE_PATTERN = Pattern.compile("^(.*?)(?:\\W\\(\\#.*\\))?$");
    private static final Pattern MOVIE_PREFIX_PATTERN = Pattern.compile("(.*), ([A-Z][a-z']{0,2})");

    private static final int STEPS_TO_REPORT_PROGRESS = 1000;

    private ImdbDatabase mDatabase;
    private String mServer;
    private boolean mRunParser = true;

    public ImdbParser(final ImdbDatabase db, final String server) {
        mDatabase = db;
        mServer = server;
    }

    public void startParsing(final ProgressMonitor monitor) throws IOException {
        int ratingCount = 0;
        mDatabase.deleteDatabase();

        monitor.setMaximum(getFileSize(mServer));

        mRunParser = true;

        BufferedInputStream akaFile = downloadFile(monitor, "aka-titles.list.gz");
        BufferedInputStream ratingsFile = downloadFile(monitor, "ratings.list.gz");

        ProgressInputStream progressInputStream = new ProgressInputStream(akaFile, monitor);

        if (mRunParser) {
            parseAkaTitles(new GZIPInputStream(progressInputStream), monitor);
        }
        mDatabase.close();
        if (mRunParser) {
            mDatabase.openForWriting();
            progressInputStream = new ProgressInputStream(ratingsFile, monitor,
                    progressInputStream.getCurrentPosition());
            ratingCount = parseRatings(new GZIPInputStream(progressInputStream), monitor);
        }

        if (mRunParser) {
            optimizeDatabase(monitor);
            ImdbPlugin.getInstance().setCurrentDatabaseVersion(ratingCount);
        } else {
            // Cancel was pressed, all Files have to be deleted
            mDatabase.deleteDatabase();
        }
        mDatabase.openForReading();
    }

    private BufferedInputStream downloadFile(final ProgressMonitor monitor, final String fileName)
            throws MalformedURLException, IOException, FileNotFoundException {
        monitor.setMessage(mLocalizer.msg("download", "Downloading {0}", fileName));
        final URL url = new URL(mServer + fileName);
        final File tempFile = File.createTempFile("imdb", null);
        IOUtilities.download(url, tempFile);
        final BufferedInputStream stream = new BufferedInputStream(new FileInputStream(tempFile));
        tempFile.deleteOnExit();
        return stream;
    }

    private void optimizeDatabase(final ProgressMonitor monitor) throws IOException {
        monitor.setMessage(mLocalizer.msg("optimize", "Optimizing database"));
        mDatabase.close();
        mDatabase.openForWriting();
        mDatabase.optimizeIndex();
    }

    private int getFileSize(final String server) {
        final String[] fileNames = new String[] { "aka-titles", "ratings" };
        final Integer[] fileSizes = new Integer[] { 5950067, 5067908 };

        try {

            int size = 0;
            final String filesizes = new String(IOUtilities.loadFileFromHttpServer(new URL(server + "filesizes")));

            for (int i = 0; i < fileNames.length; i++) {
                final Matcher m = Pattern.compile("^" + fileNames[i] + "\\.list\\W(\\d*)$", Pattern.MULTILINE)
                        .matcher(filesizes);

                if (m.find()) {
                    size += Integer.parseInt(m.group(1));
                } else {
                    size += fileSizes[i];
                }
            }
            return size;
        } catch (IOException e) {
            e.printStackTrace();
        }

        // if an exception occurred, return the default sizes
        int size = 0;
        for (int i = 0; i < fileSizes.length; i++) {
            size += fileSizes[i];
        }
        return size;
    }

    public void stopParsing() {
        mRunParser = false;
    }

    private void parseAkaTitles(final InputStream inputStream, final ProgressMonitor monitor) throws IOException {
        final BufferedReader reader = new BufferedReader(new InputStreamReader(inputStream, "ISO-8859-15"));
        String line = reader.readLine();

        final Pattern moviePattern = Pattern
                .compile("^(.*)\\((?:(\\d{4,4}).*|\\?\\?\\?\\?)\\)(?:\\W*\\((.*)\\))?(?:\\W\\{(.*)\\})?.*$");
        final Pattern akaPattern = Pattern.compile(
                "^\\(aka (.*) \\((?:(\\d{4,4}).*|\\?\\?\\?\\?)\\)(?:\\W\\((.*)\\))?(?:\\W\\{(.*)\\})?\\).*$");

        String movieId = null;
        boolean startFound = false;
        int count = 0;
        while (line != null && mRunParser) {
            line = line.trim();
            if (!startFound && line.startsWith("==========")) {
                startFound = true;
            } else if (startFound) {

                if (line.length() > 0) {
                    if (line.startsWith("(aka ")) {
                        if (movieId != null) {
                            final Matcher matcher = akaPattern.matcher(line);
                            if (matcher.matches()) {
                                final String title = cleanMovieTitle(matcher.group(1).trim());
                                int year = -1;
                                if (matcher.group(2) != null) {
                                    year = Integer.parseInt(matcher.group(2));
                                }
                                final String episode = cleanEpisodeTitle(matcher.group(4));

                                mDatabase.addAkaTitle(movieId, title, episode, year);
                                if (++count % STEPS_TO_REPORT_PROGRESS == 0 || count == 1) {
                                    monitor.setMessage(mLocalizer.msg("akaTitles", "Alternative title {0}", count));
                                }
                            }
                        } else {
                            mLog.severe("Parse error: movieId unknown for alternative title");
                        }
                    } else {
                        final Matcher matcher = moviePattern.matcher(line);
                        if (matcher.matches()) {
                            final String movieTitle = cleanMovieTitle(matcher.group(1).trim());
                            int year = -1;
                            if (matcher.group(2) != null) {
                                year = Integer.parseInt(matcher.group(2));
                            }
                            final String episode = cleanEpisodeTitle(matcher.group(4));
                            movieId = mDatabase.getOrCreateMovieId(movieTitle, episode, year);
                        }
                    }
                } else {
                    // blank line, prepare for new movie
                    movieId = null;
                }

            }

            line = reader.readLine();
        }
        reader.close();
    }

    private int parseRatings(final InputStream inputStream, final ProgressMonitor monitor) throws IOException {
        final Pattern ratingPattern = Pattern
                .compile("^(.*)(?:\\W\\((\\d{4,4}|\\?\\?\\?\\?).*?\\))(?:\\W\\((.*)\\))?(?:\\W\\{(.*)\\})?$");
        final ImdbHistogram histogram = new ImdbHistogram();

        final BufferedReader reader = new BufferedReader(new InputStreamReader(inputStream, "ISO-8859-15"));
        String line = reader.readLine();

        boolean startFound = false;
        int count = 0;
        while (line != null && mRunParser) {
            line = line.trim();
            if (!startFound && line.startsWith("MOVIE RATINGS REPORT")) {
                startFound = true;
            } else if (startFound && line.startsWith("-------------")) {
                startFound = false;
            } else if (startFound && line.startsWith("New  Distribution  Votes  Rank  Title")) {
                // Ignore this line!
            } else if (startFound && line.length() > 0) {
                final String distribution = line.substring(0, 10);
                final int votes = Integer.parseInt(line.substring(11, 19).trim());
                final String ratingStr = line.substring(20, 25).trim();
                final int rating = Integer.parseInt(ratingStr.replaceAll("\\.", ""));
                final String title = line.substring(25).trim();

                final Matcher matcher = ratingPattern.matcher(title);
                if (matcher.matches()) {
                    final String movieTitle = cleanMovieTitle(matcher.group(1).trim());
                    int year = -1;
                    String yearString = matcher.group(2);
                    if (yearString != null) {
                        if (!yearString.equals("????")) {
                            try {
                                year = Integer.parseInt(yearString);
                            } catch (NumberFormatException e) {
                                mLog.warning("unexpected year: " + yearString);
                            }
                        }
                    } else {
                        mLog.warning("unexpected null year");
                    }
                    final String episode = cleanEpisodeTitle(matcher.group(4));

                    mDatabase.addRating(mDatabase.getOrCreateMovieId(movieTitle, episode, year), rating, votes,
                            distribution);

                    if (StringUtils.isEmpty(episode)) {
                        histogram.addRating(rating, votes);
                    }
                    if (++count % STEPS_TO_REPORT_PROGRESS == 0 || count == 1) {
                        monitor.setMessage(mLocalizer.msg("ratings", "Rating {0}", count));
                    }
                } else {
                    mLog.warning("Non matching line: " + line);
                }
            }

            line = reader.readLine();
        }

        reader.close();
        ImdbPlugin.getInstance().storeHistogram(histogram);
        return count;
    }

    private String cleanEpisodeTitle(final String episode) {
        if (episode == null) {
            return "";
        }

        final Matcher m = EPISODE_PATTERN.matcher(episode);
        m.find();

        return m.group(1);
    }

    private String cleanMovieTitle(String movieTitle) {
        if (movieTitle.startsWith("\"") && movieTitle.endsWith("\"")) {
            movieTitle = movieTitle.substring(1, movieTitle.length() - 1);
        }

        final Matcher matcher = MOVIE_PREFIX_PATTERN.matcher(movieTitle);
        if (matcher.matches()) {
            movieTitle = matcher.group(2) + " " + matcher.group(1);
        }

        return movieTitle;
    }
}