norbert.mynemo.dataimport.FileImporter.java Source code

Java tutorial

Introduction

Here is the source code for norbert.mynemo.dataimport.FileImporter.java

Source

/*
 * Copyright 2015 Norbert
 *
 * Licensed to the Apache Software Foundation (ASF) under one or more contributor license
 * agreements. See the NOTICE file distributed with this work for additional information regarding
 * copyright ownership. The ASF licenses this file to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance with the License. You may obtain a
 * copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software distributed under the License
 * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
 * or implied. See the License for the specific language governing permissions and limitations under
 * the License.
 */
package norbert.mynemo.dataimport;

import static com.google.common.base.Preconditions.checkArgument;
import static com.google.common.base.Preconditions.checkNotNull;

import java.io.File;
import java.io.IOException;
import java.util.Collection;

import norbert.mynemo.dataimport.fileformat.MynemoRating;
import norbert.mynemo.dataimport.fileformat.input.CkRatingImporter;
import norbert.mynemo.dataimport.fileformat.input.MovieLensRatingImporter;
import norbert.mynemo.dataimport.fileformat.input.MynemoRatingImporter;
import norbert.mynemo.dataimport.fileformat.input.RatingImporter;
import norbert.mynemo.dataimport.fileformat.input.TenMillionRatingImporter;
import norbert.mynemo.dataimport.fileformat.output.DuplicateRemover;
import norbert.mynemo.dataimport.fileformat.output.MaxNeighborUserFilter;
import norbert.mynemo.dataimport.fileformat.output.MaxUserFilter;
import norbert.mynemo.dataimport.fileformat.output.MinCommonRatingFilter;
import norbert.mynemo.dataimport.fileformat.output.MinRatingByMovieFilter;
import norbert.mynemo.dataimport.fileformat.output.RatingFileWriter;
import norbert.mynemo.dataimport.fileformat.output.RatingWriter;
import norbert.mynemo.dataimport.fileformat.output.ScaleValueWriter;
import norbert.mynemo.dataimport.fileformat.output.UnivalueRemover;
import norbert.mynemo.dataimport.fileformat.output.UserSimilarityType;

import com.google.common.base.Optional;

/**
 * This importer can merge, convert and filter ratings from files. It produces one output file from
 * several input files. The format of the output file is the Mynemo file format. The format of an
 * input file is automatically detected. The ratings can be filtered.
 */
public class FileImporter {

    private static final String DEFAULT_USER_ID = Integer.toString(Integer.MAX_VALUE);

    /**
     * Converts the given rating files to the Mynemo format. Handles the files generated from
     * MovieLens, and the 10 million ratings from a MovieLens data set. If the latter is amongst the
     * rating files, a movie file must be provided. Handles also the files generated by the scraping
     * of CK. CK rating files need a mapping file in be imported.
     *
     * <p>
     * The mapping files must contains the equivalence between MovieLens ids and IMDb ids, or the
     * equivalence between CK identifiers and IMDb ids. The files are automatically recognized.
     *
     * <p>
     * If an input file contains ratings without user id, the given <code>user</code> is used. If
     * <code>user</code> is absent, then a default user id is used. No check is done to find if the id
     * already exists.
     *
     * <p>
     * The output file must not exist. At least one existing rating file must be provided.
     *
     * @param outputFilepath the file where the imported ratings are written
     * @param inputFilepaths the rating files to convert
     * @param movieFilepath the file containing the equivalences between ids
     * @param user the user id used for the input ratings without user
     * @param maxUsers maximum number of users, the output file will contain the ratings of at most
     *        this number of users
     * @param minRatingsByMovie minimum ratings by movie, the output file won't contain movies that
     *        have less than this number of ratings
     * @param similarityType type of similarity used to find the nearest users of the target user
     */
    public static void convert(String outputFilepath, Collection<String> inputFilepaths,
            Collection<String> movieFilepath, Optional<String> user, Optional<Integer> maxUsers,
            Optional<Integer> minRatingsByMovie, Optional<Integer> minCommonRatings,
            Optional<UserSimilarityType> similarityType) throws IOException {
        checkNotNull(outputFilepath);
        checkNotNull(inputFilepaths);
        checkArgument(!inputFilepaths.isEmpty(), "At least one input file must be given.");
        checkArgument(!new File(outputFilepath).exists(), "The output file must not exist.");
        for (String filepath : inputFilepaths) {
            checkArgument(new File(filepath).exists(), "The input file must exist.");
        }
        for (String filepath : movieFilepath) {
            checkArgument(new File(filepath).exists(), "The movie file must exist.");
        }

        RatingWriter writer = createFilters(new RatingFileWriter(outputFilepath), maxUsers, minRatingsByMovie,
                similarityType, minCommonRatings, user);

        for (String ratingFilepath : inputFilepaths) {
            RatingImporter importableFile = getFile(ratingFilepath, movieFilepath, user);
            for (MynemoRating rating : importableFile) {
                writer.write(rating);
            }
        }

        writer.close();
    }

    /**
     * Interposes the necessary filters before the last writer, according to the given parameters.
     */
    private static RatingWriter createFilters(RatingWriter lastWriter, Optional<Integer> maxUsers,
            Optional<Integer> minRatingsByMovie, Optional<UserSimilarityType> similarityType,
            Optional<Integer> minCommonRatings, Optional<String> targetUser) {

        RatingWriter nextWriter = lastWriter;

        if (minRatingsByMovie.isPresent()) {
            nextWriter = new MinRatingByMovieFilter(nextWriter, minRatingsByMovie.get());
        }
        if (maxUsers.isPresent()) {
            if (similarityType.isPresent()) {
                nextWriter = new MaxNeighborUserFilter(nextWriter, targetUser.get(), maxUsers.get(),
                        similarityType.get());
            } else {
                nextWriter = new MaxUserFilter(nextWriter, maxUsers.get());
            }
        }
        if (minCommonRatings.isPresent()) {
            checkArgument(targetUser.isPresent(), "The user parameter is missing.");
            nextWriter = new MinCommonRatingFilter(nextWriter, targetUser.get(), minCommonRatings.get());
        }
        nextWriter = new ScaleValueWriter(nextWriter);
        nextWriter = new DuplicateRemover(nextWriter);
        nextWriter = new UnivalueRemover(nextWriter);

        return nextWriter;
    }

    /**
     * Returns a rating file that can parse the ratings contained in the given file.
     *
     * @throws UnsupportedOperationException if the file cannot be parsed
     */
    private static RatingImporter getFile(String ratingFilepath, Collection<String> mappingFilepaths,
            Optional<String> user) throws IOException {

        for (String mappingFilepath : mappingFilepaths) {
            if (CkRatingImporter.canImport(ratingFilepath, mappingFilepath)) {
                return new CkRatingImporter(ratingFilepath, mappingFilepath);
            }
            if (TenMillionRatingImporter.canImport(ratingFilepath, mappingFilepath)) {
                return new TenMillionRatingImporter(ratingFilepath, mappingFilepath);
            }
        }

        if (MovieLensRatingImporter.canImport(ratingFilepath)) {
            return new MovieLensRatingImporter(ratingFilepath, user.or(DEFAULT_USER_ID));
        }

        // the Mynemo rating file must stay last because it does not use headers, so the parser can
        // parse several file formats.
        if (MynemoRatingImporter.canImport(ratingFilepath)) {
            return new MynemoRatingImporter(ratingFilepath);
        }

        throw new UnsupportedOperationException("Unable to convert the file \"" + ratingFilepath + "\".");
    }

    /**
     * Instantiates a new object. Private to prevents instantiation.
     */
    private FileImporter() {
        throw new AssertionError();
    }
}