nl.gridline.zieook.runners.cf.ItemSimilarityJobZieook.java Source code

Introduction

Here is the source code for nl.gridline.zieook.runners.cf.ItemSimilarityJobZieook.java
Source

/**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 */
package nl.gridline.zieook.runners.cf;

import java.io.IOException;
import java.util.Map;
import java.util.concurrent.atomic.AtomicInteger;

import nl.gridline.zieook.inx.movielens.RowSimilarityZieOok;
import nl.gridline.zieook.inx.movielens.hbase.RecommendationsImportMap;
import nl.gridline.zieook.inx.movielens.hbase.RecommendationsImportReduce;
import nl.gridline.zieook.mapreduce.TaskConfig;
import nl.gridline.zieook.tasks.ZieOokTask;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hbase.client.Put;
import org.apache.hadoop.hbase.mapreduce.TableMapReduceUtil;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.mapreduce.InputFormat;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.JobContext;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.OutputFormat;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import org.apache.hadoop.util.ToolRunner;
import org.apache.mahout.cf.taste.hadoop.EntityPrefWritable;
import org.apache.mahout.cf.taste.hadoop.MaybePruneRowsMapper;
import org.apache.mahout.cf.taste.hadoop.TasteHadoopUtils;
import org.apache.mahout.cf.taste.hadoop.ToItemPrefsMapper;
import org.apache.mahout.cf.taste.hadoop.item.ItemIDIndexMapper;
import org.apache.mahout.cf.taste.hadoop.item.ItemIDIndexReducer;
import org.apache.mahout.cf.taste.hadoop.item.RecommenderJob;
import org.apache.mahout.cf.taste.hadoop.item.ToUserVectorReducer;
import org.apache.mahout.cf.taste.hadoop.similarity.item.CountUsersKeyWritable;
import org.apache.mahout.cf.taste.hadoop.similarity.item.CountUsersMapper;
import org.apache.mahout.cf.taste.hadoop.similarity.item.CountUsersReducer;
import org.apache.mahout.cf.taste.hadoop.similarity.item.ItemSimilarityJob;
import org.apache.mahout.cf.taste.hadoop.similarity.item.ToItemVectorsReducer;
import org.apache.mahout.common.AbstractJob;
import org.apache.mahout.math.VarIntWritable;
import org.apache.mahout.math.VarLongWritable;
import org.apache.mahout.math.VectorWritable;
import org.apache.mahout.math.hadoop.DistributedRowMatrix;
import org.apache.mahout.math.hadoop.similarity.SimilarityType;
import org.mortbay.log.Log;

/**
 * <p>
 * Distributed precomputation of the item-item-similarities for Itembased Collaborative Filtering
 * </p>
 * <p>
 * Preferences in the input file should look like {@code userID,itemID[,preferencevalue]}
 * </p>
 * <p>
 * Preference value is optional to accommodate applications that have no notion of a preference value (that is, the user
 * simply expresses a preference for an item, but no degree of preference).
 * </p>
 * <p>
 * The preference value is assumed to be parseable as a {@code double}. The user IDs and item IDs are parsed as
 * {@code long}s.
 * </p>
 * <p>
 * Command line arguments specific to this class are:
 * </p>
 * <ol>
 * <li>-Dmapred.input.dir=(path): Directory containing one or more text files with the preference data</li>
 * <li>-Dmapred.output.dir=(path): output path where similarity data should be written</li>
 * <li>--similarityClassname (classname): Name of distributed similarity class to instantiate or a predefined similarity
 * from {@link SimilarityType}</li>
 * <li>--maxSimilaritiesPerItem (integer): Maximum number of similarities considered per item (100)</li>
 * <li>--maxCooccurrencesPerItem (integer): Maximum number of cooccurrences considered per item (100)</li>
 * <li>--booleanData (boolean): Treat input data as having no pref values (false)</li>
 * </ol>
 * <p>
 * General command line options are documented in {@link AbstractJob}.
 * </p>
 * <p>
 * Note that because of how Hadoop parses arguments, all "-D" arguments must appear before all other arguments.
 * </p>
 * [purpose]
 * <p />
 * Project zieook-runner<br />
 * ItemSimilarityJobZieook.java created 13 dec. 2011
 * <p />
 * Copyright, all rights reserved 2011 GridLine Amsterdam
 * @author <a href="mailto:job@gridline.nl">Job</a>
 * @version $Revision:$, $Date:$
 */
public class ItemSimilarityJobZieook extends AbstractJob {

    static final String ITEM_ID_INDEX_PATH_STR = ItemSimilarityJob.class.getName() + ".itemIDIndexPathStr";
    static final String MAX_SIMILARITIES_PER_ITEM = ItemSimilarityJob.class.getName() + ".maxSimilarItemsPerItem";

    private static final int DEFAULT_MAX_SIMILAR_ITEMS_PER_ITEM = 100;
    private static final int DEFAULT_MAX_COOCCURRENCES_PER_ITEM = 100;
    private static final int DEFAULT_MIN_PREFS_PER_USER = 1;

    private final ZieOokTask task;

    public ItemSimilarityJobZieook(ZieOokTask task) {
        this.task = task;
    }

    @Override
    public int run(String[] args) throws IOException, InterruptedException, ClassNotFoundException {

        addInputOption();

        // addOutputOption(); // no output path, we use a table!
        addOption("outputtable", "ot", "Output table name");

        addOption("similarityClassname", "s",
                "Name of distributed similarity class to instantiate, alternatively use "
                        + "one of the predefined similarities (" + SimilarityType.listEnumNames() + ')');
        addOption("maxSimilaritiesPerItem", "m",
                "try to cap the number of similar items per item to this number " + "(default: "
                        + DEFAULT_MAX_SIMILAR_ITEMS_PER_ITEM + ')',
                String.valueOf(DEFAULT_MAX_SIMILAR_ITEMS_PER_ITEM));
        addOption("maxCooccurrencesPerItem", "mo",
                "try to cap the number of cooccurrences per item to this number " + "(default: "
                        + DEFAULT_MAX_COOCCURRENCES_PER_ITEM + ')',
                String.valueOf(DEFAULT_MAX_COOCCURRENCES_PER_ITEM));
        addOption("minPrefsPerUser", "mp",
                "ignore users with less preferences than this " + "(default: " + DEFAULT_MIN_PREFS_PER_USER + ')',
                String.valueOf(DEFAULT_MIN_PREFS_PER_USER));
        addOption("booleanData", "b", "Treat input as without pref values", Boolean.FALSE.toString());

        Map<String, String> parsedArgs = parseArguments(args);
        if (parsedArgs == null) {
            return -1;
        }

        String similarityClassName = parsedArgs.get("--similarityClassname");
        int maxSimilarItemsPerItem = Integer.parseInt(parsedArgs.get("--maxSimilaritiesPerItem"));
        int maxCooccurrencesPerItem = Integer.parseInt(parsedArgs.get("--maxCooccurrencesPerItem"));
        int minPrefsPerUser = Integer.parseInt(parsedArgs.get("--minPrefsPerUser"));
        boolean booleanData = Boolean.valueOf(parsedArgs.get("--booleanData"));

        Path inputPath = getInputPath();
        // Path outputPath = getOutputPath();
        String outputTable = parsedArgs.get("--outputtable");
        Path tempDirPath = new Path(parsedArgs.get("--tempDir"));

        Path itemIDIndexPath = new Path(tempDirPath, "itemIDIndex");
        Path countUsersPath = new Path(tempDirPath, "countUsers");
        Path userVectorPath = new Path(tempDirPath, "userVectors");
        Path itemUserMatrixPath = new Path(tempDirPath, "itemUserMatrix");
        Path similarityMatrixPath = new Path(tempDirPath, "similarityMatrix");

        AtomicInteger currentPhase = new AtomicInteger();

        if (shouldRunNextPhase(parsedArgs, currentPhase)) {
            Job itemIDIndex = prepareJob(inputPath, itemIDIndexPath, TextInputFormat.class, ItemIDIndexMapper.class,
                    VarIntWritable.class, VarLongWritable.class, ItemIDIndexReducer.class, VarIntWritable.class,
                    VarLongWritable.class, SequenceFileOutputFormat.class);
            itemIDIndex.setCombinerClass(ItemIDIndexReducer.class);
            task.setCurrentJob(itemIDIndex).waitForCompletion(true);
        }

        if (shouldRunNextPhase(parsedArgs, currentPhase)) {
            Job toUserVector = prepareJob(inputPath, userVectorPath, TextInputFormat.class, ToItemPrefsMapper.class,
                    VarLongWritable.class, booleanData ? VarLongWritable.class : EntityPrefWritable.class,
                    ToUserVectorReducer.class, VarLongWritable.class, VectorWritable.class,
                    SequenceFileOutputFormat.class);
            toUserVector.getConfiguration().setBoolean(RecommenderJob.BOOLEAN_DATA, booleanData);
            toUserVector.getConfiguration().setInt(ToUserVectorReducer.MIN_PREFERENCES_PER_USER, minPrefsPerUser);
            task.setCurrentJob(toUserVector).waitForCompletion(true);
        }

        if (shouldRunNextPhase(parsedArgs, currentPhase)) {
            Job countUsers = prepareJob(userVectorPath, countUsersPath, SequenceFileInputFormat.class,
                    CountUsersMapper.class, CountUsersKeyWritable.class, VarLongWritable.class,
                    CountUsersReducer.class, VarIntWritable.class, NullWritable.class, TextOutputFormat.class);
            countUsers.setPartitionerClass(CountUsersKeyWritable.CountUsersPartitioner.class);
            countUsers.setGroupingComparatorClass(CountUsersKeyWritable.CountUsersGroupComparator.class);
            task.setCurrentJob(countUsers).waitForCompletion(true);
        }

        if (shouldRunNextPhase(parsedArgs, currentPhase)) {
            Job maybePruneAndTransponse = prepareJob(userVectorPath, itemUserMatrixPath,
                    SequenceFileInputFormat.class, MaybePruneRowsMapper.class, IntWritable.class,
                    DistributedRowMatrix.MatrixEntryWritable.class, ToItemVectorsReducer.class, IntWritable.class,
                    VectorWritable.class, SequenceFileOutputFormat.class);
            maybePruneAndTransponse.getConfiguration().setInt(MaybePruneRowsMapper.MAX_COOCCURRENCES,
                    maxCooccurrencesPerItem);
            task.setCurrentJob(maybePruneAndTransponse).waitForCompletion(true);
        }

        int numberOfUsers = TasteHadoopUtils.readIntFromFile(getConf(), countUsersPath);

        /*
         * Once DistributedRowMatrix uses the hadoop 0.20 API, we should refactor this call to something like
         * new DistributedRowMatrix(...).rowSimilarity(...)
         */
        try {
            ToolRunner.run(getConf(), new RowSimilarityZieOok(),
                    new String[] { "-Dmapred.input.dir=" + itemUserMatrixPath,
                            "-Dmapred.output.dir=" + similarityMatrixPath, "--numberOfColumns",
                            String.valueOf(numberOfUsers), "--similarityClassname", similarityClassName,
                            "--maxSimilaritiesPerRow", String.valueOf(maxSimilarItemsPerItem + 1), "--tempDir",
                            tempDirPath.toString() });
        } catch (Exception e) {
            throw new IllegalStateException("item-item-similarity computation failed", e);
        }

        // This step writes the data to a file, we don't want that, it should be written in HBase directly:
        if (shouldRunNextPhase(parsedArgs, currentPhase)) {
            Job mostSimilarItems = prepareMostSimilarItems(similarityMatrixPath, outputTable);

            // Configuration mostSimilarItemsConf = mostSimilarItems.getConfiguration();

            // mostSimilarItemsConf.set(ITEM_ID_INDEX_PATH_STR, itemIDIndexPath.toString());
            // mostSimilarItemsConf.setInt(MAX_SIMILARITIES_PER_ITEM, maxSimilarItemsPerItem);

            // mostSimilarItems.waitForCompletion(true);

            task.setCurrentJob(mostSimilarItems).waitForCompletion(Log.isDebugEnabled());

            // Job mostSimilarItems = prepareJob(similarityMatrixPath, outputPath, SequenceFileInputFormat.class,
            // MostSimilarItemPairsMapper.class, EntityEntityWritable.class, DoubleWritable.class,
            // MostSimilarItemPairsReducer.class, EntityEntityWritable.class, DoubleWritable.class,
            // TextOutputFormat.class);
            // Configuration mostSimilarItemsConf = mostSimilarItems.getConfiguration();
            // mostSimilarItemsConf.set(ITEM_ID_INDEX_PATH_STR, itemIDIndexPath.toString());
            // mostSimilarItemsConf.setInt(MAX_SIMILARITIES_PER_ITEM, maxSimilarItemsPerItem);
            // mostSimilarItems.setCombinerClass(MostSimilarItemPairsReducer.class);
            // mostSimilarItems.waitForCompletion(true);
        }

        return 0;
    }

    protected Job prepareMostSimilarItems(Path inputFile, String outputTable) throws IOException {

        Job job = new Job(new Configuration(getConf()));
        job.setJobName(getCustomJobName(job, RecommendationsImportMap.class, RecommendationsImportReduce.class));

        job.getConfiguration().set(TaskConfig.COLLECTION, task.getConfig().get(TaskConfig.COLLECTION));
        job.getConfiguration().set(TaskConfig.RECOMMENDER, task.getConfig().get(TaskConfig.RECOMMENDER));

        job.setMapperClass(RecommendationsImportMap.class);
        job.setInputFormatClass(SequenceFileInputFormat.class);
        job.setMapOutputKeyClass(LongWritable.class);
        job.setMapOutputValueClass(Put.class);
        job.setNumReduceTasks(4);

        TableMapReduceUtil.initTableReducerJob(outputTable, RecommendationsImportReduce.class, job);

        // job.setCombinerClass(RecommendationsImportReduce.class);

        FileInputFormat.addInputPath(job, inputFile);
        return job;
    }

    @Override
    @SuppressWarnings("rawtypes")
    protected Job prepareJob(Path inputPath, Path outputPath, Class<? extends InputFormat> inputFormat,
            Class<? extends Mapper> mapper, Class<? extends Writable> mapperKey,
            Class<? extends Writable> mapperValue, Class<? extends Reducer> reducer,
            Class<? extends Writable> reducerKey, Class<? extends Writable> reducerValue,
            Class<? extends OutputFormat> outputFormat) throws IOException {

        Job job = new Job(new Configuration(getConf()));
        Configuration jobConf = job.getConfiguration();

        // This is not working: - we set the jar directly:
        // if (reducer.equals(Reducer.class))
        // {
        // if (mapper.equals(Mapper.class))
        // {
        // throw new IllegalStateException("Can't figure out the user class jar file from mapper/reducer");
        // }
        // job.setJarByClass(mapper);
        // }
        // else
        // {
        // job.setJarByClass(reducer);
        // }

        job.setInputFormatClass(inputFormat);
        jobConf.set("mapred.input.dir", inputPath.toString());

        job.setMapperClass(mapper);
        job.setMapOutputKeyClass(mapperKey);
        job.setMapOutputValueClass(mapperValue);

        jobConf.setBoolean("mapred.compress.map.output", true);

        job.setReducerClass(reducer);
        job.setOutputKeyClass(reducerKey);
        job.setOutputValueClass(reducerValue);

        job.setJobName(getCustomJobName(job, mapper, reducer));

        job.setOutputFormatClass(outputFormat);
        jobConf.set("mapred.output.dir", outputPath.toString());

        return job;
    }

    @SuppressWarnings("rawtypes")
    private String getCustomJobName(JobContext job, Class<? extends Mapper> mapper,
            Class<? extends Reducer> reducer) {
        StringBuilder name = new StringBuilder(100);
        String customJobName = job.getJobName();
        if (customJobName == null || customJobName.trim().length() == 0) {
            name.append(getClass().getSimpleName());
        } else {
            name.append(customJobName);
        }
        name.append('-').append(mapper.getSimpleName());
        name.append('-').append(reducer.getSimpleName());
        return name.toString();
    }

}