com.skp.experiment.cf.als.hadoop.DistributedParallelALSFactorizationJob.java Source code

Java tutorial

Introduction

Here is the source code for com.skp.experiment.cf.als.hadoop.DistributedParallelALSFactorizationJob.java

Source

/**
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package com.skp.experiment.cf.als.hadoop;

import java.io.IOException;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Random;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.MapFile;
import org.apache.hadoop.io.SequenceFile;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat;
import org.apache.hadoop.util.ToolRunner;
import org.apache.mahout.cf.taste.hadoop.TasteHadoopUtils;
import org.apache.mahout.cf.taste.impl.common.FullRunningAverage;
import org.apache.mahout.cf.taste.impl.common.RunningAverage;
import org.apache.mahout.common.AbstractJob;
import org.apache.mahout.common.RandomUtils;
import org.apache.mahout.common.mapreduce.MergeVectorsCombiner;
import org.apache.mahout.common.mapreduce.MergeVectorsReducer;
import org.apache.mahout.common.mapreduce.TransposeMapper;
import org.apache.mahout.common.mapreduce.VectorSumReducer;
import org.apache.mahout.math.DenseMatrix;
import org.apache.mahout.math.DenseVector;
import org.apache.mahout.math.Matrix;
import org.apache.mahout.math.MatrixSlice;
import org.apache.mahout.math.RandomAccessSparseVector;
import org.apache.mahout.math.SequentialAccessSparseVector;
import org.apache.mahout.math.Vector;
import org.apache.mahout.math.VectorWritable;
import org.apache.mahout.math.als.AlternatingLeastSquaresSolver;
import org.apache.mahout.math.hadoop.DistributedRowMatrix;
import org.apache.mahout.math.map.OpenIntObjectHashMap;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import com.google.common.base.Preconditions;
import com.google.common.collect.Lists;
import com.google.common.io.Closeables;
import com.skp.experiment.math.als.hadoop.DistributedImplicitFeedbackAlternatingLeastSquaresSolver;
import com.skp.experiment.math.als.hadoop.ImplicitFeedbackAlternatingLeastSquaresSolver;

/**
 * <p>MapReduce implementation of the two factorization algorithms described in
 *
 * <p>"Large-scale Parallel Collaborative Filtering for the Netix Prize" available at
 * http://www.hpl.hp.com/personal/Robert_Schreiber/papers/2008%20AAIM%20Netflix/netflix_aaim08(submitted).pdf.</p>
 *
 * "<p>Collaborative Filtering for Implicit Feedback Datasets" available at
 * http://research.yahoo.com/pub/2433</p>
 *
 * </p>
 * <p>Command line arguments specific to this class are:</p>
 *
 * <ol>
 * <li>--input (path): Directory containing one or more text files with the dataset</li>
 * <li>--output (path): path where output should go</li>
 * <li>--lambda (double): regularization parameter to avoid overfitting</li>
 * <li>--userFeatures (path): path to the user feature matrix</li>
 * <li>--itemFeatures (path): path to the item feature matrix</li>
 * </ol>
 */
public class DistributedParallelALSFactorizationJob extends AbstractJob {

    private static final Logger log = LoggerFactory.getLogger(DistributedParallelALSFactorizationJob.class);
    private static final String LZO_CODEC_CLASS = "org.apache.hadoop.io.compress.LzoCodec";
    private static final int LARGE_MATRIX_MAP_TASKS_NUM = 1000000;
    private static final String SMALL_MATRIX_MEMORY = "-Xmx2g";

    static final String NUM_FEATURES = DistributedParallelALSFactorizationJob.class.getName() + ".numFeatures";
    static final String LAMBDA = DistributedParallelALSFactorizationJob.class.getName() + ".lambda";
    static final String ALPHA = DistributedParallelALSFactorizationJob.class.getName() + ".alpha";
    static final String FEATURE_MATRIX = DistributedParallelALSFactorizationJob.class.getName() + ".featureMatrix";
    static final String FEATURE_MATRIX_TRANSPOSE = DistributedParallelALSFactorizationJob.class.getName()
            + ".featureMatrixTranspose";

    private int numUsers;
    private int numItems;

    private boolean implicitFeedback;
    private int numIterations;
    private int numFeatures;
    private double lambda;
    private double alpha;
    private long dfsBlockSize;

    public static void main(String[] args) throws Exception {
        ToolRunner.run(new DistributedParallelALSFactorizationJob(), args);
    }

    @Override
    public int run(String[] args) throws Exception {

        addInputOption();
        addOutputOption();
        addOption("lambda", null, "regularization parameter", true);
        addOption("implicitFeedback", null, "data consists of implicit feedback?", String.valueOf(false));
        addOption("alpha", null, "confidence parameter (only used on implicit feedback)", String.valueOf(40));
        addOption("numFeatures", null, "dimension of the feature space", true);
        addOption("numIterations", null, "number of iterations", true);
        addOption("numUsers", null, "number of users", true);
        addOption("numItems", null, "number of items", true);
        addOption("blockSize", null, "dfs block size.", false);
        //addOption("runIterations", null, "true or false for iterations", true);

        Map<String, String> parsedArgs = parseArguments(args);
        if (parsedArgs == null) {
            return -1;
        }

        numFeatures = Integer.parseInt(parsedArgs.get("--numFeatures"));
        numIterations = Integer.parseInt(parsedArgs.get("--numIterations"));
        lambda = Double.parseDouble(parsedArgs.get("--lambda"));
        alpha = Double.parseDouble(parsedArgs.get("--alpha"));
        implicitFeedback = Boolean.parseBoolean(parsedArgs.get("--implicitFeedback"));
        numUsers = Integer.parseInt(parsedArgs.get("--numUsers"));
        numItems = Integer.parseInt(parsedArgs.get("--numItems"));
        dfsBlockSize = getOption("blockSize") == null ? 64 * 1024 * 1024 : Long.parseLong(getOption("blockSize"));
        /*
            * compute the factorization A = U M'
            *
            * where A (users x items) is the matrix of known ratings
            *           U (users x features) is the representation of users in the feature space
            *           M (items x features) is the representation of items in the feature space
            */

        /* create A' */
        Job itemRatings = prepareJob(getInputPath(), pathToItemRatings(), TextInputFormat.class,
                ItemRatingVectorsMapper.class, IntWritable.class, VectorWritable.class, VectorSumReducer.class,
                IntWritable.class, VectorWritable.class, SequenceFileOutputFormat.class);
        itemRatings.setCombinerClass(VectorSumReducer.class);
        itemRatings.waitForCompletion(true);
        //numItems = 
        //    (int) itemRatings.getCounters().findCounter("org.apache.hadoop.mapred.Task$Counter", "REDUCE_OUTPUT_RECORDS").getValue();
        log.info("Number of Items\t{}", numItems);

        /* create A */
        Job userRatings = prepareJob(pathToItemRatings(), pathToUserRatings(), TransposeMapper.class,
                IntWritable.class, VectorWritable.class, MergeVectorsReducer.class, IntWritable.class,
                VectorWritable.class);
        userRatings.setCombinerClass(MergeVectorsCombiner.class);
        userRatings.waitForCompletion(true);
        //numUsers = 
        //    (int) userRatings.getCounters().findCounter("org.apache.hadoop.mapred.Task$Counter", "REDUCE_OUTPUT_RECORDS").getValue();
        log.info("Number of Users\t{}", numUsers);

        /* count item per user */
        Job userItemCntsJob = prepareJob(pathToUserRatings(), getOutputPath("userItemCnts"),
                SequenceFileInputFormat.class, UserItemCntsMapper.class, IntWritable.class, IntWritable.class,
                UserItemCntsReducer.class, IntWritable.class, IntWritable.class, SequenceFileOutputFormat.class);
        userItemCntsJob.setJobName("user ratings count");
        userItemCntsJob.setCombinerClass(UserItemCntsReducer.class);
        userItemCntsJob.waitForCompletion(true);

        //TODO this could be fiddled into one of the upper jobs
        Job averageItemRatings = prepareJob(pathToItemRatings(), getTempPath("averageRatings"),
                AverageRatingMapper.class, IntWritable.class, VectorWritable.class, MergeVectorsReducer.class,
                IntWritable.class, VectorWritable.class);
        averageItemRatings.setCombinerClass(MergeVectorsCombiner.class);
        averageItemRatings.waitForCompletion(true);

        Vector averageRatings = ALSMatrixUtil.readFirstRow(getTempPath("averageRatings"), getConf());

        /* create an initial M */
        initializeM(averageRatings);

        for (int currentIteration = 0; currentIteration < numIterations; currentIteration++) {
            DistributedRowMatrix curM = new DistributedRowMatrix(pathToM(currentIteration - 1),
                    getTempPath("Mtemp" + String.valueOf(currentIteration - 1)), numItems, numFeatures);
            curM.setConf(new Configuration());
            DistributedRowMatrix YtransposeY = curM.times(curM);

            // broadcast M, read A row-wise, recompute U row-wise //
            log.info("Recomputing U (iteration {}/{})", currentIteration, numIterations);
            runSolver(pathToUserRatings(), pathToU(currentIteration), pathToM(currentIteration - 1),
                    YtransposeY.getRowPath(), numItems);

            DistributedRowMatrix curU = new DistributedRowMatrix(pathToU(currentIteration),
                    getTempPath("Utmp" + String.valueOf(currentIteration)), numUsers, numFeatures);
            curU.setConf(new Configuration());
            DistributedRowMatrix XtransposeX = curU.times(curU);

            // set up index of U //
            CreateMapFileFromSeq.createMapFile(pathToU(currentIteration));

            // broadcast U, read A' row-wise, recompute M row-wise //
            log.info("Recomputing M (iteration {}/{})", currentIteration, numIterations);
            runDistributedImplicitSolver(pathToItemRatings(), pathToM(currentIteration), pathToU(currentIteration),
                    XtransposeX.getRowPath(), numUsers);
        }
        return 0;
    }

    private void initializeM(Vector averageRatings) throws IOException {
        Random random = RandomUtils.getRandom();

        FileSystem fs = FileSystem.get(pathToM(-1).toUri(), getConf());
        SequenceFile.Writer writer = null;
        //MapFile.Writer writer = null;
        try {
            //writer = new MapFile.Writer(getConf(), fs, pathToM(-1).toString(), IntWritable.class, VectorWritable.class);
            writer = new SequenceFile.Writer(fs, getConf(), new Path(pathToM(-1), "part-m-00000"),
                    IntWritable.class, VectorWritable.class);

            Iterator<Vector.Element> averages = averageRatings.iterateNonZero();
            while (averages.hasNext()) {
                Vector.Element e = averages.next();
                Vector row = new DenseVector(numFeatures);
                row.setQuick(0, e.get());
                for (int m = 1; m < numFeatures; m++) {
                    row.setQuick(m, random.nextDouble());
                }
                writer.append(new IntWritable(e.index()), new VectorWritable(row));
            }
        } finally {
            Closeables.closeQuietly(writer);
        }
    }

    private void runSolver(Path ratings, Path output, Path pathToUorI, Path pathToTranspose, int rowNums)
            throws ClassNotFoundException, IOException, InterruptedException {

        @SuppressWarnings("rawtypes")
        Class<? extends Mapper> solverMapper = null;
        if (implicitFeedback) {
            solverMapper = SolveImplicitFeedbackMapper.class;
        } else {
            solverMapper = SolveExplicitFeedbackMapper.class;
        }

        Job solverForUorI = prepareJob(ratings, output, SequenceFileInputFormat.class, solverMapper,
                IntWritable.class, VectorWritable.class, SequenceFileOutputFormat.class);

        Configuration solverConf = solverForUorI.getConfiguration();
        solverConf.set(LAMBDA, String.valueOf(lambda));
        solverConf.set(ALPHA, String.valueOf(alpha));
        solverConf.setInt(NUM_FEATURES, numFeatures);
        solverConf.set(FEATURE_MATRIX, pathToUorI.toString());
        solverConf.set(FEATURE_MATRIX_TRANSPOSE, pathToTranspose.toString());
        solverConf.setInt("rowNums", rowNums);
        solverConf.set("mapred.child.java.opts", SMALL_MATRIX_MEMORY);
        solverConf.setBoolean("mapred.map.tasks.speculative.execution", false);
        solverConf.setInt("mapred.job.reuse.jvm.num.tasks", -1);
        solverConf.setBoolean("mapred.compress.map.output", true);
        solverConf.set("mapred.map.output.compression.codec", LZO_CODEC_CLASS);
        solverForUorI.waitForCompletion(true);
    }

    private void runDistributedImplicitSolver(Path ratings, Path output, Path pathToUorI, Path pathToTranspose,
            int rowNums) throws IOException, InterruptedException, ClassNotFoundException {
        @SuppressWarnings("rawtypes")
        Class<? extends Mapper> solverMapper = DistributedSolveImplicitFeedbackMapper.class;
        Job solverForUorI = prepareJob(ratings, output, SequenceFileInputFormat.class, solverMapper,
                IntWritable.class, VectorWritable.class, SequenceFileOutputFormat.class);

        Configuration solverConf = solverForUorI.getConfiguration();

        solverConf.setLong("mapred.min.split.size", dfsBlockSize);
        solverConf.setLong("mapred.max.split.size", dfsBlockSize);
        solverConf.setBoolean("mapred.map.tasks.speculative.execution", false);
        solverConf.setInt("mapred.map.tasks", LARGE_MATRIX_MAP_TASKS_NUM);
        solverConf.setLong("mapred.task.timeout", 600000 * 5);
        solverConf.setInt("mapred.job.reuse.jvm.num.tasks", -1);
        solverConf.set("mapred.child.java.opts", SMALL_MATRIX_MEMORY);

        solverConf.set(LAMBDA, String.valueOf(lambda));
        solverConf.set(ALPHA, String.valueOf(alpha));
        solverConf.setInt(NUM_FEATURES, numFeatures);
        solverConf.set(FEATURE_MATRIX, pathToUorI.toString());
        solverConf.set(FEATURE_MATRIX_TRANSPOSE, pathToTranspose.toString());
        solverConf.setInt("rowNums", rowNums);
        solverConf.setBoolean("mapred.compress.map.output", true);
        solverConf.set("mapred.map.output.compression.codec", LZO_CODEC_CLASS);
        solverForUorI.waitForCompletion(true);
    }

    public static class SolveImplicitFeedbackMapper
            extends Mapper<IntWritable, VectorWritable, IntWritable, VectorWritable> {
        private ImplicitFeedbackAlternatingLeastSquaresSolver solver;

        @Override
        protected void setup(Context ctx) throws IOException, InterruptedException {
            double lambda = Double.parseDouble(ctx.getConfiguration().get(LAMBDA));
            double alpha = Double.parseDouble(ctx.getConfiguration().get(ALPHA));
            int numFeatures = ctx.getConfiguration().getInt(NUM_FEATURES, -1);
            Path YPath = new Path(ctx.getConfiguration().get(FEATURE_MATRIX));
            Path YtransposeYPath = new Path(ctx.getConfiguration().get(FEATURE_MATRIX_TRANSPOSE));
            int rowNums = ctx.getConfiguration().getInt("rowNums", -1);
            //OpenIntObjectHashMap<Vector> Y = ALSMatrixUtil.readMatrixByRows(YPath, ctx.getConfiguration());
            OpenIntObjectHashMap<Vector> Y = ALSMatrixUtil.readMatrixByRows(YPath, ctx);
            Matrix YtransposeY = ALSMatrixUtil.readDistributedRowMatrix(YtransposeYPath, numFeatures, numFeatures);
            //solver = new ImplicitFeedbackAlternatingLeastSquaresSolver(numFeatures, lambda, alpha, Y, YtransposeY);

            Preconditions.checkArgument(numFeatures > 0, "numFeatures was not set correctly!");
        }

        @Override
        protected void map(IntWritable userOrItemID, VectorWritable ratingsWritable, Context ctx)
                throws IOException, InterruptedException {
            Vector ratings = new SequentialAccessSparseVector(ratingsWritable.get());

            Vector uiOrmj = solver.solve(ratings);

            ctx.write(userOrItemID, new VectorWritable(uiOrmj));
        }
    }

    public static class DistributedSolveImplicitFeedbackMapper
            extends Mapper<IntWritable, VectorWritable, IntWritable, VectorWritable> {
        private DistributedImplicitFeedbackAlternatingLeastSquaresSolver solver;
        private MapFile.Reader reader;

        @Override
        protected void setup(Context ctx) throws IOException, InterruptedException {

            double lambda = Double.parseDouble(ctx.getConfiguration().get(LAMBDA));
            double alpha = Double.parseDouble(ctx.getConfiguration().get(ALPHA));
            int numFeatures = ctx.getConfiguration().getInt(NUM_FEATURES, -1);
            Path YPath = new Path(ctx.getConfiguration().get(FEATURE_MATRIX));
            Path YtransposeYPath = new Path(ctx.getConfiguration().get(FEATURE_MATRIX_TRANSPOSE));
            int rowNums = ctx.getConfiguration().getInt("rowNums", -1);

            Matrix YtransposeY = fetchDistributedRowMatrix(YtransposeYPath, numFeatures, numFeatures);
            FileSystem fs = FileSystem.get(ctx.getConfiguration());
            reader = new MapFile.Reader(fs, YPath.toString(), ctx.getConfiguration());
            solver = new DistributedImplicitFeedbackAlternatingLeastSquaresSolver(rowNums, numFeatures, lambda,
                    alpha, reader, YtransposeY);

            Preconditions.checkArgument(numFeatures > 0, "numFeatures was not set correctly!");
        }

        private Matrix fetchDistributedRowMatrix(Path matrixPath, int numRows, int numCols) {
            Matrix result = new DenseMatrix(numRows, numCols);
            DistributedRowMatrix m = new DistributedRowMatrix(matrixPath, new Path(matrixPath.toString() + "_tmp"),
                    numRows, numCols);
            m.setConf(new Configuration());
            Iterator<MatrixSlice> rows = m.iterator();
            while (rows.hasNext()) {
                MatrixSlice row = rows.next();
                result.assignRow(row.index(), row.vector());
            }
            return result;
        }

        @Override
        protected void map(IntWritable userOrItemID, VectorWritable ratingsWritable, Context ctx)
                throws IOException, InterruptedException {

            Vector ratings = new SequentialAccessSparseVector(ratingsWritable.get());

            Vector uiOrmj = solver.solve(ratings);

            ctx.write(userOrItemID, new VectorWritable(uiOrmj));
        }
    }

    public static class SolveExplicitFeedbackMapper
            extends Mapper<IntWritable, VectorWritable, IntWritable, VectorWritable> {
        private double lambda;
        private int numFeatures;
        private OpenIntObjectHashMap<Vector> UorM;
        private AlternatingLeastSquaresSolver solver;

        @Override
        protected void setup(Context ctx) throws IOException, InterruptedException {
            lambda = Double.parseDouble(ctx.getConfiguration().get(LAMBDA));
            numFeatures = ctx.getConfiguration().getInt(NUM_FEATURES, -1);
            solver = new AlternatingLeastSquaresSolver();
            Path UOrIPath = new Path(ctx.getConfiguration().get(FEATURE_MATRIX));
            //UorM = ALSMatrixUtil.readMatrixByRows(UOrIPath, ctx.getConfiguration());
            UorM = ALSMatrixUtil.readMatrixByRows(UOrIPath, ctx);
            Preconditions.checkArgument(numFeatures > 0, "numFeatures was not set correctly!");
        }

        @Override
        protected void map(IntWritable userOrItemID, VectorWritable ratingsWritable, Context ctx)
                throws IOException, InterruptedException {
            Vector ratings = new SequentialAccessSparseVector(ratingsWritable.get());
            List<Vector> featureVectors = Lists.newArrayList();
            Iterator<Vector.Element> interactions = ratings.iterateNonZero();
            while (interactions.hasNext()) {
                int index = interactions.next().index();
                featureVectors.add(UorM.get(index));
            }
            Vector uiOrmj = solver.solve(featureVectors, ratings, lambda, numFeatures);
            ctx.write(userOrItemID, new VectorWritable(uiOrmj));
        }
    }

    public static class AverageRatingMapper
            extends Mapper<IntWritable, VectorWritable, IntWritable, VectorWritable> {
        @Override
        protected void map(IntWritable r, VectorWritable v, Context ctx) throws IOException, InterruptedException {
            RunningAverage avg = new FullRunningAverage();
            Iterator<Vector.Element> elements = v.get().iterateNonZero();
            while (elements.hasNext()) {
                avg.addDatum(elements.next().get());
            }
            Vector vector = new RandomAccessSparseVector(Integer.MAX_VALUE, 1);
            vector.setQuick(r.get(), avg.getAverage());
            ctx.write(new IntWritable(0), new VectorWritable(vector));
        }
    }

    public static class ItemRatingVectorsMapper extends Mapper<LongWritable, Text, IntWritable, VectorWritable> {
        private static IntWritable outKey = new IntWritable();

        @Override
        protected void map(LongWritable offset, Text line, Context ctx) throws IOException, InterruptedException {
            String[] tokens = TasteHadoopUtils.splitPrefTokens(line.toString());
            int userID = Integer.parseInt(tokens[0]);
            int itemID = Integer.parseInt(tokens[1]);
            float rating = Float.parseFloat(tokens[2]);

            Vector ratings = new RandomAccessSparseVector(Integer.MAX_VALUE, 1);
            ratings.set(userID, rating);
            outKey.set(itemID);
            ctx.write(outKey, new VectorWritable(ratings));
        }
    }

    public static class UserItemCntsMapper extends Mapper<IntWritable, VectorWritable, IntWritable, IntWritable> {
        private static IntWritable result = new IntWritable(1);

        @Override
        protected void map(IntWritable key, VectorWritable value, Context context)
                throws IOException, InterruptedException {
            context.write(key, result);
        }
    }

    public static class UserItemCntsReducer extends Reducer<IntWritable, IntWritable, IntWritable, IntWritable> {
        private static IntWritable result = new IntWritable();

        @Override
        protected void reduce(IntWritable key, Iterable<IntWritable> values, Context context)
                throws IOException, InterruptedException {
            int sum = 0;
            for (IntWritable value : values) {
                sum += value.get();
            }
            result.set(sum);
            context.write(key, result);
        }
    }

    private Path pathToM(int iteration) {
        return iteration == numIterations - 1 ? getOutputPath("M") : getTempPath("M-" + iteration);
    }

    private Path pathToU(int iteration) {
        return iteration == numIterations - 1 ? getOutputPath("U") : getTempPath("U-" + iteration);
    }

    private Path pathToItemRatings() {
        return getTempPath("itemRatings");
    }

    private Path pathToUserRatings() {
        return getOutputPath("userRatings");
    }

}