org.apache.mahout.cf.taste.hbase.preparation.PreparePreferenceMatrixJob.java Source code

Introduction

Here is the source code for org.apache.mahout.cf.taste.hbase.preparation.PreparePreferenceMatrixJob.java
Source

/**
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 * 
 * 
 */

package org.apache.mahout.cf.taste.hbase.preparation;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.hbase.HBaseConfiguration;
import org.apache.hadoop.hbase.client.Scan;
import org.apache.hadoop.hbase.mapreduce.TableMapReduceUtil;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat;
import org.apache.hadoop.util.ToolRunner;
import org.apache.mahout.cf.taste.hadoop.EntityPrefWritable;
import org.apache.mahout.cf.taste.hadoop.ToEntityPrefsMapper;
import org.apache.mahout.cf.taste.hbase.ToItemPrefsMapper;
import org.apache.mahout.cf.taste.hbase.item.ItemIDIndexMapper;
import org.apache.mahout.cf.taste.hbase.item.RecommenderJob;
import org.apache.mahout.cf.taste.hadoop.item.ItemIDIndexReducer;
import org.apache.mahout.cf.taste.hadoop.item.ToUserVectorsReducer;
import org.apache.mahout.cf.taste.hadoop.preparation.ToItemVectorsMapper;
import org.apache.mahout.cf.taste.hadoop.preparation.ToItemVectorsReducer;
import org.apache.mahout.common.AbstractJob;
import org.apache.mahout.common.HadoopUtil;
import org.apache.mahout.math.VarIntWritable;
import org.apache.mahout.math.VarLongWritable;
import org.apache.mahout.math.VectorWritable;

import java.util.List;
import java.util.Map;

public class PreparePreferenceMatrixJob extends AbstractJob {

    public static final String NUM_USERS = "numUsers.bin";
    public static final String ITEMID_INDEX = "itemIDIndex";
    public static final String USER_VECTORS = "userVectors";
    public static final String RATING_MATRIX = "ratingMatrix";

    private static final int DEFAULT_MIN_PREFS_PER_USER = 1;

    public static void main(String[] args) throws Exception {
        ToolRunner.run(new PreparePreferenceMatrixJob(), args);
    }

    @Override
    public int run(String[] args) throws Exception {

        addInputOption();
        addOutputOption();
        addOption("minPrefsPerUser", "mp",
                "ignore users with less preferences than this " + "(default: " + DEFAULT_MIN_PREFS_PER_USER + ')',
                String.valueOf(DEFAULT_MIN_PREFS_PER_USER));
        addOption("booleanData", "b", "Treat input as without pref values", Boolean.FALSE.toString());
        addOption("ratingShift", "rs", "shift ratings by this value", "0.0");

        Map<String, List<String>> parsedArgs = parseArguments(args);
        if (parsedArgs == null) {
            return -1;
        }

        int minPrefsPerUser = Integer.parseInt(getOption("minPrefsPerUser"));
        boolean booleanData = Boolean.valueOf(getOption("booleanData"));
        float ratingShift = Float.parseFloat(getOption("ratingShift"));
        String workingTable = getConf().get(RecommenderJob.PARAM_WORKING_TABLE);
        String cfRatings = getConf().get(RecommenderJob.PARAM_CF_RATINGS);

        //convert items to an internal index
        Configuration mapred_config = HBaseConfiguration.create();
        mapred_config.setBoolean("mapred.compress.map.output", true);
        mapred_config.set(RecommenderJob.PARAM_CF_RATINGS, cfRatings);

        Job itemIDIndex = Job.getInstance(mapred_config);
        itemIDIndex.setJobName(HadoopUtil.getCustomJobName(getClass().getSimpleName(), itemIDIndex,
                ItemIDIndexMapper.class, ItemIDIndexReducer.class));
        itemIDIndex.setJarByClass(ItemIDIndexMapper.class); // class that contains mapper and reducer

        Scan scan = new Scan();
        scan.setCaching(500); // 1 is the default in Scan, which will be bad for MapReduce jobs
        scan.setCacheBlocks(false); // don't set to true for MR jobs
        // set other scan attrs

        TableMapReduceUtil.initTableMapperJob(workingTable, // input table
                scan, // Scan instance to control CF and attribute selection
                ItemIDIndexMapper.class, // mapper class
                VarIntWritable.class, // mapper output key
                VarLongWritable.class, // mapper output value
                itemIDIndex);

        itemIDIndex.setReducerClass(ItemIDIndexReducer.class); // reducer class

        itemIDIndex.setOutputKeyClass(VarIntWritable.class);
        itemIDIndex.setOutputValueClass(VarLongWritable.class);
        itemIDIndex.setOutputFormatClass(SequenceFileOutputFormat.class);

        FileOutputFormat.setOutputPath(itemIDIndex, getOutputPath(ITEMID_INDEX)); // adjust directories as required

        if (!itemIDIndex.waitForCompletion(true))
            return -1;
        //////////////////////////////////////////////////////////////////////////

        //convert user preferences into a vector per user
        mapred_config.setBoolean(RecommenderJob.BOOLEAN_DATA, booleanData);
        mapred_config.setInt(ToUserVectorsReducer.MIN_PREFERENCES_PER_USER, minPrefsPerUser);
        mapred_config.set(ToEntityPrefsMapper.RATING_SHIFT, String.valueOf(ratingShift));

        Job toUserVectors_hb = Job.getInstance(mapred_config);
        toUserVectors_hb.setJobName(HadoopUtil.getCustomJobName(getClass().getSimpleName(), toUserVectors_hb,
                ToItemPrefsMapper.class, ToUserVectorsReducer.class));
        toUserVectors_hb.setJarByClass(ToItemPrefsMapper.class); // class that contains mapper and reducer

        TableMapReduceUtil.initTableMapperJob(workingTable, // input table
                scan, // Scan instance to control CF and attribute selection
                ToItemPrefsMapper.class, // mapper class
                VarLongWritable.class, // mapper output key
                booleanData ? VarLongWritable.class : EntityPrefWritable.class, // mapper output value
                toUserVectors_hb);

        toUserVectors_hb.setReducerClass(ToUserVectorsReducer.class); // reducer class
        toUserVectors_hb.setNumReduceTasks(1); // at least one, adjust as required

        toUserVectors_hb.setOutputKeyClass(VarLongWritable.class);
        toUserVectors_hb.setOutputValueClass(VectorWritable.class);
        toUserVectors_hb.setOutputFormatClass(SequenceFileOutputFormat.class);

        FileOutputFormat.setOutputPath(toUserVectors_hb, getOutputPath(USER_VECTORS)); // adjust directories as required

        if (!toUserVectors_hb.waitForCompletion(true))
            return -1;
        //////////////////////////////////////////////////////////////////////////

        //we need the number of users later
        int numberOfUsers = (int) toUserVectors_hb.getCounters().findCounter(ToUserVectorsReducer.Counters.USERS)
                .getValue();
        HadoopUtil.writeInt(numberOfUsers, getOutputPath(NUM_USERS), getConf());
        //build the rating matrix
        Job toItemVectors = prepareJob(getOutputPath(USER_VECTORS), getOutputPath(RATING_MATRIX),
                ToItemVectorsMapper.class, IntWritable.class, VectorWritable.class, ToItemVectorsReducer.class,
                IntWritable.class, VectorWritable.class);
        toItemVectors.setCombinerClass(ToItemVectorsReducer.class);

        if (!toItemVectors.waitForCompletion(true))
            return -1;

        return 0;
    }
}