org.qcri.pca.Norm2Job.java Source code

Java tutorial

Introduction

Here is the source code for org.qcri.pca.Norm2Job.java

Source

/**
 * QCRI, sPCA LICENSE
 * sPCA is a scalable implementation of Principal Component Analysis (PCA) on of Spark and MapReduce
 *
 * Copyright (c) 2015, Qatar Foundation for Education, Science and Community Development (on
 * behalf of Qatar Computing Research Institute) having its principle place of business in Doha,
 * Qatar with the registered address P.O box 5825 Doha, Qatar (hereinafter referred to as "QCRI")
 *
*/

package org.qcri.pca;

import java.io.IOException;
import java.util.Iterator;
import java.util.List;
import java.util.Map;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.DoubleWritable;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat;
import org.apache.mahout.common.AbstractJob;
import org.apache.mahout.common.Pair;
import org.apache.mahout.common.iterator.sequencefile.SequenceFileIterator;
import org.apache.mahout.math.DenseVector;
import org.apache.mahout.math.Vector;
import org.apache.mahout.math.VectorWritable;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import com.google.common.io.Closeables;

/**
 * Obtain Frobenius norm of the input matrix
 * 
 * @author maysam yabandeh
 */
public class Norm2Job extends AbstractJob {
    private static final Logger log = LoggerFactory.getLogger(Norm2Job.class);

    /**
     * The name of the file that contains the column-wise mean of the matrix
     */
    public static final String MEANSPANOPTION = "meanSpanFile";

    @Override
    public int run(String[] strings) throws Exception {
        addInputOption();
        addOutputOption();
        addOption(MEANSPANOPTION, "mean", "The name of the file that contains mean and span vectors");
        Map<String, List<String>> parsedArgs = parseArguments(strings);
        if (parsedArgs == null) {
            return -1;
        }
        String meanSpanFileName = getOption(MEANSPANOPTION);
        run(getConf(), getInputPath(), meanSpanFileName, getOutputPath());
        return 0;
    }

    public void run(Configuration conf, Path matrixInputPath, String meanSpanFileName, Path matrixOutputPath)
            throws IOException, InterruptedException, ClassNotFoundException {
        conf.set(MEANSPANOPTION, meanSpanFileName);
        Job job = new Job(conf);
        job.setJobName("Norm2Job");
        job.setJarByClass(Norm2Job.class);
        FileSystem fs = FileSystem.get(matrixInputPath.toUri(), conf);
        matrixInputPath = fs.makeQualified(matrixInputPath);
        matrixOutputPath = fs.makeQualified(matrixOutputPath);
        FileInputFormat.addInputPath(job, matrixInputPath);
        job.setInputFormatClass(SequenceFileInputFormat.class);
        job.setOutputFormatClass(SequenceFileOutputFormat.class);
        FileOutputFormat.setOutputPath(job, matrixOutputPath);
        job.setMapperClass(MyMapper.class);
        job.setReducerClass(MyReducer.class);
        job.setNumReduceTasks(1);
        job.setOutputKeyClass(NullWritable.class);
        job.setOutputValueClass(DoubleWritable.class);
        job.submit();
        job.waitForCompletion(true);
    }

    /**
     * Compute Frobenius norm of the input matrix
     * @param conf the configuration file
     * @param inputPath the path to the input matrix
     * @param meanSpanPath the path to the file that contains mean and span (generated by {@link MeanAndSpanJob})
     * @param tmpPath the temporary directory in HDFS
     * @param id the unique id to name files in HDFS
     * @return the Frobenius norm (2-norm)
     * @throws IOException
     * @throws InterruptedException
     * @throws ClassNotFoundException
     */
    public double computeFNorm(Configuration conf, Path inputPath, Path meanSpanPath, Path tmpPath, String id)
            throws IOException, InterruptedException, ClassNotFoundException {
        Path norm2Path = new Path(tmpPath, "2-norm" + id);
        FileSystem fs = FileSystem.get(norm2Path.toUri(), conf);
        if (!fs.exists(norm2Path)) {
            run(conf, inputPath, meanSpanPath.toString(), norm2Path);
        } else {
            log.warn("---------- Skip Norm2Job - already exists");
        }
        double norm2 = loadResult(norm2Path, conf);
        return norm2;
    }

    public double loadResult(Path outputDirPath, Configuration conf) throws IOException {
        Path finalNumberFile = new Path(outputDirPath, "part-r-00000");
        SequenceFileIterator<NullWritable, DoubleWritable> iterator = new SequenceFileIterator<NullWritable, DoubleWritable>(
                finalNumberFile, true, conf);
        double norm2;
        try {
            Pair<NullWritable, DoubleWritable> next = iterator.next();
            norm2 = next.getSecond().get();
            if (iterator.hasNext())
                throw new IOException("More than one value after norm2Job!");
        } finally {
            Closeables.close(iterator, false);
        }
        return norm2;
    }

    public static class MyMapper extends Mapper<IntWritable, VectorWritable, NullWritable, DoubleWritable> {
        private DenseVector meanVector;
        /**
         * Sum of square of means
         */
        private double meanSquareSum = 0;
        /**
         * The computing 2-norm
         */
        private double norm2Sum = 0;

        @Override
        public void setup(Context context) throws IOException {
            Configuration conf = context.getConfiguration();
            String meanSpanFileName = conf.get(MEANSPANOPTION);
            Path meanSpanFile = new Path(meanSpanFileName);
            MeanAndSpanJob masJob = new MeanAndSpanJob();
            final boolean normalizeMean = true;
            masJob.loadResults(meanSpanFile, normalizeMean, conf);
            meanVector = (DenseVector) masJob.getMeanVector();
            for (int i = 0; i < meanVector.size(); i++) {
                double v = meanVector.get(i);
                meanSquareSum += v * v;
            }
        }

        @Override
        public void map(IntWritable r, VectorWritable v, Context context) throws IOException, InterruptedException {
            Vector row = v.get();
            double norm2 = norm2OfUncentralizedSparseVector(row, meanVector, meanSquareSum);
            norm2Sum += norm2;
        }

        @Override
        public void cleanup(Context context) throws IOException, InterruptedException {
            DoubleWritable v = new DoubleWritable(norm2Sum);
            context.write(NullWritable.get(), v);
        }
    }

    public static class MyReducer extends Reducer<NullWritable, DoubleWritable, NullWritable, DoubleWritable> {
        @Override
        public void reduce(NullWritable n, Iterable<DoubleWritable> values, Context context)
                throws IOException, InterruptedException {
            double norm2 = 0;
            for (DoubleWritable v : values)
                norm2 += v.get();
            context.write(n, new DoubleWritable(norm2));
        }
    }

    //utility functions
    /**
     * To compute the norm2 of a sparse matrix, iterate over sparse items and sum
     * square of the difference. After processing each row, add the sum of the
     * meanSquare of the zero-elements that were ignored in the sparse iteration.
     * 
     * @param sparseVector
     *          the sparse vector of data
     * @param meanVector
     *          the vector of means
     * @param meanSquareSum
     *          sum of the square of all the means, including for zero and
     *          non-zero elements
     * @return
     */
    static double norm2OfUncentralizedSparseVector(Vector sparseVector, DenseVector meanVector,
            double meanSquareSum) {
        double norm2 = 0;
        double meanSquareSumOfZeroElements = meanSquareSum;
        Iterator<Vector.Element> iterator = sparseVector.nonZeroes().iterator();
        while (iterator.hasNext()) {
            Vector.Element element = iterator.next();
            double v = element.get();
            double mean = meanVector.get(element.index());
            double diff = v - mean;
            diff *= diff;
            // cancel the effect of the non-zero element in meanSquareSum
            meanSquareSumOfZeroElements -= mean * mean;
            norm2 += diff;
        }
        // For all all zero items, the following has the sum of mean square
        norm2 += meanSquareSumOfZeroElements;
        return norm2;
    }

}