Java tutorial
/** * QCRI, sPCA LICENSE * sPCA is a scalable implementation of Principal Component Analysis (PCA) on of Spark and MapReduce * * Copyright (c) 2015, Qatar Foundation for Education, Science and Community Development (on * behalf of Qatar Computing Research Institute) having its principle place of business in Doha, * Qatar with the registered address P.O box 5825 Doha, Qatar (hereinafter referred to as "QCRI") * */ package org.qcri.pca; import java.io.IOException; import java.util.Iterator; import java.util.List; import java.util.Map; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.DoubleWritable; import org.apache.hadoop.io.IntWritable; import org.apache.hadoop.io.NullWritable; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.Mapper; import org.apache.hadoop.mapreduce.Reducer; import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; import org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat; import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat; import org.apache.mahout.common.AbstractJob; import org.apache.mahout.common.Pair; import org.apache.mahout.common.iterator.sequencefile.SequenceFileIterator; import org.apache.mahout.math.DenseVector; import org.apache.mahout.math.Vector; import org.apache.mahout.math.VectorWritable; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import com.google.common.io.Closeables; /** * Obtain Frobenius norm of the input matrix * * @author maysam yabandeh */ public class Norm2Job extends AbstractJob { private static final Logger log = LoggerFactory.getLogger(Norm2Job.class); /** * The name of the file that contains the column-wise mean of the matrix */ public static final String MEANSPANOPTION = "meanSpanFile"; @Override public int run(String[] strings) throws Exception { addInputOption(); addOutputOption(); addOption(MEANSPANOPTION, "mean", "The name of the file that contains mean and span vectors"); Map<String, List<String>> parsedArgs = parseArguments(strings); if (parsedArgs == null) { return -1; } String meanSpanFileName = getOption(MEANSPANOPTION); run(getConf(), getInputPath(), meanSpanFileName, getOutputPath()); return 0; } public void run(Configuration conf, Path matrixInputPath, String meanSpanFileName, Path matrixOutputPath) throws IOException, InterruptedException, ClassNotFoundException { conf.set(MEANSPANOPTION, meanSpanFileName); Job job = new Job(conf); job.setJobName("Norm2Job"); job.setJarByClass(Norm2Job.class); FileSystem fs = FileSystem.get(matrixInputPath.toUri(), conf); matrixInputPath = fs.makeQualified(matrixInputPath); matrixOutputPath = fs.makeQualified(matrixOutputPath); FileInputFormat.addInputPath(job, matrixInputPath); job.setInputFormatClass(SequenceFileInputFormat.class); job.setOutputFormatClass(SequenceFileOutputFormat.class); FileOutputFormat.setOutputPath(job, matrixOutputPath); job.setMapperClass(MyMapper.class); job.setReducerClass(MyReducer.class); job.setNumReduceTasks(1); job.setOutputKeyClass(NullWritable.class); job.setOutputValueClass(DoubleWritable.class); job.submit(); job.waitForCompletion(true); } /** * Compute Frobenius norm of the input matrix * @param conf the configuration file * @param inputPath the path to the input matrix * @param meanSpanPath the path to the file that contains mean and span (generated by {@link MeanAndSpanJob}) * @param tmpPath the temporary directory in HDFS * @param id the unique id to name files in HDFS * @return the Frobenius norm (2-norm) * @throws IOException * @throws InterruptedException * @throws ClassNotFoundException */ public double computeFNorm(Configuration conf, Path inputPath, Path meanSpanPath, Path tmpPath, String id) throws IOException, InterruptedException, ClassNotFoundException { Path norm2Path = new Path(tmpPath, "2-norm" + id); FileSystem fs = FileSystem.get(norm2Path.toUri(), conf); if (!fs.exists(norm2Path)) { run(conf, inputPath, meanSpanPath.toString(), norm2Path); } else { log.warn("---------- Skip Norm2Job - already exists"); } double norm2 = loadResult(norm2Path, conf); return norm2; } public double loadResult(Path outputDirPath, Configuration conf) throws IOException { Path finalNumberFile = new Path(outputDirPath, "part-r-00000"); SequenceFileIterator<NullWritable, DoubleWritable> iterator = new SequenceFileIterator<NullWritable, DoubleWritable>( finalNumberFile, true, conf); double norm2; try { Pair<NullWritable, DoubleWritable> next = iterator.next(); norm2 = next.getSecond().get(); if (iterator.hasNext()) throw new IOException("More than one value after norm2Job!"); } finally { Closeables.close(iterator, false); } return norm2; } public static class MyMapper extends Mapper<IntWritable, VectorWritable, NullWritable, DoubleWritable> { private DenseVector meanVector; /** * Sum of square of means */ private double meanSquareSum = 0; /** * The computing 2-norm */ private double norm2Sum = 0; @Override public void setup(Context context) throws IOException { Configuration conf = context.getConfiguration(); String meanSpanFileName = conf.get(MEANSPANOPTION); Path meanSpanFile = new Path(meanSpanFileName); MeanAndSpanJob masJob = new MeanAndSpanJob(); final boolean normalizeMean = true; masJob.loadResults(meanSpanFile, normalizeMean, conf); meanVector = (DenseVector) masJob.getMeanVector(); for (int i = 0; i < meanVector.size(); i++) { double v = meanVector.get(i); meanSquareSum += v * v; } } @Override public void map(IntWritable r, VectorWritable v, Context context) throws IOException, InterruptedException { Vector row = v.get(); double norm2 = norm2OfUncentralizedSparseVector(row, meanVector, meanSquareSum); norm2Sum += norm2; } @Override public void cleanup(Context context) throws IOException, InterruptedException { DoubleWritable v = new DoubleWritable(norm2Sum); context.write(NullWritable.get(), v); } } public static class MyReducer extends Reducer<NullWritable, DoubleWritable, NullWritable, DoubleWritable> { @Override public void reduce(NullWritable n, Iterable<DoubleWritable> values, Context context) throws IOException, InterruptedException { double norm2 = 0; for (DoubleWritable v : values) norm2 += v.get(); context.write(n, new DoubleWritable(norm2)); } } //utility functions /** * To compute the norm2 of a sparse matrix, iterate over sparse items and sum * square of the difference. After processing each row, add the sum of the * meanSquare of the zero-elements that were ignored in the sparse iteration. * * @param sparseVector * the sparse vector of data * @param meanVector * the vector of means * @param meanSquareSum * sum of the square of all the means, including for zero and * non-zero elements * @return */ static double norm2OfUncentralizedSparseVector(Vector sparseVector, DenseVector meanVector, double meanSquareSum) { double norm2 = 0; double meanSquareSumOfZeroElements = meanSquareSum; Iterator<Vector.Element> iterator = sparseVector.nonZeroes().iterator(); while (iterator.hasNext()) { Vector.Element element = iterator.next(); double v = element.get(); double mean = meanVector.get(element.index()); double diff = v - mean; diff *= diff; // cancel the effect of the non-zero element in meanSquareSum meanSquareSumOfZeroElements -= mean * mean; norm2 += diff; } // For all all zero items, the following has the sum of mean square norm2 += meanSquareSumOfZeroElements; return norm2; } }