Java tutorial
/** * QCRI, sPCA LICENSE * sPCA is a scalable implementation of Principal Component Analysis (PCA) on of Spark and MapReduce * * Copyright (c) 2015, Qatar Foundation for Education, Science and Community Development (on * behalf of Qatar Computing Research Institute) having its principle place of business in Doha, * Qatar with the registered address P.O box 5825 Doha, Qatar (hereinafter referred to as "QCRI") * */ package org.qcri.pca; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.IntWritable; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.Mapper; import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; import org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat; import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat; import org.apache.mahout.common.AbstractJob; import org.apache.mahout.common.Pair; import org.apache.mahout.common.iterator.sequencefile.SequenceFileIterator; import org.apache.mahout.math.DenseVector; import org.apache.mahout.math.Vector; import org.apache.mahout.math.VectorWritable; import org.apache.mahout.math.function.DoubleDoubleFunction; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import com.google.common.io.Closeables; import java.io.IOException; import java.util.Iterator; import java.util.List; import java.util.Map; /** * Normalize a matrix by dividing each value to the span of its column. After * normalization, the difference between the values in a column is <=1 * * @author maysam yabandeh */ public class NormalizeJob extends AbstractJob { private static final Logger log = LoggerFactory.getLogger(NormalizeJob.class); public static final String MEANSPANOPTION = "meanSpanFile"; public static final String SAMPLERATE = "sampleRate"; public static final String SPLITFACTOR = "splitFactor"; @Override public int run(String[] strings) throws Exception { addInputOption(); addOutputOption(); addOption(MEANSPANOPTION, "mean", "The name of the file that contains mean and span vectors"); addOption(SPLITFACTOR, "sf", "The factor with which the default split size is divided"); Map<String, List<String>> parsedArgs = parseArguments(strings); if (parsedArgs == null) { return -1; } String meanSpanFileName = getOption(MEANSPANOPTION); run(getConf(), getInputPath(), meanSpanFileName, getOutputPath(), 1); return 0; } /** * Normalize the input matrix by dividing its elements by the column range * * @param conf * the configuration * @param inputPath * the path to the input matrix Y * @param meanSpanPath * the path to the file that contains mean and span (generated by * MeanAndSpanJob) * @param sampleRate * if < 1, a sample of the matrix is taken for normalization * @param normalizedYePath * the output path * @return the path to which the resulting normalized matrix is stored * @throws Exception */ public Path normalize(Configuration conf, Path inputPath, Path meanSpanPath, Path outputPath, double sampleRate, String id) throws Exception { Path normalizedYePath = new Path(outputPath, "normalizeY" + id); FileSystem fs = FileSystem.get(normalizedYePath.toUri(), conf); if (!fs.exists(normalizedYePath)) { run(conf, inputPath, meanSpanPath.toString(), normalizedYePath, sampleRate); } else { log.warn("---------- Skip NormalizeJob - already exists: " + normalizedYePath); } return normalizedYePath; } public void run(Configuration conf, Path matrixInputPath, String meanSpanFileName, Path matrixOutputPath, double sampleRate) throws IOException, InterruptedException, ClassNotFoundException { conf.set(MEANSPANOPTION, meanSpanFileName); conf.setFloat(SAMPLERATE, (float) sampleRate); Job job = new Job(conf); job.setJobName("Normalize"); job.setJarByClass(NormalizeJob.class); FileSystem fs = FileSystem.get(matrixInputPath.toUri(), conf); matrixInputPath = fs.makeQualified(matrixInputPath); matrixOutputPath = fs.makeQualified(matrixOutputPath); FileInputFormat.addInputPath(job, matrixInputPath); job.setInputFormatClass(SequenceFileInputFormat.class); FileOutputFormat.setOutputPath(job, matrixOutputPath); job.setMapperClass(NormalizeMapper.class); job.setNumReduceTasks(0); job.setOutputFormatClass(SequenceFileOutputFormat.class); job.setOutputKeyClass(IntWritable.class); job.setOutputValueClass(VectorWritable.class); job.submit(); job.waitForCompletion(true); } public static class NormalizeMapper extends Mapper<IntWritable, VectorWritable, IntWritable, VectorWritable> { private DenseVector spanVector; private double sampleRate = 1; @Override public void setup(Context context) throws IOException { Configuration conf = context.getConfiguration(); sampleRate = conf.getFloat(SAMPLERATE, 1); String meanSpanFileName = conf.get(MEANSPANOPTION); Path meanSpanFile = new Path(meanSpanFileName); SequenceFileIterator<IntWritable, VectorWritable> iterator = new SequenceFileIterator<IntWritable, VectorWritable>( meanSpanFile, true, conf); try { Pair<IntWritable, VectorWritable> next; next = iterator.next(); if (next.getFirst().get() == MeanAndSpanJob.SPANVECTOR) spanVector = new DenseVector(next.getSecond().get()); next = iterator.next(); if (next.getFirst().get() == MeanAndSpanJob.SPANVECTOR) spanVector = new DenseVector(next.getSecond().get()); } finally { Closeables.close(iterator, false); } } @Override public void map(IntWritable r, VectorWritable v, Context context) throws IOException, InterruptedException { if (PCACommon.pass(sampleRate)) return; Vector row = v.get(); sparseVectorAssign(row, spanVector, new DoubleDoubleFunction() { @Override public double apply(double v, double span) { if (Double.isNaN(v)) v = 0; return v / (span != 0 ? span : 1); } }); context.write(r, v); } } //utility functions static void sparseVectorAssign(Vector mainV, final Vector otherV, DoubleDoubleFunction function) { java.util.Vector<IndexValue> newZeroElements = new java.util.Vector<IndexValue>(); Iterator<Vector.Element> nonZeroElements = mainV.nonZeroes().iterator(); while (nonZeroElements.hasNext()) { Vector.Element e = nonZeroElements.next(); double res = function.apply(e.get(), otherV.getQuick(e.index())); if (res != 0) mainV.setQuick(e.index(), res); else //Don't affect the iterator newZeroElements.add(new IndexValue(e.index(), res)); } for (IndexValue iv : newZeroElements) mainV.setQuick(iv.index, iv.value); } static class IndexValue { IndexValue(int index, double value) { this.index = index; this.value = value; } int index; double value; } }