org.qcri.pca.NormalizeJob.java Source code

Java tutorial

Introduction

Here is the source code for org.qcri.pca.NormalizeJob.java

Source

/**
 * QCRI, sPCA LICENSE
 * sPCA is a scalable implementation of Principal Component Analysis (PCA) on of Spark and MapReduce
 *
 * Copyright (c) 2015, Qatar Foundation for Education, Science and Community Development (on
 * behalf of Qatar Computing Research Institute) having its principle place of business in Doha,
 * Qatar with the registered address P.O box 5825 Doha, Qatar (hereinafter referred to as "QCRI")
 *
*/

package org.qcri.pca;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat;
import org.apache.mahout.common.AbstractJob;
import org.apache.mahout.common.Pair;
import org.apache.mahout.common.iterator.sequencefile.SequenceFileIterator;
import org.apache.mahout.math.DenseVector;
import org.apache.mahout.math.Vector;
import org.apache.mahout.math.VectorWritable;
import org.apache.mahout.math.function.DoubleDoubleFunction;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import com.google.common.io.Closeables;

import java.io.IOException;
import java.util.Iterator;
import java.util.List;
import java.util.Map;

/**
 * Normalize a matrix by dividing each value to the span of its column. After
 * normalization, the difference between the values in a column is <=1
 * 
 * @author maysam yabandeh
 */
public class NormalizeJob extends AbstractJob {
    private static final Logger log = LoggerFactory.getLogger(NormalizeJob.class);

    public static final String MEANSPANOPTION = "meanSpanFile";
    public static final String SAMPLERATE = "sampleRate";
    public static final String SPLITFACTOR = "splitFactor";

    @Override
    public int run(String[] strings) throws Exception {
        addInputOption();
        addOutputOption();
        addOption(MEANSPANOPTION, "mean", "The name of the file that contains mean and span vectors");
        addOption(SPLITFACTOR, "sf", "The factor with which the default split size is divided");
        Map<String, List<String>> parsedArgs = parseArguments(strings);
        if (parsedArgs == null) {
            return -1;
        }
        String meanSpanFileName = getOption(MEANSPANOPTION);
        run(getConf(), getInputPath(), meanSpanFileName, getOutputPath(), 1);
        return 0;
    }

    /**
     * Normalize the input matrix by dividing its elements by the column range
     * 
     * @param conf
     *          the configuration
     * @param inputPath
     *          the path to the input matrix Y
     * @param meanSpanPath
     *          the path to the file that contains mean and span (generated by
     *          MeanAndSpanJob)
     * @param sampleRate
     *          if < 1, a sample of the matrix is taken for normalization
     * @param normalizedYePath
     *          the output path
     * @return the path to which the resulting normalized matrix is stored
     * @throws Exception
     */
    public Path normalize(Configuration conf, Path inputPath, Path meanSpanPath, Path outputPath, double sampleRate,
            String id) throws Exception {
        Path normalizedYePath = new Path(outputPath, "normalizeY" + id);
        FileSystem fs = FileSystem.get(normalizedYePath.toUri(), conf);
        if (!fs.exists(normalizedYePath)) {
            run(conf, inputPath, meanSpanPath.toString(), normalizedYePath, sampleRate);
        } else {
            log.warn("---------- Skip NormalizeJob - already exists: " + normalizedYePath);
        }
        return normalizedYePath;
    }

    public void run(Configuration conf, Path matrixInputPath, String meanSpanFileName, Path matrixOutputPath,
            double sampleRate) throws IOException, InterruptedException, ClassNotFoundException {
        conf.set(MEANSPANOPTION, meanSpanFileName);
        conf.setFloat(SAMPLERATE, (float) sampleRate);
        Job job = new Job(conf);
        job.setJobName("Normalize");
        job.setJarByClass(NormalizeJob.class);
        FileSystem fs = FileSystem.get(matrixInputPath.toUri(), conf);
        matrixInputPath = fs.makeQualified(matrixInputPath);
        matrixOutputPath = fs.makeQualified(matrixOutputPath);
        FileInputFormat.addInputPath(job, matrixInputPath);
        job.setInputFormatClass(SequenceFileInputFormat.class);
        FileOutputFormat.setOutputPath(job, matrixOutputPath);
        job.setMapperClass(NormalizeMapper.class);
        job.setNumReduceTasks(0);
        job.setOutputFormatClass(SequenceFileOutputFormat.class);
        job.setOutputKeyClass(IntWritable.class);
        job.setOutputValueClass(VectorWritable.class);
        job.submit();
        job.waitForCompletion(true);
    }

    public static class NormalizeMapper extends Mapper<IntWritable, VectorWritable, IntWritable, VectorWritable> {
        private DenseVector spanVector;
        private double sampleRate = 1;

        @Override
        public void setup(Context context) throws IOException {
            Configuration conf = context.getConfiguration();
            sampleRate = conf.getFloat(SAMPLERATE, 1);
            String meanSpanFileName = conf.get(MEANSPANOPTION);
            Path meanSpanFile = new Path(meanSpanFileName);
            SequenceFileIterator<IntWritable, VectorWritable> iterator = new SequenceFileIterator<IntWritable, VectorWritable>(
                    meanSpanFile, true, conf);
            try {
                Pair<IntWritable, VectorWritable> next;
                next = iterator.next();
                if (next.getFirst().get() == MeanAndSpanJob.SPANVECTOR)
                    spanVector = new DenseVector(next.getSecond().get());
                next = iterator.next();
                if (next.getFirst().get() == MeanAndSpanJob.SPANVECTOR)
                    spanVector = new DenseVector(next.getSecond().get());
            } finally {
                Closeables.close(iterator, false);
            }
        }

        @Override
        public void map(IntWritable r, VectorWritable v, Context context) throws IOException, InterruptedException {
            if (PCACommon.pass(sampleRate))
                return;
            Vector row = v.get();
            sparseVectorAssign(row, spanVector, new DoubleDoubleFunction() {
                @Override
                public double apply(double v, double span) {
                    if (Double.isNaN(v))
                        v = 0;
                    return v / (span != 0 ? span : 1);
                }
            });
            context.write(r, v);
        }
    }

    //utility functions
    static void sparseVectorAssign(Vector mainV, final Vector otherV, DoubleDoubleFunction function) {
        java.util.Vector<IndexValue> newZeroElements = new java.util.Vector<IndexValue>();
        Iterator<Vector.Element> nonZeroElements = mainV.nonZeroes().iterator();
        while (nonZeroElements.hasNext()) {
            Vector.Element e = nonZeroElements.next();
            double res = function.apply(e.get(), otherV.getQuick(e.index()));
            if (res != 0)
                mainV.setQuick(e.index(), res);
            else //Don't affect the iterator
                newZeroElements.add(new IndexValue(e.index(), res));
        }
        for (IndexValue iv : newZeroElements)
            mainV.setQuick(iv.index, iv.value);
    }

    static class IndexValue {
        IndexValue(int index, double value) {
            this.index = index;
            this.value = value;
        }

        int index;
        double value;
    }

}