org.qcri.pca.ReconstructionErrJobTest.java Source code

Java tutorial

Introduction

Here is the source code for org.qcri.pca.ReconstructionErrJobTest.java

Source

/**
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.qcri.pca;

import java.io.IOException;
import java.util.List;

import junit.framework.Assert;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.DoubleWritable;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.qcri.pca.DummyRecordWriter;
import org.apache.mahout.math.DenseMatrix;
import org.apache.mahout.math.DenseVector;
import org.apache.mahout.math.SequentialAccessSparseVector;
import org.apache.mahout.math.Vector;
import org.apache.mahout.math.VectorWritable;
import org.apache.mahout.math.hadoop.DistributedRowMatrix;
import org.junit.Before;
import org.junit.Test;

/**
 * @author maysam yabandeh
 */
public class ReconstructionErrJobTest extends PCATestCase {

    // try different combination of 0, +, -
    // it assumes NaN are removed as part of normalization
    private final double[][] inputVectors = { { -0.49, 0, 3, -13.3 }, { 0, -0.48, 0.45, 3.2 },
            { 0.13, -0.06, 4, -0.00003 } };
    private final double[][] y2xVectors = { { -0.49, 0 }, { 0, -0.48 }, { 0, 0.71 }, { 4, -0.00003 } };
    final int xsize = y2xVectors[0].length;
    private final double[][] cVectors = { { 0, 0.000302 }, { 0, 453 }, { -0.0032, -9.9999 }, { -0.00003, 0 } };
    int cols = inputVectors[0].length;
    int rows = inputVectors.length;
    double[] ym, xm, ymC;
    private Configuration conf;
    private Path ymPath;
    private Path zmPath;
    private Path y2xMatrixPath;
    private Path cMatrixPath;
    private double reconstructionError;
    private double yNorm;
    private double centralizedYNorm;

    interface Array2Vector {
        Vector get(double[] array);
    }

    class Array2DenseVector implements Array2Vector {
        public Vector get(double[] array) {
            return new DenseVector(array);
        }
    }

    class Array2SparseVector implements Array2Vector {
        public Vector get(double[] array) {
            return new SequentialAccessSparseVector(new DenseVector(array));
        }
    }

    @Before
    public void setup() throws Exception {
        conf = new Configuration();
        long currTime = System.currentTimeMillis();
        Path outputDir = new Path("/tmp/" + currTime);
        FileSystem fs;
        try {
            fs = FileSystem.get(outputDir.toUri(), conf);
            fs.mkdirs(outputDir);
            fs.deleteOnExit(outputDir);
        } catch (IOException e) {
            e.printStackTrace();
            Assert.fail("Error in creating output direcoty " + outputDir);
            return;
        }
        ym = computeMean(inputVectors);
        double[] xm = new double[xsize];
        times(ym, y2xVectors, xm);
        double[] zm = new double[cols];
        timesTranspose(xm, cVectors, zm);
        for (int c = 0; c < cols; c++)
            zm[c] -= ym[c];
        ymPath = PCACommon.toDistributedVector(new DenseVector(ym), outputDir, "ym", conf);
        zmPath = PCACommon.toDistributedVector(new DenseVector(zm), outputDir, "zm", conf);
        DistributedRowMatrix distMatrix = PCACommon.toDistributedRowMatrix(new DenseMatrix(y2xVectors), outputDir,
                outputDir, "y2xMatrix");
        y2xMatrixPath = distMatrix.getRowPath();
        distMatrix = PCACommon.toDistributedRowMatrix(new DenseMatrix(cVectors), outputDir, outputDir, "cMatrix");
        cMatrixPath = distMatrix.getRowPath();
        computeError(inputVectors);
    }

    @Test
    public void testWithDenseVectors() throws Exception {
        testJob(new Array2DenseVector());
    }

    @Test
    public void testWithSparseVectors() throws Exception {
        testJob(new Array2SparseVector());
    }

    void testJob(Array2Vector array2Vector) throws Exception {
        conf.set(ReconstructionErrJob.MATRIXY2X, y2xMatrixPath.toString());
        conf.set(ReconstructionErrJob.RECONSTRUCTIONMATRIX, cMatrixPath.toString());
        conf.set(ReconstructionErrJob.YMPATH, ymPath.toString());
        conf.set(ReconstructionErrJob.ZMPATH, zmPath.toString());
        conf.setInt(ReconstructionErrJob.YCOLS, cols);
        conf.setInt(ReconstructionErrJob.XCOLS, xsize);
        ReconstructionErrJob.MyMapper mapper = new ReconstructionErrJob.MyMapper();
        // construct the writers
        DummyRecordWriter<IntWritable, VectorWritable> mapWriter = new DummyRecordWriter<IntWritable, VectorWritable>();
        Mapper<IntWritable, VectorWritable, IntWritable, VectorWritable>.Context context = DummyRecordWriter
                .build(mapper, conf, mapWriter);
        // perform the mapping
        mapper.setup(context);
        for (int i = 0; i < inputVectors.length; i++) {
            VectorWritable row = new VectorWritable(array2Vector.get(inputVectors[i]));
            mapper.map(new IntWritable(i), row, context);
        }
        mapper.cleanup(context);

        // perform the reducing
        ReconstructionErrJob.MyReducer reducer = new ReconstructionErrJob.MyReducer();
        DummyRecordWriter<IntWritable, DoubleWritable> redWriter = new DummyRecordWriter<IntWritable, DoubleWritable>();
        Reducer<IntWritable, VectorWritable, IntWritable, DoubleWritable>.Context redContext = DummyRecordWriter
                .build(reducer, conf, redWriter, IntWritable.class, VectorWritable.class);
        for (IntWritable key : mapWriter.getKeys()) {
            reducer.reduce(key, mapWriter.getValue(key), redContext);
        }

        verifyReducerOutput(redWriter);
    }

    private void verifyReducerOutput(DummyRecordWriter<IntWritable, DoubleWritable> writer) {
        Assert.assertEquals("The reducer should output three key!", 3, writer.getKeys().size());
        for (IntWritable key : writer.getKeys()) {
            List<DoubleWritable> list = writer.getValue(key);
            assertEquals("reducer produces more than one values per key!", 1, list.size());
            Double value = list.get(0).get();
            switch (key.get()) {
            case 0:
                assertEquals("the computed reconstructionError is incorrect!", reconstructionError, value, EPSILON);
                break;
            case 1:
                assertEquals("the computed yNorm is incorrect!", yNorm, value, EPSILON);
                break;
            case 2:
                assertEquals("the computed centralizedYNorm is incorrect!", centralizedYNorm, value, EPSILON);
                break;
            default:
                fail("Unknown key in reading the results: " + key);
            }
        }
    }

    private void computeError(double[][] vectors) {
        double[][] vectorsCopy = vectors.clone();
        for (int i = 0; i < vectors.length; i++)
            vectorsCopy[i] = vectors[i].clone();
        vectors = vectorsCopy;
        yNorm = fnorm(vectors);
        for (int r = 0; r < rows; r++)
            for (int c = 0; c < cols; c++)
                vectors[r][c] -= ym[c];
        centralizedYNorm = fnorm(vectors);
        double[] xi = new double[xsize];
        double[] newyi = new double[cols];
        for (int r = 0; r < rows; r++) {
            double[] yi = vectors[r];
            times(yi, y2xVectors, xi);
            timesTranspose(xi, cVectors, newyi);
            for (int c = 0; c < cols; c++)
                yi[c] = Math.abs(yi[c] - newyi[c]);
        }
        reconstructionError = fnorm(vectors);
    }

    private double fnorm(double[][] vectors) {
        double max = Double.MIN_VALUE;
        for (int c = 0; c < cols; c++) {
            double sum = 0;
            for (int r = 0; r < rows; r++)
                sum += Math.abs(vectors[r][c]);
            max = Math.max(max, sum);
        }
        return max;
    }

    private double[] computeMean(double[][] vectors) {
        double[] meanVector = new double[cols];
        for (int c = 0; c < cols; c++) {
            double sum = 0;
            for (int r = 0; r < rows; r++) {
                double val = vectors[r][c];
                sum += val;
            }
            meanVector[c] = sum / rows;
        }
        return meanVector;
    }

    private void times(double[] ym, double[][] matrix, double[] resVector) {
        int maxC = matrix[0].length;
        int maxR = matrix.length;
        for (int inMemC = 0; inMemC < maxC; inMemC++) {
            double sum = 0;
            for (int r = 0; r < maxR; r++) {
                double val = ym[r];
                double inMemVal = matrix[r][inMemC];
                sum += val * inMemVal;
            }
            resVector[inMemC] = sum;
        }
    }

    private void timesTranspose(double[] vector, double[][] matrix, double[] resVector) {
        int maxC = matrix[0].length;
        int maxR = matrix.length;
        for (int inMemR = 0; inMemR < maxR; inMemR++) {
            double sum = 0;
            for (int c = 0; c < maxC; c++) {
                double val = vector[c];
                double inMemVal = matrix[inMemR][c];
                sum += val * inMemVal;
            }
            resVector[inMemR] = sum;
        }
    }

}