org.qcri.pca.PCATest.java Source code

Java tutorial

Introduction

Here is the source code for org.qcri.pca.PCATest.java

Source

/**
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.qcri.pca;

import java.io.IOException;
import java.net.URL;
import java.util.Random;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.mahout.common.iterator.sequencefile.PathType;
import org.apache.mahout.common.iterator.sequencefile.SequenceFileDirValueIterable;
import org.apache.mahout.math.DenseMatrix;
import org.apache.mahout.math.Matrix;
import org.apache.mahout.math.VectorWritable;
import org.apache.mahout.math.hadoop.DistributedRowMatrix;
import org.qcri.pca.SPCADriver.InitialValues;
import org.junit.Before;
import org.junit.Test;
import org.junit.Assert;

/**
 * @author maysam yabandeh
 */
public class PCATest {// extends org.apache.mahout.common.MahoutTestCase {

    final static double EPSILON = 0.00001d;
    SPCADriver ppcaDriver;
    int N;//number of rows
    int D;//number of cols
    int d;//number of principal components
    Configuration conf;
    Path input;
    Path output;
    Path tmp;

    @Before
    public void setup() {
        ppcaDriver = new SPCADriver() {
            public Path getTempPath() {
                return tmp;
            }
        };
        N = 527;
        D = 38;
        d = 8;
        conf = new Configuration();
        conf.set("mapred.job.tracker", "local");
        conf.set("fs.default.name", "file:///");
        URL inputURL = this.getClass().getResource("/input.water");
        input = new Path(inputURL.toString());
        long currTime = System.currentTimeMillis();
        output = new Path("/tmp/" + currTime + "/output");
        tmp = new Path("/tmp/" + currTime + "/tmp");
        FileSystem fs;
        try {
            fs = FileSystem.get(output.toUri(), conf);
            fs.mkdirs(output);
            fs.mkdirs(tmp);
            fs.deleteOnExit(output);
            fs.deleteOnExit(tmp);
        } catch (IOException e) {
            e.printStackTrace();
            Assert.fail("Error in creating output direcoty " + output);
            return;
        }
    }

    @Test
    public void crossTestSequentialPPCAs() throws Exception {
        double jakobErr = ppcaDriver.runSequential_JacobVersion(conf, input, output, N, D, d);
        PCACommon.random = new Random(0);
        double bishopErr = ppcaDriver.runSequential(conf, input, output, N, D, d);
        Assert.assertEquals(
                "The PPCA error between two sequntial methods is too different: " + jakobErr + "!= " + bishopErr,
                jakobErr, bishopErr, 0.01d);
    }

    @Test
    public void crossTestIterationOfMapReducePPCASequentialPPCA() throws Exception {
        Matrix C_central = PCACommon.randomMatrix(D, d);
        double ss = PCACommon.randSS();
        InitialValues initValSeq = new InitialValues(C_central, ss);
        InitialValues initValMR = new InitialValues(C_central.clone(), ss);

        //1. run sequential
        Matrix Ye_central = new DenseMatrix(N, D);
        int row = 0;
        for (VectorWritable vw : new SequenceFileDirValueIterable<VectorWritable>(input, PathType.LIST, null,
                conf)) {
            Ye_central.assignRow(row, vw.get());
            row++;
        }
        double bishopSeqErr = ppcaDriver.runSequential(conf, Ye_central, initValSeq, 1);

        //2. run mapreduce
        DistributedRowMatrix Ye = new DistributedRowMatrix(input, tmp, N, D);
        Ye.setConf(conf);
        double bishopMRErr = ppcaDriver.runMapReduce(conf, Ye, initValMR, output, N, D, d, 1, 1, 1, 1);

        Assert.assertEquals("ss value is different in sequential and mapreduce PCA", initValSeq.ss, initValMR.ss,
                EPSILON);
        double seqCTrace = PCACommon.trace(initValSeq.C);
        double mrCTrace = PCACommon.trace(initValMR.C);
        Assert.assertEquals("C value is different in sequential and mapreduce PCA", seqCTrace, mrCTrace, EPSILON);
        Assert.assertEquals("The PPCA error between sequntial and mapreduce methods is too different: "
                + bishopSeqErr + "!= " + bishopMRErr, bishopSeqErr, bishopMRErr, EPSILON);
    }

    /* Too slow
    @Test
    public void crossTestMapReducePPCASequentialPPCA() throws Exception {
      double bishopSeqErr = ppcaDriver.runSequential(conf, input, 
    output, N, D, d);
      PCACommon.random = new Random(0);
      double bishopMRErr = ppcaDriver.runMapReduce(conf, input, 
    output, N, D, d, 1);
      Assert.assertEquals(
    "The PPCA error between sequntial and mapreduce methods is too different: "
        + bishopSeqErr + "!= " + bishopMRErr, bishopSeqErr, bishopMRErr, 0.01d);
    }
    */

}