org.apache.mahout.math.hadoop.stochasticsvd.LocalSSVDPCASparseTest.java Source code

Java tutorial

Introduction

Here is the source code for org.apache.mahout.math.hadoop.stochasticsvd.LocalSSVDPCASparseTest.java

Source

/**
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.mahout.math.hadoop.stochasticsvd;

import com.google.common.collect.Lists;
import com.google.common.io.Closeables;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.SequenceFile;
import org.apache.hadoop.io.SequenceFile.CompressionType;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.io.compress.DefaultCodec;
import org.apache.mahout.common.IOUtils;
import org.apache.mahout.common.MahoutTestCase;
import org.apache.mahout.common.Pair;
import org.apache.mahout.common.RandomUtils;
import org.apache.mahout.math.*;
import org.apache.mahout.math.function.DoubleFunction;
import org.apache.mahout.math.function.Functions;
import org.apache.mahout.math.function.VectorFunction;
import org.junit.Test;

import java.io.Closeable;
import java.io.File;
import java.io.IOException;
import java.util.Deque;
import java.util.Iterator;
import java.util.Random;

public class LocalSSVDPCASparseTest extends MahoutTestCase {

    private static final double s_epsilon = 1.0E-10d;

    @Test
    public void testOmegaTRightMultiply() {
        final Random rnd = RandomUtils.getRandom();
        final long seed = rnd.nextLong();
        final int n = 2000;

        final int kp = 100;

        final Omega omega = new Omega(seed, kp);
        final Matrix materializedOmega = new DenseMatrix(n, kp);
        for (int i = 0; i < n; i++)
            for (int j = 0; j < kp; j++)
                materializedOmega.setQuick(i, j, omega.getQuick(i, j));
        Vector xi = new DenseVector(n);
        xi.assign(new DoubleFunction() {
            @Override
            public double apply(double x) {
                return rnd.nextDouble() * 100;
            }
        });

        Vector s_o = omega.mutlithreadedTRightMultiply(xi);

        Matrix xiVector = new DenseMatrix(n, 1);
        xiVector.assignColumn(0, xi);

        Vector s_o_control = materializedOmega.transpose().times(xiVector).viewColumn(0);

        assertEquals(0, s_o.minus(s_o_control).aggregate(Functions.PLUS, Functions.ABS), 1e-10);

        System.out.printf("s_omega=\n%s\n", s_o);
        System.out.printf("s_omega_control=\n%s\n", s_o_control);
    }

    @Test
    public void runPCATest1() throws IOException {
        runSSVDSolver(1);
    }

    //  @Test
    public void runPCATest0() throws IOException {
        runSSVDSolver(0);
    }

    public void runSSVDSolver(int q) throws IOException {

        Configuration conf = new Configuration();
        conf.set("mapred.job.tracker", "local");
        conf.set("fs.default.name", "file:///");

        // conf.set("mapred.job.tracker","localhost:11011");
        // conf.set("fs.default.name","hdfs://localhost:11010/");

        Deque<Closeable> closeables = Lists.newLinkedList();
        try {
            Random rnd = RandomUtils.getRandom();

            File tmpDir = getTestTempDir("svdtmp");
            conf.set("hadoop.tmp.dir", tmpDir.getAbsolutePath());

            Path aLocPath = new Path(getTestTempDirPath("svdtmp/A"), "A.seq");

            // create distributed row matrix-like struct
            SequenceFile.Writer w = SequenceFile.createWriter(FileSystem.getLocal(conf), conf, aLocPath, Text.class,
                    VectorWritable.class, CompressionType.BLOCK, new DefaultCodec());
            closeables.addFirst(w);

            int n = 100;
            int m = 2000;
            double percent = 5;

            VectorWritable vw = new VectorWritable();
            Text rkey = new Text();

            Vector xi = new DenseVector(n);

            double muAmplitude = 50.0;
            for (int i = 0; i < m; i++) {
                Vector dv = new SequentialAccessSparseVector(n);
                String rowname = "row-" + i;
                NamedVector namedRow = new NamedVector(dv, rowname);
                for (int j = 0; j < n * percent / 100; j++) {
                    dv.setQuick(rnd.nextInt(n), muAmplitude * (rnd.nextDouble() - 0.25));
                }
                rkey.set("row-i" + i);
                vw.set(namedRow);
                w.append(rkey, vw);
                xi.assign(dv, Functions.PLUS);
            }
            closeables.remove(w);
            Closeables.close(w, false);

            xi.assign(Functions.mult(1.0 / m));

            FileSystem fs = FileSystem.get(conf);

            Path tempDirPath = getTestTempDirPath("svd-proc");
            Path aPath = new Path(tempDirPath, "A/A.seq");
            fs.copyFromLocalFile(aLocPath, aPath);
            Path xiPath = new Path(tempDirPath, "xi/xi.seq");
            SSVDHelper.saveVector(xi, xiPath, conf);

            Path svdOutPath = new Path(tempDirPath, "SSVD-out");

            // make sure we wipe out previous test results, just a convenience
            fs.delete(svdOutPath, true);

            // Solver starts here:
            System.out.println("Input prepared, starting solver...");

            int ablockRows = 867;
            int p = 60;
            int k = 40;
            SSVDSolver ssvd = new SSVDSolver(conf, new Path[] { aPath }, svdOutPath, ablockRows, k, p, 3);
            ssvd.setOuterBlockHeight(500);
            ssvd.setAbtBlockHeight(251);
            ssvd.setPcaMeanPath(xiPath);

            /*
             * Removing V,U jobs from this test to reduce running time. i will keep them
             * put in the dense test though.
             *
             * For PCA test, we also want to request U*Sigma output and check it for named
             * vector propagation.
             */
            ssvd.setComputeU(false);
            ssvd.setComputeV(false);
            ssvd.setcUSigma(true);

            ssvd.setOverwrite(true);
            ssvd.setQ(q);
            ssvd.setBroadcast(true);
            ssvd.run();

            Vector stochasticSValues = ssvd.getSingularValues();

            // try to run the same thing without stochastic algo
            Matrix a = SSVDHelper.drmLoadAsDense(fs, aPath, conf);

            verifyInternals(svdOutPath, a, new Omega(ssvd.getOmegaSeed(), k + p), k + p, q);

            // subtract pseudo pca mean
            for (int i = 0; i < m; i++) {
                a.viewRow(i).assign(xi, Functions.MINUS);
            }

            SingularValueDecomposition svd2 = new SingularValueDecomposition(a);

            Vector svalues2 = new DenseVector(svd2.getSingularValues());

            System.out.println("--SSVD solver singular values:");
            LocalSSVDSolverSparseSequentialTest.dumpSv(stochasticSValues);
            System.out.println("--SVD solver singular values:");
            LocalSSVDSolverSparseSequentialTest.dumpSv(svalues2);

            for (int i = 0; i < k + p; i++) {
                assertTrue(Math.abs(svalues2.getQuick(i) - stochasticSValues.getQuick(i)) <= s_epsilon);
            }

            DenseMatrix mQ = SSVDHelper.drmLoadAsDense(fs, new Path(svdOutPath, "Bt-job/" + BtJob.OUTPUT_Q + "-*"),
                    conf);

            SSVDCommonTest.assertOrthonormality(mQ, false, s_epsilon);

            // assert name propagation
            for (Iterator<Pair<Writable, Vector>> iter = SSVDHelper.drmIterator(fs,
                    new Path(ssvd.getuSigmaPath() + "/*"), conf, closeables); iter.hasNext();) {
                Pair<Writable, Vector> pair = iter.next();
                Writable key = pair.getFirst();
                Vector v = pair.getSecond();

                assertTrue(v instanceof NamedVector);
                assertTrue(key instanceof Text);
            }

        } finally {
            IOUtils.close(closeables);
        }
    }

    private void verifyInternals(Path tempDir, Matrix a, Omega omega, int kp, int q) {
        int m = a.numRows();
        int n = a.numCols();

        Vector xi = a.aggregateColumns(new VectorFunction() {
            @Override
            public double apply(Vector v) {
                return v.zSum() / v.size();
            }
        });

        // materialize omega
        Matrix momega = new DenseMatrix(n, kp);
        for (int i = 0; i < n; i++)
            for (int j = 0; j < kp; j++)
                momega.setQuick(i, j, omega.getQuick(i, j));

        Vector s_o = omega.mutlithreadedTRightMultiply(xi);

        System.out.printf("s_omega=\n%s\n", s_o);

        Matrix y = a.times(momega);
        for (int i = 0; i < n; i++)
            y.viewRow(i).assign(s_o, Functions.MINUS);

        QRDecomposition qr = new QRDecomposition(y);
        Matrix qm = qr.getQ();

        Vector s_q = qm.aggregateColumns(new VectorFunction() {
            @Override
            public double apply(Vector v) {
                return v.zSum();
            }
        });

        System.out.printf("s_q=\n%s\n", s_q);

        Matrix b = qm.transpose().times(a);

        Vector s_b = b.times(xi);

        System.out.printf("s_b=\n%s\n", s_b);

    }

}