nthu.scopelab.tsqr.ssvd.itQJob.java Source code

Java tutorial

Introduction

Here is the source code for nthu.scopelab.tsqr.ssvd.itQJob.java

Source

/**
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
// modify from mahout-6.0 package org.apache.mahout.math.hadoop.stochasticsvd.itQJob
// 2013 Hsiu-Cheng Yu
package nthu.scopelab.tsqr.ssvd;

import java.io.Closeable;
import java.io.IOException;
import java.util.List;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.Date;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.io.SequenceFile.CompressionType;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.io.compress.DefaultCodec;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.BytesWritable;
import org.apache.hadoop.io.Text;

import org.apache.hadoop.mapred.Mapper;
import org.apache.hadoop.mapred.Reducer;
import org.apache.hadoop.mapred.FileInputFormat;
import org.apache.hadoop.mapred.FileOutputFormat;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.MapReduceBase;
import org.apache.hadoop.mapred.JobConfigurable;
import org.apache.hadoop.mapred.JobContext;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.mapred.JobClient;
import org.apache.hadoop.mapred.lib.IdentityMapper;
import org.apache.hadoop.mapred.SequenceFileInputFormat;
import org.apache.hadoop.mapred.SequenceFileOutputFormat;
import org.apache.hadoop.mapred.RunningJob;
import org.apache.hadoop.mapred.lib.MultipleOutputs;

import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import no.uib.cipr.matrix.Vector;
import no.uib.cipr.matrix.VectorEntry;
import no.uib.cipr.matrix.DenseVector;
import no.uib.cipr.matrix.Matrix;
import no.uib.cipr.matrix.sparse.FlexCompRowMatrix;

import nthu.scopelab.tsqr.QRFirstJob;
import nthu.scopelab.tsqr.math.QRFactorMultiply;
import nthu.scopelab.tsqr.matrix.LMatrixWritable;
import nthu.scopelab.tsqr.matrix.cmDenseMatrix;
import nthu.scopelab.tsqr.TSQRunner.fileGather;
import nthu.scopelab.tsqr.TSQRunner.Checker;
import nthu.scopelab.tsqr.math.QRF;

/**
 * Compute first level of Q.
 * ---
 * part of Modification:
 * 1. Replaced mahout VectorWritable by LMatrixWritable.
 * 2. Extends QRFirstJob in tsqr package to doing factorization.
 * 
 */

@SuppressWarnings("deprecation")
public final class itQJob {

    public enum QTime {
        computation, communication
    }

    public static final String PROP_OMEGA_SEED = "ssvd.omegaseed";
    public static final String PROP_K = "prop.k";
    public static final String PROP_P = "prop.p";
    public static final String QF_MAT = QRFirstJob.QF_MAT;

    final static Logger LOG = LoggerFactory.getLogger(itQJob.class);

    public static class QRMergeJob extends MapReduceBase {
        protected OutputCollector<IntWritable, LMatrixWritable> output;
        protected IntWritable okey = new IntWritable();
        protected LMatrixWritable ovalue = new LMatrixWritable();
        protected String Mapred;
        protected int TaskId;
        protected MultipleOutputs qmos;
        protected List<cmDenseMatrix> RList = new ArrayList<cmDenseMatrix>();
        protected List<Integer> keyList = new ArrayList<Integer>();

        private boolean isR = false;
        private long t1, t2, qt1, qt2, outputTime = 0;

        @Override
        public void configure(JobConf job) {
            qt1 = new Date().getTime();
            Mapred = job.get("mapred.task.id").split("_")[3];
            TaskId = Integer.parseInt(job.get("mapred.task.id").split("_")[4]);
            qmos = new MultipleOutputs(job);
            if (Mapred.equals("r"))
                isR = true;
        }

        public void collect(IntWritable key, LMatrixWritable value) throws IOException {
            //In Reduce function, just store matrix to the List
            if (!isR) {
                QRF qrf = QRF.factorize(value.getDense());
                cmDenseMatrix outputQ = qrf.getQ();
                cmDenseMatrix outputR = qrf.getR();

                RList.add(outputR);
                keyList.add(key.get());

                okey.set(key.get());
                ovalue.set(outputQ);
                t1 = new Date().getTime();
                qmos.getCollector(QF_MAT, null).collect(okey, ovalue);
                t2 = new Date().getTime();
                outputTime += (t2 - t1);
                //System.out.println("Qf: "+(t2-t1));      
            } else {
                RList.add(value.getDense().copy());
                keyList.add(key.get());
            }
        }

        public void close() throws IOException {
            //merge R and do QR decomposition
            if (RList.size() > 0) {
                int numRows, numCols, subRows;
                subRows = RList.get(0).numRows();
                numRows = RList.size() * RList.get(0).numRows();
                numCols = RList.get(0).numColumns();

                cmDenseMatrix mR = new cmDenseMatrix(numRows, numCols);
                int curRowbegingIndex = 0;
                while (!RList.isEmpty()) {
                    cmDenseMatrix dmat = RList.remove(0);
                    for (int i = 0; i < dmat.numRows(); i++)
                        for (int j = 0; j < dmat.numColumns(); j++) {
                            mR.set(i + curRowbegingIndex, j, dmat.get(i, j));
                        }
                    curRowbegingIndex += dmat.numRows();
                }
                QRF qrf = QRF.factorize(mR);
                cmDenseMatrix outputQ = qrf.getQ();
                cmDenseMatrix outputR = qrf.getR();

                //split Q and output 
                int curRowIndex = 0;
                cmDenseMatrix outputSplitQ = new cmDenseMatrix(new double[subRows * numCols], subRows, numCols);
                while (!keyList.isEmpty()) {
                    for (int i = 0; i < subRows; i++)
                        for (int j = 0; j < numCols; j++)
                            outputSplitQ.set(i, j, outputQ.get(i + curRowIndex, j));

                    okey.set(keyList.remove(0));
                    ovalue.set(outputSplitQ);
                    t1 = new Date().getTime(); //debug
                    qmos.getCollector(QF_MAT, null).collect(okey, ovalue);
                    t2 = new Date().getTime(); //debug
                    outputTime += (t2 - t1);
                    //System.out.println("Qs: "+(t2-t1));                    
                    curRowIndex += subRows;
                }
                okey.set(TaskId);
                ovalue.set(outputR);
                output.collect(okey, ovalue);

                qmos.close();
                qt2 = new Date().getTime();
                System.out.println("Output Time: " + outputTime);
                System.out.println("Total Time: " + (qt2 - qt1));
            } //if( RList.size()>0 )
        }

    }

    public static class QMapper extends QRMergeJob
            implements Mapper<IntWritable, LMatrixWritable, IntWritable, LMatrixWritable> {

        private Omega omega;
        private int kp;
        private cmDenseMatrix subY = null;

        @Override
        public void configure(JobConf job) {
            super.configure(job);
            int k = Integer.parseInt(job.get(PROP_K));
            int p = Integer.parseInt(job.get(PROP_P));
            kp = k + p;
            long omegaSeed = Long.parseLong(job.get(PROP_OMEGA_SEED));
            omega = new Omega(omegaSeed, k, p);
        }

        public void map(IntWritable key, LMatrixWritable value,
                OutputCollector<IntWritable, LMatrixWritable> output, Reporter reporter) throws IOException {
            FlexCompRowMatrix subAs = null;
            cmDenseMatrix subAd = null;
            int subANumRows = -1;
            if (!value.isDense()) {
                subAs = value.getSparse();
                subANumRows = subAs.numRows();
            } else {
                subAd = value.getDense();
                subANumRows = subAd.numRows();
            }

            if (subY == null)
                subY = new cmDenseMatrix(new double[subANumRows * kp * 2], subANumRows, kp);
            else if (subY.getData().length < subANumRows * kp)
                subY = new cmDenseMatrix(new double[subANumRows * kp * 2], subANumRows, kp);
            else
                subY.set(subY.getData(), subANumRows, kp);

            //get the Y sub matrix  from A sub matrix * omega
            //computeY
            subY.zero();
            if (!value.isDense()) {
                omega.computeY(subAs, subY);
            } else {
                omega.computeY(subAd, subY);
            }
            ovalue.setLMat(value.getLongArray(), subY);

            this.output = output;
            super.collect(key, ovalue);
        }

        @Override
        public void close() throws IOException {
            super.close();
        }
    }

    public static class QReducer extends QRMergeJob
            implements Reducer<IntWritable, LMatrixWritable, IntWritable, LMatrixWritable> {

        public void reduce(IntWritable key, Iterator<LMatrixWritable> values,
                OutputCollector<IntWritable, LMatrixWritable> output, Reporter reporter) throws IOException {
            this.output = output;
            while (values.hasNext()) {
                super.collect(key, values.next());
            }
        }

        @Override
        public void close() throws IOException {
            super.close();
        }
    }

    public static void run(Configuration conf, Path[] inputPaths, String outputPath, String reduceSchedule, int k,
            int p, long seed, int mis) throws ClassNotFoundException, InterruptedException, IOException {

        String stages[] = reduceSchedule.split(",");
        String rinput = "";
        String routput = outputPath + "/iter-r-";

        for (int i = 0; i < stages.length; i++) {
            String thenumber = Integer.toString(i + 1);
            JobConf job = new JobConf(conf, itQJob.class);
            job.setJobName("itQ-job-" + thenumber);
            job.setInputFormat(SequenceFileInputFormat.class);
            job.setOutputFormat(SequenceFileOutputFormat.class);

            if (i == 0)
                job.setMapperClass(QMapper.class);
            else
                job.setMapperClass(IdentityMapper.class);

            job.setReducerClass(QReducer.class);
            job.setOutputKeyClass(IntWritable.class);
            job.setOutputValueClass(LMatrixWritable.class);

            FileSystem fs = FileSystem.get(job);
            Path Paths[];
            fileGather fgather = null;
            if (i == 0)
                fgather = new fileGather(inputPaths, "part", fs);
            else
                fgather = new fileGather(new Path(rinput), "part", fs);
            Paths = fgather.getPaths();
            mis = Checker.checkMis(mis, fgather.getInputSize(), fs);
            job.setNumMapTasks(fgather.recNumMapTasks(mis));

            job.setNumReduceTasks(Integer.parseInt(stages[i]));

            job.setInt(QRFirstJob.COLUMN_SIZE, k + p);
            job.setLong(PROP_OMEGA_SEED, seed);
            job.setInt(PROP_K, k);
            job.setInt(PROP_P, p);

            fs.delete(new Path(routput + thenumber), true);

            FileInputFormat.setInputPaths(job, Paths);

            FileOutputFormat.setOutputPath(job, new Path(routput + thenumber));

            //FileOutputFormat.setCompressOutput(job, true);
            //FileOutputFormat.setOutputCompressorClass(job, DefaultCodec.class);
            //SequenceFileOutputFormat.setOutputCompressionType(job,CompressionType.BLOCK);
            //output first level Q
            MultipleOutputs.addNamedOutput(job, QF_MAT, SequenceFileOutputFormat.class, IntWritable.class,
                    LMatrixWritable.class);

            RunningJob rj = JobClient.runJob(job);
            System.out.println("itQJob Job ID: " + rj.getJobID().toString());
            rinput = routput + thenumber;
        }
    }

}