at.illecker.hama.hybrid.examples.matrixmultiplication2.DistributedRowMatrix.java Source code

Java tutorial

Introduction

Here is the source code for at.illecker.hama.hybrid.examples.matrixmultiplication2.DistributedRowMatrix.java

Source

/**
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package at.illecker.hama.hybrid.examples.matrixmultiplication2;

import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.Random;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configurable;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.SequenceFile;
import org.apache.hadoop.io.SequenceFile.CompressionType;
import org.apache.hadoop.io.WritableComparable;
import org.apache.hama.HamaConfiguration;
import org.apache.hama.bsp.BSPJob;
import org.apache.hama.commons.io.VectorWritable;
import org.apache.hama.commons.math.DenseDoubleMatrix;
import org.apache.hama.commons.math.DenseDoubleVector;
import org.apache.hama.commons.math.DoubleVector;

public class DistributedRowMatrix implements Configurable {
    private static final Log LOG = LogFactory.getLog(DistributedRowMatrix.class);

    private final Path inputPath;
    private final Path outputTmpPath;
    private Configuration conf;
    private Path rowPath;
    private Path outputTmpBasePath;
    private final int numRows;
    private final int numCols;

    public DistributedRowMatrix(Path inputPath, Path outputTmpPath, int numRows, int numCols) {
        this(inputPath, outputTmpPath, numRows, numCols, false);
    }

    public DistributedRowMatrix(Path inputPath, Path outputTmpPath, int numRows, int numCols,
            boolean keepTempFiles) {
        this.inputPath = inputPath;
        this.outputTmpPath = outputTmpPath;
        this.numRows = numRows;
        this.numCols = numCols;
    }

    @Override
    public Configuration getConf() {
        return conf;
    }

    @Override
    public void setConf(Configuration conf) {
        this.conf = conf;
        try {
            FileSystem fs = FileSystem.get(inputPath.toUri(), conf);
            rowPath = fs.makeQualified(inputPath);
            outputTmpBasePath = fs.makeQualified(outputTmpPath);
        } catch (IOException ioe) {
            throw new IllegalStateException(ioe);
        }
    }

    public int numRows() {
        return numRows;
    }

    public int numCols() {
        return numCols;
    }

    public Path getRowPath() {
        return rowPath;
    }

    public Path getOutputTempPath() {
        return outputTmpBasePath;
    }

    public void setOutputTempPathString(String outPathString) {
        try {
            outputTmpBasePath = FileSystem.get(conf).makeQualified(new Path(outPathString));
        } catch (IOException ioe) {
            LOG.error(
                    "Unable to set outputBasePath to {}, leaving as {}" + outPathString + " " + outputTmpBasePath);
        }
    }

    /**
     * This implements matrix multiplication A * B using MapReduce tasks on CPU
     * 
     * @param other a DistributedRowMatrix
     * @param outPath path to write result to
     * 
     * @return a DistributedRowMatrix containing the product
     */
    public DistributedRowMatrix multiply(DistributedRowMatrix other, Path outPath, int tileWidth,
            boolean isDebugging) throws IOException, ClassNotFoundException, InterruptedException {
        return multiplyBSP(other, outPath, tileWidth, isDebugging);
    }

    /**
     * This implements matrix multiplication A * B using MapReduce tasks on CPU or
     * GPU
     * 
     * @param other a DistributedRowMatrix
     * @param outPath path to write result to
     * @param useGPU use GPU or CPU (default: false, use CPU)
     * @return a DistributedRowMatrix containing the product
     */
    public DistributedRowMatrix multiplyBSP(DistributedRowMatrix other, Path outPath, int tileWidth,
            boolean isDebugging) throws IOException, ClassNotFoundException, InterruptedException {

        // Check if cols of MatrixA = rows of MatrixB
        // (l x m) * (m x n) = (l x n)
        if (numCols != other.numRows()) {
            throw new IOException("Cols of MatrixA != rows of MatrixB! (" + numCols + "!=" + other.numRows() + ")");
        }

        Configuration initialConf = (getConf() == null) ? new HamaConfiguration() : getConf();

        // Debug
        // System.out.println("DistributedRowMatrix transposed:");
        // transposed.printDistributedRowMatrix();

        // Build MatrixMultiplication job configuration
        BSPJob job = MatrixMultiplicationHybridBSP.createMatrixMultiplicationHybridBSPConf(initialConf,
                this.rowPath, other.rowPath, outPath.getParent(), tileWidth, isDebugging);

        // Multiply Matrix
        if (job.waitForCompletion(true)) {

            // Rename result file to output path
            Configuration conf = job.getConfiguration();
            FileSystem fs = outPath.getFileSystem(conf);
            FileStatus[] files = fs.listStatus(outPath.getParent());
            for (int i = 0; i < files.length; i++) {
                if ((files[i].getPath().getName().startsWith("part-")) && (files[i].getLen() > 97)) {
                    fs.rename(files[i].getPath(), outPath);
                    break;
                }
            }

            // Read resulting Matrix from HDFS
            DistributedRowMatrix out = new DistributedRowMatrix(outPath, outputTmpPath, this.numRows,
                    other.numCols());
            out.setConf(conf);

            return out;
        }

        return null;
    }

    /**
     * This implements matrix multiplication A * B in Java without using MapReduce
     * tasks
     * 
     * @param other a DistributedRowMatrix
     * @param outPath path to write result to
     * 
     * @return a DistributedRowMatrix containing the product
     */
    public DistributedRowMatrix multiplyJava(DistributedRowMatrix other, Path outPath) throws IOException {
        // Check if cols of MatrixA = rows of MatrixB
        // (l x m) * (m x n) = (l x n)
        if (numCols != other.numRows()) {
            throw new IOException("Cols of MatrixA != rows of MatrixB! (" + numCols + "!=" + other.numRows() + ")");
        }

        // Multiply Matrix with transposed one without new MapReduce Job
        final double[][] matrixA = this.toDoubleArray();
        final double[][] matrixB = other.toDoubleArray();
        final double[][] matrixC = new double[this.numRows][other.numCols];

        int m = this.numRows;
        int n = this.numCols;
        int p = other.numCols;
        for (int k = 0; k < n; k++) {
            for (int i = 0; i < m; i++) {
                for (int j = 0; j < p; j++) {
                    matrixC[i][j] = matrixC[i][j] + matrixA[i][k] * matrixB[k][j];
                }
            }
        }

        // Save resulting Matrix to HDFS
        List<Path> matrixCPaths = writeDistributedRowMatrix(this.conf, matrixC, this.numRows, other.numCols,
                outPath, 1, 0, 0);

        // Read resulting Matrix from HDFS
        DistributedRowMatrix out = new DistributedRowMatrix(matrixCPaths.get(0), outputTmpPath, this.numRows,
                other.numCols);
        out.setConf(conf);

        return out;
    }

    public static class MatrixEntryWritable implements WritableComparable<MatrixEntryWritable> {
        private int row;
        private int col;
        private double val;

        public int getRow() {
            return row;
        }

        public void setRow(int row) {
            this.row = row;
        }

        public int getCol() {
            return col;
        }

        public void setCol(int col) {
            this.col = col;
        }

        public double getVal() {
            return val;
        }

        public void setVal(double val) {
            this.val = val;
        }

        @Override
        public int compareTo(MatrixEntryWritable o) {
            if (row > o.row) {
                return 1;
            } else if (row < o.row) {
                return -1;
            } else {
                if (col > o.col) {
                    return 1;
                } else if (col < o.col) {
                    return -1;
                } else {
                    return 0;
                }
            }
        }

        @Override
        public boolean equals(Object o) {
            if (!(o instanceof MatrixEntryWritable)) {
                return false;
            }
            MatrixEntryWritable other = (MatrixEntryWritable) o;
            return row == other.row && col == other.col;
        }

        @Override
        public int hashCode() {
            return row + 31 * col;
        }

        @Override
        public void write(DataOutput out) throws IOException {
            out.writeInt(row);
            out.writeInt(col);
            out.writeDouble(val);
        }

        @Override
        public void readFields(DataInput in) throws IOException {
            row = in.readInt();
            col = in.readInt();
            val = in.readDouble();
        }

        @Override
        public String toString() {
            return "(" + row + ',' + col + "):" + val;
        }
    }

    public static List<Path> createRandomDistributedRowMatrix(Configuration conf, int rows, int columns,
            Random rand, Path path, boolean saveTransposed) throws Exception {

        return createRandomDistributedRowMatrix(conf, rows, columns, rand, path, saveTransposed, 1, 0, 0);
    }

    public static List<Path> createRandomDistributedRowMatrix(Configuration conf, int rows, int columns,
            Random rand, Path path, boolean saveTransposed, int numBspTask, int numGPUBspTask, int GPUPercentage)
            throws Exception {

        double[][] matrix = new double[rows][columns];
        for (int i = 0; i < rows; i++) {
            for (int j = 0; j < columns; j++) {
                // matrix[i][j] = rand.nextDouble();
                matrix[i][j] = rand.nextInt(9) + 1;
            }
        }

        // Transpose Matrix before saving
        if (saveTransposed) {
            double[][] transposed = new double[columns][rows];
            for (int i = 0; i < rows; i++) {
                for (int j = 0; j < columns; j++) {
                    transposed[j][i] = matrix[i][j];
                }
            }
            matrix = transposed;

            // switch cols and rows
            int tmp = rows;
            rows = columns;
            columns = tmp;
        }

        return writeDistributedRowMatrix(conf, matrix, rows, columns, path, numBspTask, numGPUBspTask,
                GPUPercentage);
    }

    public static DenseDoubleMatrix readDistributedRowMatrix(Configuration conf, Path path) {

        // System.out.println("readDistributedRowMatrix: " + path);

        List<DoubleVector> matrix = new ArrayList<DoubleVector>();

        SequenceFile.Reader reader = null;
        try {
            FileSystem fs = FileSystem.get(conf);
            reader = new SequenceFile.Reader(fs, path, conf);

            IntWritable key = new IntWritable();
            VectorWritable vector = new VectorWritable();

            while (reader.next(key, vector)) {
                // System.out.println("readDistributedRowMatrix: key: " + key
                // + Arrays.toString(vector.getVector().toArray()));
                matrix.add(vector.getVector());
            }
            reader.close();

            if (matrix.size() > 0) {
                DoubleVector list[] = new DoubleVector[matrix.size()];
                DenseDoubleMatrix result = new DenseDoubleMatrix(matrix.toArray(list));
                return result;
            }
            return null;

        } catch (IOException e) {
            e.printStackTrace();
        } finally {
            if (reader != null) {
                try {
                    reader.close();
                } catch (IOException e) {
                    e.printStackTrace();
                }
            }
        }
        return null;
    }

    public static List<Path> writeDistributedRowMatrix(Configuration conf, double[][] matrix, int rows, int columns,
            Path path, int numBspTask, int numGPUBspTask, int GPUPercentage) throws IOException {

        List<Path> splittedFiles = new ArrayList<Path>();

        // Compute work distributions
        int cpuTaskNum = numBspTask - numGPUBspTask;
        int inputVectorsPerGPUTask = 0;
        int inputVectorsPerCPU = 0;
        int inputVectorsPerCPUTask = 0;
        if ((numGPUBspTask > 0) && (GPUPercentage > 0) && (GPUPercentage <= 100)) {
            inputVectorsPerGPUTask = (rows * GPUPercentage) / 100;
            inputVectorsPerCPU = rows - inputVectorsPerGPUTask;
        } else {
            inputVectorsPerCPU = rows;
        }
        if (cpuTaskNum > 0) {
            inputVectorsPerCPUTask = inputVectorsPerCPU / cpuTaskNum;
        }

        for (int part = 0; part < numBspTask; part++) {

            Path partIn = new Path(path, "part" + part + ".seq");
            splittedFiles.add(partIn);
            FileSystem fs = FileSystem.get(conf);
            final SequenceFile.Writer dataWriter = SequenceFile.createWriter(fs, conf, partIn, IntWritable.class,
                    VectorWritable.class, CompressionType.NONE);

            int interval = 0;
            if (part > cpuTaskNum) {
                interval = inputVectorsPerGPUTask;
            } else {
                interval = inputVectorsPerCPUTask;
            }
            int start = interval * part;
            int end = start + interval;
            if ((numBspTask - 1) == part) {
                end = rows; // set to totalRows
            }
            LOG.info("Partition " + part + " file " + partIn.getParent().getName() + "/" + partIn.getName()
                    + " from " + start + " to " + (end - 1));

            for (int i = start; i < end; i++) {
                DenseDoubleVector rowVector = new DenseDoubleVector(matrix[i]);
                dataWriter.append(new IntWritable(i), new VectorWritable(rowVector));
            }
            dataWriter.close();
        }

        return splittedFiles;
    }

    public static void printMatrix(double[][] matrix, int rows, int columns) {
        if (matrix != null) {
            for (int i = 0; i < rows; i++) {
                for (int j = 0; j < columns; j++) {
                    System.out.print(matrix[i][j] + " ");
                }
                System.out.println();
            }
        }
    }

    public void printDistributedRowMatrix() {
        System.out.println("printMatrix (" + this.numRows + " x " + this.numCols + ") Path: " + this.rowPath);
        printMatrix(this.toDoubleArray(), this.numRows, this.numCols);
    }

    public DenseDoubleMatrix readDistributedRowMatrix() {
        return DistributedRowMatrix.readDistributedRowMatrix(this.conf, this.rowPath);
    }

    public double[][] toDoubleArray() {
        DenseDoubleMatrix matrix = this.readDistributedRowMatrix();
        if (matrix != null) {
            return matrix.getValues();
        } else {
            return null;
        }
    }

    public boolean verify(DistributedRowMatrix other) {

        DenseDoubleMatrix matrixA = this.readDistributedRowMatrix();
        DenseDoubleMatrix matrixB = other.readDistributedRowMatrix();

        if ((matrixA == null) || (matrixB == null)) {
            return false;
        }

        if ((matrixA.getRowCount() != matrixB.getRowCount())
                || (matrixA.getColumnCount() != matrixB.getColumnCount())) {
            return false;
        }

        for (int i = 0; i < matrixA.getRowCount(); i++) {

            if (!Arrays.equals(matrixA.getRow(i), matrixB.getRow(i))) {
                return false;
            }
        }
        return true;
    }
}