Java tutorial
/** * (C) Copyright IBM Corp. 2010, 2015 * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * */ package com.ibm.bi.dml.runtime.util; import java.io.IOException; import java.util.ArrayList; import java.util.HashMap; import java.util.List; import java.util.Map.Entry; import org.apache.commons.math3.linear.Array2DRowRealMatrix; import com.ibm.bi.dml.parser.Expression.ValueType; import com.ibm.bi.dml.runtime.DMLRuntimeException; import com.ibm.bi.dml.runtime.controlprogram.caching.MatrixObject; import com.ibm.bi.dml.runtime.io.MatrixReader; import com.ibm.bi.dml.runtime.io.MatrixReaderFactory; import com.ibm.bi.dml.runtime.io.MatrixWriter; import com.ibm.bi.dml.runtime.io.MatrixWriterFactory; import com.ibm.bi.dml.runtime.io.ReadProperties; import com.ibm.bi.dml.runtime.matrix.MatrixCharacteristics; import com.ibm.bi.dml.runtime.matrix.data.CTableMap; import com.ibm.bi.dml.runtime.matrix.data.FileFormatProperties; import com.ibm.bi.dml.runtime.matrix.data.IJV; import com.ibm.bi.dml.runtime.matrix.data.InputInfo; import com.ibm.bi.dml.runtime.matrix.data.MatrixBlock; import com.ibm.bi.dml.runtime.matrix.data.MatrixIndexes; import com.ibm.bi.dml.runtime.matrix.data.OutputInfo; import com.ibm.bi.dml.runtime.matrix.data.SparseRowsIterator; import com.ibm.bi.dml.udf.Matrix; /** * This class provides methods to read and write matrix blocks from to HDFS using different data formats. * Those functionalities are used especially for CP read/write and exporting in-memory matrices to HDFS * (before executing MR jobs). * */ public class DataConverter { ////////////// // READING and WRITING of matrix blocks to/from HDFS // (textcell, binarycell, binaryblock) /////// /** * * @param mat * @param dir * @param outputinfo * @param rlen * @param clen * @param brlen * @param bclen * @throws IOException */ public static void writeMatrixToHDFS(MatrixBlock mat, String dir, OutputInfo outputinfo, MatrixCharacteristics mc) throws IOException { writeMatrixToHDFS(mat, dir, outputinfo, mc, -1, null); } /** * * @param mat * @param dir * @param outputinfo * @param mc * @param replication * @param formatProperties * @throws IOException */ public static void writeMatrixToHDFS(MatrixBlock mat, String dir, OutputInfo outputinfo, MatrixCharacteristics mc, int replication, FileFormatProperties formatProperties) throws IOException { try { MatrixWriter writer = MatrixWriterFactory.createMatrixWriter(outputinfo, replication, formatProperties); writer.writeMatrixToHDFS(mat, dir, mc.getRows(), mc.getCols(), mc.getRowsPerBlock(), mc.getColsPerBlock(), mc.getNonZeros()); } catch (Exception e) { throw new IOException(e); } } /** * * @param dir * @param inputinfo * @param rlen * @param clen * @param brlen * @param bclen * @return * @throws IOException */ public static MatrixBlock readMatrixFromHDFS(String dir, InputInfo inputinfo, long rlen, long clen, int brlen, int bclen, boolean localFS) throws IOException { ReadProperties prop = new ReadProperties(); prop.path = dir; prop.inputInfo = inputinfo; prop.rlen = rlen; prop.clen = clen; prop.brlen = brlen; prop.bclen = bclen; prop.localFS = localFS; //expected matrix is sparse (default SystemML usecase) return readMatrixFromHDFS(prop); } /** * * @param dir * @param inputinfo * @param rlen * @param clen * @param brlen * @param bclen * @return * @throws IOException */ public static MatrixBlock readMatrixFromHDFS(String dir, InputInfo inputinfo, long rlen, long clen, int brlen, int bclen) throws IOException { ReadProperties prop = new ReadProperties(); prop.path = dir; prop.inputInfo = inputinfo; prop.rlen = rlen; prop.clen = clen; prop.brlen = brlen; prop.bclen = bclen; //expected matrix is sparse (default SystemML usecase) return readMatrixFromHDFS(prop); } /** * * @param dir * @param inputinfo * @param rlen * @param clen * @param brlen * @param bclen * @param expectedSparsity * @return * @throws IOException */ public static MatrixBlock readMatrixFromHDFS(String dir, InputInfo inputinfo, long rlen, long clen, int brlen, int bclen, double expectedSparsity) throws IOException { ReadProperties prop = new ReadProperties(); prop.path = dir; prop.inputInfo = inputinfo; prop.rlen = rlen; prop.clen = clen; prop.brlen = brlen; prop.bclen = bclen; prop.expectedSparsity = expectedSparsity; return readMatrixFromHDFS(prop); } /** * * @param dir * @param inputinfo * @param rlen * @param clen * @param brlen * @param bclen * @param expectedSparsity * @param localFS * @return * @throws IOException */ public static MatrixBlock readMatrixFromHDFS(String dir, InputInfo inputinfo, long rlen, long clen, int brlen, int bclen, double expectedSparsity, boolean localFS) throws IOException { ReadProperties prop = new ReadProperties(); prop.path = dir; prop.inputInfo = inputinfo; prop.rlen = rlen; prop.clen = clen; prop.brlen = brlen; prop.bclen = bclen; prop.expectedSparsity = expectedSparsity; prop.localFS = localFS; return readMatrixFromHDFS(prop); } /** * * @param dir * @param inputinfo * @param rlen * @param clen * @param brlen * @param bclen * @param expectedSparsity * @param localFS * @return * @throws IOException */ public static MatrixBlock readMatrixFromHDFS(String dir, InputInfo inputinfo, long rlen, long clen, int brlen, int bclen, double expectedSparsity, FileFormatProperties formatProperties) throws IOException { ReadProperties prop = new ReadProperties(); prop.path = dir; prop.inputInfo = inputinfo; prop.rlen = rlen; prop.clen = clen; prop.brlen = brlen; prop.bclen = bclen; prop.expectedSparsity = expectedSparsity; prop.formatProperties = formatProperties; //prop.printMe(); return readMatrixFromHDFS(prop); } /** * Core method for reading matrices in format textcell, matrixmarket, binarycell, or binaryblock * from HDFS into main memory. For expected dense matrices we directly copy value- or block-at-a-time * into the target matrix. In contrast, for sparse matrices, we append (column-value)-pairs and do a * final sort if required in order to prevent large reorg overheads and increased memory consumption * in case of unordered inputs. * * DENSE MxN input: * * best/average/worst: O(M*N) * SPARSE MxN input * * best (ordered, or binary block w/ clen<=bclen): O(M*N) * * average (unordered): O(M*N*log(N)) * * worst (descending order per row): O(M * N^2) * * NOTE: providing an exact estimate of 'expected sparsity' can prevent a full copy of the result * matrix block (required for changing sparse->dense, or vice versa) * * @param dir * @param inputinfo * @param rlen * @param clen * @param brlen * @param bclen * @param expectedSparsity * @return * @throws IOException */ public static MatrixBlock readMatrixFromHDFS(ReadProperties prop) throws IOException { //Timing time = new Timing(true); long estnnz = (long) (prop.expectedSparsity * prop.rlen * prop.clen); //core matrix reading MatrixBlock ret = null; try { MatrixReader reader = MatrixReaderFactory.createMatrixReader(prop); ret = reader.readMatrixFromHDFS(prop.path, prop.rlen, prop.clen, prop.brlen, prop.bclen, estnnz); } catch (DMLRuntimeException rex) { throw new IOException(rex); } //System.out.println("read matrix ("+prop.rlen+","+prop.clen+","+ret.getNonZeros()+") in "+time.stop()); return ret; } ////////////// // Utils for CREATING and COPYING matrix blocks /////// /** * Creates a two-dimensional double matrix of the input matrix block. * * @param mb * @return */ public static double[][] convertToDoubleMatrix(MatrixBlock mb) { int rows = mb.getNumRows(); int cols = mb.getNumColumns(); double[][] ret = new double[rows][cols]; //0-initialized if (mb.getNonZeros() > 0) { if (mb.isInSparseFormat()) { SparseRowsIterator iter = mb.getSparseRowsIterator(); while (iter.hasNext()) { IJV cell = iter.next(); ret[cell.i][cell.j] = cell.v; } } else { for (int i = 0; i < rows; i++) for (int j = 0; j < cols; j++) ret[i][j] = mb.getValueDenseUnsafe(i, j); } } return ret; } /** * * @param mb * @return */ public static boolean[] convertToBooleanVector(MatrixBlock mb) { int rows = mb.getNumRows(); int cols = mb.getNumColumns(); boolean[] ret = new boolean[rows * cols]; //false-initialized if (mb.getNonZeros() > 0) { if (mb.isInSparseFormat()) { SparseRowsIterator iter = mb.getSparseRowsIterator(); while (iter.hasNext()) { IJV cell = iter.next(); ret[cell.i * rows + cell.j] = (cell.v != 0.0); } } else { if (!mb.isEmptyBlock(false)) { for (int i = 0; i < rows; i++) for (int j = 0; j < cols; j++) ret[i * cols + j] = (mb.getValueDenseUnsafe(i, j) != 0.0); } } } return ret; } /** * * @param mb * @return */ public static int[] convertToIntVector(MatrixBlock mb) { int rows = mb.getNumRows(); int cols = mb.getNumColumns(); int[] ret = new int[rows * cols]; //0-initialized if (mb.getNonZeros() > 0) { if (mb.isInSparseFormat()) { SparseRowsIterator iter = mb.getSparseRowsIterator(); while (iter.hasNext()) { IJV cell = iter.next(); ret[cell.i * rows + cell.j] = (int) cell.v; } } else { //memcopy row major representation if at least 1 non-zero if (!mb.isEmptyBlock(false)) for (int i = 0; i < rows; i++) for (int j = 0; j < cols; j++) ret[i * cols + j] = (int) (mb.getValueDenseUnsafe(i, j)); } } return ret; } /** * * @param mb * @return */ public static double[] convertToDoubleVector(MatrixBlock mb) { int rows = mb.getNumRows(); int cols = mb.getNumColumns(); double[] ret = new double[rows * cols]; //0-initialized if (mb.getNonZeros() > 0) { if (mb.isInSparseFormat()) { SparseRowsIterator iter = mb.getSparseRowsIterator(); while (iter.hasNext()) { IJV cell = iter.next(); ret[cell.i * rows + cell.j] = cell.v; } } else { //memcopy row major representation if at least 1 non-zero if (!mb.isEmptyBlock(false)) System.arraycopy(mb.getDenseArray(), 0, ret, 0, rows * cols); } } return ret; } /** * * @param mb * @return */ public static List<Double> convertToDoubleList(MatrixBlock mb) { int rows = mb.getNumRows(); int cols = mb.getNumColumns(); long nnz = mb.getNonZeros(); ArrayList<Double> ret = new ArrayList<Double>(); if (mb.isInSparseFormat()) { SparseRowsIterator iter = mb.getSparseRowsIterator(); while (iter.hasNext()) { IJV cell = iter.next(); ret.add(cell.v); } for (long i = nnz; i < (long) rows * cols; i++) ret.add(0d); //add remaining values } else { for (int i = 0; i < rows; i++) for (int j = 0; j < cols; j++) ret.add(mb.getValueDenseUnsafe(i, j)); } return ret; } /** * Creates a dense Matrix Block and copies the given double matrix into it. * * @param data * @return * @throws DMLRuntimeException */ public static MatrixBlock convertToMatrixBlock(double[][] data) throws DMLRuntimeException { int rows = data.length; int cols = (rows > 0) ? data[0].length : 0; MatrixBlock mb = new MatrixBlock(rows, cols, false); try { //copy data to mb (can be used because we create a dense matrix) mb.init(data, rows, cols); } catch (Exception e) { } //can never happen //check and convert internal representation mb.examSparsity(); return mb; } /** * Creates a dense Matrix Block and copies the given double vector into it. * * @param data * @return * @throws DMLRuntimeException */ public static MatrixBlock convertToMatrixBlock(double[] data, boolean columnVector) throws DMLRuntimeException { int rows = columnVector ? data.length : 1; int cols = columnVector ? 1 : data.length; MatrixBlock mb = new MatrixBlock(rows, cols, false); try { //copy data to mb (can be used because we create a dense matrix) mb.init(data, rows, cols); } catch (Exception e) { } //can never happen //check and convert internal representation mb.examSparsity(); return mb; } /** * * @param map * @return */ public static MatrixBlock convertToMatrixBlock(HashMap<MatrixIndexes, Double> map) { // compute dimensions from the map long nrows = 0, ncols = 0; for (MatrixIndexes index : map.keySet()) { nrows = Math.max(nrows, index.getRowIndex()); ncols = Math.max(ncols, index.getColumnIndex()); } // convert to matrix block return convertToMatrixBlock(map, (int) nrows, (int) ncols); } /** * NOTE: this method also ensures the specified matrix dimensions * * @param map * @return */ public static MatrixBlock convertToMatrixBlock(HashMap<MatrixIndexes, Double> map, int rlen, int clen) { int nnz = map.size(); boolean sparse = MatrixBlock.evalSparseFormatInMemory(rlen, clen, nnz); MatrixBlock mb = new MatrixBlock(rlen, clen, sparse, nnz); // copy map values into new block if (sparse) //SPARSE <- cells { //append cells to sparse target (prevent shifting) for (Entry<MatrixIndexes, Double> e : map.entrySet()) { MatrixIndexes index = e.getKey(); double value = e.getValue(); int rix = (int) index.getRowIndex(); int cix = (int) index.getColumnIndex(); if (value != 0 && rix <= rlen && cix <= clen) mb.appendValue(rix - 1, cix - 1, value); } //sort sparse target representation mb.sortSparseRows(); } else //DENSE <- cells { //directly insert cells into dense target for (Entry<MatrixIndexes, Double> e : map.entrySet()) { MatrixIndexes index = e.getKey(); double value = e.getValue(); int rix = (int) index.getRowIndex(); int cix = (int) index.getColumnIndex(); if (value != 0 && rix <= rlen && cix <= clen) mb.quickSetValue(rix - 1, cix - 1, value); } } return mb; } /** * * @param map * @return */ public static MatrixBlock convertToMatrixBlock(CTableMap map) { // compute dimensions from the map int nrows = (int) map.getMaxRow(); int ncols = (int) map.getMaxColumn(); // convert to matrix block return convertToMatrixBlock(map, nrows, ncols); } /** * NOTE: this method also ensures the specified matrix dimensions * * @param map * @return */ public static MatrixBlock convertToMatrixBlock(CTableMap map, int rlen, int clen) { return map.toMatrixBlock(rlen, clen); } /** * * @param mb * @param colwise * @return * @throws DMLRuntimeException */ public static MatrixBlock[] convertToMatrixBlockPartitions(MatrixBlock mb, boolean colwise) throws DMLRuntimeException { MatrixBlock[] ret = null; int rows = mb.getNumRows(); int cols = mb.getNumColumns(); long nnz = mb.getNonZeros(); boolean sparse = mb.isInSparseFormat(); double sparsity = ((double) nnz) / (rows * cols); if (colwise) //COL PARTITIONS { //allocate output partitions ret = new MatrixBlock[cols]; for (int j = 0; j < cols; j++) ret[j] = new MatrixBlock(rows, 1, false); //cache-friendly sequential read/append if (!mb.isEmptyBlock(false)) { if (sparse) { //SPARSE SparseRowsIterator iter = mb.getSparseRowsIterator(); while (iter.hasNext()) { IJV cell = iter.next(); ret[cell.j].appendValue(cell.i, 0, cell.v); } } else { //DENSE for (int i = 0; i < rows; i++) for (int j = 0; j < cols; j++) ret[j].appendValue(i, 0, mb.getValueDenseUnsafe(i, j)); } } } else //ROW PARTITIONS { //allocate output partitions ret = new MatrixBlock[rows]; for (int i = 0; i < rows; i++) ret[i] = new MatrixBlock(1, cols, sparse, (long) (cols * sparsity)); //cache-friendly sparse/dense row slicing if (!mb.isEmptyBlock(false)) { for (int i = 0; i < rows; i++) mb.sliceOperations(i, i, 0, cols - 1, ret[i]); } } return ret; } /** * Helper method that converts SystemML matrix variable (<code>varname</code>) into a Array2DRowRealMatrix format, * which is useful in invoking Apache CommonsMath. * * @param ec * @param varname * @return * @throws DMLRuntimeException */ public static Array2DRowRealMatrix convertToArray2DRowRealMatrix(MatrixObject mo) throws DMLRuntimeException { Matrix.ValueType vt = (mo.getValueType() == ValueType.DOUBLE ? Matrix.ValueType.Double : Matrix.ValueType.Integer); Matrix mathInput = new Matrix(mo.getFileName(), mo.getNumRows(), mo.getNumColumns(), vt); mathInput.setMatrixObject(mo); double[][] data = mathInput.getMatrixAsDoubleArray(); Array2DRowRealMatrix matrixInput = new Array2DRowRealMatrix(data, false); return matrixInput; } }