Java tutorial
/* * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, * software distributed under the License is distributed on an * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY * KIND, either express or implied. See the License for the * specific language governing permissions and limitations * under the License. */ package org.apache.sysml.runtime.controlprogram.caching; import java.io.IOException; import java.lang.ref.SoftReference; import org.apache.commons.lang.mutable.MutableBoolean; import org.apache.sysml.api.DMLScript; import org.apache.sysml.api.DMLScript.RUNTIME_PLATFORM; import org.apache.sysml.conf.ConfigurationManager; import org.apache.sysml.hops.OptimizerUtils; import org.apache.sysml.lops.Lop; import org.apache.sysml.parser.Expression.DataType; import org.apache.sysml.parser.Expression.ValueType; import org.apache.sysml.runtime.DMLRuntimeException; import org.apache.sysml.runtime.controlprogram.ParForProgramBlock.PDataPartitionFormat; import org.apache.sysml.runtime.controlprogram.context.SparkExecutionContext; import org.apache.sysml.runtime.instructions.spark.data.RDDObject; import org.apache.sysml.runtime.matrix.MatrixCharacteristics; import org.apache.sysml.runtime.matrix.MatrixDimensionsMetaData; import org.apache.sysml.runtime.matrix.MatrixFormatMetaData; import org.apache.sysml.runtime.matrix.MetaData; import org.apache.sysml.runtime.matrix.data.FileFormatProperties; import org.apache.sysml.runtime.matrix.data.InputInfo; import org.apache.sysml.runtime.matrix.data.MatrixBlock; import org.apache.sysml.runtime.matrix.data.OutputInfo; import org.apache.sysml.runtime.util.DataConverter; import org.apache.sysml.runtime.util.IndexRange; import org.apache.sysml.runtime.util.MapReduceTool; /** * Represents a matrix in control program. This class contains method to read * matrices from HDFS and convert them to a specific format/representation. It * is also able to write several formats/representation of matrices to HDFS. * IMPORTANT: Preserve one-to-one correspondence between {@link MatrixObject} * and {@link MatrixBlock} objects, for cache purposes. Do not change a * {@link MatrixBlock} object without informing its {@link MatrixObject} object. * */ public class MatrixObject extends CacheableData<MatrixBlock> { private static final long serialVersionUID = 6374712373206495637L; public enum UpdateType { COPY, INPLACE, INPLACE_PINNED; public boolean isInPlace() { return (this != COPY); } } //additional matrix-specific flags private UpdateType _updateType = UpdateType.COPY; //information relevant to partitioned matrices. private boolean _partitioned = false; //indicates if obj partitioned private PDataPartitionFormat _partitionFormat = null; //indicates how obj partitioned private int _partitionSize = -1; //indicates n for BLOCKWISE_N private String _partitionCacheName = null; //name of cache block private MatrixBlock _partitionInMemory = null; /** * Constructor that takes the value type and the HDFS filename. * * @param vt value type * @param file file name */ public MatrixObject(ValueType vt, String file) { this(vt, file, null); //HDFS file path } /** * Constructor that takes the value type, HDFS filename and associated metadata. * * @param vt value type * @param file file name * @param mtd metadata */ public MatrixObject(ValueType vt, String file, MetaData mtd) { super(DataType.MATRIX, vt); _metaData = mtd; _hdfsFileName = file; _cache = null; _data = null; } /** * Copy constructor that copies meta data but NO data. * * @param mo matrix object */ public MatrixObject(MatrixObject mo) { //base copy constructor super(mo); MatrixFormatMetaData metaOld = (MatrixFormatMetaData) mo.getMetaData(); _metaData = new MatrixFormatMetaData(new MatrixCharacteristics(metaOld.getMatrixCharacteristics()), metaOld.getOutputInfo(), metaOld.getInputInfo()); _updateType = mo._updateType; _partitioned = mo._partitioned; _partitionFormat = mo._partitionFormat; _partitionSize = mo._partitionSize; _partitionCacheName = mo._partitionCacheName; } public void setUpdateType(UpdateType flag) { _updateType = flag; } public UpdateType getUpdateType() { return _updateType; } @Override public void updateMatrixCharacteristics(MatrixCharacteristics mc) { ((MatrixDimensionsMetaData) _metaData).setMatrixCharacteristics(mc); } /** * Make the matrix metadata consistent with the in-memory matrix data * * @throws CacheException if CacheException occurs */ @Override public void refreshMetaData() throws CacheException { if (_data == null || _metaData == null) //refresh only for existing data throw new CacheException("Cannot refresh meta data because there is no data or meta data. "); //we need to throw an exception, otherwise input/output format cannot be inferred MatrixCharacteristics mc = ((MatrixDimensionsMetaData) _metaData).getMatrixCharacteristics(); mc.setDimension(_data.getNumRows(), _data.getNumColumns()); mc.setNonZeros(_data.getNonZeros()); } public long getNumRows() { MatrixCharacteristics mc = getMatrixCharacteristics(); return mc.getRows(); } public long getNumColumns() { MatrixCharacteristics mc = getMatrixCharacteristics(); return mc.getCols(); } public long getNumRowsPerBlock() { MatrixCharacteristics mc = getMatrixCharacteristics(); return mc.getRowsPerBlock(); } public long getNumColumnsPerBlock() { MatrixCharacteristics mc = getMatrixCharacteristics(); return mc.getColsPerBlock(); } public long getNnz() { MatrixCharacteristics mc = getMatrixCharacteristics(); return mc.getNonZeros(); } public double getSparsity() { MatrixCharacteristics mc = getMatrixCharacteristics(); return ((double) mc.getNonZeros()) / mc.getRows() / mc.getCols(); } // ********************************************* // *** *** // *** HIGH-LEVEL PUBLIC METHODS *** // *** FOR PARTITIONED MATRIX ACCESS *** // *** (all other methods still usable) *** // *** *** // ********************************************* public void setPartitioned(PDataPartitionFormat format, int n) { _partitioned = true; _partitionFormat = format; _partitionSize = n; } public void unsetPartitioned() { _partitioned = false; _partitionFormat = null; _partitionSize = -1; } public boolean isPartitioned() { return _partitioned; } public PDataPartitionFormat getPartitionFormat() { return _partitionFormat; } public int getPartitionSize() { return _partitionSize; } public synchronized void setInMemoryPartition(MatrixBlock block) { _partitionInMemory = block; } /** * NOTE: for reading matrix partitions, we could cache (in its real sense) the read block * with soft references (no need for eviction, as partitioning only applied for read-only matrices). * However, since we currently only support row- and column-wise partitioning caching is not applied yet. * This could be changed once we also support column-block-wise and row-block-wise. Furthermore, * as we reject to partition vectors and support only full row or column indexing, no metadata (apart from * the partition flag) is required. * * @param pred index range * @return matrix block * @throws CacheException if CacheException occurs */ public synchronized MatrixBlock readMatrixPartition(IndexRange pred) throws CacheException { if (LOG.isTraceEnabled()) LOG.trace("Acquire partition " + getVarName() + " " + pred); long t0 = DMLScript.STATISTICS ? System.nanoTime() : 0; if (!_partitioned) throw new CacheException("MatrixObject not available to indexed read."); //return static partition of set from outside of the program if (_partitionInMemory != null) return _partitionInMemory; MatrixBlock mb = null; try { boolean blockwise = (_partitionFormat == PDataPartitionFormat.ROW_BLOCK_WISE || _partitionFormat == PDataPartitionFormat.COLUMN_BLOCK_WISE); //preparations for block wise access MatrixFormatMetaData iimd = (MatrixFormatMetaData) _metaData; MatrixCharacteristics mc = iimd.getMatrixCharacteristics(); int brlen = mc.getRowsPerBlock(); int bclen = mc.getColsPerBlock(); //get filename depending on format String fname = getPartitionFileName(pred, brlen, bclen); //probe cache if (blockwise && _partitionCacheName != null && _partitionCacheName.equals(fname)) { mb = _cache.get(); //try getting block from cache } if (mb == null) //block not in cache { //get rows and cols long rows = -1; long cols = -1; switch (_partitionFormat) { case ROW_WISE: rows = 1; cols = mc.getCols(); break; case ROW_BLOCK_WISE: rows = brlen; cols = mc.getCols(); break; case COLUMN_WISE: rows = mc.getRows(); cols = 1; break; case COLUMN_BLOCK_WISE: rows = mc.getRows(); cols = bclen; break; default: throw new CacheException("Unsupported partition format: " + _partitionFormat); } //read the if (MapReduceTool.existsFileOnHDFS(fname)) mb = readBlobFromHDFS(fname, rows, cols); else { mb = new MatrixBlock((int) rows, (int) cols, true); LOG.warn("Reading empty matrix partition " + fname); } } //post processing if (blockwise) { //put block into cache _partitionCacheName = fname; _cache = new SoftReference<MatrixBlock>(mb); if (_partitionFormat == PDataPartitionFormat.ROW_BLOCK_WISE) { int rix = (int) ((pred.rowStart - 1) % brlen); mb = mb.sliceOperations(rix, rix, (int) (pred.colStart - 1), (int) (pred.colEnd - 1), new MatrixBlock()); } if (_partitionFormat == PDataPartitionFormat.COLUMN_BLOCK_WISE) { int cix = (int) ((pred.colStart - 1) % bclen); mb = mb.sliceOperations((int) (pred.rowStart - 1), (int) (pred.rowEnd - 1), cix, cix, new MatrixBlock()); } } //NOTE: currently no special treatment of non-existing partitions necessary // because empty blocks are written anyway } catch (Exception ex) { throw new CacheException(ex); } if (DMLScript.STATISTICS) { long t1 = System.nanoTime(); CacheStatistics.incrementAcquireRTime(t1 - t0); } return mb; } public String getPartitionFileName(IndexRange pred, int brlen, int bclen) throws CacheException { if (!_partitioned) throw new CacheException("MatrixObject not available to indexed read."); StringBuilder sb = new StringBuilder(); sb.append(_hdfsFileName); switch (_partitionFormat) { case ROW_WISE: sb.append(Lop.FILE_SEPARATOR); sb.append(pred.rowStart); break; case ROW_BLOCK_WISE: sb.append(Lop.FILE_SEPARATOR); sb.append((pred.rowStart - 1) / brlen + 1); break; case COLUMN_WISE: sb.append(Lop.FILE_SEPARATOR); sb.append(pred.colStart); break; case COLUMN_BLOCK_WISE: sb.append(Lop.FILE_SEPARATOR); sb.append((pred.colStart - 1) / bclen + 1); break; default: throw new CacheException("MatrixObject not available to indexed read."); } return sb.toString(); } // ********************************************* // *** *** // *** LOW-LEVEL PROTECTED METHODS *** // *** EXTEND CACHEABLE DATA *** // *** ONLY CALLED BY THE SUPERCLASS *** // *** *** // ********************************************* @Override protected boolean isBelowCachingThreshold() { return super.isBelowCachingThreshold() || getUpdateType() == UpdateType.INPLACE_PINNED; } @Override protected MatrixBlock readBlobFromCache(String fname) throws IOException { return (MatrixBlock) LazyWriteBuffer.readBlock(fname, true); } @Override protected MatrixBlock readBlobFromHDFS(String fname, long rlen, long clen) throws IOException { MatrixFormatMetaData iimd = (MatrixFormatMetaData) _metaData; MatrixCharacteristics mc = iimd.getMatrixCharacteristics(); long begin = 0; if (LOG.isTraceEnabled()) { LOG.trace("Reading matrix from HDFS... " + getVarName() + " Path: " + fname + ", dimensions: [" + mc.getRows() + ", " + mc.getCols() + ", " + mc.getNonZeros() + "]"); begin = System.currentTimeMillis(); } double sparsity = (mc.getNonZeros() >= 0 ? ((double) mc.getNonZeros()) / (mc.getRows() * mc.getCols()) : 1.0d); MatrixBlock newData = DataConverter.readMatrixFromHDFS(fname, iimd.getInputInfo(), rlen, clen, mc.getRowsPerBlock(), mc.getColsPerBlock(), sparsity, getFileFormatProperties()); //sanity check correct output if (newData == null) throw new IOException("Unable to load matrix from file: " + fname); if (LOG.isTraceEnabled()) LOG.trace("Reading Completed: " + (System.currentTimeMillis() - begin) + " msec."); return newData; } @Override protected MatrixBlock readBlobFromRDD(RDDObject rdd, MutableBoolean writeStatus) throws IOException { //note: the read of a matrix block from an RDD might trigger //lazy evaluation of pending transformations. RDDObject lrdd = rdd; //prepare return status (by default only collect) writeStatus.setValue(false); MatrixFormatMetaData iimd = (MatrixFormatMetaData) _metaData; MatrixCharacteristics mc = iimd.getMatrixCharacteristics(); InputInfo ii = iimd.getInputInfo(); MatrixBlock mb = null; try { //prevent unnecessary collect through rdd checkpoint if (rdd.allowsShortCircuitCollect()) { lrdd = (RDDObject) rdd.getLineageChilds().get(0); } //obtain matrix block from RDD int rlen = (int) mc.getRows(); int clen = (int) mc.getCols(); int brlen = (int) mc.getRowsPerBlock(); int bclen = (int) mc.getColsPerBlock(); long nnz = mc.getNonZeros(); //guarded rdd collect if (ii == InputInfo.BinaryBlockInputInfo && //guarded collect not for binary cell !OptimizerUtils.checkSparkCollectMemoryBudget(rlen, clen, brlen, bclen, nnz, getPinnedSize())) { //write RDD to hdfs and read to prevent invalid collect mem consumption //note: lazy, partition-at-a-time collect (toLocalIterator) was significantly slower if (!MapReduceTool.existsFileOnHDFS(_hdfsFileName)) { //prevent overwrite existing file long newnnz = SparkExecutionContext.writeRDDtoHDFS(lrdd, _hdfsFileName, iimd.getOutputInfo()); ((MatrixDimensionsMetaData) _metaData).getMatrixCharacteristics().setNonZeros(newnnz); ((RDDObject) rdd).setHDFSFile(true); //mark rdd as hdfs file (for restore) writeStatus.setValue(true); //mark for no cache-write on read } mb = readBlobFromHDFS(_hdfsFileName); } else if (ii == InputInfo.BinaryCellInputInfo) { //collect matrix block from binary block RDD mb = SparkExecutionContext.toMatrixBlock(lrdd, rlen, clen, nnz); } else { //collect matrix block from binary cell RDD mb = SparkExecutionContext.toMatrixBlock(lrdd, rlen, clen, brlen, bclen, nnz); } } catch (DMLRuntimeException ex) { throw new IOException(ex); } //sanity check correct output if (mb == null) { throw new IOException("Unable to load matrix from rdd: " + lrdd.getVarName()); } return mb; } /** * Writes in-memory matrix to HDFS in a specified format. */ @Override protected void writeBlobToHDFS(String fname, String ofmt, int rep, FileFormatProperties fprop) throws IOException, DMLRuntimeException { long begin = 0; if (LOG.isTraceEnabled()) { LOG.trace(" Writing matrix to HDFS... " + getVarName() + " Path: " + fname + ", Format: " + (ofmt != null ? ofmt : "inferred from metadata")); begin = System.currentTimeMillis(); } MatrixFormatMetaData iimd = (MatrixFormatMetaData) _metaData; if (_data != null) { // Get the dimension information from the metadata stored within MatrixObject MatrixCharacteristics mc = iimd.getMatrixCharacteristics(); // Write the matrix to HDFS in requested format OutputInfo oinfo = (ofmt != null ? OutputInfo.stringToOutputInfo(ofmt) : InputInfo.getMatchingOutputInfo(iimd.getInputInfo())); // when outputFormat is binaryblock, make sure that matrixCharacteristics has correct blocking dimensions // note: this is only required if singlenode (due to binarycell default) if (oinfo == OutputInfo.BinaryBlockOutputInfo && DMLScript.rtplatform == RUNTIME_PLATFORM.SINGLE_NODE && (mc.getRowsPerBlock() != ConfigurationManager.getBlocksize() || mc.getColsPerBlock() != ConfigurationManager.getBlocksize())) { DataConverter.writeMatrixToHDFS(_data, fname, oinfo, new MatrixCharacteristics(mc.getRows(), mc.getCols(), ConfigurationManager.getBlocksize(), ConfigurationManager.getBlocksize(), mc.getNonZeros()), rep, fprop); } else { DataConverter.writeMatrixToHDFS(_data, fname, oinfo, mc, rep, fprop); } if (LOG.isTraceEnabled()) LOG.trace("Writing matrix to HDFS (" + fname + ") - COMPLETED... " + (System.currentTimeMillis() - begin) + " msec."); } else if (LOG.isTraceEnabled()) { LOG.trace("Writing matrix to HDFS (" + fname + ") - NOTHING TO WRITE (_data == null)."); } if (DMLScript.STATISTICS) CacheStatistics.incrementHDFSWrites(); } @Override protected void writeBlobFromRDDtoHDFS(RDDObject rdd, String fname, String outputFormat) throws IOException, DMLRuntimeException { //prepare output info MatrixFormatMetaData iimd = (MatrixFormatMetaData) _metaData; OutputInfo oinfo = (outputFormat != null ? OutputInfo.stringToOutputInfo(outputFormat) : InputInfo.getMatchingOutputInfo(iimd.getInputInfo())); //note: the write of an RDD to HDFS might trigger //lazy evaluation of pending transformations. long newnnz = SparkExecutionContext.writeRDDtoHDFS(rdd, fname, oinfo); ((MatrixDimensionsMetaData) _metaData).getMatrixCharacteristics().setNonZeros(newnnz); } }