Java tutorial
/** * (C) Copyright IBM Corp. 2010, 2015 * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * */ package com.ibm.bi.dml.runtime.controlprogram.caching; import java.io.IOException; import java.lang.ref.SoftReference; import org.apache.commons.lang.mutable.MutableBoolean; import com.ibm.bi.dml.api.DMLScript; import com.ibm.bi.dml.api.DMLScript.RUNTIME_PLATFORM; import com.ibm.bi.dml.hops.OptimizerUtils; import com.ibm.bi.dml.lops.Lop; import com.ibm.bi.dml.parser.DMLTranslator; import com.ibm.bi.dml.parser.Expression.DataType; import com.ibm.bi.dml.parser.Expression.ValueType; import com.ibm.bi.dml.runtime.DMLRuntimeException; import com.ibm.bi.dml.runtime.controlprogram.ParForProgramBlock.PDataPartitionFormat; import com.ibm.bi.dml.runtime.controlprogram.context.SparkExecutionContext; import com.ibm.bi.dml.runtime.instructions.spark.data.BroadcastObject; import com.ibm.bi.dml.runtime.instructions.spark.data.RDDObject; import com.ibm.bi.dml.runtime.instructions.spark.data.RDDProperties; import com.ibm.bi.dml.runtime.matrix.MatrixCharacteristics; import com.ibm.bi.dml.runtime.matrix.MatrixDimensionsMetaData; import com.ibm.bi.dml.runtime.matrix.MatrixFormatMetaData; import com.ibm.bi.dml.runtime.matrix.MetaData; import com.ibm.bi.dml.runtime.matrix.data.FileFormatProperties; import com.ibm.bi.dml.runtime.matrix.data.InputInfo; import com.ibm.bi.dml.runtime.matrix.data.MatrixBlock; import com.ibm.bi.dml.runtime.matrix.data.NumItemsByEachReducerMetaData; import com.ibm.bi.dml.runtime.matrix.data.OutputInfo; import com.ibm.bi.dml.runtime.util.DataConverter; import com.ibm.bi.dml.runtime.util.IndexRange; import com.ibm.bi.dml.runtime.util.MapReduceTool; /** * Represents a matrix in control program. This class contains method to read * matrices from HDFS and convert them to a specific format/representation. It * is also able to write several formats/representation of matrices to HDFS. * IMPORTANT: Preserve one-to-one correspondence between {@link MatrixObject} * and {@link MatrixBlock} objects, for cache purposes. Do not change a * {@link MatrixBlock} object without informing its {@link MatrixObject} object. * */ public class MatrixObject extends CacheableData { private static final long serialVersionUID = 6374712373206495637L; /** * Current state of pinned variables, required for guarded collect. */ private static ThreadLocal<Long> sizePinned = new ThreadLocal<Long>() { @Override protected Long initialValue() { return 0L; } }; /** * Cache for actual data, evicted by garbage collector. */ private SoftReference<MatrixBlock> _cache = null; /** * Container object that holds the actual data. */ private MatrixBlock _data = null; /** * The name of HDFS file in which the data is backed up. */ private String _hdfsFileName = null; // file name and path /** * Flag that indicates whether or not hdfs file exists. * It is used for improving the performance of "rmvar" instruction. * When it has value <code>false</code>, one can skip invocations to * utility functions such as MapReduceTool.deleteFileIfExistOnHDFS(), * which can be potentially expensive. */ private boolean _hdfsFileExists = false; /** * <code>true</code> if the in-memory or evicted matrix may be different from * the matrix located at {@link #_hdfsFileName}; <code>false</code> if the two * matrices should be the same. */ private boolean _dirtyFlag = false; /** * Object that holds the metadata associated with the matrix, which * includes: 1) Matrix dimensions, if available 2) Number of non-zeros, if * available 3) Block dimensions, if applicable 4) InputInfo -- subsequent * operations that use this Matrix expect it to be in this format. * * When the matrix is written to HDFS (local file system, as well?), one * must get the OutputInfo that matches with InputInfo stored inside _mtd. */ private MetaData _metaData = null; //additional names and flags private String _varName = ""; //plan variable name private String _cacheFileName = null; //local eviction file name private boolean _requiresLocalWrite = false; //flag if local write for read obj private boolean _isAcquireFromEmpty = false; //flag if read from status empty private boolean _cleanupFlag = true; //flag if obj unpinned (cleanup enabled) private boolean _updateInPlaceFlag = false; //flag if in-place update //spark-specific handles //note: we use the abstraction of LineageObjects for two reasons: (1) to keep track of cleanup //for lazily evaluated RDDs, and (2) as abstraction for environments that do not necessarily have spark libraries available private RDDObject _rddHandle = null; //RDD handle private BroadcastObject _bcHandle = null; //Broadcast handle private RDDProperties _rddProperties = null; /** * Information relevant to partitioned matrices. */ private boolean _partitioned = false; //indicates if obj partitioned private PDataPartitionFormat _partitionFormat = null; //indicates how obj partitioned private int _partitionSize = -1; //indicates n for BLOCKWISE_N private String _partitionCacheName = null; //name of cache block private MatrixBlock _partitionInMemory = null; /** * Information relevant to specific external file formats */ FileFormatProperties _formatProperties = null; public RDDProperties getRddProperties() { return _rddProperties; } public void setRddProperties(RDDProperties _rddProperties) { this._rddProperties = _rddProperties; } /** * Constructor that takes only the HDFS filename. */ public MatrixObject(ValueType vt, String file) { this(vt, file, null); //HDFS file path } /** * Constructor that takes both HDFS filename and associated metadata. */ public MatrixObject(ValueType vt, String file, MetaData mtd) { super(DataType.MATRIX, vt); _metaData = mtd; _hdfsFileName = file; _cache = null; _data = null; } /** * Copy constructor that copies meta data but NO data. * * @param mo */ public MatrixObject(MatrixObject mo) { super(mo.getDataType(), mo.getValueType()); _hdfsFileName = mo._hdfsFileName; _hdfsFileExists = mo._hdfsFileExists; MatrixFormatMetaData metaOld = (MatrixFormatMetaData) mo.getMetaData(); _metaData = new MatrixFormatMetaData(new MatrixCharacteristics(metaOld.getMatrixCharacteristics()), metaOld.getOutputInfo(), metaOld.getInputInfo()); _varName = mo._varName; _cleanupFlag = mo._cleanupFlag; _updateInPlaceFlag = mo._updateInPlaceFlag; _partitioned = mo._partitioned; _partitionFormat = mo._partitionFormat; _partitionSize = mo._partitionSize; _partitionCacheName = mo._partitionCacheName; } public void setVarName(String s) { _varName = s; } public String getVarName() { return _varName; } @Override public void setMetaData(MetaData md) { _metaData = md; } @Override public MetaData getMetaData() { return _metaData; } @Override public void removeMetaData() { _metaData = null; } @Override public void updateMatrixCharacteristics(MatrixCharacteristics mc) { ((MatrixDimensionsMetaData) _metaData).setMatrixCharacteristics(mc); } /** * Make the matrix metadata consistent with the in-memory matrix data * @throws CacheException */ public void refreshMetaData() throws CacheException { if (_data == null || _metaData == null) //refresh only for existing data throw new CacheException("Cannot refresh meta data because there is no data or meta data. "); //we need to throw an exception, otherwise input/output format cannot be inferred MatrixCharacteristics mc = ((MatrixDimensionsMetaData) _metaData).getMatrixCharacteristics(); mc.setDimension(_data.getNumRows(), _data.getNumColumns()); mc.setNonZeros(_data.getNonZeros()); } public void setFileFormatProperties(FileFormatProperties formatProperties) { _formatProperties = formatProperties; } public FileFormatProperties getFileFormatProperties() { return _formatProperties; } public boolean isFileExists() { return _hdfsFileExists; } public void setFileExists(boolean flag) { _hdfsFileExists = flag; } public String getFileName() { return _hdfsFileName; } public synchronized void setFileName(String file) { if (!_hdfsFileName.equals(file)) { _hdfsFileName = file; if (!isEmpty(true)) _dirtyFlag = true; } } /** * * @return */ public long getNumRows() { MatrixDimensionsMetaData meta = (MatrixDimensionsMetaData) _metaData; MatrixCharacteristics mc = meta.getMatrixCharacteristics(); return mc.getRows(); } /** * * @return */ public long getNumColumns() { MatrixDimensionsMetaData meta = (MatrixDimensionsMetaData) _metaData; MatrixCharacteristics mc = meta.getMatrixCharacteristics(); return mc.getCols(); } /** * * @return */ public long getNumRowsPerBlock() { MatrixDimensionsMetaData meta = (MatrixDimensionsMetaData) _metaData; MatrixCharacteristics mc = meta.getMatrixCharacteristics(); return mc.getRowsPerBlock(); } /** * * @return */ public long getNumColumnsPerBlock() { MatrixDimensionsMetaData meta = (MatrixDimensionsMetaData) _metaData; MatrixCharacteristics mc = meta.getMatrixCharacteristics(); return mc.getColsPerBlock(); } /** * * @return */ public long getNnz() { MatrixDimensionsMetaData meta = (MatrixDimensionsMetaData) _metaData; MatrixCharacteristics mc = meta.getMatrixCharacteristics(); return mc.getNonZeros(); } /** * * @return */ public double getSparsity() { MatrixDimensionsMetaData meta = (MatrixDimensionsMetaData) _metaData; MatrixCharacteristics mc = meta.getMatrixCharacteristics(); return ((double) mc.getNonZeros()) / mc.getRows() / mc.getCols(); } /** * * @return */ public MatrixCharacteristics getMatrixCharacteristics() { MatrixDimensionsMetaData meta = (MatrixDimensionsMetaData) _metaData; return meta.getMatrixCharacteristics(); } /** * <code>true</code> if the in-memory or evicted matrix may be different from * the matrix located at {@link #_hdfsFileName}; <code>false</code> if the two * matrices are supposed to be the same. */ public boolean isDirty() { return _dirtyFlag; } public String toString() { StringBuilder str = new StringBuilder(); str.append("Matrix: "); str.append(_hdfsFileName + ", "); //System.out.println(_hdfsFileName); if (_metaData instanceof NumItemsByEachReducerMetaData) { str.append("NumItemsByEachReducerMetaData"); } else { try { MatrixFormatMetaData md = (MatrixFormatMetaData) _metaData; if (md != null) { MatrixCharacteristics mc = ((MatrixDimensionsMetaData) _metaData).getMatrixCharacteristics(); str.append(mc.toString()); InputInfo ii = md.getInputInfo(); if (ii == null) str.append("null"); else { str.append(", "); str.append(InputInfo.inputInfoToString(ii)); } } else { str.append("null, null"); } } catch (Exception ex) { LOG.error(ex); } } str.append(", "); str.append(isDirty() ? "dirty" : "not-dirty"); return str.toString(); } public RDDObject getRDDHandle() { return _rddHandle; } public void setRDDHandle(RDDObject rdd) { //cleanup potential old back reference if (_rddHandle != null) _rddHandle.setBackReference(null); //add new rdd handle _rddHandle = rdd; if (_rddHandle != null) rdd.setBackReference(this); } public BroadcastObject getBroadcastHandle() { return _bcHandle; } public void setBroadcastHandle(BroadcastObject bc) { //cleanup potential old back reference if (_bcHandle != null) _bcHandle.setBackReference(null); //add new broadcast handle _bcHandle = bc; if (_bcHandle != null) bc.setBackReference(this); } // ********************************************* // *** *** // *** HIGH-LEVEL METHODS THAT SPECIFY *** // *** THE LOCKING AND CACHING INTERFACE *** // *** *** // ********************************************* /** * Acquires a shared "read-only" lock, produces the reference to the matrix data, * restores the matrix to main memory, reads from HDFS if needed. * * Synchronized because there might be parallel threads (parfor local) that * access the same MatrixObjectNew object (in case it was created before the loop). * * In-Status: EMPTY, EVICTABLE, EVICTED, READ; * Out-Status: READ(+1). * * @return the matrix data reference * @throws CacheException */ public synchronized MatrixBlock acquireRead() throws CacheException { if (LOG.isTraceEnabled()) LOG.trace("Acquire read " + _varName); long t0 = DMLScript.STATISTICS ? System.nanoTime() : 0; if (!isAvailableToRead()) throw new CacheStatusException("MatrixObject not available to read."); //get object from cache if (_data == null) getCache(); //read data from HDFS/RDD if required //(probe data for cache_nowrite / jvm_reuse) if (isEmpty(true) && _data == null) { try { if (DMLScript.STATISTICS) CacheStatistics.incrementHDFSHits(); if (getRDDHandle() == null || getRDDHandle().allowsShortCircuitRead()) { //check filename if (_hdfsFileName == null) throw new CacheException("Cannot read matrix for empty filename."); //read matrix from hdfs _data = readMatrixFromHDFS(_hdfsFileName); //mark for initial local write despite read operation _requiresLocalWrite = CACHING_WRITE_CACHE_ON_READ; } else { //read matrix from rdd (incl execute pending rdd operations) MutableBoolean writeStatus = new MutableBoolean(); _data = readMatrixFromRDD(getRDDHandle(), writeStatus); //mark for initial local write (prevent repeated execution of rdd operations) if (writeStatus.booleanValue()) _requiresLocalWrite = CACHING_WRITE_CACHE_ON_READ; else _requiresLocalWrite = true; } _dirtyFlag = false; } catch (IOException e) { throw new CacheIOException("Reading of " + _hdfsFileName + " (" + _varName + ") failed.", e); } _isAcquireFromEmpty = true; } else if (DMLScript.STATISTICS) { if (_data != null) CacheStatistics.incrementMemHits(); } //cache status maintenance super.acquire(false, _data == null); updateStatusPinned(true); if (DMLScript.STATISTICS) { long t1 = System.nanoTime(); CacheStatistics.incrementAcquireRTime(t1 - t0); } return _data; } /** * Acquires the exclusive "write" lock for a thread that wants to change matrix * cell values. Produces the reference to the matrix data, restores the matrix * to main memory, reads from HDFS if needed. * * In-Status: EMPTY, EVICTABLE, EVICTED; * Out-Status: MODIFY. * * @return the matrix data reference * @throws CacheException */ public synchronized MatrixBlock acquireModify() throws CacheException { if (LOG.isTraceEnabled()) LOG.trace("Acquire modify " + _varName); long t0 = DMLScript.STATISTICS ? System.nanoTime() : 0; if (!isAvailableToModify()) throw new CacheStatusException("MatrixObject not available to modify."); //get object from cache if (_data == null) getCache(); //read data from HDFS if required if (isEmpty(true) && _data == null) { //check filename if (_hdfsFileName == null) throw new CacheException("Cannot read matrix for empty filename."); //load data try { _data = readMatrixFromHDFS(_hdfsFileName); } catch (IOException e) { throw new CacheIOException("Reading of " + _hdfsFileName + " (" + _varName + ") failed.", e); } } //cache status maintenance super.acquire(true, _data == null); updateStatusPinned(true); _dirtyFlag = true; _isAcquireFromEmpty = false; if (DMLScript.STATISTICS) { long t1 = System.nanoTime(); CacheStatistics.incrementAcquireMTime(t1 - t0); } return _data; } /** * Acquires the exclusive "write" lock for a thread that wants to throw away the * old matrix data and link up with new matrix data. Abandons the old matrix data * without reading it. Sets the new matrix data reference. * In-Status: EMPTY, EVICTABLE, EVICTED; * Out-Status: MODIFY. * * @param newData : the new matrix data reference * @return the matrix data reference, which is the same as the argument * @throws CacheException */ public synchronized MatrixBlock acquireModify(MatrixBlock newData) throws CacheException { if (LOG.isTraceEnabled()) LOG.trace("Acquire modify newdata " + _varName); long t0 = DMLScript.STATISTICS ? System.nanoTime() : 0; if (!isAvailableToModify()) throw new CacheStatusException("MatrixObject not available to modify."); //clear old data clearData(); //cache status maintenance super.acquire(true, false); //no need to load evicted matrix _dirtyFlag = true; _isAcquireFromEmpty = false; //set references to new data if (newData == null) throw new CacheException("acquireModify with empty matrix block."); _data = newData; updateStatusPinned(true); if (DMLScript.STATISTICS) { long t1 = System.nanoTime(); CacheStatistics.incrementAcquireMTime(t1 - t0); } return _data; } /** * Releases the shared ("read-only") or exclusive ("write") lock. Updates * the matrix size, last-access time, metadata, etc. * * Synchronized because there might be parallel threads (parfor local) that * access the same MatrixObjectNew object (in case it was created before the loop). * * In-Status: READ, MODIFY; * Out-Status: READ(-1), EVICTABLE, EMPTY. * * @throws CacheStatusException */ public synchronized void release() throws CacheException { if (LOG.isTraceEnabled()) LOG.trace("Release " + _varName); long t0 = DMLScript.STATISTICS ? System.nanoTime() : 0; boolean write = false; if (isModify()) { //set flags for write write = true; _dirtyFlag = true; //update meta data refreshMetaData(); } //compact empty in-memory block if (_data.isEmptyBlock(false) && _data.isAllocated()) _data.cleanupBlock(true, true); //cache status maintenance (pass cacheNoWrite flag) super.release(_isAcquireFromEmpty && !_requiresLocalWrite); updateStatusPinned(false); if (isCachingActive() //only if caching is enabled (otherwise keep everything in mem) && isCached(true) //not empty and not read/modify && !isUpdateInPlace() //pinned result variable && !isBelowCachingThreshold()) //min size for caching { if (write || _requiresLocalWrite) { //evict blob String filePath = getCacheFilePathAndName(); try { writeMatrix(filePath); } catch (Exception e) { throw new CacheException("Eviction to local path " + filePath + " (" + _varName + ") failed.", e); } _requiresLocalWrite = false; } //create cache createCache(); _data = null; } else if (LOG.isTraceEnabled()) { LOG.trace("Var " + _varName + " not subject to caching: rows=" + _data.getNumRows() + ", cols=" + _data.getNumColumns() + ", state=" + getStatusAsString()); } if (DMLScript.STATISTICS) { long t1 = System.nanoTime(); CacheStatistics.incrementReleaseTime(t1 - t0); } } /** * Sets the matrix data reference to <code>null</code>, abandons the old matrix. * Makes the "envelope" empty. Run it to finalize the matrix (otherwise the * evicted matrix file may remain undeleted). * * In-Status: EMPTY, EVICTABLE, EVICTED; * Out-Status: EMPTY. * @throws CacheException */ public synchronized void clearData() throws CacheException { if (LOG.isTraceEnabled()) LOG.trace("Clear data " + _varName); // check if cleanup enabled and possible if (!_cleanupFlag) return; // do nothing if (!isAvailableToModify()) throw new CacheStatusException("MatrixObject (" + this.getDebugName() + ") not available to modify. Status = " + this.getStatusAsString() + "."); // clear existing WB / FS representation (but prevent unnecessary probes) if (!(isEmpty(true) || (_data != null && isBelowCachingThreshold()) || (_data != null && !isCachingActive()))) //additional condition for JMLC freeEvictedBlob(); // clear the in-memory data _data = null; clearCache(); // clear rdd/broadcast back refs if (_rddHandle != null) _rddHandle.setBackReference(null); if (_bcHandle != null) _bcHandle.setBackReference(null); // change object state EMPTY _dirtyFlag = false; setEmpty(); } public synchronized void exportData() throws CacheException { exportData(-1); } /** * Writes, or flushes, the matrix data to HDFS. * * In-Status: EMPTY, EVICTABLE, EVICTED, READ; * Out-Status: EMPTY, EVICTABLE, EVICTED, READ. * * @throws CacheException */ public synchronized void exportData(int replication) throws CacheException { exportData(_hdfsFileName, null, replication, null); _hdfsFileExists = true; } /** * * @param fName * @param outputFormat * @param formatProperties * @throws CacheException */ public synchronized void exportData(String fName, String outputFormat, FileFormatProperties formatProperties) throws CacheException { exportData(fName, outputFormat, -1, formatProperties); } /** * * @param fName * @param outputFormat * @throws CacheException */ public synchronized void exportData(String fName, String outputFormat) throws CacheException { exportData(fName, outputFormat, -1, null); } /** * Synchronized because there might be parallel threads (parfor local) that * access the same MatrixObjectNew object (in case it was created before the loop). * If all threads export the same data object concurrently it results in errors * because they all write to the same file. Efficiency for loops and parallel threads * is achieved by checking if the in-memory matrix block is dirty. * * NOTE: MB: we do not use dfs copy from local (evicted) to HDFS because this would ignore * the output format and most importantly would bypass reblocking during write (which effects the * potential degree of parallelism). However, we copy files on HDFS if certain criteria are given. * * @param fName * @param outputFormat * @throws CacheException */ public synchronized void exportData(String fName, String outputFormat, int replication, FileFormatProperties formatProperties) throws CacheException { if (LOG.isTraceEnabled()) LOG.trace("Export data " + _varName + " " + fName); long t0 = DMLScript.STATISTICS ? System.nanoTime() : 0; //prevent concurrent modifications if (!isAvailableToRead()) throw new CacheStatusException("MatrixObject not available to read."); LOG.trace("Exporting " + this.getDebugName() + " to " + fName + " in format " + outputFormat); boolean pWrite = false; // !fName.equals(_hdfsFileName); //persistent write flag if (fName.equals(_hdfsFileName)) { _hdfsFileExists = true; pWrite = false; } else { pWrite = true; // i.e., export is called from "write" instruction } //actual export (note: no direct transfer of local copy in order to ensure blocking (and hence, parallelism)) if (isDirty() || //use dirty for skipping parallel exports (pWrite && !isEqualOutputFormat(outputFormat))) { // CASE 1: dirty in-mem matrix or pWrite w/ different format (write matrix to fname; load into memory if evicted) // a) get the matrix if (isEmpty(true)) { //read data from HDFS if required (never read before), this applies only to pWrite w/ different output formats //note: for large rdd outputs, we compile dedicated writespinstructions (no need to handle this here) try { if (getRDDHandle() == null || getRDDHandle().allowsShortCircuitRead()) _data = readMatrixFromHDFS(_hdfsFileName); else _data = readMatrixFromRDD(getRDDHandle(), new MutableBoolean()); _dirtyFlag = false; } catch (IOException e) { throw new CacheIOException("Reading of " + _hdfsFileName + " (" + _varName + ") failed.", e); } } //get object from cache if (_data == null) getCache(); super.acquire(false, _data == null); //incl. read matrix if evicted // b) write the matrix try { writeMetaData(fName, outputFormat, formatProperties); writeMatrixToHDFS(fName, outputFormat, replication, formatProperties); if (!pWrite) _dirtyFlag = false; } catch (Exception e) { throw new CacheIOException("Export to " + fName + " failed.", e); } finally { release(); } } else if (pWrite) // pwrite with same output format { //CASE 2: matrix already in same format but different file on hdfs (copy matrix to fname) try { MapReduceTool.deleteFileIfExistOnHDFS(fName); MapReduceTool.deleteFileIfExistOnHDFS(fName + ".mtd"); if (getRDDHandle() == null || getRDDHandle().allowsShortCircuitRead()) MapReduceTool.copyFileOnHDFS(_hdfsFileName, fName); else //write might trigger rdd operations and nnz maintenance writeMatrixFromRDDtoHDFS(getRDDHandle(), fName, outputFormat); writeMetaData(fName, outputFormat, formatProperties); } catch (Exception e) { throw new CacheIOException("Export to " + fName + " failed.", e); } } else if (getRDDHandle() != null && //pending rdd operation !getRDDHandle().allowsShortCircuitRead()) { //CASE 3: pending rdd operation (other than checkpoints) try { writeMatrixFromRDDtoHDFS(getRDDHandle(), fName, outputFormat); writeMetaData(fName, outputFormat, formatProperties); } catch (Exception e) { throw new CacheIOException("Export to " + fName + " failed.", e); } } else { //CASE 4: data already in hdfs (do nothing, no need for export) LOG.trace(this.getDebugName() + ": Skip export to hdfs since data already exists."); } if (DMLScript.STATISTICS) { long t1 = System.nanoTime(); CacheStatistics.incrementExportTime(t1 - t0); } } /** * * @param fName * @param outputFormat * @return * @throws CacheIOException */ public synchronized boolean moveData(String fName, String outputFormat) throws CacheIOException { boolean ret = false; try { //ensure input file is persistent on hdfs (pending RDD operations), //file might have been written during export or collect via write/read if (getRDDHandle() != null && !MapReduceTool.existsFileOnHDFS(_hdfsFileName)) { writeMatrixFromRDDtoHDFS(getRDDHandle(), _hdfsFileName, outputFormat); } //export or rename to target file on hdfs if (isDirty() || (!isEqualOutputFormat(outputFormat) && isEmpty(true))) { exportData(fName, outputFormat); ret = true; } else if (isEqualOutputFormat(outputFormat)) { MapReduceTool.deleteFileIfExistOnHDFS(fName); MapReduceTool.deleteFileIfExistOnHDFS(fName + ".mtd"); MapReduceTool.renameFileOnHDFS(_hdfsFileName, fName); writeMetaData(fName, outputFormat, null); ret = true; } } catch (Exception e) { throw new CacheIOException("Move to " + fName + " failed.", e); } return ret; } // ********************************************* // *** *** // *** HIGH-LEVEL PUBLIC METHODS *** // *** FOR PARTITIONED MATRIX ACCESS *** // *** (all other methods still usable) *** // *** *** // ********************************************* /** * @param n * */ public void setPartitioned(PDataPartitionFormat format, int n) { _partitioned = true; _partitionFormat = format; _partitionSize = n; } public void unsetPartitioned() { _partitioned = false; _partitionFormat = null; _partitionSize = -1; } /** * * @return */ public boolean isPartitioned() { return _partitioned; } public PDataPartitionFormat getPartitionFormat() { return _partitionFormat; } public int getPartitionSize() { return _partitionSize; } public synchronized void setInMemoryPartition(MatrixBlock block) { _partitionInMemory = block; } /** * NOTE: for reading matrix partitions, we could cache (in its real sense) the read block * with soft references (no need for eviction, as partitioning only applied for read-only matrices). * However, since we currently only support row- and column-wise partitioning caching is not applied yet. * This could be changed once we also support column-block-wise and row-block-wise. Furthermore, * as we reject to partition vectors and support only full row or column indexing, no metadata (apart from * the partition flag) is required. * * @param pred * @return * @throws CacheException */ public synchronized MatrixBlock readMatrixPartition(IndexRange pred) throws CacheException { if (LOG.isTraceEnabled()) LOG.trace("Acquire partition " + _varName + " " + pred); long t0 = DMLScript.STATISTICS ? System.nanoTime() : 0; if (!_partitioned) throw new CacheStatusException("MatrixObject not available to indexed read."); //return static partition of set from outside of the program if (_partitionInMemory != null) return _partitionInMemory; MatrixBlock mb = null; try { boolean blockwise = (_partitionFormat == PDataPartitionFormat.ROW_BLOCK_WISE || _partitionFormat == PDataPartitionFormat.COLUMN_BLOCK_WISE); //preparations for block wise access MatrixFormatMetaData iimd = (MatrixFormatMetaData) _metaData; MatrixCharacteristics mc = iimd.getMatrixCharacteristics(); int brlen = mc.getRowsPerBlock(); int bclen = mc.getColsPerBlock(); //get filename depending on format String fname = getPartitionFileName(pred, brlen, bclen); //probe cache if (blockwise && _partitionCacheName != null && _partitionCacheName.equals(fname)) { mb = _cache.get(); //try getting block from cache } if (mb == null) //block not in cache { //get rows and cols long rows = -1; long cols = -1; switch (_partitionFormat) { case ROW_WISE: rows = 1; cols = mc.getCols(); break; case ROW_BLOCK_WISE: rows = brlen; cols = mc.getCols(); break; case COLUMN_WISE: rows = mc.getRows(); cols = 1; break; case COLUMN_BLOCK_WISE: rows = mc.getRows(); cols = bclen; break; default: throw new CacheException("Unsupported partition format: " + _partitionFormat); } //read the if (MapReduceTool.existsFileOnHDFS(fname)) mb = readMatrixFromHDFS(fname, rows, cols); else { mb = new MatrixBlock((int) rows, (int) cols, true); LOG.warn("Reading empty matrix partition " + fname); } } //post processing if (blockwise) { //put block into cache _partitionCacheName = fname; _cache = new SoftReference<MatrixBlock>(mb); if (_partitionFormat == PDataPartitionFormat.ROW_BLOCK_WISE) { int rix = (int) ((pred.rowStart - 1) % brlen); mb = mb.sliceOperations(rix, rix, (int) (pred.colStart - 1), (int) (pred.colEnd - 1), new MatrixBlock()); } if (_partitionFormat == PDataPartitionFormat.COLUMN_BLOCK_WISE) { int cix = (int) ((pred.colStart - 1) % bclen); mb = mb.sliceOperations((int) (pred.rowStart - 1), (int) (pred.rowEnd - 1), cix, cix, new MatrixBlock()); } } //NOTE: currently no special treatment of non-existing partitions necessary // because empty blocks are written anyway } catch (Exception ex) { throw new CacheException(ex); } if (DMLScript.STATISTICS) { long t1 = System.nanoTime(); CacheStatistics.incrementAcquireRTime(t1 - t0); } return mb; } /** * * @param pred * @return * @throws CacheStatusException */ public String getPartitionFileName(IndexRange pred, int brlen, int bclen) throws CacheStatusException { if (!_partitioned) throw new CacheStatusException("MatrixObject not available to indexed read."); StringBuilder sb = new StringBuilder(); sb.append(_hdfsFileName); switch (_partitionFormat) { case ROW_WISE: sb.append(Lop.FILE_SEPARATOR); sb.append(pred.rowStart); break; case ROW_BLOCK_WISE: sb.append(Lop.FILE_SEPARATOR); sb.append((pred.rowStart - 1) / brlen + 1); break; case COLUMN_WISE: sb.append(Lop.FILE_SEPARATOR); sb.append(pred.colStart); break; case COLUMN_BLOCK_WISE: sb.append(Lop.FILE_SEPARATOR); sb.append((pred.colStart - 1) / bclen + 1); break; default: throw new CacheStatusException("MatrixObject not available to indexed read."); } return sb.toString(); } // ********************************************* // *** *** // *** LOW-LEVEL PROTECTED METHODS *** // *** EXTEND CACHEABLE DATA *** // *** ONLY CALLED BY THE SUPERCLASS *** // *** *** // ********************************************* @Override protected boolean isBlobPresent() { return (_data != null); } @Override protected void evictBlobFromMemory(MatrixBlock mb) throws CacheIOException { throw new CacheIOException("Redundant explicit eviction."); } @Override protected void restoreBlobIntoMemory() throws CacheIOException { long begin = 0; if (LOG.isTraceEnabled()) { LOG.trace("RESTORE of Matrix " + _varName + ", " + _hdfsFileName); begin = System.currentTimeMillis(); } String filePath = getCacheFilePathAndName(); if (LOG.isTraceEnabled()) LOG.trace("CACHE: Restoring matrix... " + _varName + " HDFS path: " + (_hdfsFileName == null ? "null" : _hdfsFileName) + ", Restore from path: " + filePath); if (_data != null) throw new CacheIOException(filePath + " : Cannot restore on top of existing in-memory data."); try { _data = readMatrix(filePath); } catch (IOException e) { throw new CacheIOException(filePath + " : Restore failed.", e); } //check for success if (_data == null) throw new CacheIOException(filePath + " : Restore failed."); if (LOG.isTraceEnabled()) LOG.trace("Restoring matrix - COMPLETED ... " + (System.currentTimeMillis() - begin) + " msec."); } @Override protected void freeEvictedBlob() { String cacheFilePathAndName = getCacheFilePathAndName(); long begin = 0; if (LOG.isTraceEnabled()) { LOG.trace("CACHE: Freeing evicted matrix... " + _varName + " HDFS path: " + (_hdfsFileName == null ? "null" : _hdfsFileName) + " Eviction path: " + cacheFilePathAndName); begin = System.currentTimeMillis(); } LazyWriteBuffer.deleteMatrix(cacheFilePathAndName); if (LOG.isTraceEnabled()) LOG.trace("Freeing evicted matrix - COMPLETED ... " + (System.currentTimeMillis() - begin) + " msec."); } @Override protected boolean isBelowCachingThreshold() { long rlen = _data.getNumRows(); long clen = _data.getNumColumns(); long nnz = _data.getNonZeros(); //get in-memory size (assume dense, if nnz unknown) double sparsity = OptimizerUtils.getSparsity(rlen, clen, nnz); double size = MatrixBlock.estimateSizeInMemory(rlen, clen, sparsity); return (!_data.isAllocated() || size <= CACHING_THRESHOLD); } // ******************************************* // *** *** // *** LOW-LEVEL PRIVATE METHODS *** // *** FOR MATRIX I/O *** // *** *** // ******************************************* private boolean isUpdateInPlace() { return _updateInPlaceFlag; } /** * */ private String getCacheFilePathAndName() { if (_cacheFileName == null) { StringBuilder sb = new StringBuilder(); sb.append(CacheableData.cacheEvictionLocalFilePath); sb.append(CacheableData.cacheEvictionLocalFilePrefix); sb.append(String.format("%09d", getUniqueCacheID())); sb.append(CacheableData.cacheEvictionLocalFileExtension); _cacheFileName = sb.toString(); } return _cacheFileName; } /** * * @param filePathAndName * @return * @throws IOException */ private MatrixBlock readMatrix(String filePathAndName) throws IOException { return LazyWriteBuffer.readMatrix(filePathAndName); } /** * * @param filePathAndName * @return * @throws IOException */ private MatrixBlock readMatrixFromHDFS(String filePathAndName) throws IOException { MatrixFormatMetaData iimd = (MatrixFormatMetaData) _metaData; MatrixCharacteristics mc = iimd.getMatrixCharacteristics(); return readMatrixFromHDFS(filePathAndName, mc.getRows(), mc.getCols()); } /** * * @param rdd * @return * @throws IOException */ private MatrixBlock readMatrixFromRDD(RDDObject rdd, MutableBoolean writeStatus) throws IOException { //note: the read of a matrix block from an RDD might trigger //lazy evaluation of pending transformations. RDDObject lrdd = rdd; //prepare return status (by default only collect) writeStatus.setValue(false); MatrixFormatMetaData iimd = (MatrixFormatMetaData) _metaData; MatrixCharacteristics mc = iimd.getMatrixCharacteristics(); MatrixBlock mb = null; try { //prevent unnecessary collect through rdd checkpoint if (rdd.allowsShortCircuitCollect()) { lrdd = (RDDObject) rdd.getLineageChilds().get(0); } //obtain matrix block from RDD int rlen = (int) mc.getRows(); int clen = (int) mc.getCols(); int brlen = (int) mc.getRowsPerBlock(); int bclen = (int) mc.getColsPerBlock(); long nnz = mc.getNonZeros(); //guarded rdd collect if (!OptimizerUtils.checkSparkCollectMemoryBudget(rlen, clen, brlen, bclen, nnz, sizePinned.get())) { //write RDD to hdfs and read to prevent invalid collect mem consumption //note: lazy, partition-at-a-time collect (toLocalIterator) was significantly slower if (!MapReduceTool.existsFileOnHDFS(_hdfsFileName)) { //prevent overwrite existing file long newnnz = SparkExecutionContext.writeRDDtoHDFS(lrdd, _hdfsFileName, iimd.getOutputInfo()); ((MatrixDimensionsMetaData) _metaData).getMatrixCharacteristics().setNonZeros(newnnz); ((RDDObject) rdd).setHDFSFile(true); //mark rdd as hdfs file (for restore) writeStatus.setValue(true); //mark for no cache-write on read } mb = readMatrixFromHDFS(_hdfsFileName); } else { //collect matrix block from RDD mb = SparkExecutionContext.toMatrixBlock(lrdd, rlen, clen, brlen, bclen, nnz); } } catch (DMLRuntimeException ex) { throw new IOException(ex); } //sanity check correct output if (mb == null) { throw new IOException("Unable to load matrix from rdd: " + lrdd.getVarName()); } return mb; } /** * * @param rdd * @param fname * @param outputFormat * @throws DMLRuntimeException */ private void writeMatrixFromRDDtoHDFS(RDDObject rdd, String fname, String outputFormat) throws DMLRuntimeException { //prepare output info MatrixFormatMetaData iimd = (MatrixFormatMetaData) _metaData; OutputInfo oinfo = (outputFormat != null ? OutputInfo.stringToOutputInfo(outputFormat) : InputInfo.getMatchingOutputInfo(iimd.getInputInfo())); //note: the write of an RDD to HDFS might trigger //lazy evaluation of pending transformations. long newnnz = SparkExecutionContext.writeRDDtoHDFS(rdd, fname, oinfo); ((MatrixDimensionsMetaData) _metaData).getMatrixCharacteristics().setNonZeros(newnnz); } /** * * @param filePathAndName * @param rlen * @param clen * @return * @throws IOException */ private MatrixBlock readMatrixFromHDFS(String filePathAndName, long rlen, long clen) throws IOException { long begin = 0; MatrixFormatMetaData iimd = (MatrixFormatMetaData) _metaData; MatrixCharacteristics mc = iimd.getMatrixCharacteristics(); if (LOG.isTraceEnabled()) { LOG.trace("Reading matrix from HDFS... " + _varName + " Path: " + filePathAndName + ", dimensions: [" + mc.getRows() + ", " + mc.getCols() + ", " + mc.getNonZeros() + "]"); begin = System.currentTimeMillis(); } double sparsity = (mc.getNonZeros() >= 0 ? ((double) mc.getNonZeros()) / (mc.getRows() * mc.getCols()) : 1.0d); //expected sparsity MatrixBlock newData = DataConverter.readMatrixFromHDFS(filePathAndName, iimd.getInputInfo(), rlen, clen, mc.getRowsPerBlock(), mc.getColsPerBlock(), sparsity, _formatProperties); //sanity check correct output if (newData == null) { throw new IOException("Unable to load matrix from file: " + filePathAndName); } if (LOG.isTraceEnabled()) LOG.trace("Reading Completed: " + (System.currentTimeMillis() - begin) + " msec."); return newData; } /** * * @param filePathAndName * @throws DMLRuntimeException * @throws IOException */ private void writeMatrix(String filePathAndName) throws DMLRuntimeException, IOException { LazyWriteBuffer.writeMatrix(filePathAndName, _data); } /** * Writes in-memory matrix to HDFS in a specified format. * * @throws DMLRuntimeException * @throws IOException */ private void writeMatrixToHDFS(String filePathAndName, String outputFormat, int replication, FileFormatProperties formatProperties) throws DMLRuntimeException, IOException { long begin = 0; if (LOG.isTraceEnabled()) { LOG.trace(" Writing matrix to HDFS... " + _varName + " Path: " + filePathAndName + ", Format: " + (outputFormat != null ? outputFormat : "inferred from metadata")); begin = System.currentTimeMillis(); } MatrixFormatMetaData iimd = (MatrixFormatMetaData) _metaData; if (_data != null) { // Get the dimension information from the metadata stored within MatrixObject MatrixCharacteristics mc = iimd.getMatrixCharacteristics(); // Write the matrix to HDFS in requested format OutputInfo oinfo = (outputFormat != null ? OutputInfo.stringToOutputInfo(outputFormat) : InputInfo.getMatchingOutputInfo(iimd.getInputInfo())); // when outputFormat is binaryblock, make sure that matrixCharacteristics has correct blocking dimensions // note: this is only required if singlenode (due to binarycell default) if (oinfo == OutputInfo.BinaryBlockOutputInfo && DMLScript.rtplatform == RUNTIME_PLATFORM.SINGLE_NODE && (mc.getRowsPerBlock() != DMLTranslator.DMLBlockSize || mc.getColsPerBlock() != DMLTranslator.DMLBlockSize)) { DataConverter.writeMatrixToHDFS( _data, filePathAndName, oinfo, new MatrixCharacteristics(mc.getRows(), mc.getCols(), DMLTranslator.DMLBlockSize, DMLTranslator.DMLBlockSize, mc.getNonZeros()), replication, formatProperties); } else { DataConverter.writeMatrixToHDFS(_data, filePathAndName, oinfo, mc, replication, formatProperties); } if (LOG.isTraceEnabled()) LOG.trace("Writing matrix to HDFS (" + filePathAndName + ") - COMPLETED... " + (System.currentTimeMillis() - begin) + " msec."); } else if (LOG.isTraceEnabled()) { LOG.trace("Writing matrix to HDFS (" + filePathAndName + ") - NOTHING TO WRITE (_data == null)."); } if (DMLScript.STATISTICS) CacheStatistics.incrementHDFSWrites(); } /** * * @param filePathAndName * @param outputFormat * @throws DMLRuntimeException * @throws IOException */ private void writeMetaData(String filePathAndName, String outputFormat, FileFormatProperties formatProperties) throws DMLRuntimeException, IOException { MatrixFormatMetaData iimd = (MatrixFormatMetaData) _metaData; if (iimd != null) { // Write the matrix to HDFS in requested format OutputInfo oinfo = (outputFormat != null ? OutputInfo.stringToOutputInfo(outputFormat) : InputInfo.getMatchingOutputInfo(iimd.getInputInfo())); if (oinfo != OutputInfo.MatrixMarketOutputInfo) { // Get the dimension information from the metadata stored within MatrixObject MatrixCharacteristics mc = iimd.getMatrixCharacteristics(); // when outputFormat is binaryblock, make sure that matrixCharacteristics has correct blocking dimensions // note: this is only required if singlenode (due to binarycell default) if (oinfo == OutputInfo.BinaryBlockOutputInfo && DMLScript.rtplatform == RUNTIME_PLATFORM.SINGLE_NODE && (mc.getRowsPerBlock() != DMLTranslator.DMLBlockSize || mc.getColsPerBlock() != DMLTranslator.DMLBlockSize)) { mc = new MatrixCharacteristics(mc.getRows(), mc.getCols(), DMLTranslator.DMLBlockSize, DMLTranslator.DMLBlockSize, mc.getNonZeros()); } MapReduceTool.writeMetaDataFile(filePathAndName + ".mtd", valueType, mc, oinfo, formatProperties); } } else { throw new DMLRuntimeException( "Unexpected error while writing mtd file (" + filePathAndName + ") -- metadata is null."); } } /** * * @param outputFormat * @return */ private boolean isEqualOutputFormat(String outputFormat) { boolean ret = true; if (outputFormat != null) { try { MatrixFormatMetaData iimd = (MatrixFormatMetaData) _metaData; OutputInfo oi1 = InputInfo.getMatchingOutputInfo(iimd.getInputInfo()); OutputInfo oi2 = OutputInfo.stringToOutputInfo(outputFormat); if (oi1 != oi2) { ret = false; } } catch (Exception ex) { ret = false; } } return ret; } @Override public synchronized String getDebugName() { int maxLength = 23; String debugNameEnding = (_hdfsFileName == null ? "null" : (_hdfsFileName.length() < maxLength ? _hdfsFileName : "..." + _hdfsFileName.substring(_hdfsFileName.length() - maxLength + 3))); return _varName + " " + debugNameEnding; } // ******************************************* // *** *** // *** LOW-LEVEL PRIVATE METHODS *** // *** FOR SOFTREFERENCE CACHE *** // *** *** // ******************************************* /** * */ private void createCache() { _cache = new SoftReference<MatrixBlock>(_data); } /** * */ private void getCache() { if (_cache != null) { _data = _cache.get(); clearCache(); } } /** * */ private void clearCache() { if (_cache != null) { _cache.clear(); _cache = null; } } /** * * @param add */ private void updateStatusPinned(boolean add) { if (_data != null) { //data should never be null long size = sizePinned.get(); size += (add ? 1 : -1) * _data.getSizeInMemory(); sizePinned.set(Math.max(size, 0)); } } /** * see clear data * * @param flag */ public void enableCleanup(boolean flag) { _cleanupFlag = flag; } /** * see clear data * * @return */ public boolean isCleanupEnabled() { return _cleanupFlag; } /** * * @param flag */ public void enableUpdateInPlace(boolean flag) { _updateInPlaceFlag = flag; } /** * * @return */ public boolean isUpdateInPlaceEnabled() { return _updateInPlaceFlag; } /** * */ public void setEmptyStatus() { setEmpty(); } }