Java tutorial
/* * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, * software distributed under the License is distributed on an * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY * KIND, either express or implied. See the License for the * specific language governing permissions and limitations * under the License. */ package org.apache.sysml.runtime.controlprogram.caching; import java.io.File; import java.io.IOException; import java.lang.ref.SoftReference; import org.apache.commons.lang.mutable.MutableBoolean; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.sysml.api.DMLScript; import org.apache.sysml.api.DMLScript.RUNTIME_PLATFORM; import org.apache.sysml.conf.ConfigurationManager; import org.apache.sysml.parser.Expression.DataType; import org.apache.sysml.parser.Expression.ValueType; import org.apache.sysml.runtime.DMLRuntimeException; import org.apache.sysml.runtime.controlprogram.caching.LazyWriteBuffer.RPolicy; import org.apache.sysml.runtime.instructions.gpu.context.GPUObject; import org.apache.sysml.runtime.controlprogram.parfor.util.IDSequence; import org.apache.sysml.runtime.instructions.cp.Data; import org.apache.sysml.runtime.instructions.spark.data.BroadcastObject; import org.apache.sysml.runtime.instructions.spark.data.RDDObject; import org.apache.sysml.runtime.matrix.MatrixCharacteristics; import org.apache.sysml.runtime.matrix.MatrixDimensionsMetaData; import org.apache.sysml.runtime.matrix.MatrixFormatMetaData; import org.apache.sysml.runtime.matrix.MetaData; import org.apache.sysml.runtime.matrix.data.FileFormatProperties; import org.apache.sysml.runtime.matrix.data.InputInfo; import org.apache.sysml.runtime.matrix.data.NumItemsByEachReducerMetaData; import org.apache.sysml.runtime.matrix.data.OutputInfo; import org.apache.sysml.runtime.util.LocalFileUtils; import org.apache.sysml.runtime.util.MapReduceTool; /** * Each object of this class is a cache envelope for some large piece of data * called "cache block". For example, the body of a matrix can be the cache block. * The term cache block refers strictly to the cacheable portion of the data object, * often excluding metadata and auxiliary parameters, as defined in the subclasses. * Under the protection of the envelope, the data blob may be evicted to * the file system; then the subclass must set its reference to <code>null</code> * to allow Java garbage collection. If other parts of the system continue * keep references to the cache block, its eviction will not release any memory. */ public abstract class CacheableData<T extends CacheBlock> extends Data { private static final long serialVersionUID = -413810592207212835L; /** Global logging instance for all subclasses of CacheableData */ protected static final Log LOG = LogFactory.getLog(CacheableData.class.getName()); // global constant configuration parameters public static final long CACHING_THRESHOLD = 4 * 1024; //obj not s.t. caching if below threshold [in bytes] public static final double CACHING_BUFFER_SIZE = 0.15; public static final RPolicy CACHING_BUFFER_POLICY = RPolicy.FIFO; public static final boolean CACHING_BUFFER_PAGECACHE = false; public static final boolean CACHING_WRITE_CACHE_ON_READ = false; public static final String CACHING_COUNTER_GROUP_NAME = "SystemML Caching Counters"; public static final String CACHING_EVICTION_FILEEXTENSION = ".dat"; public static final boolean CACHING_ASYNC_FILECLEANUP = true; /** * Defines all possible cache status types for a data blob. * An object of class {@link CacheableData} can be in one of the following * five status types: * * <code>EMPTY</code>: Either there is no data blob at all, or the data blob * resides in a specified import file and has never been downloaded yet. * <code>READ</code>: The data blob is in main memory; one or more threads are * referencing and reading it (shared "read-only" lock). This status uses a * counter. Eviction is NOT allowed. * <code>MODIFY</code>: The data blob is in main memory; exactly one thread is * referencing and modifying it (exclusive "write" lock). Eviction is NOT allowed. * <code>CACHED</code>: The data blob is in main memory, and nobody is using nor referencing it. * There is always an persistent recovery object for it **/ protected enum CacheStatus { EMPTY, READ, MODIFY, CACHED, CACHED_NOWRITE, }; /** Global flag indicating if caching is enabled (controls eviction) */ private static boolean _activeFlag = false; /** Global sequence for generating unique ids. */ private static IDSequence _seq = null; // Global eviction path and prefix (prefix used for isolation purposes) public static String cacheEvictionLocalFilePath = null; //set during init public static String cacheEvictionLocalFilePrefix = "cache"; /** * Current state of pinned variables, required for guarded collect. */ private static ThreadLocal<Long> sizePinned = new ThreadLocal<Long>() { @Override protected Long initialValue() { return 0L; } }; static { _seq = new IDSequence(); } /** * The unique (JVM-wide) ID of a cacheable data object; to ensure unique IDs across JVMs, we * concatenate filenames with a unique prefix (map task ID). */ private final int _uniqueID; /** The cache status of the data blob (whether it can be or is evicted, etc. */ private CacheStatus _cacheStatus = null; /** Cache for actual data, evicted by garbage collector. */ protected SoftReference<T> _cache = null; /** Container object that holds the actual data. */ protected T _data = null; /** * Object that holds the metadata associated with the matrix, which * includes: 1) Matrix dimensions, if available 2) Number of non-zeros, if * available 3) Block dimensions, if applicable 4) InputInfo -- subsequent * operations that use this Matrix expect it to be in this format. * * When the matrix is written to HDFS (local file system, as well?), one * must get the OutputInfo that matches with InputInfo stored inside _mtd. */ protected MetaData _metaData = null; /** The name of HDFS file in which the data is backed up. */ protected String _hdfsFileName = null; // file name and path /** * Flag that indicates whether or not hdfs file exists.It is used * for improving the performance of "rmvar" instruction. When it has * value <code>false</code>, one can skip file system existence * checks which can be expensive. */ private boolean _hdfsFileExists = false; /** Information relevant to specific external file formats. */ private FileFormatProperties _formatProps = null; /** * <code>true</code> if the in-memory or evicted matrix may be different from * the matrix located at {@link #_hdfsFileName}; <code>false</code> if the two * matrices should be the same. */ private boolean _dirtyFlag = false; // additional private flags and meta data private int _numReadThreads = 0; //number of threads for read from HDFS private boolean _cleanupFlag = true; //flag if obj unpinned (cleanup enabled) private String _varName = ""; //plan variable name private String _cacheFileName = null; //local eviction file name private boolean _requiresLocalWrite = false; //flag if local write for read obj private boolean _isAcquireFromEmpty = false; //flag if read from status empty //spark-specific handles //note: we use the abstraction of LineageObjects for two reasons: (1) to keep track of cleanup //for lazily evaluated RDDs, and (2) as abstraction for environments that do not necessarily have spark libraries available private RDDObject _rddHandle = null; //RDD handle private BroadcastObject<T> _bcHandle = null; //Broadcast handle protected GPUObject _gpuHandle = null; /** * Basic constructor for any cacheable data. * * @param dt data type * @param vt value type */ protected CacheableData(DataType dt, ValueType vt) { super(dt, vt); _uniqueID = (int) _seq.getNextID(); _cacheStatus = CacheStatus.EMPTY; _numReadThreads = 0; } /** * Copy constructor for cacheable data (of same type). * * @param that cacheable data object */ protected CacheableData(CacheableData<T> that) { this(that.getDataType(), that.getValueType()); _cleanupFlag = that._cleanupFlag; _hdfsFileName = that._hdfsFileName; _hdfsFileExists = that._hdfsFileExists; _varName = that._varName; _gpuHandle = that._gpuHandle; } /** * Enables or disables the cleanup of the associated * data object on clearData(). * * @param flag true if cleanup */ public void enableCleanup(boolean flag) { _cleanupFlag = flag; } /** * Indicates if cleanup of the associated data object * is enabled on clearData(). * * @return true if cleanup enabled */ public boolean isCleanupEnabled() { return _cleanupFlag; } public void setVarName(String s) { _varName = s; } public String getVarName() { return _varName; } public boolean isHDFSFileExists() { return _hdfsFileExists; } public void setHDFSFileExists(boolean flag) { _hdfsFileExists = flag; } public String getFileName() { return _hdfsFileName; } public synchronized void setFileName(String file) { if (_hdfsFileName != null && !_hdfsFileName.equals(file)) if (!isEmpty(true)) _dirtyFlag = true; _hdfsFileName = file; } /** * <code>true</code> if the in-memory or evicted matrix may be different from * the matrix located at {@link #_hdfsFileName}; <code>false</code> if the two * matrices are supposed to be the same. * * @return true if dirty */ public boolean isDirty() { return _dirtyFlag; } public void setDirty(boolean flag) { _dirtyFlag = flag; } public FileFormatProperties getFileFormatProperties() { return _formatProps; } public void setFileFormatProperties(FileFormatProperties props) { _formatProps = props; } @Override public void setMetaData(MetaData md) { _metaData = md; } @Override public MetaData getMetaData() { return _metaData; } @Override public void removeMetaData() { _metaData = null; } public MatrixCharacteristics getMatrixCharacteristics() { MatrixDimensionsMetaData meta = (MatrixDimensionsMetaData) _metaData; return meta.getMatrixCharacteristics(); } public abstract void refreshMetaData() throws CacheException; public RDDObject getRDDHandle() { return _rddHandle; } public void setRDDHandle(RDDObject rdd) { //cleanup potential old back reference if (_rddHandle != null) _rddHandle.setBackReference(null); //add new rdd handle _rddHandle = rdd; if (_rddHandle != null) rdd.setBackReference(this); } public BroadcastObject<T> getBroadcastHandle() { return _bcHandle; } @SuppressWarnings({ "rawtypes", "unchecked" }) public void setBroadcastHandle(BroadcastObject bc) { //cleanup potential old back reference if (_bcHandle != null) _bcHandle.setBackReference(null); //add new broadcast handle _bcHandle = bc; if (_bcHandle != null) bc.setBackReference(this); } public GPUObject getGPUObject() { return _gpuHandle; } public void setGPUObject(GPUObject handle) { _gpuHandle = handle; } // ********************************************* // *** *** // *** HIGH-LEVEL METHODS THAT SPECIFY *** // *** THE LOCKING AND CACHING INTERFACE *** // *** *** // ********************************************* /** * Acquires a shared "read-only" lock, produces the reference to the cache block, * restores the cache block to main memory, reads from HDFS if needed. * * Synchronized because there might be parallel threads (parfor local) that * access the same object (in case it was created before the loop). * * In-Status: EMPTY, EVICTABLE, EVICTED, READ; * Out-Status: READ(+1). * * @return cacheable data * @throws CacheException if CacheException occurs */ public synchronized T acquireRead() throws CacheException { if (LOG.isTraceEnabled()) LOG.trace("Acquire read " + getVarName()); long t0 = DMLScript.STATISTICS ? System.nanoTime() : 0; if (!isAvailableToRead()) throw new CacheException("MatrixObject not available to read."); //get object from cache if (_data == null) getCache(); //call acquireHostRead if gpuHandle is set as well as is allocated if (_gpuHandle != null && _gpuHandle.isAllocated()) { _gpuHandle.acquireHostRead(); if (_data == null) getCache(); } //read data from HDFS/RDD if required //(probe data for cache_nowrite / jvm_reuse) if (isEmpty(true) && _data == null) { try { if (DMLScript.STATISTICS) CacheStatistics.incrementHDFSHits(); if (getRDDHandle() == null || getRDDHandle().allowsShortCircuitRead()) { //check filename if (_hdfsFileName == null) throw new CacheException("Cannot read matrix for empty filename."); //read cacheable data from hdfs _data = readBlobFromHDFS(_hdfsFileName); //mark for initial local write despite read operation _requiresLocalWrite = CACHING_WRITE_CACHE_ON_READ; } else { //read matrix from rdd (incl execute pending rdd operations) MutableBoolean writeStatus = new MutableBoolean(); _data = readBlobFromRDD(getRDDHandle(), writeStatus); //mark for initial local write (prevent repeated execution of rdd operations) if (writeStatus.booleanValue()) _requiresLocalWrite = CACHING_WRITE_CACHE_ON_READ; else _requiresLocalWrite = true; } setDirty(false); } catch (IOException e) { throw new CacheException("Reading of " + _hdfsFileName + " (" + getVarName() + ") failed.", e); } _isAcquireFromEmpty = true; } else if (DMLScript.STATISTICS) { if (_data != null) CacheStatistics.incrementMemHits(); } //cache status maintenance acquire(false, _data == null); updateStatusPinned(true); if (DMLScript.STATISTICS) { long t1 = System.nanoTime(); CacheStatistics.incrementAcquireRTime(t1 - t0); } return _data; } /** * Acquires the exclusive "write" lock for a thread that wants to change cache block * cell values. Produces the reference to the cache block, restores the cache block * to main memory, reads from HDFS if needed. * * In-Status: EMPTY, EVICTABLE, EVICTED; * Out-Status: MODIFY. * * @return cacheable data * @throws CacheException if CacheException occurs */ public synchronized T acquireModify() throws CacheException { if (LOG.isTraceEnabled()) LOG.trace("Acquire modify " + getVarName()); long t0 = DMLScript.STATISTICS ? System.nanoTime() : 0; if (!isAvailableToModify()) throw new CacheException("MatrixObject not available to modify."); //get object from cache if (_data == null) getCache(); //read data from HDFS if required if (isEmpty(true) && _data == null) { //check filename if (_hdfsFileName == null) throw new CacheException("Cannot read matrix for empty filename."); //load data try { _data = readBlobFromHDFS(_hdfsFileName); } catch (IOException e) { throw new CacheException("Reading of " + _hdfsFileName + " (" + getVarName() + ") failed.", e); } } //cache status maintenance acquire(true, _data == null); updateStatusPinned(true); setDirty(true); _isAcquireFromEmpty = false; if (DMLScript.STATISTICS) { long t1 = System.nanoTime(); CacheStatistics.incrementAcquireMTime(t1 - t0); } return _data; } /** * Acquires the exclusive "write" lock for a thread that wants to throw away the * old cache block data and link up with new cache block data. Abandons the old data * without reading it and sets the new data reference. * In-Status: EMPTY, EVICTABLE, EVICTED; * Out-Status: MODIFY. * * @param newData new data * @return cacheable data * @throws CacheException if CacheException occurs */ public synchronized T acquireModify(T newData) throws CacheException { if (LOG.isTraceEnabled()) LOG.trace("Acquire modify newdata " + getVarName()); long t0 = DMLScript.STATISTICS ? System.nanoTime() : 0; if (!isAvailableToModify()) throw new CacheException("CacheableData not available to modify."); //clear old data clearData(); //cache status maintenance acquire(true, false); //no need to load evicted matrix setDirty(true); _isAcquireFromEmpty = false; //set references to new data if (newData == null) throw new CacheException("acquireModify with empty cache block."); _data = newData; updateStatusPinned(true); if (DMLScript.STATISTICS) { long t1 = System.nanoTime(); CacheStatistics.incrementAcquireMTime(t1 - t0); } return _data; } /** * Releases the shared ("read-only") or exclusive ("write") lock. Updates * size information, last-access time, metadata, etc. * * Synchronized because there might be parallel threads (parfor local) that * access the same object (in case it was created before the loop). * * In-Status: READ, MODIFY; * Out-Status: READ(-1), EVICTABLE, EMPTY. * * @throws CacheException if CacheException occurs */ public synchronized void release() throws CacheException { if (LOG.isTraceEnabled()) LOG.trace("Release " + getVarName()); long t0 = DMLScript.STATISTICS ? System.nanoTime() : 0; boolean write = false; if (isModify()) { //set flags for write write = true; setDirty(true); //update meta data refreshMetaData(); } //compact empty in-memory block _data.compactEmptyBlock(); //cache status maintenance (pass cacheNoWrite flag) release(_isAcquireFromEmpty && !_requiresLocalWrite); updateStatusPinned(false); if (isCachingActive() //only if caching is enabled (otherwise keep everything in mem) && isCached(true) //not empty and not read/modify && !isBelowCachingThreshold()) //min size for caching { if (write || _requiresLocalWrite) { //evict blob String filePath = getCacheFilePathAndName(); try { LazyWriteBuffer.writeBlock(filePath, _data); } catch (Exception e) { throw new CacheException( "Eviction to local path " + filePath + " (" + getVarName() + ") failed.", e); } _requiresLocalWrite = false; } //create cache createCache(); _data = null; } else if (LOG.isTraceEnabled()) { LOG.trace("Var " + getVarName() + " not subject to caching, state=" + getStatusAsString()); } if (DMLScript.STATISTICS) { long t1 = System.nanoTime(); CacheStatistics.incrementReleaseTime(t1 - t0); } } protected void clearReusableData() { } /** * Sets the cache block reference to <code>null</code>, abandons the old block. * Makes the "envelope" empty. Run it to finalize the object (otherwise the * evicted cache block file may remain undeleted). * * In-Status: EMPTY, EVICTABLE, EVICTED; * Out-Status: EMPTY. * * @throws CacheException if CacheException occurs */ public synchronized void clearData() throws CacheException { if (LOG.isTraceEnabled()) LOG.trace("Clear data " + getVarName()); // check if cleanup enabled and possible if (!isCleanupEnabled()) return; // do nothing if (!isAvailableToModify()) throw new CacheException("CacheableData (" + getDebugName() + ") not available to " + "modify. Status = " + getStatusAsString() + "."); // clear existing WB / FS representation (but prevent unnecessary probes) if (!(isEmpty(true) || (_data != null && isBelowCachingThreshold()) || (_data != null && !isCachingActive()))) //additional condition for JMLC freeEvictedBlob(); // clear the in-memory data clearReusableData(); _data = null; clearCache(); // clear rdd/broadcast back refs if (_rddHandle != null) _rddHandle.setBackReference(null); if (_bcHandle != null) _bcHandle.setBackReference(null); if (_gpuHandle != null) _gpuHandle.clearData(); // change object state EMPTY setDirty(false); setEmpty(); } public synchronized void exportData() throws CacheException { exportData(-1); } /** * Writes, or flushes, the cache block data to HDFS. * * In-Status: EMPTY, EVICTABLE, EVICTED, READ; * Out-Status: EMPTY, EVICTABLE, EVICTED, READ. * * @param replication ? * @throws CacheException if CacheException occurs */ public synchronized void exportData(int replication) throws CacheException { exportData(_hdfsFileName, null, replication, null); _hdfsFileExists = true; } public synchronized void exportData(String fName, String outputFormat) throws CacheException { exportData(fName, outputFormat, -1, null); } public synchronized void exportData(String fName, String outputFormat, FileFormatProperties formatProperties) throws CacheException { exportData(fName, outputFormat, -1, formatProperties); } /** * Synchronized because there might be parallel threads (parfor local) that * access the same object (in case it was created before the loop). * If all threads export the same data object concurrently it results in errors * because they all write to the same file. Efficiency for loops and parallel threads * is achieved by checking if the in-memory block is dirty. * * NOTE: MB: we do not use dfs copy from local (evicted) to HDFS because this would ignore * the output format and most importantly would bypass reblocking during write (which effects the * potential degree of parallelism). However, we copy files on HDFS if certain criteria are given. * * @param fName file name * @param outputFormat format * @param replication ? * @param formatProperties file format properties * @throws CacheException if CacheException occurs */ public synchronized void exportData(String fName, String outputFormat, int replication, FileFormatProperties formatProperties) throws CacheException { if (LOG.isTraceEnabled()) LOG.trace("Export data " + getVarName() + " " + fName); long t0 = DMLScript.STATISTICS ? System.nanoTime() : 0; //prevent concurrent modifications if (!isAvailableToRead()) throw new CacheException("MatrixObject not available to read."); LOG.trace("Exporting " + this.getDebugName() + " to " + fName + " in format " + outputFormat); //TODO remove if (getGPUObject() != null && getGPUObject().isAllocated()) { getGPUObject().acquireHostRead(); } boolean pWrite = false; // !fName.equals(_hdfsFileName); //persistent write flag if (fName.equals(_hdfsFileName)) { setHDFSFileExists(true); pWrite = false; } else { pWrite = true; // i.e., export is called from "write" instruction } //actual export (note: no direct transfer of local copy in order to ensure blocking (and hence, parallelism)) if (isDirty() || //use dirty for skipping parallel exports (pWrite && !isEqualOutputFormat(outputFormat))) { // CASE 1: dirty in-mem matrix or pWrite w/ different format (write matrix to fname; load into memory if evicted) // a) get the matrix if (isEmpty(true)) { //read data from HDFS if required (never read before), this applies only to pWrite w/ different output formats //note: for large rdd outputs, we compile dedicated writespinstructions (no need to handle this here) try { if (getRDDHandle() == null || getRDDHandle().allowsShortCircuitRead()) _data = readBlobFromHDFS(_hdfsFileName); else _data = readBlobFromRDD(getRDDHandle(), new MutableBoolean()); setDirty(false); } catch (IOException e) { throw new CacheException("Reading of " + _hdfsFileName + " (" + getVarName() + ") failed.", e); } } //get object from cache if (_data == null) getCache(); acquire(false, _data == null); //incl. read matrix if evicted // b) write the matrix try { writeMetaData(fName, outputFormat, formatProperties); writeBlobToHDFS(fName, outputFormat, replication, formatProperties); if (!pWrite) setDirty(false); } catch (Exception e) { throw new CacheException("Export to " + fName + " failed.", e); } finally { release(); } } else if (pWrite) // pwrite with same output format { //CASE 2: matrix already in same format but different file on hdfs (copy matrix to fname) try { MapReduceTool.deleteFileIfExistOnHDFS(fName); MapReduceTool.deleteFileIfExistOnHDFS(fName + ".mtd"); if (getRDDHandle() == null || getRDDHandle().allowsShortCircuitRead()) MapReduceTool.copyFileOnHDFS(_hdfsFileName, fName); else //write might trigger rdd operations and nnz maintenance writeBlobFromRDDtoHDFS(getRDDHandle(), fName, outputFormat); writeMetaData(fName, outputFormat, formatProperties); } catch (Exception e) { throw new CacheException("Export to " + fName + " failed.", e); } } else if (getRDDHandle() != null && //pending rdd operation !getRDDHandle().allowsShortCircuitRead()) { //CASE 3: pending rdd operation (other than checkpoints) try { writeBlobFromRDDtoHDFS(getRDDHandle(), fName, outputFormat); writeMetaData(fName, outputFormat, formatProperties); } catch (Exception e) { throw new CacheException("Export to " + fName + " failed.", e); } } else { //CASE 4: data already in hdfs (do nothing, no need for export) LOG.trace(this.getDebugName() + ": Skip export to hdfs since data already exists."); } if (DMLScript.STATISTICS) { long t1 = System.nanoTime(); CacheStatistics.incrementExportTime(t1 - t0); } } // --------- ABSTRACT LOW-LEVEL CACHE I/O OPERATIONS ---------- /** * Checks if the data blob reference points to some in-memory object. * This method is called when releasing the (last) lock. Do not call * this method for a blob that has been evicted. * * @return <code>true</code> if the blob is in main memory and the * reference points to it; * <code>false</code> if the blob reference is <code>null</code>. */ protected boolean isBlobPresent() { return (_data != null); } /** * Low-level cache I/O method that physically restores the data blob to * main memory. Must be defined by a subclass, never called by users. * * @throws CacheException if CacheException occurs */ protected void restoreBlobIntoMemory() throws CacheException { String cacheFilePathAndName = getCacheFilePathAndName(); long begin = LOG.isTraceEnabled() ? System.currentTimeMillis() : 0; if (LOG.isTraceEnabled()) LOG.trace("CACHE: Restoring matrix... " + getVarName() + " HDFS path: " + (_hdfsFileName == null ? "null" : _hdfsFileName) + ", Restore from path: " + cacheFilePathAndName); if (_data != null) throw new CacheException(cacheFilePathAndName + " : Cannot restore on top of existing in-memory data."); try { _data = readBlobFromCache(cacheFilePathAndName); } catch (IOException e) { throw new CacheException(cacheFilePathAndName + " : Restore failed.", e); } //check for success if (_data == null) throw new CacheException(cacheFilePathAndName + " : Restore failed."); if (LOG.isTraceEnabled()) LOG.trace("Restoring matrix - COMPLETED ... " + (System.currentTimeMillis() - begin) + " msec."); } protected abstract T readBlobFromCache(String fname) throws IOException; /** * Low-level cache I/O method that deletes the file containing the * evicted data blob, without reading it. * Must be defined by a subclass, never called by users. */ protected void freeEvictedBlob() { String cacheFilePathAndName = getCacheFilePathAndName(); long begin = LOG.isTraceEnabled() ? System.currentTimeMillis() : 0; if (LOG.isTraceEnabled()) LOG.trace("CACHE: Freeing evicted matrix... " + getVarName() + " HDFS path: " + (_hdfsFileName == null ? "null" : _hdfsFileName) + " Eviction path: " + cacheFilePathAndName); LazyWriteBuffer.deleteBlock(cacheFilePathAndName); if (LOG.isTraceEnabled()) LOG.trace("Freeing evicted matrix - COMPLETED ... " + (System.currentTimeMillis() - begin) + " msec."); } protected boolean isBelowCachingThreshold() { return (_data.getInMemorySize() <= CACHING_THRESHOLD); } protected ValueType[] getSchema() { return null; } @Override //Data public synchronized String getDebugName() { int maxLength = 23; String debugNameEnding = (_hdfsFileName == null ? "null" : (_hdfsFileName.length() < maxLength ? _hdfsFileName : "..." + _hdfsFileName.substring(_hdfsFileName.length() - maxLength + 3))); return getVarName() + " " + debugNameEnding; } protected T readBlobFromHDFS(String fname) throws IOException { MatrixFormatMetaData iimd = (MatrixFormatMetaData) _metaData; MatrixCharacteristics mc = iimd.getMatrixCharacteristics(); return readBlobFromHDFS(fname, mc.getRows(), mc.getCols()); } protected abstract T readBlobFromHDFS(String fname, long rlen, long clen) throws IOException; protected abstract T readBlobFromRDD(RDDObject rdd, MutableBoolean status) throws IOException; protected abstract void writeBlobToHDFS(String fname, String ofmt, int rep, FileFormatProperties fprop) throws IOException, DMLRuntimeException; protected abstract void writeBlobFromRDDtoHDFS(RDDObject rdd, String fname, String ofmt) throws IOException, DMLRuntimeException; protected void writeMetaData(String filePathAndName, String outputFormat, FileFormatProperties formatProperties) throws DMLRuntimeException, IOException { MatrixFormatMetaData iimd = (MatrixFormatMetaData) _metaData; if (iimd == null) throw new DMLRuntimeException( "Unexpected error while writing mtd file (" + filePathAndName + ") -- metadata is null."); // Write the matrix to HDFS in requested format OutputInfo oinfo = (outputFormat != null ? OutputInfo.stringToOutputInfo(outputFormat) : InputInfo.getMatchingOutputInfo(iimd.getInputInfo())); if (oinfo != OutputInfo.MatrixMarketOutputInfo) { // Get the dimension information from the metadata stored within MatrixObject MatrixCharacteristics mc = iimd.getMatrixCharacteristics(); // when outputFormat is binaryblock, make sure that matrixCharacteristics has correct blocking dimensions // note: this is only required if singlenode (due to binarycell default) if (oinfo == OutputInfo.BinaryBlockOutputInfo && DMLScript.rtplatform == RUNTIME_PLATFORM.SINGLE_NODE && (mc.getRowsPerBlock() != ConfigurationManager.getBlocksize() || mc.getColsPerBlock() != ConfigurationManager.getBlocksize())) { mc = new MatrixCharacteristics(mc.getRows(), mc.getCols(), ConfigurationManager.getBlocksize(), ConfigurationManager.getBlocksize(), mc.getNonZeros()); } //write the actual meta data file MapReduceTool.writeMetaDataFile(filePathAndName + ".mtd", valueType, getSchema(), dataType, mc, oinfo, formatProperties); } } protected boolean isEqualOutputFormat(String outputFormat) { boolean ret = true; if (outputFormat != null) { try { MatrixFormatMetaData iimd = (MatrixFormatMetaData) _metaData; OutputInfo oi1 = InputInfo.getMatchingOutputInfo(iimd.getInputInfo()); OutputInfo oi2 = OutputInfo.stringToOutputInfo(outputFormat); if (oi1 != oi2) ret = false; } catch (Exception ex) { ret = false; } } return ret; } // ------------- IMPLEMENTED CACHE LOGIC METHODS -------------- protected int getUniqueCacheID() { return _uniqueID; } protected String getCacheFilePathAndName() { if (_cacheFileName == null) { StringBuilder sb = new StringBuilder(); sb.append(CacheableData.cacheEvictionLocalFilePath); sb.append(CacheableData.cacheEvictionLocalFilePrefix); sb.append(String.format("%09d", getUniqueCacheID())); sb.append(CacheableData.CACHING_EVICTION_FILEEXTENSION); _cacheFileName = sb.toString(); } return _cacheFileName; } /** * This method "acquires the lock" to ensure that the data blob is in main memory * (not evicted) while it is being accessed. When called, the method will try to * restore the blob if it has been evicted. There are two kinds of locks it may * acquire: a shared "read" lock (if the argument is <code>false</code>) or the * exclusive "modify" lock (if the argument is <code>true</code>). * The method can fail in three ways: * (1) if there is lock status conflict; * (2) if there is not enough cache memory to restore the blob; * (3) if the restore method returns an error. * The method locks the data blob in memory (which disables eviction) and updates * its last-access timestamp. For the shared "read" lock, acquiring a new lock * increments the associated count. The "read" count has to be decremented once * the blob is no longer used, which may re-enable eviction. This method has to * be called only once per matrix operation and coupled with {@link #release()}, * because it increments the lock count and the other method decrements this count. * * @param isModify : <code>true</code> for the exclusive "modify" lock, * <code>false</code> for a shared "read" lock. * @param restore true if restore * @throws CacheException if CacheException occurs */ protected void acquire(boolean isModify, boolean restore) throws CacheException { switch (_cacheStatus) { case CACHED: if (restore) restoreBlobIntoMemory(); case CACHED_NOWRITE: case EMPTY: if (isModify) setModify(); else addOneRead(); break; case READ: if (isModify) throw new CacheException("READ-MODIFY not allowed."); else addOneRead(); break; case MODIFY: throw new CacheException("MODIFY-MODIFY not allowed."); } if (LOG.isTraceEnabled()) LOG.trace("Acquired lock on " + this.getDebugName() + ", status: " + this.getStatusAsString()); } /** * Call this method to permit eviction for the stored data blob, or to * decrement its "read" count if it is "read"-locked by other threads. * It is expected that you eliminate all external references to the blob * prior to calling this method, because otherwise eviction will * duplicate the blob, but not release memory. This method has to be * called only once per process and coupled with {@link #acquire(boolean, boolean)}, * because it decrements the lock count and the other method increments * the lock count. * * @param cacheNoWrite ? * @throws CacheException if CacheException occurs */ protected void release(boolean cacheNoWrite) throws CacheException { switch (_cacheStatus) { case EMPTY: case CACHED: case CACHED_NOWRITE: throw new CacheException("Redundant release."); case READ: removeOneRead(isBlobPresent(), cacheNoWrite); break; case MODIFY: if (isBlobPresent()) setCached(); else setEmpty(); break; } if (LOG.isTraceEnabled()) LOG.trace("Released lock on " + this.getDebugName() + ", status: " + this.getStatusAsString()); } // ************************************************** // *** *** // *** CACHE STATUS FIELD - CLASSES AND METHODS *** // *** *** // ************************************************** public String getStatusAsString() { return _cacheStatus.toString(); } public boolean isCached(boolean inclCachedNoWrite) { if (inclCachedNoWrite) return (_cacheStatus == CacheStatus.CACHED || _cacheStatus == CacheStatus.CACHED_NOWRITE); else return (_cacheStatus == CacheStatus.CACHED); } public void setEmptyStatus() { setEmpty(); } protected boolean isEmpty(boolean inclCachedNoWrite) { if (inclCachedNoWrite) return (_cacheStatus == CacheStatus.EMPTY || _cacheStatus == CacheStatus.CACHED_NOWRITE); else return (_cacheStatus == CacheStatus.EMPTY); } protected boolean isModify() { return (_cacheStatus == CacheStatus.MODIFY); } protected void setEmpty() { _cacheStatus = CacheStatus.EMPTY; } protected void setModify() { _cacheStatus = CacheStatus.MODIFY; } protected void setCached() { _cacheStatus = CacheStatus.CACHED; } protected void addOneRead() { _numReadThreads++; _cacheStatus = CacheStatus.READ; } protected void removeOneRead(boolean doesBlobExist, boolean cacheNoWrite) { _numReadThreads--; if (_numReadThreads == 0) { if (cacheNoWrite) _cacheStatus = (doesBlobExist ? CacheStatus.CACHED_NOWRITE : CacheStatus.EMPTY); else _cacheStatus = (doesBlobExist ? CacheStatus.CACHED : CacheStatus.EMPTY); } } protected boolean isAvailableToRead() { return (_cacheStatus == CacheStatus.EMPTY || _cacheStatus == CacheStatus.CACHED || _cacheStatus == CacheStatus.CACHED_NOWRITE || _cacheStatus == CacheStatus.READ); } protected boolean isAvailableToModify() { return (_cacheStatus == CacheStatus.EMPTY || _cacheStatus == CacheStatus.CACHED || _cacheStatus == CacheStatus.CACHED_NOWRITE); } // ******************************************* // *** *** // *** LOW-LEVEL PRIVATE METHODS *** // *** FOR SOFTREFERENCE CACHE *** // *** *** // ******************************************* /** * Creates a new cache soft reference to the currently * referenced cache block. */ protected void createCache() { _cache = new SoftReference<T>(_data); } /** * Tries to get the cache block from the cache soft reference * and subsequently clears the cache soft reference if existing. */ protected void getCache() { if (_cache != null) { _data = _cache.get(); clearCache(); } } /** Clears the cache soft reference if existing. */ protected void clearCache() { if (_cache != null) { _cache.clear(); _cache = null; } } protected void updateStatusPinned(boolean add) { if (_data != null) { //data should never be null long size = sizePinned.get(); size += (add ? 1 : -1) * _data.getInMemorySize(); sizePinned.set(Math.max(size, 0)); } } protected long getPinnedSize() { return sizePinned.get(); } // --------- STATIC CACHE INIT/CLEANUP OPERATIONS ---------- public synchronized static void cleanupCacheDir() { //cleanup remaining cached writes LazyWriteBuffer.cleanup(); //delete cache dir and files cleanupCacheDir(true); } /** * Deletes the DML-script-specific caching working dir. * * @param withDir if true, delete directory */ public synchronized static void cleanupCacheDir(boolean withDir) { //get directory name String dir = cacheEvictionLocalFilePath; //clean files with cache prefix if (dir != null) //if previous init cache { File fdir = new File(dir); if (fdir.exists()) { //just for robustness File[] files = fdir.listFiles(); for (File f : files) if (f.getName().startsWith(cacheEvictionLocalFilePrefix)) f.delete(); if (withDir) fdir.delete(); //deletes dir only if empty } } _activeFlag = false; } /** * Inits caching with the default uuid of DMLScript * * @throws IOException if IOException occurs */ public synchronized static void initCaching() throws IOException { initCaching(DMLScript.getUUID()); } /** * Creates the DML-script-specific caching working dir. * * Takes the UUID in order to allow for custom uuid, e.g., for remote parfor caching * * @param uuid ID * @throws IOException if IOException occurs */ public synchronized static void initCaching(String uuid) throws IOException { try { String dir = LocalFileUtils.getWorkingDir(LocalFileUtils.CATEGORY_CACHE); LocalFileUtils.createLocalFileIfNotExist(dir); cacheEvictionLocalFilePath = dir; } catch (DMLRuntimeException e) { throw new IOException(e); } //init write-ahead buffer LazyWriteBuffer.init(); _activeFlag = true; //turn on caching } public static synchronized boolean isCachingActive() { return _activeFlag; } public static synchronized void disableCaching() { _activeFlag = false; } public static synchronized void enableCaching() { _activeFlag = true; } public synchronized boolean moveData(String fName, String outputFormat) throws CacheException { boolean ret = false; try { //export or rename to target file on hdfs if ((isDirty() || (!isEqualOutputFormat(outputFormat) && isEmpty(true))) || (getRDDHandle() != null && !MapReduceTool.existsFileOnHDFS(_hdfsFileName))) { exportData(fName, outputFormat); ret = true; } else if (isEqualOutputFormat(outputFormat)) { MapReduceTool.deleteFileIfExistOnHDFS(fName); MapReduceTool.deleteFileIfExistOnHDFS(fName + ".mtd"); MapReduceTool.renameFileOnHDFS(_hdfsFileName, fName); writeMetaData(fName, outputFormat, null); ret = true; } } catch (Exception e) { throw new CacheException("Move to " + fName + " failed.", e); } return ret; } public String toString() { StringBuilder str = new StringBuilder(); str.append(getClass().getSimpleName()); str.append(": "); str.append(_hdfsFileName + ", "); if (_metaData instanceof NumItemsByEachReducerMetaData) { str.append("NumItemsByEachReducerMetaData"); } else { try { MatrixFormatMetaData md = (MatrixFormatMetaData) _metaData; if (md != null) { MatrixCharacteristics mc = ((MatrixDimensionsMetaData) _metaData).getMatrixCharacteristics(); str.append(mc.toString()); InputInfo ii = md.getInputInfo(); if (ii == null) str.append("null"); else { str.append(", "); str.append(InputInfo.inputInfoToString(ii)); } } else { str.append("null, null"); } } catch (Exception ex) { LOG.error(ex); } } str.append(", "); str.append(isDirty() ? "dirty" : "not-dirty"); return str.toString(); } }