gov.nih.nci.caarray.dataStorage.DataStorageFacade.java Source code

Introduction

Here is the source code for gov.nih.nci.caarray.dataStorage.DataStorageFacade.java
Source

//======================================================================================
// Copyright 5AM Solutions Inc, Yale University
//
// Distributed under the OSI-approved BSD 3-Clause License.
// See http://ncip.github.com/caarray/LICENSE.txt for details.
//======================================================================================
package gov.nih.nci.caarray.dataStorage;

import gov.nih.nci.caarray.dataStorage.spi.DataStorage;

import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.net.URI;
import java.util.Date;
import java.util.Map;
import java.util.Set;

import org.apache.commons.io.IOUtils;
import org.apache.log4j.Logger;

import com.google.common.base.Preconditions;
import com.google.common.base.Predicate;
import com.google.common.collect.Iterables;
import com.google.common.collect.Sets;
import com.google.inject.Inject;
import com.google.inject.name.Named;

/**
 * Facade class to the data storage subsystem. Application code should inject and use this class to interact with the
 * data storage subsystem. In turn, this will delegate to appropriate storage engines as necessary.
 * 
 * @author dkokotov
 */
public class DataStorageFacade {
    private static final Logger LOG = Logger.getLogger(DataStorageFacade.class);

    private final Map<String, DataStorage> dataStorageEngines;
    private final String parsedDataStorage;
    private final String fileDataStorage;

    /**
     * Constructor.
     * 
     * @param dataStorageEngines a map of storage scheme name to the DataStorage implementations which can handle
     *            storing data using that scheme. The storage scheme name is matched against the scheme part of the data
     *            handle URI to locate the storage engine that can retrieve the data for that handle.
     * @param fileDataStorage the name of the storage scheme to use for storing raw file data. This is the scheme used
     *            when new file data needs to be stored; existing file data can use other schemes, as long as they have
     *            corresponding storage engine implementations. Must be present as a key in <i>dataStorageEngines</i>
     * @param parsedDataStorage the name of the storage scheme to use for storing parsed array data. This is the scheme
     *            used when new parsed data needs to be stored; existing parsed data can use other schemes, as long as
     *            they have corresponding storage engine implementations. Must be present as a key in
     *            <i>dataStorageEngines</i>
     */
    @Inject
    public DataStorageFacade(Map<String, DataStorage> dataStorageEngines,
            @Named(DataStorageModule.FILE_DATA_ENGINE) String fileDataStorage,
            @Named(DataStorageModule.PARSED_DATA_ENGINE) String parsedDataStorage) {
        this.dataStorageEngines = dataStorageEngines;
        this.fileDataStorage = fileDataStorage;
        this.parsedDataStorage = parsedDataStorage;
        Preconditions.checkArgument(dataStorageEngines.containsKey(fileDataStorage),
                "No data storage engine named " + fileDataStorage + " defined");
        Preconditions.checkArgument(dataStorageEngines.containsKey(parsedDataStorage),
                "No data storage engine named " + parsedDataStorage + " defined");
    }

    private DataStorage fileDataStorageEngine() {
        return this.dataStorageEngines.get(this.fileDataStorage);
    }

    private DataStorage parsedDataStorageEngine() {
        return this.dataStorageEngines.get(this.parsedDataStorage);
    }

    private DataStorage dataStorageEngine(URI handle) {
        return this.dataStorageEngines.get(handle.getScheme());
    }

    /**
     * Synchronize state of storate engines with state of caArray by removing from storage engines any data blocks that
     * are no longer referenced from any caArray entities. Only data blocks of a minimum threshold age are removed; this
     * is to handle data blocks which are referenced from entities created in an ongoing transaction that has not yet
     * been committed.
     * 
     * @param references the complete set of data handles that are currently held by entities in caArray. Any data
     *            blocks whose handles are not in this set and that are older than <i>minAge</i> will be removed.
     * @param minAge the minimum age for a data block to be eligible for removal, if it is unreferenced, in
     *            milliseconds. This should be at least as long as the maximum transaction time.
     */
    public void removeUnreferencedData(Set<URI> references, long minAge) {
        final Date creationThreshold = new Date(System.currentTimeMillis() - minAge);
        for (final Map.Entry<String, DataStorage> dsEntry : this.dataStorageEngines.entrySet()) {
            final Set<URI> dsReferences = Sets.newHashSet(Iterables.filter(references, new Predicate<URI>() {
                @Override
                public boolean apply(URI uri) {
                    return dsEntry.getKey().equals(uri.getScheme());
                }
            }));
            removeUnreferencedData(dsReferences, creationThreshold, dsEntry.getValue());
        }
    }

    private void removeUnreferencedData(Set<URI> references, Date creationThreshold, DataStorage ds) {
        final Set<URI> urisToRemove = Sets.newHashSet();
        final Iterable<StorageMetadata> existingMetadatas = ds.list();
        for (final StorageMetadata metadata : existingMetadatas) {
            if (metadata.getCreationTimestamp().before(creationThreshold)
                    && !references.contains(metadata.getHandle())) {
                urisToRemove.add(metadata.getHandle());
            }
        }
        ds.remove(urisToRemove);
    }

    /**
     * Return a <code>java.io.File</code> which will hold the data identified by the given handle. The client should
     * eventually call releaseFile() for this <code>handle</code>.
     * 
     * @param handle the handle referencing the data to be obtained.
     * @param compressed whether the file should hold the compressed (if true) or uncompressed (if false) view of the
     *            data. Compressed data uses GZip compression.
     * @return the <code>java.io.File</code> with the referenced data, possibly compressed.
     */
    public File openFile(URI handle, boolean compressed) {
        final DataStorage ds = dataStorageEngine(handle);
        if (ds == null) {
            throw new DataStoreException("No storage engine found for handle: " + handle);
        }

        return ds.openFile(handle, compressed);
    }

    /**
     * Releases the file that was previously obtained from the data storage subsystem for the data block with given
     * handle.
     * 
     * @param handle the handle referencing the data block that was obtained as a file via a call to openFile.
     * @param compressed whether the openFile call was to get compressed or uncompressed data.
     */
    public void releaseFile(URI handle, boolean compressed) {
        final DataStorage ds = dataStorageEngine(handle);
        if (ds == null) {
            throw new DataStoreException("No storage engine found for handle: " + handle);
        }
        ds.releaseFile(handle, compressed);
    }

    /**
     * Return a <code>java.io.InputStream</code> which can be used to read the data identified by the given handle. The
     * client should make sure to close the stream once it is done with it. Implementors may use a temporary file to
     * hold the data, but may also provide a stream that accesses the data from its original location.
     * 
     * @param handle the handle referencing the data to be obtained.
     * @param compressed whether the stream should hold the compressed (if true) or uncompressed (if false) view of the
     *            data. Compressed data uses GZip compression.
     * @return the <code>java.io.InputStream</code> with the referenced data, possibly compressed.
     */
    public InputStream openInputStream(URI handle, boolean compressed) {
        final DataStorage ds = dataStorageEngine(handle);
        if (ds == null) {
            throw new DataStoreException("No storage engine found for handle: " + handle);
        }

        return ds.openInputStream(handle, compressed);
    }

    /**
     * Write the uncompressed data identified by the given handle to the given OutputStream. The OutputStream is not
     * closed at the end.
     * 
     * @param handle the handle referencing the data to be obtained.
     * @param os the OutputStream to write the data to.
     * 
     * @throws IOException if there is an error writing to the stream or reading the data.
     */
    public void copyDataToStream(URI handle, OutputStream os) throws IOException {
        final InputStream in = openInputStream(handle, false);
        try {
            IOUtils.copy(in, os);
        } finally {
            IOUtils.closeQuietly(in);
        }
    }

    /**
     * Add a new block of raw file data to the storage system. This should correspond to the contents of a CaArrayFile.
     * 
     * @param in a stream from which data will be read
     * @param compressed whether the data in the stream is compressed or not. If the data is compressed, GZip
     *            compression should be used.
     * @return a StorageMetadata object describing the added block of data, including a handle for later access to the
     *         data
     */
    public StorageMetadata addFile(InputStream in, boolean compressed) {
        return fileDataStorageEngine().add(in, compressed);
    }

    /**
     * Add a new block of parsed data to the storage system. This should correspond to the contents of a
     * AbstractDataColumn.
     * 
     * @param in a stream from which data will be read
     * @param compressed whether the data in the stream is compressed or not. If the data is compressed, GZip
     *            compression should be used.
     * @return a StorageMetadata object describing the added block of data, including a handle for later access to the
     *         data
     */
    public StorageMetadata addParsed(InputStream in, boolean compressed) {
        return parsedDataStorageEngine().add(in, compressed);
    }

    /**
     * Appends file data to an existing block in the storage system. If no handle is given, a new block is created.
     * The client must call completeFile() after the last chunk has been added.
     * 
     * @param handle the handle referencing the data to be appended to.
     * @param in a stream from which data will be read.
     * @return a StorageMetadata object describing the added block of data, must include a handle for later access to
     *         the data and the size of the partially uploaded file.
     */
    public StorageMetadata addFileChunk(URI handle, InputStream in) {
        return fileDataStorageEngine().addChunk(handle, in);
    }

    /**
     * When a file is added in chunks, this method must be called after the final chunk is added.  This will perform
     * any necessary processing after all the chunks have been put together.
     * 
     * @param handle the handle referencing the data.
     * @return a StorageMetadata object describing the block of data, must include a handle for later access to the
     *         data and the compressed/uncompressed sizes of the file.
     */
    public StorageMetadata finalizeChunkedFile(URI handle) {
        return fileDataStorageEngine().finalizeChunkedFile(handle);
    }
}