org.bdval.cache.TableCache.java Source code

Java tutorial

Introduction

Here is the source code for org.bdval.cache.TableCache.java

Source

/*
 * Copyright (C) 2008-2010 Institute for Computational Biomedicine,
 *                         Weill Medical College of Cornell University
 *
 *  This program is free software; you can redistribute it and/or modify
 *  it under the terms of the GNU General Public License as published by
 *  the Free Software Foundation; either version 3 of the License, or
 *  (at your option) any later version.
 *
 *  This program is distributed in the hope that it will be useful,
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *  GNU General Public License for more details.
 *
 *  You should have received a copy of the GNU General Public License
 *  along with this program.  If not, see <http://www.gnu.org/licenses/>.
 */

package org.bdval.cache;

import edu.cornell.med.icb.geo.tools.FullGeneList;
import edu.cornell.med.icb.geo.tools.GeneList;
import edu.mssm.crover.tables.ArrayTable;
import edu.mssm.crover.tables.InvalidColumnException;
import edu.mssm.crover.tables.Table;
import edu.mssm.crover.tables.TypeMismatchException;
import it.unimi.dsi.fastutil.io.FastBufferedInputStream;
import it.unimi.dsi.fastutil.io.FastBufferedOutputStream;
import it.unimi.dsi.fastutil.objects.ObjectOpenHashSet;
import it.unimi.dsi.fastutil.objects.ObjectSet;
import org.apache.commons.io.IOUtils;
import org.apache.commons.lang.StringUtils;
import org.apache.log4j.Logger;

import java.io.DataInputStream;
import java.io.DataOutputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;

/**
 * Class to cache (read/write) Tables.
 *
 * @author Kevin Dorff
 */
public class TableCache {
    /**
     * The logger to use.
     */
    private static final Logger LOG = Logger.getLogger(TableCache.class);

    /**
     * The directory to store the tables to.
     */
    private String cacheDirectoryName;

    /**
     * Template used to create cached table filenames.
     */
    private static final String S_CACHED_FILENAME = "%s/cached-table-%s-%s-%d.bin";

    /**
     * Create a TableCache storing tables in the "current working directory".
     *
     * @throws IOException the cacheDirectory doesn't exist or isn't a directory
     *                     or can't be read from or written to.
     */
    public TableCache() throws IOException {
        this(new File("."));
    }

    /**
     * Create a TableCache with the cache located in the
     * cacheDirectory directory.
     *
     * @param cacheDirectory the directory in which to store the cached tables
     * @throws IOException the cacheDirectory doesn't exist or isn't a directory
     *                     or can't be read from or written to.
     */
    public TableCache(final File cacheDirectory) throws IOException {
        super();
        if (!cacheDirectory.exists()) {
            throw new IOException("Specified directory does not exist " + cacheDirectory.getCanonicalPath());
        }
        if (!cacheDirectory.isDirectory()) {
            throw new IOException("Specified file is not a directory " + cacheDirectory.getCanonicalPath());
        }
        if (!cacheDirectory.canRead()) {
            throw new IOException("Specified directory is not readable " + cacheDirectory.getCanonicalPath());
        }
        if (!cacheDirectory.canWrite()) {
            throw new IOException("Specified directory is not writable " + cacheDirectory.getCanonicalPath());
        }
        cacheDirectoryName = cacheDirectory.getCanonicalPath();
        if (!cacheDirectoryName.endsWith("/") && !cacheDirectoryName.endsWith("\\")) {
            cacheDirectoryName += "/";
        }
    }

    /**
     * Check if a table has been saved to the cache.
     *
     * @param splitId     The split id
     * @param splitType   The Split type
     * @param datasetName The dataset name
     * @return true if table a table with the specified paramters has been cached
     */
    public boolean isTableCached(final int splitId, final String splitType, final String datasetName) {
        final File cachedTableFile = getCachedTableFile(splitId, splitType, datasetName);
        final String filename;
        try {
            filename = cachedTableFile.getCanonicalPath();
        } catch (IOException e) {
            return false;
        }
        synchronized (filename.intern()) {
            return cachedTableFile.exists();
        }
    }

    /**
     * Remove a specific table from the cache.
     *
     * @param splitId     The split id
     * @param splitType   The Split type
     * @param datasetName The dataset name
     */
    public void clearFromCache(final int splitId, final String splitType, final String datasetName) {
        final File cachedTableFile = getCachedTableFile(splitId, splitType, datasetName);
        if (cachedTableFile.exists()) {
            if (!cachedTableFile.delete()) {
                LOG.warn("Unable to delete cache file " + cachedTableFile.getAbsolutePath());
            }
        }
    }

    /**
     * Retrieve a cached table from the cache. isTableCached should be called
     * before calling this to verify that the table is cached. Null will
     * be returned if the table wasn't in the cache or there was a problem
     * reading the table.
     *
     * @param splitId     The split id
     * @param splitType   The Split type
     * @param datasetName The dataset name
     * @return the table read from the cache (or null if it could not be read)
     */
    public Table getCachedTable(final int splitId, final String splitType, final String datasetName) {
        return getCachedTable(splitId, splitType, datasetName, null);
    }

    /**
     * Retrieve a cached table from the cache. isTableCached should be called
     * before calling this to verify that the table is cached. Null will
     * be returned if the table wasn't in the cache or there was a problem
     * reading the table.
     *
     * @param splitId        The split id
     * @param splitType      The Split type
     * @param datasetName    The dataset name
     * @param geneListFilter A gene list. If not null, the gene list is queried with each double
     * column identifier to determine if the identifier is contained in the gene list. Columns
     * that do not match the gene list are not loaded.
     * @return the table read from the cache (or null if it could not be read)
     */
    public Table getCachedTable(final int splitId, final String splitType, final String datasetName,
            final GeneList geneListFilter) {
        final File cachedTableFile = getCachedTableFile(splitId, splitType, datasetName);

        if (geneListFilter != null && !(geneListFilter instanceof FullGeneList)) {
            final ObjectSet<CharSequence> tableColumnIds = getTableColumnIds(splitId, splitType, datasetName);
            geneListFilter.calculateProbeSetSelection(tableColumnIds);
        }
        DataInputStream dataInput = null;
        try {
            dataInput = new DataInputStream(new FastBufferedInputStream(new FileInputStream(cachedTableFile)));
            final ArrayTable result = new ArrayTable();
            final int numberOfColumns = dataInput.readInt();
            LOG.info("Reading cached table with " + numberOfColumns + " columns");
            for (int i = 0; i < numberOfColumns; i++) {
                final String colType = dataInput.readUTF();
                final String colId = dataInput.readUTF();

                if ("s".equals(colType)) {
                    final int numStrings = dataInput.readInt();
                    resize(result, numStrings);
                    final int columnIndex = result.addColumn(colId, String.class);
                    for (int j = 0; j < numStrings; j++) {
                        result.appendObject(columnIndex, dataInput.readUTF());
                    }
                } else if ("d".equals(colType)) {
                    final int numDoubles = dataInput.readInt();
                    resize(result, numDoubles);
                    if (geneListFilter != null && !geneListFilter.isProbesetInList(colId)) {
                        // the column does not match the gene list. Skip this column
                        // we don't need to read these doubles, just skip them;
                        final int numBytes = Double.SIZE * numDoubles / 8;
                        final int actualBytes = dataInput.skipBytes(numBytes);
                        if (actualBytes != numBytes) {
                            LOG.warn("actual bytes skipped (" + actualBytes + ") does" + "not equal expected of "
                                    + numBytes);
                        }
                        continue;
                    }

                    final int columnIndex = result.addColumn(colId, double.class);
                    for (int j = 0; j < numDoubles; j++) {
                        result.appendDoubleValue(columnIndex, dataInput.readDouble());
                    }

                } else {
                    LOG.error("UNKNOWN COLUMN TYPE " + colType + " cannot read cached table from file "
                            + filenameOf(cachedTableFile));
                    return null;
                }
            }
            return result;
        } catch (IOException e) {
            LOG.error(e);
            return null;
        } catch (TypeMismatchException e) {
            LOG.error("TypeMismatchException adding data to Table " + filenameOf(cachedTableFile), e);
            return null;
        } finally {
            IOUtils.closeQuietly(dataInput);
        }
    }

    public ObjectSet<CharSequence> getTableColumnIds(final int splitId, final String splitType,
            final String datasetName) {
        final File cachedTableFile = getCachedTableFile(splitId, splitType, datasetName);
        final ObjectSet<CharSequence> result = new ObjectOpenHashSet<CharSequence>();
        DataInputStream dataInput = null;
        try {
            dataInput = new DataInputStream(new FastBufferedInputStream(new FileInputStream(cachedTableFile)));
            final int numberOfColumns = dataInput.readInt();
            LOG.info("Reading cached table with " + numberOfColumns + " columns");
            for (int i = 0; i < numberOfColumns; i++) {
                final String colType = dataInput.readUTF();
                final String colId = dataInput.readUTF();

                result.add(colId);

                if ("s".equals(colType)) {
                    final int numStrings = dataInput.readInt();

                    for (int j = 0; j < numStrings; j++) {
                        dataInput.readUTF();
                    }
                } else if ("d".equals(colType)) {
                    final int numDoubles = dataInput.readInt();
                    // we don't need to read these doubles, just skip them;
                    final int numBytes = Double.SIZE * numDoubles / 8;
                    final int actualBytes = dataInput.skipBytes(numBytes);
                    if (actualBytes != numBytes) {
                        LOG.warn("actual bytes skipped (" + actualBytes + ") does " + "not equal expected of "
                                + numBytes);
                    }
                } else {
                    LOG.error("UNKNOWN COLUMN TYPE " + colType + " cannot read cached table from file "
                            + filenameOf(cachedTableFile));
                    return null;
                }
            }
            return result;
        } catch (IOException e) {
            LOG.error("Error getting column ids", e);
            return null;
        } finally {
            IOUtils.closeQuietly(dataInput);
        }
    }

    private void resize(final ArrayTable result, final int numRows) {
        if (LOG.isTraceEnabled()) {
            LOG.trace("Resizing table to " + numRows + 1);
        }
        result.setInitialSize(numRows + 1);
        result.setChunk(numRows + 1);
    }

    /**
     * Given the specified paramters, save the table to the cache.
     *
     * @param splitId     The split id
     * @param splitType   The Split type
     * @param datasetName The dataset name
     * @param table       the Table to save.
     */
    public void saveTableToCache(final int splitId, final String splitType, final String datasetName,
            final Table table) {
        DataOutputStream dataOutput = null;
        try {
            if (!checkTableConfiguration(table)) {
                return;
            }

            final int numColumns = table.getColumnNumber();

            final File cachedTableFile = getCachedTableFile(splitId, splitType, datasetName);
            dataOutput = new DataOutputStream(new FastBufferedOutputStream(new FileOutputStream(cachedTableFile)));

            // Write the number of columns
            dataOutput.writeInt(numColumns);
            LOG.info("Writing " + numColumns + " columns");
            int numWritten = 0;
            for (int i = 0; i < numColumns; i++) {
                // For each column write if it is a "d"ouble or "s"tring column
                final String id = table.getIdentifier(i);
                if (table.getType(i) == String.class) {
                    dataOutput.writeUTF("s");
                    dataOutput.writeUTF(id);
                    final String[] stringColumnData = table.getStrings(id);
                    final int numStrings = stringColumnData.length;
                    dataOutput.writeInt(numStrings);
                    for (final String stringColumnItem : stringColumnData) {
                        // Each String
                        dataOutput.writeUTF(stringColumnItem);
                    }
                    numWritten++;
                } else if (table.getType(i) == double.class) {
                    dataOutput.writeUTF("d");
                    dataOutput.writeUTF(id);
                    final double[] doubleColumnData = table.getDoubles(id);
                    final int numDoubles = doubleColumnData.length;
                    dataOutput.writeInt(numDoubles);
                    for (final double doubleColumnItem : doubleColumnData) {
                        // Each double
                        dataOutput.writeDouble(doubleColumnItem);
                    }
                    numWritten++;
                }
            }

            dataOutput.flush();

            LOG.info("Wrote " + numWritten + " columns");
            LOG.info("++ SAVED TABLE TO CACHE for split-id=" + splitId + ", split-type=" + splitType
                    + ", dataset-name=" + datasetName);
        } catch (IOException e) {
            LOG.error("Cannot cache table. ", e);
        } catch (InvalidColumnException e) {
            LOG.error("Invalid table data", e);
        } finally {
            IOUtils.closeQuietly(dataOutput);
        }
    }

    /**
     * Given the specified paramters, return the File that will
     * be used for by TableCache.
     *
     * @param splitId     the split id
     * @param splitType   the split type
     * @param datasetName the dataset name
     * @return the File for the TableCache
     */
    private File getCachedTableFile(final int splitId, final String splitType, final String datasetName) {
        final String cachedTableFilename = String.format(S_CACHED_FILENAME, cacheDirectoryName, datasetName,
                StringUtils.defaultString(splitType, "complete"), splitId);
        return new File(cachedTableFilename);
    }

    /**
     * Return the complete path filename for a file.
     *
     * @param file the file to get the filename of
     * @return the complete path filename for the file
     */
    private String filenameOf(final File file) {
        try {
            return file.getCanonicalPath();
        } catch (IOException e) {
            return "<error obtaining filename>";
        }
    }

    /**
     * Verify the table is configured appropriately,
     * two columns and only String and double columns.
     *
     * @param table the table to verify
     * @return true if the table checks out appropriately.
     */
    private boolean checkTableConfiguration(final Table table) {
        final int numColumns = table.getColumnNumber();
        if (numColumns <= 1) {
            LOG.error("TableCache only supports tables with at least two columns.");
        }
        for (int i = 0; i < numColumns; i++) {
            if (table.getType(i) != String.class && table.getType(i) != double.class) {
                LOG.error(
                        "TableCache only supports tables where the " + "columns are String.class or double.class");
                return false;
            }
        }
        return true;
    }
}