org.mskcc.cbio.importer.io.internal.FileUtilsImpl.java Source code

Introduction

Here is the source code for org.mskcc.cbio.importer.io.internal.FileUtilsImpl.java
Source

/** Copyright (c) 2012 Memorial Sloan-Kettering Cancer Center.
**
** This library is free software; you can redistribute it and/or modify it
** under the terms of the GNU Lesser General Public License as published
** by the Free Software Foundation; either version 2.1 of the License, or
** any later version.
**
** This library is distributed in the hope that it will be useful, but
** WITHOUT ANY WARRANTY, WITHOUT EVEN THE IMPLIED WARRANTY OF
** MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE.  The software and
** documentation provided hereunder is on an "as is" basis, and
** Memorial Sloan-Kettering Cancer Center 
** has no obligations to provide maintenance, support,
** updates, enhancements or modifications.  In no event shall
** Memorial Sloan-Kettering Cancer Center
** be liable to any party for direct, indirect, special,
** incidental or consequential damages, including lost profits, arising
** out of the use of this software and its documentation, even if
** Memorial Sloan-Kettering Cancer Center 
** has been advised of the possibility of such damage.  See
** the GNU Lesser General Public License for more details.
**
** You should have received a copy of the GNU Lesser General Public License
** along with this library; if not, write to the Free Software Foundation,
** Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA.
**/

// package
package org.mskcc.cbio.importer.io.internal;

// imports
import org.mskcc.cbio.importer.Config;
import org.mskcc.cbio.importer.CaseIDs;
import org.mskcc.cbio.importer.FileUtils;
import org.mskcc.cbio.importer.Converter;
import org.mskcc.cbio.importer.model.ImportDataRecord;
import org.mskcc.cbio.importer.model.PortalMetadata;
import org.mskcc.cbio.importer.model.DataMatrix;
import org.mskcc.cbio.importer.model.DatatypeMetadata;
import org.mskcc.cbio.importer.model.CaseListMetadata;
import org.mskcc.cbio.cgds.scripts.NormalizeExpressionLevels;
import org.mskcc.cbio.importer.model.CancerStudyMetadata;
import org.mskcc.cbio.importer.model.DataSourcesMetadata;
import org.mskcc.cbio.importer.util.MetadataUtils;
import org.mskcc.cbio.importer.util.Shell;

import org.mskcc.cbio.liftover.Hg18ToHg19;
import org.mskcc.cbio.oncotator.OncotateTool;
import org.mskcc.cbio.mutassessor.MutationAssessorTool;

import org.apache.commons.io.*;
import org.apache.commons.codec.digest.DigestUtils;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;

import org.apache.commons.compress.compressors.gzip.GzipUtils;
import org.apache.commons.compress.archivers.tar.TarArchiveEntry;
import org.apache.commons.compress.archivers.tar.TarArchiveInputStream;
import org.apache.commons.compress.compressors.gzip.GzipCompressorInputStream;

import org.springframework.beans.factory.annotation.Value;

import java.io.File;
import java.io.PrintWriter;
import java.io.InputStream;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.ByteArrayInputStream;

import java.lang.reflect.Constructor;

import java.net.URL;
import java.util.Arrays;
import java.util.Collection;
import java.util.HashSet;
import java.util.List;
import java.util.ArrayList;
import java.util.LinkedList;
import java.util.zip.GZIPInputStream;

/**
 * Class which implements the FileUtils interface.
 */
class FileUtilsImpl implements org.mskcc.cbio.importer.FileUtils {

    // used in unzip method
    private static int BUFFER = 2048;

    // our logger
    private static Log LOG = LogFactory.getLog(FileUtilsImpl.class);

    // ref to config
    private Config config;

    // location of lift over binary
    private String liftoverBinary;

    @Value("${liftover_binary}")
    public void setLiftOverBinary(String property) {
        this.liftoverBinary = property;
    }

    public String getLiftOverBinary() {
        return MetadataUtils.getCanonicalPath(liftoverBinary);
    }

    // location of lift over chain
    private String liftoverChain;

    @Value("${liftover_chain_file}")
    public void setLiftOverChain(String property) {
        this.liftoverChain = property;
    }

    public String getLiftOverChain() {
        return MetadataUtils.getCanonicalPath(liftoverChain);
    }

    /**
     * Constructor.
      *
      * @param config Config
     */
    public FileUtilsImpl(Config config) {

        // set members
        this.config = config;
    }

    /**
     * Computes the MD5 digest for the given file.
     * Returns the 32 digit hexadecimal.
     *
     * @param filename String
     * @return String
     * @throws Exception
     */
    @Override
    public String getMD5Digest(File file) throws Exception {

        if (LOG.isInfoEnabled()) {
            LOG.info("getMD5Digest(): " + file.getCanonicalPath());
        }

        String toReturn = "";
        InputStream is = org.apache.commons.io.FileUtils.openInputStream(file);
        try {
            toReturn = DigestUtils.md5Hex(is);
        } finally {
            IOUtils.closeQuietly(is);
        }

        // outta here
        return toReturn;
    }

    /**
     * Reads the precomputed md5 digest out of a .md5 file (firehose).
      * Assume the file only contains one line wit checksum.
     *
     * @param file File
     * @return String
     * @throws Exception 
     */
    @Override
    public String getPrecomputedMD5Digest(File file) throws Exception {

        if (LOG.isInfoEnabled()) {
            LOG.info("getPrecomputedMD5Digest(): " + file.getCanonicalPath());
        }

        String toReturn = "";
        LineIterator it = org.apache.commons.io.FileUtils.lineIterator(file);
        try {
            while (it.hasNext()) {
                String content = it.nextLine();
                if (content.split(" ").length == 2) {
                    toReturn = content.split(" ")[0].toUpperCase();
                }
            }
        } finally {
            LineIterator.closeQuietly(it);
        }

        // outta here
        return toReturn;
    }

    /**
     * Makes a directory, including parent directories if necessary.
     *
     * @param directory File
    * @throws Exception
     */
    @Override
    public void makeDirectory(File directory) throws Exception {

        org.apache.commons.io.FileUtils.forceMkdir(directory);
    }

    /**
     * Deletes a directory recursively.
     *
     * @param directory File
    * @throws Exception
     */
    @Override
    public void deleteDirectory(File directory) throws Exception {

        org.apache.commons.io.FileUtils.deleteDirectory(directory);
    }

    /**
     * Lists all files in a given directory and its subdirectories.
     *
     * @param directory File
     * @param extensions String[]
     * @param recursize boolean
     * @return Collection<File>
    * @throws Exception
     */
    @Override
    public Collection<File> listFiles(File directory, String[] extensions, boolean recursive) throws Exception {

        return org.apache.commons.io.FileUtils.listFiles(directory, extensions, recursive);
    }

    /**
     * Returns the contents of the datafile as specified by ImportDataRecord
      * in an DataMatrix.  May return null if there is a problem reading the file.
     *
     * @param importDataRecord ImportDataRecord
     * @return DataMatrix
     * @throws Exception
     */
    @Override
    public DataMatrix getFileContents(ImportDataRecord importDataRecord) throws Exception {

        if (LOG.isInfoEnabled()) {
            LOG.info("getFileContents(): " + importDataRecord);
        }

        // determine path to file (does override file exist?)
        String fileCanonicalPath = importDataRecord.getCanonicalPathToData();

        // get filedata inputstream
        InputStream fileContents;

        // data can be compressed
        if (GzipUtils.isCompressedFilename(fileCanonicalPath.toLowerCase())) {
            if (LOG.isInfoEnabled()) {
                LOG.info("getFileContents(): processing file: " + fileCanonicalPath);
            }
            fileContents = readContent(importDataRecord,
                    org.apache.commons.io.FileUtils.openInputStream(new File(fileCanonicalPath)));
        } else {
            if (LOG.isInfoEnabled()) {
                LOG.info("getFileContents(): processing file: " + fileCanonicalPath);
            }
            fileContents = org.apache.commons.io.FileUtils.openInputStream(new File(fileCanonicalPath));
        }

        // outta here
        return getDataMatrix(fileContents);
    }

    /**
     * Get the case list from the staging file.
     *
     * @param caseIDs CaseIDs;
      * @param portalMetadata PortalMetadata
     * @param cancerStudyMetadata CancerStudyMetadata
     * @param stagingFilename String
     * @return List<String>
     * @throws Exception
     */
    @Override
    public List<String> getCaseListFromStagingFile(CaseIDs caseIDs, PortalMetadata portalMetadata,
            CancerStudyMetadata cancerStudyMetadata, String stagingFilename) throws Exception {

        if (LOG.isInfoEnabled()) {
            LOG.info("getCaseListFromStagingFile(): " + stagingFilename);
        }

        // we use set here
        HashSet<String> caseSet = new HashSet<String>();

        // staging file
        File stagingFile = org.apache.commons.io.FileUtils.getFile(portalMetadata.getStagingDirectory(),
                cancerStudyMetadata.getStudyPath(), stagingFilename);
        // sanity check
        if (!stagingFile.exists()) {
            return new ArrayList<String>();
        }

        // iterate over all rows in file
        org.apache.commons.io.LineIterator it = org.apache.commons.io.FileUtils.lineIterator(stagingFile);
        try {
            int mafCaseIDColumnIndex = 0;
            boolean processHeader = true;
            while (it.hasNext()) {
                // create a string list from row in file
                List<String> thisRow = Arrays.asList(it.nextLine().split(Converter.VALUE_DELIMITER));
                // is this the header file?
                if (processHeader) {
                    // look for MAF file case id column header
                    mafCaseIDColumnIndex = thisRow.indexOf(Converter.MUTATION_CASE_ID_COLUMN_HEADER);
                    // this is not a MAF file, header contains the case ids, return here
                    if (mafCaseIDColumnIndex == -1) {
                        for (String potentialCaseID : thisRow) {
                            if (caseIDs.isTumorCaseID(potentialCaseID)) {
                                caseSet.add(caseIDs.convertCaseID(potentialCaseID));
                            }
                        }
                        break;
                    }
                    processHeader = false;
                    continue;
                }
                // we want to add the value at mafCaseIDColumnIndex into return set - this is a case ID
                String potentialCaseID = thisRow.get(mafCaseIDColumnIndex);
                if (caseIDs.isTumorCaseID(potentialCaseID)) {
                    caseSet.add(caseIDs.convertCaseID(potentialCaseID));
                }
            }
        } finally {
            it.close();
        }

        // outta here
        return new ArrayList<String>(caseSet);
    }

    /**
     * Creates a temporary file with the given contents.
     *
     * @param filename String
     * @param fileContent String
     * @return File
     * @throws Exception
     */
    @Override
    public File createTmpFileWithContents(String filename, String fileContent) throws Exception {

        File tmpFile = org.apache.commons.io.FileUtils
                .getFile(org.apache.commons.io.FileUtils.getTempDirectoryPath(), filename);
        org.apache.commons.io.FileUtils.writeStringToFile(tmpFile, fileContent, false);
        return tmpFile;
    }

    /**
     * Creates (or overwrites) the given file with the given contents. Filename
     * is canonical path/filename.
     *
     * @param filename String
     * @param fileContent String
     * @return File
     * @throws Exception
     */
    @Override
    public File createFileWithContents(String filename, String fileContent) throws Exception {

        File file = org.apache.commons.io.FileUtils.getFile(filename);
        org.apache.commons.io.FileUtils.writeStringToFile(file, fileContent, false);

        // outta here
        return file;
    }

    /**
     * Downloads the given file specified via url to the given canonicalDestination.
     *
     * @param urlSource String
     * @param urlDestination String
     * @throws Exception
     */
    @Override
    public void downloadFile(String urlSource, String urlDestination) throws Exception {

        // sanity check
        if (urlSource == null || urlSource.length() == 0 || urlDestination == null
                || urlDestination.length() == 0) {
            throw new IllegalArgumentException("downloadFile(): urlSource or urlDestination argument is null...");
        }

        // URLs for given parameters
        URL source = new URL(urlSource);
        URL destination = new URL(urlDestination);

        // we have a compressed file
        if (GzipUtils.isCompressedFilename(urlSource)) {
            // downlod to temp destination
            File tempDestinationFile = org.apache.commons.io.FileUtils.getFile(
                    org.apache.commons.io.FileUtils.getTempDirectory(), new File(source.getFile()).getName());
            if (LOG.isInfoEnabled()) {
                LOG.info("downloadFile(), " + urlSource + ", this may take a while...");
            }
            org.apache.commons.io.FileUtils.copyURLToFile(source, tempDestinationFile);
            if (LOG.isInfoEnabled()) {
                LOG.info("downloadFile(), gunzip: we have compressed file, decompressing...");
            }
            // decompress the file
            gunzip(tempDestinationFile.getCanonicalPath());
            if (LOG.isInfoEnabled()) {
                LOG.info("downloadFile(), gunzip complete...");
            }
            // move temp/decompressed file to final destination
            File destinationFile = new File(destination.getFile());
            if (destinationFile.exists()) {
                org.apache.commons.io.FileUtils.forceDelete(destinationFile);
            }
            org.apache.commons.io.FileUtils.moveFile(
                    org.apache.commons.io.FileUtils
                            .getFile(GzipUtils.getUncompressedFilename(tempDestinationFile.getCanonicalPath())),
                    destinationFile);

            // lets cleanup after ourselves - remove compressed file
            tempDestinationFile.delete();
        }
        // uncompressed file, download directry to urlDestination
        else {
            if (LOG.isInfoEnabled()) {
                LOG.info("downloadFile(), " + urlSource + ", this may take a while...");
            }
            org.apache.commons.io.FileUtils.copyURLToFile(source,
                    org.apache.commons.io.FileUtils.getFile(destination.getFile()));
        }
    }

    /**
     * Returns a line iterator over the given file.
     *
     * @param urlFile String
     * @throws Exception
     */
    @Override
    public LineIterator getFileContents(String urlFile) throws Exception {
        return org.apache.commons.io.FileUtils.lineIterator(new File(new URL(urlFile).getFile()));
    }

    /**
     * Method which writes the cancer study metadata file.
     *
      * @param portalMetadata PortalMetadata
     * @param cancerStudyMetadata CancerStudyMetadata
     * @param numCases int
     * @throws Exception
     *
     */
    @Override
    public void writeCancerStudyMetadataFile(PortalMetadata portalMetadata, CancerStudyMetadata cancerStudyMetadata,
            int numCases) throws Exception {

        File metaFile = org.apache.commons.io.FileUtils.getFile(portalMetadata.getStagingDirectory(),
                cancerStudyMetadata.getStudyPath(), cancerStudyMetadata.getCancerStudyMetadataFilename());
        if (LOG.isInfoEnabled()) {
            LOG.info("writeMetadataFile(), meta file: " + metaFile);
        }
        PrintWriter writer = new PrintWriter(org.apache.commons.io.FileUtils.openOutputStream(metaFile, false));
        writer.print("type_of_cancer: " + cancerStudyMetadata.getTumorType() + "\n");
        writer.print("cancer_study_identifier: " + cancerStudyMetadata + "\n");
        String name = (cancerStudyMetadata.getName().length() > 0) ? cancerStudyMetadata.getName()
                : cancerStudyMetadata.getTumorTypeMetadata().getName();
        name = name.replaceAll(CancerStudyMetadata.TUMOR_TYPE_NAME_TAG,
                cancerStudyMetadata.getTumorTypeMetadata().getName());
        writer.print("name: " + name + "\n");
        String description = cancerStudyMetadata.getDescription();
        description = description.replaceAll(CancerStudyMetadata.NUM_CASES_TAG, Integer.toString(numCases));
        description = description.replaceAll(CancerStudyMetadata.TUMOR_TYPE_TAG,
                cancerStudyMetadata.getTumorTypeMetadata().getType());
        description = description.replaceAll(CancerStudyMetadata.TUMOR_TYPE_NAME_TAG,
                cancerStudyMetadata.getTumorTypeMetadata().getName());
        writer.print("description: " + description + "\n");
        if (cancerStudyMetadata.getCitation().length() > 0) {
            writer.print("citation: " + cancerStudyMetadata.getCitation() + "\n");
        }
        if (cancerStudyMetadata.getPMID().length() > 0) {
            writer.print("pmid: " + cancerStudyMetadata.getPMID() + "\n");
        }

        writer.flush();
        writer.close();
    }

    /**
     * Method which writes a metadata file for the
     * given DatatypeMetadata.  DataMatrix may be null.
     *
      * @param portalMetadata PortalMetadata
     * @param cancerStudyMetadata CancerStudyMetadata
     * @param datatypeMetadata DatatypeMetadata
     * @param dataMatrix DataMatrix
     * @throws Exception
     *
     */
    @Override
    public void writeMetadataFile(PortalMetadata portalMetadata, CancerStudyMetadata cancerStudyMetadata,
            DatatypeMetadata datatypeMetadata, DataMatrix dataMatrix) throws Exception {

        File metaFile = org.apache.commons.io.FileUtils.getFile(portalMetadata.getStagingDirectory(),
                cancerStudyMetadata.getStudyPath(), datatypeMetadata.getMetaFilename());
        if (LOG.isInfoEnabled()) {
            LOG.info("writeMetadataFile(), meta file: " + metaFile);
        }
        PrintWriter writer = new PrintWriter(org.apache.commons.io.FileUtils.openOutputStream(metaFile, false));
        writer.print("cancer_study_identifier: " + cancerStudyMetadata + "\n");
        writer.print("genetic_alteration_type: " + datatypeMetadata.getMetaGeneticAlterationType() + "\n");
        String stableID = datatypeMetadata.getMetaStableID();
        stableID = stableID.replaceAll(DatatypeMetadata.CANCER_STUDY_TAG, cancerStudyMetadata.toString());
        writer.print("stable_id: " + stableID + "\n");
        writer.print("show_profile_in_analysis_tab: " + datatypeMetadata.getMetaShowProfileInAnalysisTab() + "\n");
        String profileDescription = datatypeMetadata.getMetaProfileDescription();
        if (dataMatrix != null) {
            profileDescription = profileDescription.replaceAll(DatatypeMetadata.NUM_GENES_TAG,
                    Integer.toString(dataMatrix.getGeneIDs().size()));
            profileDescription = profileDescription.replaceAll(DatatypeMetadata.NUM_CASES_TAG,
                    Integer.toString(dataMatrix.getCaseIDs().size()));
        }
        profileDescription = profileDescription.replaceAll(DatatypeMetadata.TUMOR_TYPE_TAG,
                cancerStudyMetadata.getTumorType());
        writer.print("profile_description: " + profileDescription + "\n");
        writer.print("profile_name: " + datatypeMetadata.getMetaProfileName() + "\n");
        writer.flush();
        writer.close();
    }

    /**
     * Creates a staging file with contents from the given DataMatrix.
     *
      * @param portalMetadata PortalMetadata
     * @param cancerStudyMetadata CancerStudyMetadata
     * @param datatypeMetadata DatatypeMetadata
     * @param dataMatrix DataMatrix
     * @throws Exception
     */
    @Override
    public void writeStagingFile(PortalMetadata portalMetadata, CancerStudyMetadata cancerStudyMetadata,
            DatatypeMetadata datatypeMetadata, DataMatrix dataMatrix) throws Exception {

        // staging file
        String stagingFilename = datatypeMetadata.getStagingFilename();
        stagingFilename = stagingFilename.replaceAll(DatatypeMetadata.CANCER_STUDY_TAG,
                cancerStudyMetadata.toString());
        File stagingFile = org.apache.commons.io.FileUtils.getFile(portalMetadata.getStagingDirectory(),
                cancerStudyMetadata.getStudyPath(), stagingFilename);

        if (LOG.isInfoEnabled()) {
            LOG.info("writingStagingFile(), staging file: " + stagingFile);
        }

        FileOutputStream out = org.apache.commons.io.FileUtils.openOutputStream(stagingFile, false);
        dataMatrix.write(out);
        IOUtils.closeQuietly(out);

        // meta file
        if (datatypeMetadata.requiresMetafile()) {
            if (LOG.isInfoEnabled()) {
                LOG.info("writingStagingFile(), creating metadata file for staging file: " + stagingFile);
            }
            writeMetadataFile(portalMetadata, cancerStudyMetadata, datatypeMetadata, dataMatrix);
        }
    }

    /**
     * Creates a staging file for mutation data (and meta file) with contents from the given DataMatrix.
     * This is called when the mutation file needs to be run through the Oncotator and Mutation Assessor Tools.
     *
      * @param portalMetadata PortalMetadata
     * @param cancerStudy CancerStudyMetadata
     * @param datatypeMetadata DatatypeMetadata
     * @param dataMatrix DataMatrix
     * @throws Exception
     */
    @Override
    public void writeMutationStagingFile(PortalMetadata portalMetadata, CancerStudyMetadata cancerStudyMetadata,
            DatatypeMetadata datatypeMetadata, DataMatrix dataMatrix) throws Exception {

        // we only have data matrix at this point, we need to create a temp with its contents
        File oncotatorInputFile = org.apache.commons.io.FileUtils
                .getFile(org.apache.commons.io.FileUtils.getTempDirectory(), "oncotatorInputFile");
        FileOutputStream out = org.apache.commons.io.FileUtils.openOutputStream(oncotatorInputFile);
        dataMatrix.write(out);
        IOUtils.closeQuietly(out);

        // output should be the path/name of staging file
        String stagingFilename = datatypeMetadata.getStagingFilename();
        stagingFilename = stagingFilename.replaceAll(DatatypeMetadata.CANCER_STUDY_TAG,
                cancerStudyMetadata.toString());
        File stagingFile = org.apache.commons.io.FileUtils.getFile(portalMetadata.getStagingDirectory(),
                cancerStudyMetadata.getStudyPath(), stagingFilename);

        // call oncotateAF
        oncotateMAF(FileUtils.FILE_URL_PREFIX + oncotatorInputFile.getCanonicalPath(),
                FileUtils.FILE_URL_PREFIX + stagingFile.getCanonicalPath());

        // clean up
        if (oncotatorInputFile.exists()) {
            org.apache.commons.io.FileUtils.forceDelete(oncotatorInputFile);
        }

        // meta file
        if (datatypeMetadata.requiresMetafile()) {
            if (LOG.isInfoEnabled()) {
                LOG.info("writingMutationStagingFile(), creating metadata file for staging file: " + stagingFile);
            }
            writeMetadataFile(portalMetadata, cancerStudyMetadata, datatypeMetadata, dataMatrix);
        }
    }

    /**
     * Creates a z-score staging file from the given dependencies.  It assumes that the
     * dependency - staging files have already been created.  This code also assumes
     * that the dependencies are ordered by cna, then expression.
     *
      * @param portalMetadata PortalMetadata
     * @param cancerStudy CancerStudyMetadata
     * @param datatypeMetadata DatatypeMetadata
     * @param dependencies DatatypeMetadata[]
     * @throws Exception
     */
    @Override
    public void writeZScoresStagingFile(PortalMetadata portalMetadata, CancerStudyMetadata cancerStudyMetadata,
            DatatypeMetadata datatypeMetadata, DatatypeMetadata[] dependencies) throws Exception {

        // sanity check
        if (dependencies.length != 2) {
            throw new IllegalArgumentException(
                    "writeZScoresStagingFile(), datatypeMetadatas.length != 2, aborting...");
        }

        // check for existence of dependencies
        if (LOG.isInfoEnabled()) {
            LOG.info("writeZScoresStagingFile(), checking for existence of dependencies: "
                    + Arrays.asList(dependencies));
        }
        File cnaFile = org.apache.commons.io.FileUtils.getFile(portalMetadata.getStagingDirectory(),
                cancerStudyMetadata.getStudyPath(), dependencies[0].getStagingFilename());
        if (!cnaFile.exists()) {
            if (LOG.isInfoEnabled()) {
                LOG.info("writeZScoresStagingFile(), cannot find cna file dependency: "
                        + cnaFile.getCanonicalPath());
            }
            return;
        }

        File expressionFile = org.apache.commons.io.FileUtils.getFile(portalMetadata.getStagingDirectory(),
                cancerStudyMetadata.getStudyPath(), dependencies[1].getStagingFilename());
        if (!expressionFile.exists()) {
            if (LOG.isInfoEnabled()) {
                LOG.info("writeZScoresStagingFile(), cannot find expression file dependency: "
                        + expressionFile.getCanonicalPath());
            }
            return;
        }

        // we need a zscore file
        File zScoresFile = org.apache.commons.io.FileUtils.getFile(portalMetadata.getStagingDirectory(),
                cancerStudyMetadata.getStudyPath(), datatypeMetadata.getStagingFilename());

        // call NormalizeExpressionLevels
        String[] args = { cnaFile.getCanonicalPath(), expressionFile.getCanonicalPath(),
                zScoresFile.getCanonicalPath() };
        if (LOG.isInfoEnabled()) {
            LOG.info("writingZScoresStagingFlie(), calling NormalizeExpressionLevels: " + Arrays.toString(args));
        }
        try {
            NormalizeExpressionLevels.driver(args);
        } catch (RuntimeException e) {
            // houston we have a problem...
            if (LOG.isInfoEnabled()) {
                LOG.info("writingZScoresStagingFlie(), exception thrown by NormalizeExpressionLevels: "
                        + e.getMessage() + ", aborting...");
            }
            if (zScoresFile.exists()) {
                org.apache.commons.io.FileUtils.forceDelete(zScoresFile);
            }
            return;
        }

        // meta file
        if (datatypeMetadata.requiresMetafile()) {
            if (LOG.isInfoEnabled()) {
                LOG.info("writingZScoresStagingFile(), creating metadata file for staging file: "
                        + zScoresFile.getCanonicalPath());
            }
            writeMetadataFile(portalMetadata, cancerStudyMetadata, datatypeMetadata, null);
        }
    }

    /**
     * Returns an override file (if it exists) for the given portal & cancer study.  The override in this case
     * is the override file that a DataMatrix is created from.
     *
     * Null is returned if an override file is not found.
     *
     * @param portalMetadata PortalMetadata
     * @param cancerStudyMetadata CancerStudyMetadata
     * @param filename String
     * @return File
     * @throws Exception
     */
    @Override
    public File getOverrideFile(PortalMetadata portalMetadata, CancerStudyMetadata cancerStudyMetadata,
            String filename) throws Exception {

        File overrideFile = org.apache.commons.io.FileUtils.getFile(portalMetadata.getOverrideDirectory(),
                cancerStudyMetadata.getStudyPath(), filename);
        return (overrideFile.exists()) ? overrideFile : null;
    }

    /**
     * If it exists, moves an override file into the proper
     * location in the given portals staging area
     *
     * @param portalMetadata PortalMetadata
     * @param cancerStudyMetadata CancerStudyMetadata
     * @param overrideFilename String
     * @param stagingFilename String
     * @throws Exception
     */
    @Override
    public void applyOverride(PortalMetadata portalMetadata, CancerStudyMetadata cancerStudyMetadata,
            String overrideFilename, String stagingFilename) throws Exception {

        // check for override file
        File overrideFile = org.apache.commons.io.FileUtils.getFile(portalMetadata.getOverrideDirectory(),
                cancerStudyMetadata.getStudyPath(), overrideFilename);
        if (overrideFile.exists()) {
            File stagingFile = org.apache.commons.io.FileUtils.getFile(portalMetadata.getStagingDirectory(),
                    cancerStudyMetadata.getStudyPath(), stagingFilename);

            if (LOG.isInfoEnabled()) {
                LOG.info("applyOverride(), override file exists for " + stagingFile.getCanonicalPath() + ": "
                        + overrideFile.getCanonicalPath());
            }

            // copy override file to staging area
            if (overrideFile.isFile()) {
                org.apache.commons.io.FileUtils.copyFile(overrideFile, stagingFile);
            } else {
                org.apache.commons.io.FileUtils.copyDirectory(overrideFile, stagingFile);
            }
        }
    }

    /**
     * Create a case list file from the given case list metadata file.
     *
      * @param portalMetadata PortalMetadata
     * @param cancerStudyMetadata CancerStudyMetadata
     * @param caseListMetadata CaseListMetadata
     * @param caseList String[]
     * @throws Exception
     */
    @Override
    public void writeCaseListFile(PortalMetadata portalMetadata, CancerStudyMetadata cancerStudyMetadata,
            CaseListMetadata caseListMetadata, String[] caseList) throws Exception {

        File caseListFile = org.apache.commons.io.FileUtils.getFile(portalMetadata.getStagingDirectory(),
                cancerStudyMetadata.getStudyPath(), "case_lists", caseListMetadata.getCaseListFilename());

        if (LOG.isInfoEnabled()) {
            LOG.info("writeCaseListFile(), case list file: " + caseListFile.getCanonicalPath());
        }
        PrintWriter writer = new PrintWriter(org.apache.commons.io.FileUtils.openOutputStream(caseListFile, false));
        writer.print("cancer_study_identifier: " + cancerStudyMetadata + "\n");
        String stableID = caseListMetadata.getMetaStableID();
        stableID = stableID.replaceAll(DatatypeMetadata.CANCER_STUDY_TAG, cancerStudyMetadata.toString());
        writer.print("stable_id: " + stableID + "\n");
        writer.print("case_list_name: " + caseListMetadata.getMetaCaseListName() + "\n");
        String caseListDescription = caseListMetadata.getMetaCaseListDescription();
        caseListDescription = caseListDescription.replaceAll(DatatypeMetadata.NUM_CASES_TAG,
                Integer.toString(caseList.length));
        writer.print("case_list_description: " + caseListDescription + "\n");
        writer.print("case_list_category: " + caseListMetadata.getMetaCaseListCategory() + "\n");
        writer.print("case_list_ids: ");
        for (String caseID : caseList) {
            writer.print(caseID + Converter.VALUE_DELIMITER);
        }
        writer.println();
        writer.flush();
        writer.close();
    }

    /**
     * Runs all MAFs for the given dataaSourcesMetadata through
     * the Oncotator and OMA tools.
     *
     * @param dataSourcesMetadata DataSourcesMetadata
     * @throws Exception
     */
    @Override
    public void oncotateAllMAFs(DataSourcesMetadata dataSourcesMetadata) throws Exception {

        // iterate over datasource download directory and process all MAFs
        String[] extensions = new String[] { DatatypeMetadata.MAF_FILE_EXT };
        for (File maf : listFiles(new File(dataSourcesMetadata.getDownloadDirectory()), extensions, true)) {
            // create temp for given maf
            File oncotatorInputFile = org.apache.commons.io.FileUtils
                    .getFile(org.apache.commons.io.FileUtils.getTempDirectory(), "oncotatorInputFile");
            org.apache.commons.io.FileUtils.copyFile(maf, oncotatorInputFile);
            // input is tmp file we just created, we want output to go into the original maf
            oncotateMAF(FileUtils.FILE_URL_PREFIX + oncotatorInputFile.getCanonicalPath(),
                    FileUtils.FILE_URL_PREFIX + maf.getCanonicalPath());
            // clean up
            org.apache.commons.io.FileUtils.forceDelete(oncotatorInputFile);
        }
    }

    /**
     * Runs a MAF file through the Oncotator and OMA tools.
     *
     * @param inputMAFURL String
     * @param outputMAFURL String
     * @throws Exception 
     */
    @Override
    public void oncotateMAF(String inputMAFURL, String outputMAFURL) throws Exception {

        // sanity check
        if (inputMAFURL == null || inputMAFURL.length() == 0 || outputMAFURL == null
                || outputMAFURL.length() == 0) {
            throw new IllegalArgumentException(
                    "oncotateMAFdownloadFile(): url or urlDestination argument is null...");
        }

        URL inputMAF = new URL(inputMAFURL);
        URL outputMAF = new URL(outputMAFURL);

        // determine if we have to call liftover
        boolean cleanOncotatorInputFile = false;
        File oncotatorInputFile = new File(inputMAF.getFile());
        org.apache.commons.io.LineIterator it = org.apache.commons.io.FileUtils.lineIterator(oncotatorInputFile);
        it.nextLine(); // skip header
        String[] parts = it.nextLine().split("\t");
        if (parts[3].contains("36") || parts[3].equals("hg18")) {
            it.close();
            File liftoverInputFile = org.apache.commons.io.FileUtils
                    .getFile(org.apache.commons.io.FileUtils.getTempDirectory(), "liftoverInputFile");
            org.apache.commons.io.FileUtils.copyFile(oncotatorInputFile, liftoverInputFile);
            oncotatorInputFile = new File(inputMAF.getFile());
            // call lift over
            if (LOG.isInfoEnabled()) {
                LOG.info("oncotateMAF(), calling Hg18ToHg19...");
            }
            Hg18ToHg19.driver(liftoverInputFile.getCanonicalPath(), oncotatorInputFile.getCanonicalPath(),
                    getLiftOverBinary(), getLiftOverChain());
            org.apache.commons.io.FileUtils.forceDelete(liftoverInputFile);
            cleanOncotatorInputFile = true;
        }

        // create a temp output file from the oncotator
        File oncotatorOutputFile = org.apache.commons.io.FileUtils
                .getFile(org.apache.commons.io.FileUtils.getTempDirectory(), "oncotatorOutputFile");
        // call oncotator
        if (LOG.isInfoEnabled()) {
            LOG.info("oncotateMAF(), calling OncotateTool...");
        }
        OncotateTool.driver(oncotatorInputFile.getCanonicalPath(), oncotatorOutputFile.getCanonicalPath(), true,
                true, true);
        // we call OMA here -
        // we use output from oncotator as input file
        if (LOG.isInfoEnabled()) {
            LOG.info("oncotateMAF(), calling MutationAssessorTool...");
        }
        File outputMAFFile = new File(outputMAF.getFile());
        outputMAFFile.createNewFile();
        MutationAssessorTool.driver(oncotatorOutputFile.getCanonicalPath(), outputMAFFile.getCanonicalPath(), false,
                true, true);

        // clean up
        org.apache.commons.io.FileUtils.forceDelete(oncotatorOutputFile);
        if (cleanOncotatorInputFile)
            org.apache.commons.io.FileUtils.forceDelete(oncotatorInputFile);
    }

    /**
     * Copy's the given portal's seg files to location used for linking to IGV from cBio Portal web site.
     *
     * @param portalMetadata PortalMetadata
     * @param datatypeMetadata DatataypeMetadata
     * @param remoteUserName String
     * @throws Exception
     */
    @Override
    public void copySegFiles(PortalMetadata portalMetadata, DatatypeMetadata datatypeMetadata,
            String remoteUserName) throws Exception {

        if (LOG.isInfoEnabled()) {
            LOG.info("copySegFiles()");
        }

        // check args
        if (portalMetadata == null || remoteUserName == null) {
            throw new IllegalArgumentException("portal or remoteUserName must not be null");
        }

        // seg file location
        URL segFileLocation = portalMetadata.getIGVSegFileLinkingLocation();

        // we need this to determine location 
        Collection<DataSourcesMetadata> dataSourcesMetadata = config.getDataSourcesMetadata(Config.ALL);

        // iterate over all cancer studies
        for (CancerStudyMetadata cancerStudyMetadata : config.getCancerStudyMetadata(portalMetadata.getName())) {

            // lets determine if cancer study is in staging directory or studies directory
            String rootDirectory = MetadataUtils.getCancerStudyRootDirectory(portalMetadata, dataSourcesMetadata,
                    cancerStudyMetadata);

            if (rootDirectory == null) {
                if (LOG.isInfoEnabled()) {
                    LOG.info("loadStagingFiles(), cannot find root directory for study: " + cancerStudyMetadata
                            + " skipping...");
                }
                continue;
            }

            // construct staging filename for seg
            String sourceFilename = (rootDirectory + File.separator + cancerStudyMetadata.getStudyPath()
                    + File.separator + datatypeMetadata.getStagingFilename());
            sourceFilename = sourceFilename.replaceAll(DatatypeMetadata.CANCER_STUDY_TAG,
                    cancerStudyMetadata.toString());
            String destinationFilename = datatypeMetadata.getStagingFilename()
                    .replaceAll(DatatypeMetadata.CANCER_STUDY_TAG, cancerStudyMetadata.toString());

            String[] command = new String[] { "scp", sourceFilename, remoteUserName + "@"
                    + segFileLocation.getHost() + ":" + segFileLocation.getFile() + destinationFilename };
            if (LOG.isInfoEnabled()) {
                LOG.info("executing: " + Arrays.asList(command));
            }
            if (Shell.exec(Arrays.asList(command), ".")) {
                if (LOG.isInfoEnabled()) {
                    LOG.info("copy successful.");
                }
            } else if (LOG.isInfoEnabled()) {
                LOG.info("copy unsucessful.");
            }
        }
    }

    /*
     * Given a zip stream, unzips it and returns an input stream to the desired data file.
     *
     * @param importDataRecord ImportDataRecord
     * @param is InputStream
     * @return InputStream
     */
    private InputStream readContent(ImportDataRecord importDataRecord, InputStream is) throws Exception {

        InputStream toReturn = null;

        try {
            // decompress .gz file
            if (LOG.isInfoEnabled()) {
                LOG.info("readContent(), decompressing: " + importDataRecord.getCanonicalPathToData());
            }

            InputStream unzippedContent = new GzipCompressorInputStream(is);
            // if tarball, untar
            if (importDataRecord.getCanonicalPathToData().toLowerCase().endsWith("tar.gz")) {
                if (LOG.isInfoEnabled()) {
                    LOG.info("readContent(), gzip file is a tarball, untarring");
                }
                TarArchiveInputStream tis = new TarArchiveInputStream(unzippedContent);
                TarArchiveEntry entry = null;
                while ((entry = tis.getNextTarEntry()) != null) {
                    String entryName = entry.getName();
                    String dataFile = importDataRecord.getDataFilename();
                    if (dataFile.contains(DatatypeMetadata.TUMOR_TYPE_TAG)) {
                        dataFile = dataFile.replaceAll(DatatypeMetadata.TUMOR_TYPE_TAG,
                                importDataRecord.getTumorType().toUpperCase());
                    }
                    if (entryName.contains(dataFile)) {
                        if (LOG.isInfoEnabled()) {
                            LOG.info("Processing tar-archive: " + importDataRecord.getDataFilename());
                        }
                        toReturn = tis;
                        break;
                    }
                }
            } else {
                toReturn = unzippedContent;
            }
        } catch (Exception e) {
            throw e;
        }

        // outta here
        return toReturn;
    }

    /**
     * Helper function to create DataMatrix.
     *
     * @param data InputStream
     * @return DataMatrix
     */
    private DataMatrix getDataMatrix(InputStream data) throws Exception {

        // iterate over all lines in byte[]
        List<String> columnNames = null;
        List<LinkedList<String>> rowData = null;
        LineIterator it = IOUtils.lineIterator(data, null);
        try {
            int count = -1;
            while (it.hasNext()) {
                // first row is our column heading, create column vector
                if (++count == 0) {
                    columnNames = new LinkedList(Arrays.asList(it.nextLine().split(Converter.VALUE_DELIMITER, -1)));
                }
                // all other rows are rows in the table
                else {
                    rowData = (rowData == null) ? new LinkedList<LinkedList<String>>() : rowData;
                    rowData.add(new LinkedList(Arrays.asList(it.nextLine().split(Converter.VALUE_DELIMITER, -1))));
                }
            }
        } finally {
            LineIterator.closeQuietly(it);
        }

        // problem reading from data?
        if (columnNames == null || rowData == null) {
            if (LOG.isInfoEnabled()) {
                LOG.info(
                        "getDataMatrix(), problem creating DataMatrix from file, data file probably missing data, returning null");
            }
            return null;
        }

        // made it here, we can create DataMatrix
        if (LOG.isInfoEnabled()) {
            LOG.info("creating new DataMatrix(), from file data");
        }

        // outta here
        return new DataMatrix(rowData, columnNames);
    }

    /**
     * Helper function to gunzip file.  gzipFile param is canonical path.
     *
     * @param gzipFile String
     */
    private static void gunzip(String gzipFile) throws Exception {

        // setup our gzip inputs tream
        FileOutputStream fos = null;
        String outFilePath = GzipUtils.getUncompressedFilename(gzipFile);
        GZIPInputStream gis = new GZIPInputStream(new FileInputStream(gzipFile));

        try {
            // unzip into file less the .gz
            fos = new FileOutputStream(outFilePath);
            IOUtils.copy(gis, fos);
        } finally {
            // close up our streams
            IOUtils.closeQuietly(gis);
            if (fos != null)
                IOUtils.closeQuietly(fos);
        }
    }
}