org.caleydo.data.importer.tcga.FirehoseProvider.java Source code

Introduction

Here is the source code for org.caleydo.data.importer.tcga.FirehoseProvider.java
Source

/*******************************************************************************
 * Caleydo - Visualization for Molecular Biology - http://caleydo.org
 * Copyright (c) The Caleydo Team. All rights reserved.
 * Licensed under the new BSD license, available at http://caleydo.org/license
 ******************************************************************************/
package org.caleydo.data.importer.tcga;

import java.io.BufferedInputStream;
import java.io.BufferedOutputStream;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.FileReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.io.PrintWriter;
import java.net.MalformedURLException;
import java.net.URL;
import java.nio.charset.Charset;
import java.nio.file.Files;
import java.nio.file.StandardCopyOption;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.BitSet;
import java.util.Calendar;
import java.util.Date;
import java.util.List;
import java.util.Set;
import java.util.logging.Level;
import java.util.logging.Logger;
import java.util.zip.GZIPInputStream;

import org.apache.commons.compress.archivers.ArchiveEntry;
import org.apache.commons.compress.archivers.tar.TarArchiveInputStream;
import org.apache.commons.lang.SystemUtils;
import org.caleydo.core.util.collection.Pair;
import org.caleydo.data.importer.tcga.model.TumorType;

import com.google.common.collect.ImmutableSet;
import com.google.common.collect.Table;
import com.google.common.collect.TreeBasedTable;
import com.google.common.io.Closeables;

public final class FirehoseProvider {
    private static final Logger log = Logger.getLogger(FirehoseProvider.class.getName());
    private static final int LEVEL = 4;

    private final TumorType tumor;
    private final String tumorSample;

    private final Date analysisRun;
    private final Date dataRun;

    private final File tmpAnalysisDir;
    private final File tmpDataDir;

    private final Settings settings;

    private final Calendar relevantDate;

    FirehoseProvider(TumorType tumor, Date analysisRun, Date dataRun, Settings settings) {
        this.tumor = tumor;
        this.relevantDate = Calendar.getInstance();
        this.relevantDate.setTime(analysisRun);
        this.tumorSample = guessTumorSample(tumor, this.relevantDate, settings);
        this.analysisRun = analysisRun;
        this.dataRun = dataRun;
        this.settings = settings;
        String tmpDir = settings.getTemporaryDirectory();
        this.tmpAnalysisDir = createTempDirectory(tmpDir, analysisRun, tumor.getName());
        this.tmpDataDir = createTempDirectory(tmpDir, dataRun, tumor.getName());
    }

    /**
     * logic determining the tumor sample based on the analysis run
     *
     * @param tumor
     * @param date
     * @return
     */
    private static String guessTumorSample(TumorType tumor, Calendar cal, Settings settings) {

        if (settings.isAwgRun())
            return tumor.toString();

        if (cal.get(Calendar.YEAR) >= 2013 && tumor.toString().equalsIgnoreCase("SKCM"))
            return tumor + "-TM";
        if (cal.get(Calendar.YEAR) >= 2013 && tumor.toString().equalsIgnoreCase("LAML"))
            return tumor + "-TB";
        if (cal.get(Calendar.YEAR) >= 2013)
            return tumor + "-TP";
        return tumor.toString();
    }

    /**
     * @return
     */
    public boolean is2014Run() {
        return relevantDate.get(Calendar.YEAR) >= 2014;
    }

    public boolean isPost20140416() {
        return relevantDate.get(Calendar.YEAR) >= 2014 && relevantDate.get(Calendar.MONTH) >= Calendar.APRIL;
    }

    private String getFileName(String suffix) {
        return tumorSample + suffix;
    }

    private File createTempDirectory(String tmpOutputDirectory, Date run, String tumor) {
        String runId;
        if (run == null)
            runId = "unknown";
        else {
            runId = Settings.formatClean(run);
        }
        return new File(
                tmpOutputDirectory + runId + SystemUtils.FILE_SEPARATOR + tumor + SystemUtils.FILE_SEPARATOR);
    }

    private Pair<TCGAFileInfo, Boolean> findStandardSampledClusteredFile(EDataSetType type) {
        return Pair.make(extractAnalysisRunFile(".expclu.gct", type.getTCGAAbbr() + "_Clustering_CNMF", LEVEL),
                false);
    }

    public Pair<TCGAFileInfo, Boolean> findRPPAMatrixFile(boolean loadFullGenes) {
        return findStandardSampledClusteredFile(EDataSetType.RPPA);
    }

    public Pair<TCGAFileInfo, Boolean> findMethylationMatrixFile(boolean loadFullGenes) {
        return findStandardSampledClusteredFile(EDataSetType.methylation);
    }

    public Pair<TCGAFileInfo, Boolean> findmRNAMatrixFile(boolean loadFullGenes) {
        if (loadFullGenes) {
            TCGAFileInfo r;
            if (isPost20140416()) {
                r = extractDataRunFile(".medianexp.txt", "mRNA_Preprocess_Median", LEVEL);
            } else {
                r = extractAnalysisRunFile(getFileName(".medianexp.txt"), "mRNA_Preprocess_Median", LEVEL);
            }
            if (r != null)
                return Pair.make(r, true);
        }
        return findStandardSampledClusteredFile(EDataSetType.mRNA);
    }

    public Pair<TCGAFileInfo, Boolean> findmRNAseqMatrixFile(boolean loadFullGenes) {
        if (loadFullGenes) {
            TCGAFileInfo r = extractDataRunFile(".uncv2.mRNAseq_RSEM_normalized_log2.txt", "mRNAseq_Preprocess",
                    isPost20140416() ? 3 : LEVEL);
            if (r == null)
                r = extractDataRunFile(".uncv1.mRNAseq_RPKM_log2.txt", "mRNAseq_Preprocess",
                        isPost20140416() ? 3 : LEVEL);
            if (r == null)
                r = extractDataRunFile(".mRNAseq_RPKM_log2.txt", "mRNAseq_Preprocess",
                        isPost20140416() ? 3 : LEVEL);
            if (r != null) {
                r = filterColumns(r, findStandardSampledClusteredFile(EDataSetType.mRNAseq));
                return Pair.make(r, true);
            }
        }
        return findStandardSampledClusteredFile(EDataSetType.mRNAseq);
    }

    private TCGAFileInfo filterColumns(TCGAFileInfo full, Pair<TCGAFileInfo, Boolean> sampled) {
        File in = full.getFile();
        File out = new File(in.getParentFile(), "F" + in.getName());
        TCGAFileInfo r = new TCGAFileInfo(out, full.getArchiveURL(), full.getSourceFileName());
        if (out.exists() && !settings.isCleanCache())
            return r;
        assert full != null;
        if (sampled == null || sampled.getFirst() == null) {
            log.severe("can't filter the full gene file: " + in + " - sampled not found");
            return full;
        }
        // full: 1row, 2col
        // sampled: 3row, 3col
        Set<String> good = readGoodSamples(sampled.getFirst().getFile());
        if (good == null)
            return full;
        try (BufferedReader fin = new BufferedReader(new FileReader(in)); PrintWriter w = new PrintWriter(out)) {
            String[] header = fin.readLine().split("\t");
            BitSet bad = filterCols(header, good);
            {
                StringBuilder b = new StringBuilder();
                for (int i = bad.nextSetBit(0); i >= 0; i = bad.nextSetBit(i + 1))
                    b.append(' ').append(header[i]);
                log.warning("remove bad samples of " + in + ":" + b);
            }
            w.append(header[0]);
            for (int i = 1; i < header.length; ++i) {
                if (bad.get(i))
                    continue;
                w.append('\t').append(header[i]);
            }
            String line;
            while ((line = fin.readLine()) != null) {
                w.println();
                int t = line.indexOf('\t');
                w.append(line.subSequence(0, t));
                int prev = t;
                int i = 1;
                for (t = line.indexOf('\t', t + 1); t >= 0; t = line.indexOf('\t', t + 1), ++i) {
                    if (!bad.get(i))
                        w.append(line.subSequence(prev, t));
                    prev = t;
                }
                if (!bad.get(i))
                    w.append(line.subSequence(prev, line.length()));

            }
        } catch (IOException e) {
            log.log(Level.SEVERE, "can't filter full file: " + in, e);
        }

        return r;
    }

    /**
     * @param header
     * @param good
     * @return
     */
    private static BitSet filterCols(String[] header, Set<String> good) {
        BitSet r = new BitSet(header.length);
        for (int i = 0; i < header.length; ++i)
            if (!good.contains(header[i]))
                r.set(i);
        return r;
    }

    private static Set<String> readGoodSamples(File file) {
        // sampled: 3row, >=3col
        try (BufferedReader r = new BufferedReader(new FileReader(file))) {
            r.readLine();
            r.readLine();
            String line = r.readLine();
            String[] samples = line.split("\t");
            return ImmutableSet.copyOf(Arrays.copyOfRange(samples, 2, samples.length));
        } catch (IOException e) {
            log.log(Level.SEVERE, "can't read sample header from: " + file, e);
        }
        return null;
    }

    public Pair<TCGAFileInfo, Boolean> findmicroRNAMatrixFile(boolean loadFullGenes) {
        if (loadFullGenes) {
            TCGAFileInfo r = extractDataRunFile(".miR_expression.txt", "miR_Preprocess",
                    isPost20140416() ? 3 : LEVEL);
            if (r != null) {
                r = filterColumns(r, findStandardSampledClusteredFile(EDataSetType.microRNA));
                return Pair.make(r, true);
            }
        }
        return findStandardSampledClusteredFile(EDataSetType.microRNA);
    }

    public Pair<TCGAFileInfo, Boolean> findmicroRNAseqMatrixFile(boolean loadFullGenes) {
        if (loadFullGenes) {
            TCGAFileInfo r = extractAnalysisRunFile(getFileName(".uncv2.miRseq_RSEM_normalized_log2.txt"),
                    "miRseq_Preprocess", isPost20140416() ? 3 : LEVEL);
            if (r == null)
                r = extractAnalysisRunFile(getFileName(".miRseq_RPKM_log2.txt"), "miRseq_Preprocess", LEVEL);
            if (r != null) {
                r = filterColumns(r, findStandardSampledClusteredFile(EDataSetType.microRNA));
                return Pair.make(r, true);
            }
        }
        return findStandardSampledClusteredFile(EDataSetType.microRNAseq);
    }

    public TCGAFileInfo findHiearchicalGrouping(EDataSetType type) {
        return extractAnalysisRunFile(getFileName(".allclusters.txt"), type.getTCGAAbbr() + "_Clustering_Consensus",
                LEVEL);
    }

    public TCGAFileInfo findCNMFGroupingFile(EDataSetType type) {
        return extractAnalysisRunFile(".membership.txt", type.getTCGAAbbr() + "_Clustering_CNMF", LEVEL);
    }

    public TCGAFileInfo findCopyNumberFile() {
        return extractAnalysisRunFile("all_thresholded.by_genes.txt", "CopyNumber_Gistic2", LEVEL);
    }

    public TCGAFileInfo findClinicalDataFile() {
        return extractDataRunFile(".clin.merged.txt", "Merge_Clinical", 1);
    }

    public TCGAFileInfo findMutSigReport() {
        return extractAnalysisRunFile(getFileName(".sig_genes.txt"), "MutSigNozzleReportCV", LEVEL);
    }

    public Pair<TCGAFileInfo, Integer> findMutationFile() {
        int startColumn = 8;
        TCGAFileInfo mutationFile = null;
        if (relevantDate.get(Calendar.YEAR) < 2013) { // test only for the <= 2012
            mutationFile = extractAnalysisRunFile(getFileName(".per_gene.mutation_counts.txt"),
                    "Mutation_Significance", LEVEL);

            if (mutationFile == null)
                mutationFile = extractAnalysisRunFile(getFileName(".per_gene.mutation_counts.txt"), "MutSigRun2.0",
                        LEVEL);
        }
        if (mutationFile == null) {
            // TODO always the -TP version
            TCGAFileInfo maf = null;
            if (!this.settings.isAwgRun()) {
                maf = extractAnalysisRunFile(tumor + "-TP.final_analysis_set.maf", "MutSigNozzleReport2.0", LEVEL);
            } else {
                maf = extractAnalysisRunFile(tumor + ".final_analysis_set.maf", "MutSigNozzleReport2.0", LEVEL);
            }
            if (maf != null) {
                return Pair.make(
                        new TCGAFileInfo(parseMAF(maf.getFile()), maf.getArchiveURL(), maf.getSourceFileName()), 1);
            }
        }
        return Pair.make(mutationFile, startColumn);
    }

    /**
     * @return
     */
    public String getReportURL() {
        return settings.getReportUrl(analysisRun, tumor);
    }

    private TCGAFileInfo extractAnalysisRunFile(String fileName, String pipelineName, int level) {
        return extractFile(fileName, pipelineName, level, true, false);
    }

    private TCGAFileInfo extractDataRunFile(String fileName, String pipelineName, int level) {
        return extractFile(fileName, pipelineName, level, false, true);
    }

    private TCGAFileInfo extractFile(String fileName, String pipelineName, int level, boolean isAnalysisRun,
            boolean hasTumor) {
        Date id = isAnalysisRun ? analysisRun : dataRun;

        String label = "unknown";
        // extract file to temp directory and return path to file
        URL url;
        try {
            if (isAnalysisRun)
                url = settings.getAnalysisURL(id, tumor, tumorSample, pipelineName, level);
            else
                url = settings.getDataURL(id, tumor, tumorSample, pipelineName, level);
            String urlString = url.getPath();
            label = urlString.substring(urlString.lastIndexOf('/') + 1, urlString.length());
            File outputDir = new File(isAnalysisRun ? tmpAnalysisDir : tmpDataDir, label);
            outputDir.mkdirs();

            return extractFileFromTarGzArchive(url, fileName, outputDir, hasTumor);
        } catch (MalformedURLException e) {
            log.log(Level.SEVERE, "invalid url generated from: " + id + " " + tumor + " " + tumorSample + " "
                    + pipelineName + " " + level);
            return null;
        }
    }

    private TCGAFileInfo extractFileFromTarGzArchive(URL inUrl, String fileToExtract, File outputDirectory,
            boolean hasTumor) {
        log.info(inUrl + " download and extract: " + fileToExtract);
        File targetFile = new File(outputDirectory, fileToExtract);

        // use cached
        if (targetFile.exists() && !settings.isCleanCache()) {
            log.fine(inUrl + " cache hit");
            return new TCGAFileInfo(targetFile, inUrl, fileToExtract);
        }

        File notFound = new File(outputDirectory, fileToExtract + "-notfound");
        if (notFound.exists() && !settings.isCleanCache()) {
            log.warning(inUrl + " marked as not found");
            return null;
        }

        String alternativeName = fileToExtract;
        if (hasTumor) {
            alternativeName = "/" + tumor.getBaseName() + fileToExtract;
            fileToExtract = "/" + tumor + fileToExtract;
        }

        TarArchiveInputStream tarIn = null;
        OutputStream out = null;
        try {
            InputStream in = new BufferedInputStream(inUrl.openStream());

            // ok we have the file
            tarIn = new TarArchiveInputStream(new GZIPInputStream(in));

            // search the correct entry
            ArchiveEntry act = tarIn.getNextEntry();
            while (act != null && !act.getName().endsWith(fileToExtract)
                    && !act.getName().endsWith(alternativeName)) {
                act = tarIn.getNextEntry();
            }
            if (act == null) // no entry found
                throw new FileNotFoundException("no entry named: " + fileToExtract + " found");

            byte[] buf = new byte[4096];
            int n;
            targetFile.getParentFile().mkdirs();
            // use a temporary file to recognize if we have aborted between run
            String tmpFile = targetFile.getAbsolutePath() + ".tmp";
            out = new BufferedOutputStream(new FileOutputStream(tmpFile));
            while ((n = tarIn.read(buf, 0, 4096)) > -1)
                out.write(buf, 0, n);
            out.close();
            Files.move(new File(tmpFile).toPath(), targetFile.toPath(), StandardCopyOption.REPLACE_EXISTING);

            log.info(inUrl + " extracted " + fileToExtract);
            return new TCGAFileInfo(targetFile, inUrl, fileToExtract);
        } catch (FileNotFoundException e) {
            log.log(Level.WARNING, inUrl + " can't extract" + fileToExtract + ": file not found", e);
            // file was not found, create a marker to remember this for quicker checks
            notFound.getParentFile().mkdirs();
            try {
                notFound.createNewFile();
            } catch (IOException e1) {
                log.log(Level.WARNING, inUrl + " can't create not-found marker", e);
            }
            return null;
        } catch (Exception e) {
            log.log(Level.SEVERE, inUrl + " can't extract" + fileToExtract + ": " + e.getMessage(), e);
            return null;
        } finally {
            Closeables.closeQuietly(tarIn);
            Closeables.closeQuietly(out);
        }
    }

    private static File parseMAF(File maf) {

        File out = new File(maf.getParentFile(), "P" + maf.getName());
        if (out.exists())
            return out;
        log.fine(maf.getAbsolutePath() + " parsing maf file");
        final String TAB = "\t";

        try (BufferedReader reader = Files.newBufferedReader(maf.toPath(), Charset.forName("UTF-8"))) {
            List<String> header = Arrays.asList(reader.readLine().split(TAB));
            int geneIndex = header.indexOf("Hugo_Symbol");
            int sampleIndex = header.indexOf("Tumor_Sample_Barcode");
            // gene x sample x mutated
            Table<String, String, Boolean> mutated = TreeBasedTable.create();
            String line = null;
            while ((line = reader.readLine()) != null) {
                String[] columns = line.split(TAB);
                mutated.put(columns[geneIndex], columns[sampleIndex], Boolean.TRUE);
            }

            File tmp = new File(out.getParentFile(), out.getName() + ".tmp");
            PrintWriter w = new PrintWriter(tmp);
            w.append("Hugo_Symbol");
            List<String> cols = new ArrayList<>(mutated.columnKeySet());
            for (String sample : cols) {
                w.append(TAB).append(sample);
            }
            w.println();
            Set<String> rows = mutated.rowKeySet();
            for (String gene : rows) {
                w.append(gene);
                for (String sample : cols) {
                    w.append(TAB).append(mutated.contains(gene, sample) ? '1' : '0');
                }
                w.println();
            }
            w.close();
            Files.move(tmp.toPath(), out.toPath(), StandardCopyOption.REPLACE_EXISTING);

            log.fine(maf.getAbsolutePath() + " parsed maf file stats: " + mutated.size() + " " + rows.size() + " "
                    + cols.size());
            return out;
        } catch (IOException e) {
            log.log(Level.SEVERE, maf.getAbsolutePath() + " maf parsing error: " + e.getMessage(), e);
        }
        return null;
    }

    public static void main(String[] args) {
        File file = new File(
                "/home/alexsb/Dropbox/Caleydo/data/ccle/CCLE_hybrid_capture1650_hg19_NoCommonSNPs_CDS_2012.05.07.maf");
        file = parseMAF(file);

    }

    @Override
    public String toString() {
        StringBuilder builder = new StringBuilder();
        builder.append("FirehoseProvider[");
        builder.append(tumor);
        builder.append("/");
        builder.append(tumorSample);
        builder.append("@");
        builder.append(Settings.format(analysisRun));
        builder.append(",");
        builder.append(Settings.format(dataRun));
        builder.append("]");
        return builder.toString();
    }

}