org.mda.bcb.tcgagsdata.create.ProcessFile.java Source code

Java tutorial

Introduction

Here is the source code for org.mda.bcb.tcgagsdata.create.ProcessFile.java

Source

/*
TcgaGSData Copyright 2014, 2015, 2016 University of Texas MD Anderson Cancer Center
    
This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 2 of the License, or (at your option) any later version.
    
This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for more details.
    
You should have received a copy of the GNU General Public License along with this program.  If not, see <http://www.gnu.org/licenses/>.
 */

package org.mda.bcb.tcgagsdata.create;

import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileReader;
import java.io.IOException;
import java.nio.charset.Charset;
import java.nio.file.Files;
import java.nio.file.Paths;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
import java.util.List;
import java.util.Map.Entry;
import java.util.TreeMap;
import java.util.TreeSet;
import org.apache.commons.codec.digest.DigestUtils;
import org.mda.bcb.tcgagsdata.GSStringUtils;
import org.mda.bcb.tcgagsdata.TcgaGSData;

/**
 *
 * @author tdcasasent
 */
public class ProcessFile {
    protected int M_BARCODES;
    protected int M_GENES;
    public double[][] mCombinedData;
    public HashMap<String, Integer> mGeneEqMap = new HashMap<>();
    public ArrayList<String> mSampleList = new ArrayList<>();
    public ArrayList<ArrayList<String>> mDiseaseToSampleLists = new ArrayList<>();
    public int mCurrentColumn = 0;
    //
    public String mConvertedDir = null;
    public String mCombinedDir = null;
    public String mDataDir = null;
    public String mType = null;
    public String mPlatform = null;
    public String mLevel = null;
    //
    //protected TcgaIdConverter mTic = null;
    protected String mIdPath = null;

    public ProcessFile(String theConvertedDir, String theCombinedDir, String theType, String thePlatform,
            String theLevel, String theIdPath, String theDataDir) {
        TcgaGSData.printVersion();
        mConvertedDir = theConvertedDir;
        mCombinedDir = theCombinedDir;
        mType = theType;
        mPlatform = thePlatform;
        mLevel = theLevel;
        //
        //mTic = new TcgaIdConverter(theIdPath);
        //mTic.loadFiles();
        mIdPath = theIdPath;
        mDataDir = theDataDir;
    }

    protected void countBarcodesAndGenes(TreeMap<File, String> theFiles) throws FileNotFoundException, IOException {
        M_BARCODES = 0;
        M_GENES = 0;
        TreeSet<String> genes = new TreeSet<>();
        for (File file : theFiles.keySet()) {
            long start = System.currentTimeMillis();
            int numOfCol = 0;
            try (BufferedReader br = new BufferedReader(new FileReader(file))) {
                // header
                String line = br.readLine();
                numOfCol = line.split("\t", -1).length;
                while (null != (line = br.readLine())) {

                    genes.add(line.substring(0, line.indexOf("\t")));
                }
            }
            M_BARCODES = M_BARCODES + numOfCol;
            long finish = System.currentTimeMillis();
            System.out.println("file " + ((finish - start) / 1000.0) + " seconds");
        }
        M_GENES = genes.size();
    }

    protected void setupArray(TreeMap<File, String> theFiles, double theDefault)
            throws FileNotFoundException, IOException {
        countBarcodesAndGenes(theFiles);
        TcgaGSData.printWithFlag("ProcessFile::setupArray - M_BARCODES=" + M_BARCODES);
        TcgaGSData.printWithFlag("ProcessFile::setupArray - M_GENES=" + M_GENES);
        mCombinedData = new double[M_BARCODES][M_GENES];
        mCurrentColumn = 0;
        for (int x = 0; x < M_BARCODES; x++) {
            for (int y = 0; y < M_GENES; y++) {
                mCombinedData[x][y] = theDefault;
            }
        }
    }

    public void process(Double theDefault) throws IOException, Exception {
        TcgaGSData.printWithFlag("ProcessFile::process - Start");
        TcgaGSData.printWithFlag("ProcessFile::process - get GeneNames_Mixin");
        GeneNames_Mixin gnm = getGeneNamesMixin();
        TcgaGSData.printWithFlag("ProcessFile::process - thePlatform=" + mPlatform);
        TreeMap<File, String> myFiles = getListOfFiles();
        TcgaGSData.printWithFlag("ProcessFile::process - filter files");
        myFiles = gnm.filterFiles(myFiles);
        TcgaGSData.printWithFlag("ProcessFile::process - setupArray");
        setupArray(myFiles, theDefault);
        TcgaGSData.printWithFlag("ProcessFile::process - M_BARCODES=" + M_BARCODES);
        TcgaGSData.printWithFlag("ProcessFile::process - M_GENES=" + M_GENES);
        TcgaGSData.printWithFlag("ProcessFile::process - scanAndProcessDirs");
        scanAndProcessDirs(myFiles);
        TcgaGSData.printWithFlag("ProcessFile::process - processList");
        gnm.processList();
        TreeMap<String, Integer> genes = gnm.mGeneEqTreeMap;
        TcgaGSData.printWithFlag("ProcessFile::process - writeGeneListFile");
        writeGeneListFile(genes);
        TcgaGSData.printWithFlag("ProcessFile::process - writeCombinedFiles");
        writeCombinedFiles(genes);
        TcgaGSData.printWithFlag("ProcessFile::process - writeDiseaseToSampleFile");
        writeDiseaseToSampleFile();
        if (null != gnm.mProbeToGeneSymbolMap) {
            TcgaGSData.printWithFlag("ProcessFile::process - writeMapFiles");
            writeMapFile(gnm);
        }
        TcgaGSData.printWithFlag("ProcessFile::process - Finish");
    }

    protected void writeMapFile(GeneNames_Mixin theGnm) throws IOException {
        File mapFile = new File(mDataDir, theGnm.mDataName.toLowerCase() + "map.tsv");
        try (BufferedWriter bw = Files.newBufferedWriter(Paths.get(mapFile.getAbsolutePath()),
                Charset.availableCharsets().get("ISO-8859-1"))) {
            bw.write("probe_id\tchromosome\tprobe_location\tgene_id");
            bw.newLine();
            for (String probe : theGnm.mProbeToGeneSymbolMap.keySet()) {
                bw.write(probe);
                bw.write("\t");
                bw.write(theGnm.mProbeToChromosomeMap.get(probe));
                bw.write("\t");
                bw.write(theGnm.mProbeToGenomicLocationMap.get(probe));
                bw.write("\t");
                bw.write(theGnm.mProbeToGeneSymbolMap.get(probe));
                bw.write("\t");
                bw.newLine();
            }
        }

    }

    protected File getOutputDir() {
        String myDir = mPlatform;
        if ("*".equals(myDir)) {
            myDir = mType.toLowerCase();
        }
        File outputDir = new File(mCombinedDir, myDir);
        outputDir.mkdirs();
        return outputDir;
    }

    protected void writeGeneListFile(TreeMap<String, Integer> theGeneEqList) throws IOException {
        File outputDir = getOutputDir();
        File outputFile = new File(outputDir, "gene_list.tsv");
        try (BufferedWriter bw = Files.newBufferedWriter(Paths.get(outputFile.getAbsolutePath()),
                Charset.availableCharsets().get("ISO-8859-1"))) {
            boolean first = true;
            for (String gene : theGeneEqList.keySet()) {
                if (false == first) {
                    bw.write("\t");
                } else {
                    first = false;
                }
                bw.write(gene);
            }
            bw.newLine();
        }
    }

    protected void writeDiseaseToSampleFile() throws IOException {
        File outputDir = getOutputDir();
        File outputFile = new File(outputDir, "disease_sample.tsv");
        try (BufferedWriter bw = Files.newBufferedWriter(Paths.get(outputFile.getAbsolutePath()),
                Charset.availableCharsets().get("ISO-8859-1"))) {
            for (ArrayList<String> outputList : mDiseaseToSampleLists) {
                boolean first = true;
                for (String data : outputList) {
                    if (false == first) {
                        bw.write("\t");
                    } else {
                        first = false;
                    }
                    bw.write(data);
                }
                bw.newLine();
            }
        }
    }

    protected void writeToMD5File(String theMd5prefix, TreeMap<String, Integer> theGeneEqList,
            ArrayList<String> theGenesublist) throws IOException {
        File outputDir = getOutputDir();
        File outputFile = new File(outputDir, "matrix_data_" + theMd5prefix + ".tsv");
        TcgaGSData.printWithFlag("ProcessFile::writeToMD5File - outputFile=" + outputFile.getName());
        try (BufferedWriter bw = Files.newBufferedWriter(Paths.get(outputFile.getAbsolutePath()),
                Charset.availableCharsets().get("ISO-8859-1"))) {
            // header
            for (String sample : mSampleList) {
                bw.write("\t");
                bw.write(sample);
            }
            bw.newLine();
            boolean foundNAN = false;
            for (String geneEq : theGenesublist) {
                // gene eq
                //TcgaGSData.printWithFlag(y + "="+mGeneEqList.get(y));
                bw.write(geneEq);
                // data
                int y = theGeneEqList.get(geneEq).intValue();
                for (int x = 0; x < mSampleList.size(); x++) {
                    bw.write("\t");
                    String dVal = "NaN";
                    if (Double.NaN != mCombinedData[x][y]) {
                        dVal = Double.toString(mCombinedData[x][y]);
                        if ("NaN".equalsIgnoreCase(dVal)) {
                            //TcgaGSData.printWithFlag("NaN <-- " + mCombinedData[x][y]);
                            foundNAN = true;
                        }
                    }
                    bw.write(dVal);
                }
                bw.newLine();
            }
            if (true == foundNAN) {
                TcgaGSData.printWithFlag("Found NaN writing " + outputFile.getAbsolutePath());
            }
        }
    }

    protected void writeCombinedFiles(TreeMap<String, Integer> theGeneEqList) throws IOException {
        File outputDir = getOutputDir();
        TcgaGSData.printWithFlag("mGeneEqMap.size()=" + theGeneEqList.size());
        TcgaGSData.printWithFlag("mSampleList.size()=" + mSampleList.size());
        // gene lines
        HashMap<String, ArrayList<String>> hashprefixToGeneList = new HashMap<>();
        for (String geneEq : theGeneEqList.keySet()) {
            String md5prefix = DigestUtils.md5Hex(geneEq).substring(0, 2);
            ArrayList<String> genesublist = hashprefixToGeneList.get(md5prefix);
            if (null == genesublist) {
                genesublist = new ArrayList<>();
            }
            genesublist.add(geneEq);
            hashprefixToGeneList.put(md5prefix, genesublist);
        }
        for (String md5prefix : hashprefixToGeneList.keySet()) {
            ArrayList<String> genesublist = hashprefixToGeneList.get(md5prefix);
            writeToMD5File(md5prefix, theGeneEqList, genesublist);
        }
    }

    protected GeneNames_Mixin getGeneNamesMixin() throws Exception {
        GeneNames_Mixin result = null;
        TcgaGSData.printWithFlag("ProcessFile::processGeneEq - Start");
        if ("genome_wide_snp_6_hg19nocnvWxy".equalsIgnoreCase(mPlatform)) {
            result = new GN_SNP6(mGeneEqMap, mIdPath);
        } else if ("illuminahiseq_rnaseqv2_gene".equalsIgnoreCase(mPlatform)) {
            result = new GN_RNASeqV2(mGeneEqMap, mIdPath);
        } else if ("illuminahiseq_rnaseq_uncGeneRPKM".equalsIgnoreCase(mPlatform)) {
            result = new GN_RNASeq(mGeneEqMap, mIdPath);
        } else if ("humanmethylation450_level3".equalsIgnoreCase(mPlatform)) {
            result = new GN_Meth450(mGeneEqMap, mIdPath);
        } else if ("humanmethylation27_hg19Wxy".equalsIgnoreCase(mPlatform)) {
            result = new GN_Meth27(mGeneEqMap, mIdPath);
        } else if ("illuminahiseq_mirnaseq_isoform".equalsIgnoreCase(mPlatform)) {
            result = new GN_miRNASeq(mGeneEqMap, mIdPath);
        } else if ("mutations".equalsIgnoreCase(mType)) {
            result = new GN_Mutations(mGeneEqMap, mIdPath);
        } else {
            TcgaGSData.printWithFlag("Unrecognized directory " + mPlatform);
            throw new Exception("Unrecognized directory " + mPlatform);
        }
        TcgaGSData.printWithFlag("ProcessFile::processGeneEq - Finish");
        return result;
    }

    protected TreeMap<File, String> getListOfFiles() {
        String skip = ".DS_Store";
        TreeMap<File, String> myFiles = new TreeMap<>();
        for (File diseaseDir : new File(mConvertedDir).listFiles()) {
            String disease = diseaseDir.getName();
            if ((diseaseDir.isDirectory()) && (!skip.equals(disease))) {
                TcgaGSData.printWithFlag("ProcessFile::scanAndProcessDirs - disease=" + disease);
                for (File typeDir : diseaseDir.listFiles()) {
                    String type = typeDir.getName();
                    if ((typeDir.isDirectory()) && (!skip.equals(type))
                            && ((mType.equals(type)) || ("*".equals(mType)))) {
                        TcgaGSData.printWithFlag("ProcessFile::scanAndProcessDirs - type=" + type);
                        for (File platformDir : typeDir.listFiles()) {
                            String platform = platformDir.getName();
                            if ((platformDir.isDirectory()) && (!skip.equals(platform))
                                    && ((mPlatform.equals(platform)) || ("*".equals(mPlatform)))) {
                                TcgaGSData.printWithFlag("ProcessFile::scanAndProcessDirs - platform=" + platform);
                                for (File levelDir : platformDir.listFiles()) {
                                    String level = levelDir.getName();
                                    if ((levelDir.isDirectory()) && (!skip.equals(level))
                                            && ((mLevel.equals(level)) || ("*".equals(mLevel)))) {
                                        TcgaGSData
                                                .printWithFlag("ProcessFile::scanAndProcessDirs - level=" + level);
                                        myFiles.put(new File(levelDir, "matrix_data.tsv"), disease);
                                    }
                                }
                            }
                        }
                    }
                }
            }
        }
        return myFiles;
    }

    protected void scanAndProcessDirs(TreeMap<File, String> theFiles) throws IOException {
        TcgaGSData.printWithFlag("ProcessFile::scanAndProcessDirs - Start");
        for (Entry<File, String> entry : theFiles.entrySet()) {
            processFile(entry.getKey(), entry.getValue());
        }
        TcgaGSData.printWithFlag("ProcessFile::scanAndProcessDirs - Finished");
    }

    protected Double getPointOnePercentile(File theDir) throws IOException {
        Double pointOnePercentile = null;
        File annotFile = new File(theDir, "annotations.tsv");
        if (annotFile.exists()) {
            List<String> lines = Files.readAllLines(Paths.get(annotFile.getAbsolutePath()));
            for (String line : lines) {
                if (line.startsWith("POINT_ONE_PERCENTILE")) {
                    line = line.replace("POINT_ONE_PERCENTILE\t", "");
                    pointOnePercentile = Double.valueOf(line);
                }
            }
        }
        if (null != pointOnePercentile) {
            TcgaGSData.printWithFlag("ProcessFile::getPointOnePercentile - found point one percentile for "
                    + theDir.getAbsolutePath());
            TcgaGSData.printWithFlag(
                    "ProcessFile::getPointOnePercentile - point one percentile " + pointOnePercentile);
        }
        return pointOnePercentile;
    }

    protected void processFile(File theFile, String theDisease) throws IOException {
        TcgaGSData.printWithFlag("ProcessFile::processFile - Start");
        TcgaGSData
                .printWithFlag("ProcessFile::processFile - theFile.getAbsolutePath()=" + theFile.getAbsolutePath());
        Double pointOnePercentile = getPointOnePercentile(theFile.getParentFile());
        try (BufferedReader br = Files.newBufferedReader(Paths.get(theFile.getAbsolutePath()),
                Charset.availableCharsets().get("ISO-8859-1"))) {
            // first line samples
            String line = br.readLine();
            TcgaGSData.printWithFlag("ProcessFile::processFile - populateSampleLists");
            populateSampleLists(theDisease, GSStringUtils.afterTab(line).split("\t", -1));
            // do rest
            int nextIndex = 0;
            line = br.readLine();
            int lineCnt = 0;
            TcgaGSData.printWithFlag("ProcessFile::processFile - before lines");
            while (null != line) {
                String geneEq = GSStringUtils.beforeTab(line);
                String data = GSStringUtils.afterTab(line);
                nextIndex = populateGeneAndData(geneEq, data.split("\t", -1), mCurrentColumn, pointOnePercentile);
                line = br.readLine();
                lineCnt = lineCnt + 1;
                if (0 == (lineCnt % 1000)) {
                    System.out.print(" " + lineCnt);
                }
                if (0 == (lineCnt % 10000)) {
                    TcgaGSData.printWithFlag("");
                }
            }
            TcgaGSData.printWithFlag(" -");
            TcgaGSData.printWithFlag("ProcessFile::processFile - after lines");
            mCurrentColumn = nextIndex;
        }
        TcgaGSData.printWithFlag("ProcessFile::processFile - Finish");
    }

    protected int populateGeneAndData(String theGeneEq, String[] theData, int theStart,
            Double thePointOnePercentile) {
        //TcgaGSData.printWithFlag("ProcessFile::populateGeneAndData - Start");
        Integer intGE = mGeneEqMap.get(theGeneEq);
        if (null == intGE) {
            int newIndex = mGeneEqMap.size();
            mGeneEqMap.put(theGeneEq, newIndex);
            intGE = mGeneEqMap.get(theGeneEq);
        }
        int indexGE = intGE.intValue();
        for (String value : theData) {
            double dVal = Double.NaN;
            if (!"NA".equalsIgnoreCase(value)) {
                if (!"NaN".equalsIgnoreCase(value)) {
                    if (!"".equals(value)) {
                        dVal = Double.parseDouble(value);
                        if (null != thePointOnePercentile) {
                            dVal = convertToPlusOne(dVal, thePointOnePercentile);
                        }
                    }
                }
            }
            mCombinedData[theStart][indexGE] = dVal;
            theStart = theStart + 1;
        }
        //TcgaGSData.printWithFlag("ProcessFile::populateGeneAndData - Finish");
        return theStart;
    }

    protected double convertToPlusOne(double theValue, double thePointOnePercentile) {
        // theValue = Math.log(val + thePointOnePercentile) / Math.log(2.0);
        // theValue*Math.log(2.0) = Math.log(val + thePointOnePercentile)
        // Math.exp(theValue*Math.log(2.0)) = val + thePointOnePercentile
        // Math.exp(theValue*Math.log(2.0)) - thePointOnePercentile = val
        double value = Math.exp(theValue * Math.log(2.0)) - thePointOnePercentile;
        value = Math.log(value + 1) / Math.log(2.0);
        return value;
    }

    protected void populateSampleLists(String theDisease, String[] theSamples) {
        TcgaGSData.printWithFlag("ProcessFile::populateSampleLists - Start");
        ArrayList<String> diseaseAndSamples = new ArrayList<>();
        mSampleList.addAll(Arrays.asList(theSamples));
        diseaseAndSamples.add(theDisease);
        diseaseAndSamples.addAll(Arrays.asList(theSamples));
        mDiseaseToSampleLists.add(diseaseAndSamples);
        TcgaGSData.printWithFlag("ProcessFile::populateSampleLists - Finish");
    }
}