org.mskcc.juber.waltz.Waltz.java Source code

Java tutorial

Introduction

Here is the source code for org.mskcc.juber.waltz.Waltz.java

Source

/*******************************************************************************
 *
 * @author Juber Patel
 *
 *         Copyright (c) 2017 Innovation Lab, CMO, MSKCC.
 *
 *         This software was developed at the Innovation Lab, Center for
 *         Molecular Oncology,
 *         Memorial Sloan Kettering Cancer Center, New York, New York.
 *
 *         Licensed under the Apache License, Version 2.0 (the "License");
 *         you may not use this file except in compliance with the License.
 *         You may obtain a copy of the License at
 *
 *         http://www.apache.org/licenses/LICENSE-2.0
 *
 *         Unless required by applicable law or agreed to in writing, software
 *         distributed under the License is distributed on an "AS IS" BASIS,
 *         WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
 *         implied.
 *         See the License for the specific language governing permissions and
 *         limitations under the License.
 *******************************************************************************/
package org.mskcc.juber.waltz;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileReader;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;

import org.apache.commons.io.FilenameUtils;

import com.google.common.base.Splitter;

import htsjdk.samtools.SAMFileHeader;
import htsjdk.samtools.SamInputResource;
import htsjdk.samtools.SamReader;
import htsjdk.samtools.SamReaderFactory;
import htsjdk.samtools.util.Interval;
import htsjdk.samtools.util.IntervalList;

/**
 * 
 */

/**
 * @author Juber Patel
 * 
 */
public class Waltz {
    // TODO compute genome size??
    // right now, hard coding the genome size that is used only for
    // reporting
    // excludes X, Y and MT
    public static final long genomeSize = 2881033286L;
    public static final int basePadding = 0;

    /**
     * @param args
     * @throws IOException
     * @throws Exception
     */
    public static void main(String[] args) throws IOException {
        // TODO Should this program be made single threaded?? That would be more
        // suitable for cluster run and it will also resolve some design issues.

        final String module = args[0];
        final int minimumMappingQuality = Integer.parseInt(args[1]);
        final String bamFile = args[2];
        final File referenceFastaFile = new File(args[3]);
        final File intervalsBedFile = new File(args[4]);
        String moduleArgument = null;
        if (args.length == 6) {
            moduleArgument = args[5];
        }

        // must not see args[] beyond this point

        final String bamIndexFile = FilenameUtils.removeExtension(bamFile) + ".bai";
        final String sampleName = FilenameUtils.removeExtension(new File(bamFile).getName());

        SamReaderFactory factory = SamReaderFactory.makeDefault();
        SamInputResource resource = SamInputResource.of(new File(bamFile)).index(new File(bamIndexFile));
        SamReader reader = factory.open(resource);
        SAMFileHeader header = reader.getFileHeader();
        // TODO implement this method properly! It is broken!
        int[] dummyInsertSize = estimateInsertSize(reader, 99.0);
        reader.close();

        // IntervalList[] inputIntervalLists = makeIntervalLists(chunkSize,
        // numberOfThreads, header);
        IntervalList[] inputIntervalLists = makeIntervalLists(intervalsBedFile, 1, header);
        WaltzOutput output = new WaltzOutput(sampleName);

        long start = System.currentTimeMillis();

        IntervalList intervalList = inputIntervalLists[0];
        WaltzWorker worker = new WaltzWorker(module, minimumMappingQuality, bamFile, bamIndexFile,
                referenceFastaFile, intervalList, moduleArgument, dummyInsertSize, output);

        // execute the worker
        worker.process();

        output.close();
        long time = System.currentTimeMillis() - start;
        System.out.println("Program finished in " + (time * 1.0) / 1000 + " seconds\n");

        /**
         * writeVariationFrequencies(insertions, sampleName +
         * "-insertions.txt");
         * writeVariationFrequencies(deletions, sampleName + "-deletions.txt");
         * writeVariationFrequencies(substituions,
         * sampleName + "-substitutions.txt");
         **/
    }

    /**
     * make given number of interval lists from the given bed file
     * 
     * @param targetBed
     * @param numberOfLists
     * @param header
     * @return
     * @throws IOException
     */
    private static IntervalList[] makeIntervalLists(File bedFile, int numberOfLists, SAMFileHeader header)
            throws IOException {
        boolean addChr = hasChr(header);

        IntervalList[] intervalLists = new IntervalList[numberOfLists];
        for (int i = 0; i < intervalLists.length; i++) {
            intervalLists[i] = new IntervalList(header);
        }

        Splitter tabSplitter = Splitter.on('\t');

        BufferedReader reader = new BufferedReader(new FileReader(bedFile));
        String line = null;
        int lineNumber = 1;
        while ((line = reader.readLine()) != null) {
            List<String> words = tabSplitter.splitToList(line);
            String chr = null;
            if (addChr) {
                chr = "chr" + words.get(0);
            } else {
                chr = words.get(0);
            }

            // adding padding
            int start = Integer.parseInt(words.get(1)) - basePadding;
            int end = Integer.parseInt(words.get(2)) + basePadding;

            Interval interval = new Interval(chr, start, end, false, words.get(4));
            // add intervals to interval lists equitably
            IntervalList intervalList = intervalLists[lineNumber % numberOfLists];
            intervalList.add(interval);
        }

        reader.close();
        return intervalLists;
    }

    private static boolean hasChr(SAMFileHeader header) {
        String name = header.getSequence(0).getSequenceName();
        if (name.startsWith("chr")) {
            return true;
        } else {
            return false;
        }
    }

    private static int[] estimateInsertSize(SamReader reader, double percentCovered) throws IOException {
        // System.out.println("Estimating min and max insert sizes that contain
        // "
        // + percentCovered + "% of the inserts from sampled reads");

        // System.out.println("Returning dummy range: 124 593");
        return new int[] { 124, 593 };

        // TODO the interval/reads to be sampled should be decided more
        // systematically
        /*
         * SAMRecordIterator iterator = reader.queryOverlapping("chr1",
         * 100000000,
         * 110000000);
         * 
         * List<Integer> insertsTemp = new ArrayList<Integer>();
         * 
         * while (iterator.hasNext())
         * {
         * SAMRecord record = iterator.next();
         * 
         * if (record.getReadUnmappedFlag() || record.getMateUnmappedFlag())
         * continue;
         * 
         * int insert = record.getInferredInsertSize();
         * 
         * if (insert <= 0)
         * continue;
         * 
         * insertsTemp.add(insert);
         * }
         * 
         * iterator.close();
         * 
         * double[] inserts = new double[insertsTemp.size()];
         * 
         * for (int i = 0; i < inserts.length; i++)
         * {
         * inserts[i] = insertsTemp.get(i);
         * }
         * 
         * insertsTemp = null;
         * 
         * Arrays.sort(inserts);
         * 
         * Percentile percentile = new Percentile();
         * percentile.setData(inserts);
         * double p = (100.0 - percentCovered) / 2;
         * 
         * double left = percentile.evaluate(p);
         * double right = percentile.evaluate(100 - p);
         * 
         * System.out.println((int) left + "\t" + (int) right);
         * 
         * return new int[] { (int) left, (int) right };
         * 
         */
    }

    private static IntervalList[] makeIntervalLists(int chunkSize, int numberOfLists, SAMFileHeader header) {
        String[] sequencesOfInterest = getSeqsOfInterest();

        List<Interval> intervals = new ArrayList<Interval>();

        long totalBases = 0;

        // make intervals of specified chunk size
        for (int i = 0; i < sequencesOfInterest.length; i++) {
            String sequenceName = sequencesOfInterest[i];
            int size = header.getSequence(sequenceName).getSequenceLength();
            totalBases += size;

            int start = 1;
            int end = chunkSize;

            while (start <= size) {
                intervals.add(new Interval(sequenceName, start, end));
                start = end + 1;
                end = start + chunkSize - 1;
            }
        }

        // shuffle the intervals
        Collections.shuffle(intervals);

        // create interval lists
        IntervalList[] intervalLists = new IntervalList[numberOfLists];
        for (int i = 0; i < intervalLists.length; i++) {
            intervalLists[i] = new IntervalList(header);
        }

        // add intervals to interval lists equitably
        for (int i = 0; i < intervals.size(); i++) {
            Interval interval = intervals.get(i);

            intervalLists[i % numberOfLists].add(interval);
        }

        return intervalLists;
    }

    private static String[] getSeqsOfInterest() {
        // return new String[] { "gi|215104|gb|J02459.1|LAMCG" };

        String[] seqs = new String[22];

        for (int i = 0; i < 22; i++) {
            seqs[i] = "chr" + Integer.toString(i + 1);
        }

        return seqs;
    }

}