edu.duke.igsp.gkde.Main.java Source code

Java tutorial

Introduction

Here is the source code for edu.duke.igsp.gkde.Main.java

Source

/*****************************************************************************
  Main.java
    
  (c) 2008-2013 - Alan Boyle
  Department of Genetics
  Stanford University
  aboyle@gmail.com
    
  Licensed under the GNU General Public License 3.0 license.
    
  This file is part of F-seq.
    
  F-seq is free software: you can redistribute it and/or modify
  it under the terms of the GNU General Public License as published by
  the Free Software Foundation, either version 3 of the License, or
  (at your option) any later version.
    
  F-seq is distributed in the hope that it will be useful,
  but WITHOUT ANY WARRANTY; without even the implied warranty of
  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  GNU General Public License for more details.
    
  You should have received a copy of the GNU General Public License
  along with F-seq.  If not, see <http://www.gnu.org/licenses/>.
    
******************************************************************************/

package edu.duke.igsp.gkde;

import java.awt.Dimension;
import java.io.File;
import java.io.FileFilter;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Random;

import javax.swing.JFrame;

import org.apache.commons.cli.CommandLine;
import org.apache.commons.cli.CommandLineParser;
import org.apache.commons.cli.GnuParser;
import org.apache.commons.cli.HelpFormatter;
import org.apache.commons.cli.OptionBuilder;
import org.apache.commons.cli.Options;

import edu.duke.igsp.gkde.KDEChromosome.Settings;
import edu.duke.igsp.gkde.KDEChromosome;
import edu.duke.igsp.gkde.format.BedDensityWriter;
import edu.duke.igsp.gkde.format.NpfDensityWriter;
import edu.duke.igsp.gkde.format.BedReader;
import edu.duke.igsp.gkde.format.SamReader;
import edu.duke.igsp.gkde.format.DensityWriter;
import edu.duke.igsp.gkde.format.WiggleDensityWriter;
import edu.duke.igsp.gkde.background.*;

public class Main {

    public static void main(String[] argv) throws Exception {

        Options opts = new Options();
        opts.addOption("s", true, "wiggle track step (default=1)");
        opts.addOption("l", true, "feature length (default=600)");
        opts.addOption("f", true, "fragment size (default=estimated from data)");
        //    opts.addOption("b", true, "bandwidth (default=200)");
        //    opts.addOption("w", true, "window (default=3800");
        opts.addOption("wg", true, "wg threshold set (defualt = calculated)");
        opts.addOption("c", true, "genomic total read weight (defualt = calculated)");
        opts.addOption("h", false, "print usage");
        opts.addOption(OptionBuilder.withArgName("input dir").hasArg()
                .withDescription("input directory (default=current directory)").isRequired(false).create("d"));
        opts.addOption(OptionBuilder.withArgName("output dir").hasArg()
                .withDescription("output directory (default=current directory)").isRequired(false).create("o"));
        opts.addOption(OptionBuilder.withArgName("background dir").hasArg()
                .withDescription("background directory (default=none)").isRequired(false).create("b"));
        opts.addOption(OptionBuilder.withArgName("ploidy dir").hasArg()
                .withDescription("ploidy/input directory (default=none)").isRequired(false).create("p"));
        opts.addOption(OptionBuilder.withArgName("wig | bed | npf").hasArg()
                .withDescription("output format (default wig)").isRequired(false).create("of"));
        opts.addOption(OptionBuilder.withArgName("dnase | chip | faire | atac").hasArg()
                .withDescription("input data").isRequired(true).create("in"));
        opts.addOption(OptionBuilder.withArgName("weight clip").hasArg()
                .withDescription("weight clip value (default none)").isRequired(false).create("wc"));
        opts.addOption("t", true, "threshold (standard deviations) (default=4.0)");
        //    opts.addOption("r", true, "background ratio (default=2.0)");
        opts.addOption("v", false, "verbose output");

        CommandLineParser parser = new GnuParser();
        int fragment_size = -1;
        int fragment_offset = 0;
        long featureLength = 600l;
        //    float thresh = 2;
        float threshold = KDEChromosome.Settings.DEFAULT_THRESHOLD;
        int step = 1;
        boolean showHelp = false;
        boolean verbose = false;
        String inputDirectory = null;
        String backgroundDirectory = null;
        String ploidyDirectory = null;
        String[] files = null;
        String[] bgfiles = {};
        String[] ipfiles = {};
        String outputFormat = "wig";
        String inputDataType = "dnase";
        File outputDirectory = new File(System.getProperty("user.dir"));

        long bandwidth = 0l;
        long window = 0l;
        double ncuts = 0.0d;
        float temp_threshold = 0f;
        int weight_clip = 0;

        System.out.println("F-Seq Version 1.85");

        try {
            CommandLine cmd = parser.parse(opts, argv);
            showHelp = (cmd.hasOption("h"));
            verbose = (cmd.hasOption("v"));
            if (cmd.hasOption("s"))
                step = Integer.parseInt(cmd.getOptionValue("s"));
            if (cmd.hasOption("f"))
                fragment_size = Integer.parseInt(cmd.getOptionValue("f"));
            if (cmd.hasOption("d")) //input directory
                inputDirectory = cmd.getOptionValue("d");
            if (cmd.hasOption("b")) //background directory
                backgroundDirectory = cmd.getOptionValue("b");
            if (cmd.hasOption("p")) //ploidy|input directory
                ploidyDirectory = cmd.getOptionValue("p");
            if (cmd.hasOption("l")) // bandwidth
                featureLength = Long.parseLong(cmd.getOptionValue("l"));
            if (cmd.hasOption("of")) { // output format
                outputFormat = cmd.getOptionValue("of");
                if (!outputFormat.equals("wig") && !outputFormat.equals("bed") && !outputFormat.equals("npf")) {
                    System.out.println("Parameter error: output format must be 'wig' or 'bed'.");
                    showHelp = true;
                }
            }
            if (cmd.hasOption("in")) { // input data type
                inputDataType = cmd.getOptionValue("in");
                if (!inputDataType.equals("dnase") && !inputDataType.equals("chip")
                        && !inputDataType.equals("faire") && !inputDataType.equals("atac")) {
                    System.out.println(
                            "Parameter error: input data type must be 'dnase', 'chip', 'faire', or 'atac'.");
                    showHelp = true;
                }
            }
            if (cmd.hasOption("wc")) { // weight clip
                weight_clip = Integer.parseInt(cmd.getOptionValue("wc"));
            }
            if (cmd.hasOption("t")) { // threshold (standard deviations)
                threshold = Float.parseFloat(cmd.getOptionValue("t"));
            }
            if (cmd.hasOption("o")) { // output directory
                String out = cmd.getOptionValue("o");
                outputDirectory = new File(out);
                if (!outputDirectory.exists() && !outputDirectory.isDirectory()) {
                    System.out.println("Output directory '" + out + "' is not a valid directory.");
                    showHelp = true;
                }
            }

            if (cmd.hasOption("wg"))
                temp_threshold = Float.parseFloat(cmd.getOptionValue("wg"));
            if (cmd.hasOption("c"))
                ncuts = Double.parseDouble(cmd.getOptionValue("c"));

            // TESTING ONLY
            //   if(cmd.hasOption("w")) // window
            //     window = Long.parseLong(cmd.getOptionValue("w"));
            //if(cmd.hasOption("b")) // window
            //  bandwidth = Long.parseLong(cmd.getOptionValue("b"));

            files = cmd.getArgs(); // input files
            //bgfiles = cmd.getArgs(); // background files
        } catch (Exception e) {
            System.out.println("Error parsing arguments: " + e.getMessage());
            e.printStackTrace();
            showHelp = true;
        }

        if (showHelp || (inputDirectory == null && files.length == 0)) {
            HelpFormatter formatter = new HelpFormatter();
            formatter.printHelp("fseq [options]... [file(s)]...", opts);
            System.exit(1);
        }

        File[] pfiles = getFiles(inputDirectory, files);
        File[] background_files = getFiles(backgroundDirectory, bgfiles);
        File[] ploidy_files = getFiles(ploidyDirectory, ipfiles);

        KDEChromosome[] chrs = null;
        // assume all files are of the same type, if not we'll get parsing errors
        String path = pfiles[0].getPath();
        String extension = path.substring(path.lastIndexOf('.')).toLowerCase();
        System.out.println("Path: " + path + ", extension: " + extension);
        if (extension.equals(".bed")) {
            System.out.println("Parsing BED file.");
            chrs = BedReader.read(pfiles);
        } else if (extension.equals(".sam") || extension.equals(".bam")) {
            System.out.println("Parsing SAM/BAM file.");
            chrs = SamReader.read(pfiles, weight_clip);
        }
        //KDEChromosome[] input = BedReader.read(ifiles);

        //compute fragment offset
        if (fragment_size == -1) {
            fragment_size = wgShiftCalc(chrs);
        }
        fragment_offset = (int) (fragment_size / 2);

        if (ncuts == 0.0d) {
            for (int i = 0; i < chrs.length; ++i) {
                // computes the total read weight of all cuts on a chromosome
                ncuts += chrs[i].getTotalWeight();
            }
        }

        KDEChromosome.Settings settings = null;
        if (bandwidth > 0 || window > 0) {
            settings = new KDEChromosome.Settings(bandwidth, window, threshold, fragment_offset, ncuts,
                    inputDataType);
        } else {
            settings = new KDEChromosome.Settings(featureLength, threshold, fragment_offset, ncuts, inputDataType);
        }

        float wg_threshold = wgThreshold(settings, chrs);
        if (temp_threshold != 0f) {
            wg_threshold = temp_threshold;
        }
        //KDEChromosome.Settings bg_settings = null;
        //bg_settings = new KDEChromosome.Settings(featureLength*2, threshold, fragment_offset);

        //int background_size = 0;
        //int input_size = 0;
        //float bg_ratio = 0;
        //float sd = 0;

        if (verbose) {
            System.out.println("Settings: ");
            System.out.println("\twindow=" + (settings.window * 2));
            System.out.println("\tbandwidth=" + (settings.bandwidth));
            //System.out.println("\tfragment offset=" + (settings.offset));
            System.out.println("\tthreshold = " + wg_threshold);
            System.out.println("\test. fragment size = " + fragment_size);
            System.out.println("\tsequence length = " + chrs[0].getSequenceLength());
        }

        //    if(backgroundDirectory != null) {
        //       for(int i = 0; i < input.length; ++i) {
        //          background_size += input[i].getLength();
        //       }
        //       for(int i = 0; i < chrs.length; ++i) {
        //          input_size += chrs[i].getLength();
        //       }
        //       bg_ratio = (float)input_size/(float)background_size;
        //       sd = computeSD(bg_settings, input);
        //       //System.out.println("Sample Ratio: " + bg_ratio);
        //       //System.out.println("Input Size: " + input_size);
        //       //System.out.println("Background Size: " + background_size);
        //       //System.out.println("Input standard deviation: " + (settings.threshold * (float)Math.sqrt((double)bg_ratio * (double)sd * (double)sd)));
        //       //System.out.println("Data standard deviation: " + settings.threshold * computeSD(settings, chrs));
        //    }

        for (int i = 0; i < chrs.length; ++i) {
            if (chrs[i].getFirstPos() == chrs[i].getLastPos()) {
                System.out.println("Warning: " + chrs[i].getChromosome() + " has size zero.  Skipping.");
                continue;
            }
            File ofile = Util.makeUniqueFileWithExtension(outputDirectory, chrs[i].getChromosome(), outputFormat);

            DensityWriter dw = null;
            if (outputFormat.equals("wig")) {
                dw = new WiggleDensityWriter(ofile, chrs[i].getChromosome(), chrs[i].getFirstPos(), step);
            } else {
                if (outputFormat.equals("npf")) {
                    dw = new NpfDensityWriter(ofile, chrs[i].getChromosome(), chrs[i].getFirstPos(), step);
                } else {
                    dw = new BedDensityWriter(ofile, chrs[i].getChromosome(), chrs[i].getFirstPos(), step);
                }
            }

            //Function takes all? or new function for each?
            //      if(backgroundDirectory != null) {
            //         boolean hit = false;
            //         for(int j = 0; j < background_files.length; ++j) {
            //            if(background_files[j].getName().equals(chrs[i].getChromosome() + ".bff")) {
            //               System.out.println("Running background on Chromosome " + chrs[i].getChromosome());
            //               chrs[i].runBG(settings, dw, verbose, wg_threshold, background_files[j]);
            //               hit = true;
            //            }
            //         }
            //         if(!hit)
            //            System.out.println("No background for Chromosome " + chrs[i].getChromosome());
            //      } else {
            //         if(ploidyDirectory !=)
            //         chrs[i].run(settings, dw, verbose, wg_threshold);
            //      }
            chrs[i].run(settings, dw, verbose, wg_threshold, background_files, ploidy_files);
            dw.close();
        }

        //kde.showGraph();
    }

    private static float computeSD(Settings settings, KDEChromosome[] chrs) {
        Random r = new Random();

        double size = 0;
        double ncuts = 0;

        for (int i = 0; i < chrs.length; ++i) {
            size += (int) Math.abs(chrs[i].getLastPos() - chrs[i].getFirstPos());
            ncuts += chrs[i].getTotalWeight();
        }

        int totalWindow = 1 + (int) (settings.window * 2);
        int cutDensity = (int) ((ncuts / size) * totalWindow);
        int thresholdIterations = 10000;
        double[] densities = new double[thresholdIterations];
        double[] cuts = new double[cutDensity];

        for (int i = 0; i < thresholdIterations; ++i) {
            for (int j = 0; j < cuts.length; ++j)
                cuts[j] = r.nextInt(totalWindow);
            Arrays.sort(cuts);
            densities[i] = density(settings, (long) settings.window, (int) (cuts.length / 2.0), cuts);
        }

        double std = Util.std(densities);

        return (float) (std);
    }

    private static float wgThreshold(Settings settings, KDEChromosome[] chrs) {
        Random r = new Random();

        double size = 0;
        double ncuts = 0;

        for (int i = 0; i < chrs.length; ++i) {
            size += (int) Math.abs(chrs[i].getLastPos() - chrs[i].getFirstPos());
            ncuts += chrs[i].getTotalWeight();
        }

        int totalWindow = 1 + (int) (settings.window * 2);
        int cutDensity = (int) ((ncuts / size) * totalWindow);
        int thresholdIterations = 10000;
        double[] densities = new double[thresholdIterations];
        double[] cuts = new double[cutDensity];

        for (int i = 0; i < thresholdIterations; ++i) {
            for (int j = 0; j < cuts.length; ++j)
                cuts[j] = r.nextInt(totalWindow);
            Arrays.sort(cuts);
            densities[i] = density(settings, (long) settings.window, (int) (cuts.length / 2.0), cuts);
        }

        double mean = Util.mean(densities);
        double std = Util.std(densities);

        return (float) (mean + settings.threshold * std);
    }

    private static int wgShiftCalc(KDEChromosome chrs[]) {
        //     for(int i = 0; i < chrs.length; ++i) {
        //These fragments are usually < 200bp so we can assume that the average is < 500

        //     }
        //Doing Wilcoxon-Mann-Whitney Rank Sum Test
        //Can compute confidence interval if necessary

        //get biggest chr for estimate
        int max_chr = 0;
        long max_chr_length = chrs[0].getLength();
        for (int i = 0; i < chrs.length; ++i) {
            if (chrs[i].getLength() > max_chr_length) {
                max_chr = i;
                max_chr_length = chrs[i].getLength(); //This is actually the size of the array - poor name
            }
        }

        KDEChromosome.Sequence[] seqs = chrs[max_chr].getCuts();
        long cur_pos;
        long last_pos;
        ArrayList<Integer> d_temp = new ArrayList<Integer>();
        int k;

        for (k = 0; k < max_chr_length; ++k) {
            if (seqs[k].getStrand()) {
                break;
            }
        }

        int max_range = 500; //max search range for sequences
        int max = 1000000; //1 million is a good estimate
        int prior = 150;
        if (max > seqs.length - max_range - k) {
            max = seqs.length - max_range - k;
        }

        for (int j = k; j < k + max; ++j) {
            if (j > max_chr_length) {
                break;
            }
            if (seqs[j].getStrand()) {
                last_pos = seqs[j].getPosition();
                int back_check = j - 10000;
                if (j < 10000) {
                    back_check = j;
                }
                for (int i = back_check; i < j + 10000; ++i) {
                    if (i >= max)
                        break;
                    if (!seqs[i].getStrand()) {
                        if (Math.abs(seqs[i].getPosition() - last_pos - prior) < max_range) {
                            Integer temp = new Integer((int) (seqs[i].getPosition() - last_pos));
                            d_temp.add(temp);
                        }
                    }
                }
            }
        }

        int[] d = new int[d_temp.size()];
        for (int j = 0; j < d.length; ++j)
            d[j] = d_temp.get(j).intValue();

        Arrays.sort(d);
        int median;
        if (d.length % 2 == 0) { //even
            median = (int) ((double) (d[(int) Math.floor((double) d.length / 2.0)]
                    + d[(int) Math.ceil((double) d.length / 2.0)]) / 2.0);
        } else {
            median = d[d.length / 2];
        }
        // System.out.println("median " + median);
        // for(int m = 0; m < d.length; ++m) {
        //     System.out.println(d[m]);
        // }
        return median;
    }

    private static float density(Settings settings, long chromPos, int cutIdx, double[] cuts) {

        long minPos = chromPos - settings.window;
        long maxPos = chromPos + settings.window;

        double[] PRECOMPUTE = settings.precompute;

        double sum = 0.0;
        for (int i = cutIdx - 1; i > -1; --i) {
            if (cuts[i] < minPos)
                break;
            int d = Math.abs((int) (cuts[i] - chromPos));
            sum += settings.precompute[d];
        }

        for (int i = cutIdx; i < cuts.length; ++i) {
            if (cuts[i] > maxPos)
                break;

            int d = Math.abs((int) (cuts[i] - chromPos));

            //System.out.println(d);
            if (d > PRECOMPUTE.length - 1)
                throw new IllegalStateException();
            sum += PRECOMPUTE[d];
        }

        return (float) (sum / (double) settings.bandwidth);
    }

    private static File[] getFiles(String directory, String[] files) {

        File[] inputFiles = null;

        if (directory != null) {
            File dir = new File(directory);
            if (!dir.exists() || !dir.isDirectory()) {
                System.out.println("Directory parameter " + directory + " does not exist or is not a directory.");
                System.exit(1);
            }
            if (files.length == 0) {
                inputFiles = dir.listFiles(new FileFilter() {
                    public boolean accept(File f) {
                        return !f.isHidden() && f.isFile();
                    }
                });
            }
        } else {
            directory = System.getProperty("user.dir");
        }

        if (inputFiles == null) {
            inputFiles = new File[files.length];
            for (int i = 0; i < files.length; ++i) {
                File f = new File(files[i]);
                boolean exists = false;
                if (!(exists = f.exists())) {
                    f = new File(directory, files[i]);
                    exists = f.exists();
                }
                if (!exists) {
                    System.out.println("Input file " + files[i] + " does not exist.");
                    System.exit(1);
                }
                inputFiles[i] = f;
            }
        }

        return inputFiles;
    }
}