edu.msu.cme.rdp.multicompare.Reprocess.java Source code

Introduction

Here is the source code for edu.msu.cme.rdp.multicompare.Reprocess.java
Source

/*
 * Copyright (C) 2012 Michigan State University <rdpstaff at msu.edu>
 *
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program.  If not, see <http://www.gnu.org/licenses/>.
 */

package edu.msu.cme.rdp.multicompare;

import edu.msu.cme.rdp.classifier.cli.CmdOptions;
import edu.msu.cme.rdp.classifier.io.ClassificationResultFormatter;
import edu.msu.cme.rdp.classifier.utils.ClassifierFactory;
import edu.msu.cme.rdp.multicompare.visitors.DefaultPrintVisitor;
import java.io.*;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import org.apache.commons.cli.*;
import org.apache.commons.io.output.NullWriter;

/**
 *
 * @author fishjord
 */
public class Reprocess {

    private static Options options = new Options();

    public static HashSet<String> readTaxonFilterFile(String file) throws IOException {
        HashSet<String> ret = new HashSet<String>();

        BufferedReader reader = new BufferedReader(new FileReader(file));
        String line;

        while ((line = reader.readLine()) != null) {
            if (line.trim().equals(""))
                continue;
            ret.add(line.trim());
            System.err.println(line.trim());
        }
        reader.close();
        return ret;
    }

    static {
        options.addOption(new Option(CmdOptions.HIER_OUTFILE_SHORT_OPT, CmdOptions.HIER_OUTFILE_LONG_OPT, true,
                CmdOptions.HIER_OUTFILE_DESC));
        options.addOption(new Option(CmdOptions.OUTFILE_SHORT_OPT, CmdOptions.OUTFILE_LONG_OPT, true,
                CmdOptions.OUTFILE_DESC));
        options.addOption(new Option(CmdOptions.RANK_SHORT_OPT, CmdOptions.RANK_LONG_OPT, true,
                "the rank to apply the confidence cutoff."
                        + " Only assignment reuslts with confidence above the cutoff at the specified rank will be included in the assignment detail outfile. Default is the lowest rank"));
        options.addOption(new Option(CmdOptions.TAXON_SHORT_OPT, CmdOptions.TAXON_LONG_OPT, true,
                "the taxon name filter containing one taxon per line.\n"
                        + "Only assignment results matching the taxon name will be included in assignment detail outfile."));
        options.addOption(
                new Option(CmdOptions.FORMAT_SHORT_OPT, CmdOptions.FORMAT_LONG_OPT, true, CmdOptions.FORMAT_DESC));
        options.addOption(new Option(CmdOptions.BOOTSTRAP_SHORT_OPT, CmdOptions.BOOTSTRAP_LONG_OPT, true,
                CmdOptions.BOOTSTRAP_DESC));
        options.addOption(new Option(CmdOptions.TRAINPROPFILE_SHORT_OPT, CmdOptions.TRAINPROPFILE_LONG_OPT, true,
                CmdOptions.TRAINPROPFILE_DESC));
        options.addOption(
                new Option(CmdOptions.GENE_SHORT_OPT, CmdOptions.GENE_LONG_OPT, true, CmdOptions.GENE_DESC));
    }

    /**
     * This class reprocesses the classification results (allrank output) and print out hierarchy output file, based on the confidence cutoff;
     * and print out only the detail classification results with assignment at certain rank with confidence above the cutoff or/and matching a given taxon.
     * @param args
     * @throws Exception 
     */
    public static void main(String[] args) throws Exception {

        PrintWriter assign_out = new PrintWriter(new NullWriter());
        float conf = 0.8f;
        PrintStream heir_out = null;
        String hier_out_filename = null;
        ClassificationResultFormatter.FORMAT format = ClassificationResultFormatter.FORMAT.allRank;
        String rank = null;
        String taxonFilterFile = null;
        String train_propfile = null;
        String gene = null;
        List<MCSample> samples = new ArrayList();

        try {
            CommandLine line = new PosixParser().parse(options, args);
            if (line.hasOption(CmdOptions.HIER_OUTFILE_SHORT_OPT)) {
                hier_out_filename = line.getOptionValue(CmdOptions.HIER_OUTFILE_SHORT_OPT);
                heir_out = new PrintStream(hier_out_filename);
            } else {
                throw new Exception(
                        "It make sense to provide output filename for " + CmdOptions.HIER_OUTFILE_LONG_OPT);
            }
            if (line.hasOption(CmdOptions.OUTFILE_SHORT_OPT)) {
                assign_out = new PrintWriter(line.getOptionValue(CmdOptions.OUTFILE_SHORT_OPT));
            }

            if (line.hasOption(CmdOptions.RANK_SHORT_OPT)) {
                rank = line.getOptionValue(CmdOptions.RANK_SHORT_OPT);
            }
            if (line.hasOption(CmdOptions.TAXON_SHORT_OPT)) {
                taxonFilterFile = line.getOptionValue(CmdOptions.TAXON_SHORT_OPT);
            }

            if (line.hasOption(CmdOptions.BOOTSTRAP_SHORT_OPT)) {
                conf = Float.parseFloat(line.getOptionValue(CmdOptions.BOOTSTRAP_SHORT_OPT));
                if (conf < 0 || conf > 1) {
                    throw new IllegalArgumentException("Confidence must be in the range [0,1]");
                }
            }
            if (line.hasOption(CmdOptions.FORMAT_SHORT_OPT)) {
                String f = line.getOptionValue(CmdOptions.FORMAT_SHORT_OPT);
                if (f.equalsIgnoreCase("allrank")) {
                    format = ClassificationResultFormatter.FORMAT.allRank;
                } else if (f.equalsIgnoreCase("fixrank")) {
                    format = ClassificationResultFormatter.FORMAT.fixRank;
                } else if (f.equalsIgnoreCase("db")) {
                    format = ClassificationResultFormatter.FORMAT.dbformat;
                } else if (f.equalsIgnoreCase("filterbyconf")) {
                    format = ClassificationResultFormatter.FORMAT.filterbyconf;
                } else {
                    throw new IllegalArgumentException(
                            "Not valid output format, only allrank, fixrank, filterbyconf and db allowed");
                }
            }
            if (line.hasOption(CmdOptions.TRAINPROPFILE_SHORT_OPT)) {
                if (gene != null) {
                    throw new IllegalArgumentException(
                            "Already specified the gene from the default location. Can not specify train_propfile");
                } else {
                    train_propfile = line.getOptionValue(CmdOptions.TRAINPROPFILE_SHORT_OPT);
                }
            }
            if (line.hasOption(CmdOptions.GENE_SHORT_OPT)) {
                if (train_propfile != null) {
                    throw new IllegalArgumentException(
                            "Already specified train_propfile. Can not specify gene any more");
                }
                gene = line.getOptionValue(CmdOptions.GENE_SHORT_OPT).toLowerCase();

                if (!gene.equals(ClassifierFactory.RRNA_16S_GENE) && !gene.equals(ClassifierFactory.FUNGALLSU_GENE)
                        && !gene.equals(ClassifierFactory.FUNGALITS_warcup_GENE)
                        && !gene.equals(ClassifierFactory.FUNGALITS_unite_GENE)) {
                    throw new IllegalArgumentException(gene + " is NOT valid, only allows "
                            + ClassifierFactory.RRNA_16S_GENE + ", " + ClassifierFactory.FUNGALLSU_GENE + ", "
                            + ClassifierFactory.FUNGALITS_warcup_GENE + " and "
                            + ClassifierFactory.FUNGALITS_unite_GENE);
                }
            }
            args = line.getArgs();
            if (args.length < 1) {
                throw new Exception("Incorrect number of command line arguments");
            }

            for (String arg : args) {
                String[] inFileNames = arg.split(",");
                String inputFile = inFileNames[0];
                File idmappingFile = null;

                if (inFileNames.length == 2) {
                    idmappingFile = new File(inFileNames[1]);
                    if (!idmappingFile.exists()) {
                        System.err.println("Failed to find input file \"" + inFileNames[1] + "\"");
                        return;
                    }
                }

                MCSample nextSample = new MCSampleResult(inputFile, idmappingFile);
                samples.add(nextSample);

            }
        } catch (Exception e) {
            System.out.println("Command Error: " + e.getMessage());
            new HelpFormatter().printHelp(120,
                    "Reprocess [options] <Classification_allrank_result>[,idmappingfile] ...", "", options, "");
            return;
        }

        if (train_propfile == null && gene == null) {
            gene = ClassifierFactory.RRNA_16S_GENE;
        }

        HashSet<String> taxonFilter = null;
        if (taxonFilterFile != null) {
            taxonFilter = readTaxonFilterFile(taxonFilterFile);
        }

        MultiClassifier multiClassifier = new MultiClassifier(train_propfile, gene);
        DefaultPrintVisitor printVisitor = new DefaultPrintVisitor(heir_out, samples);
        MultiClassifierResult result = multiClassifier.multiClassificationParser(samples, conf, assign_out, format,
                rank, taxonFilter);

        result.getRoot().topDownVisit(printVisitor);

        assign_out.close();
        heir_out.close();
        if (multiClassifier.hasCopyNumber()) {
            // print copy number corrected counts
            File cn_corrected_s = new File(new File(hier_out_filename).getParentFile(),
                    "cncorrected_" + hier_out_filename);
            PrintStream cn_corrected_hier_out = new PrintStream(cn_corrected_s);
            printVisitor = new DefaultPrintVisitor(cn_corrected_hier_out, samples, true);
            result.getRoot().topDownVisit(printVisitor);
            cn_corrected_hier_out.close();
        }

    }
}