edu.msu.cme.rdp.classifier.comparison.ComparisonCmd.java Source code

Java tutorial

Introduction

Here is the source code for edu.msu.cme.rdp.classifier.comparison.ComparisonCmd.java

Source

/*
 * ComparisonCmd.java
 *
 * This is a command line comparison class
 * Created on November 7, 2003, 5:14 PM
 */
package edu.msu.cme.rdp.classifier.comparison;

/**
 *
 * @author  wangqion
 */
import edu.msu.cme.rdp.classifier.ClassificationResult;
import edu.msu.cme.rdp.classifier.Classifier;
import edu.msu.cme.rdp.classifier.RankAssignment;
import edu.msu.cme.rdp.classifier.ShortSequenceException;
import edu.msu.cme.rdp.classifier.TrainingDataException;
import edu.msu.cme.rdp.classifier.cli.CmdOptions;
import edu.msu.cme.rdp.classifier.io.ClassificationResultFormatter;
import edu.msu.cme.rdp.classifier.utils.ClassifierFactory;
import edu.msu.cme.rdp.readseq.readers.Sequence;
import edu.msu.cme.rdp.readseq.readers.SequenceReader;
import java.io.BufferedWriter;
import java.io.FileInputStream;
import java.io.FileWriter;
import java.io.IOException;
import java.util.Iterator;
import java.util.List;
import org.apache.commons.cli.CommandLine;
import org.apache.commons.cli.HelpFormatter;
import org.apache.commons.cli.Option;
import org.apache.commons.cli.Options;
import org.apache.commons.cli.PosixParser;

public class ComparisonCmd {

    // long options
    public static final String QUERYFILE1_LONG_OPT = "queryFile1";
    public static final String QUERYFILE2_LONG_OPT = "queryFile2";
    public static final String COMPARE_OUTFILE_LONG_OPT = "compare_outputFile";

    //short options
    public static final String QUERYFILE1_SHORT_OPT = "q1";
    public static final String QUERYFILE2_SHORT_OPT = "q2";
    public static final String COMPARE_OUTFILE_SHORT_OPT = "o";

    // description of the options
    public static final String QUERYFILE_DESC = "query file contains sequences in one of the following formats: Fasta, Genbank and EMBL.";
    public static final String COMPARE_OUTFILE_DESC = "output file name for the comparsion results.";

    private static Options options = new Options();
    private ClassifierFactory factory;
    private float confidenceCutoff = CmdOptions.DEFAULT_CONF; // default
    private static final String SAMPLE1 = "sample1";
    private static final String SAMPLE2 = "sample2";

    static {
        options.addOption(new Option(QUERYFILE1_SHORT_OPT, QUERYFILE1_LONG_OPT, true, QUERYFILE_DESC));
        options.addOption(new Option(QUERYFILE2_SHORT_OPT, QUERYFILE2_LONG_OPT, true, QUERYFILE_DESC));
        options.addOption(new Option(CmdOptions.OUTFILE_SHORT_OPT, CmdOptions.OUTFILE_LONG_OPT, true,
                CmdOptions.OUTFILE_DESC));
        options.addOption(
                new Option(COMPARE_OUTFILE_SHORT_OPT, COMPARE_OUTFILE_LONG_OPT, true, COMPARE_OUTFILE_DESC));
        options.addOption(new Option(CmdOptions.TRAINPROPFILE_SHORT_OPT, CmdOptions.TRAINPROPFILE_LONG_OPT, true,
                CmdOptions.TRAINPROPFILE_DESC));
        options.addOption(
                new Option(CmdOptions.FORMAT_SHORT_OPT, CmdOptions.FORMAT_LONG_OPT, true, CmdOptions.FORMAT_DESC));
        options.addOption(new Option(CmdOptions.BOOTSTRAP_SHORT_OPT, CmdOptions.BOOTSTRAP_LONG_OPT, true,
                CmdOptions.BOOTSTRAP_DESC));
        options.addOption(
                new Option(CmdOptions.GENE_SHORT_OPT, CmdOptions.GENE_LONG_OPT, true, CmdOptions.GENE_DESC));
    }

    /** Creates a new Manager*/
    public ComparisonCmd(String propfile, String gene) throws IOException, TrainingDataException {
        if (propfile != null) {
            ClassifierFactory.setDataProp(propfile, false);
        }
        factory = ClassifierFactory.getFactory(gene);
    }

    /** given the parser and root, calls the tester to do the classification
     */
    public void doClassify(String s1, String s2, String detailFile, String sigFile,
            ClassificationResultFormatter.FORMAT format) throws IOException {

        FileInputStream inStream1 = new FileInputStream(s1);
        FileInputStream inStream2 = new FileInputStream(s2);
        BufferedWriter wt = new BufferedWriter(new FileWriter(detailFile));

        Classifier aClassifier = factory.createClassifier();
        TaxonTree root = null;
        SequenceReader parser = null;
        Sequence pSeq = null;
        int count = 0;

        try {
            parser = new SequenceReader(inStream1);
            while ((pSeq = parser.readNextSequence()) != null) {
                try {
                    ClassificationResult result = aClassifier.classify(pSeq);
                    root = reconstructTree(result, root, SAMPLE1);
                    wt.write(ClassificationResultFormatter.getOutput(result, format));

                } catch (ShortSequenceException e) {
                    System.out.println(e.getMessage());
                } catch (Exception e) {
                    e.printStackTrace();
                }
            }

            parser = new SequenceReader(inStream2);
            while ((pSeq = parser.readNextSequence()) != null) {
                try {
                    ClassificationResult result = aClassifier.classify(pSeq);
                    root = reconstructTree(result, root, SAMPLE2);
                    wt.write(ClassificationResultFormatter.getOutput(result, format));

                } catch (ShortSequenceException e) {
                    System.out.println(e.getMessage());
                } catch (Exception e) {
                    e.printStackTrace();
                }
            }
        } finally {
            parser.close();
            wt.close();
        }

        if (root != null) {
            // change the assignment count for the taxa based on the confidence cutoff value
            SigCalculator cal = new SigCalculator(root.getS1Count(), root.getS2Count(), this.confidenceCutoff);
            root.changeConfidence(cal);
            BufferedWriter sigWt = new BufferedWriter(new FileWriter(sigFile));
            sigWt.write("sample1:\t" + s1 + "\n");
            sigWt.write("sample2:\t" + s2 + "\n");
            sigWt.write("confidence:\t" + this.confidenceCutoff + "\n");
            printSignificance(root, sigWt);
            sigWt.close();
        }

    }

    /** reconstruct the hierarchical tree based on the ClassificationResult
     * we assume that the first assignment is always the root and the only root
     **/
    private TaxonTree reconstructTree(ClassificationResult result, TaxonTree root, String sample) {
        List assignments = result.getAssignments();

        Iterator assignIt = assignments.iterator();
        int nodeCount = 0;
        SeqInfo seqInfo = new SeqInfo(result.getSequence().getSeqName(), result.getSequence().getDesc());
        seqInfo.setReverse(result.isReverse());

        TaxonTree curTree = null;
        while (assignIt.hasNext()) {
            RankAssignment assign = (RankAssignment) assignIt.next();

            if (nodeCount == 0) {
                if (root == null) { // the parent of the root is null
                    root = new TaxonTree(assign.getTaxid(), assign.getName(), assign.getRank(), null);
                } else {
                    if (root.getTaxid() != assign.getTaxid()) {
                        // this should never occur
                        throw new IllegalStateException("Error: the root " + assign.getTaxid()
                                + " of assignment for " + result.getSequence().getSeqName()
                                + " is different from the other sequences " + root.getTaxid());
                    }
                }
                curTree = root;

            } else {
                curTree = curTree.getChildbyTaxid(assign.getTaxid(), assign.getName(), assign.getRank());
            }
            Score score = new Score(assign.getConfidence(), seqInfo, curTree);
            if (sample.equals(SAMPLE1)) {
                curTree.addS1Score(score);
            } else {
                curTree.addS2Score(score);
            }
            nodeCount++;
        }
        return root;
    }

    private void printSignificance(TaxonTree root, BufferedWriter wt) throws IOException {
        ComparisonResultTreeSet resultSet = new ComparisonResultTreeSet();

        Iterator taxonIt = root.getTaxonIterator(10);
        while (taxonIt.hasNext()) {
            resultSet.add(taxonIt.next());
        }

        Iterator it = resultSet.iterator();

        wt.write("Significance\tRank\tName\tSample1\tSample2\n");
        while (it.hasNext()) {
            AbstractNode node = (AbstractNode) it.next();
            wt.write(node.getSignificance() + "\t" + node.getRank() + "\t" + node.getName() + "\t"
                    + node.getS1Count() + "\t" + node.getS2Count() + "\n");
        }

    }

    private void setConfidenceCutoff(float conf) {
        if (conf < 0 || conf > 1) {
            throw new IllegalArgumentException("The confidence cutoff value should be between 0 - 1.0");
        }
        this.confidenceCutoff = conf;
    }

    /**
     * Prints the license information to std err.
     */
    public static void printLicense() {
        String license = "Copyright 2006-2011 Michigan State University Board of Trustees.\n\n"
                + "This program is free software; you can redistribute it and/or modify it under the "
                + "terms of the GNU General Public License as published by the Free Software Foundation; "
                + "either version 2 of the License, or (at your option) any later version.\n\n"
                + "This program is distributed in the hope that it will be useful, "
                + "but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY "
                + "or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.\n\n"
                + "You should have received a copy of the GNU General Public License along with this program; "
                + "if not, write to the Free Software Foundation, Inc., 59 Temple Place, "
                + "Suite 330, Boston, MA 02111-1307 USA\n\n" + "Authors's mailng address:\n"
                + "Center for Microbial Ecology\n" + "2225A Biomedical Physical Science\n"
                + "Michigan State University\n" + "East Lansing, Michigan USA 48824-4320\n"
                + "E-mail: James R. Cole at colej@msu.edu\n" + "\tQiong Wang at wangqion@msu.edu\n"
                + "\tJames M. Tiedje at tiedjej@msu.edu\n\n";

        System.err.println(license);
    }

    public static void main(String[] args) throws Exception {

        String queryFile1 = null;
        String queryFile2 = null;
        String class_outputFile = null;
        String compare_outputFile = null;
        String propFile = null;
        ClassificationResultFormatter.FORMAT format = CmdOptions.DEFAULT_FORMAT;
        float conf_cutoff = CmdOptions.DEFAULT_CONF;
        String gene = null;

        try {
            CommandLine line = new PosixParser().parse(options, args);

            if (line.hasOption(QUERYFILE1_SHORT_OPT)) {
                queryFile1 = line.getOptionValue(QUERYFILE1_SHORT_OPT);
            } else {
                throw new Exception("queryFile1 must be specified");
            }
            if (line.hasOption(QUERYFILE2_SHORT_OPT)) {
                queryFile2 = line.getOptionValue(QUERYFILE2_SHORT_OPT);
            } else {
                throw new Exception("queryFile2 must be specified");
            }

            if (line.hasOption(CmdOptions.OUTFILE_SHORT_OPT)) {
                class_outputFile = line.getOptionValue(CmdOptions.OUTFILE_SHORT_OPT);
            } else {
                throw new Exception("outputFile for classification results must be specified");
            }

            if (line.hasOption(COMPARE_OUTFILE_SHORT_OPT)) {
                compare_outputFile = line.getOptionValue(COMPARE_OUTFILE_SHORT_OPT);
            } else {
                throw new Exception("outputFile for comparsion results must be specified");
            }

            if (line.hasOption(CmdOptions.TRAINPROPFILE_SHORT_OPT)) {
                if (gene != null) {
                    throw new IllegalArgumentException(
                            "Already specified the gene from the default location. Can not specify train_propfile");
                } else {
                    propFile = line.getOptionValue(CmdOptions.TRAINPROPFILE_SHORT_OPT);
                }
            }
            if (line.hasOption(CmdOptions.BOOTSTRAP_SHORT_OPT)) {
                conf_cutoff = Float.parseFloat(line.getOptionValue(CmdOptions.BOOTSTRAP_SHORT_OPT));
            }
            if (line.hasOption(CmdOptions.FORMAT_SHORT_OPT)) {
                String f = line.getOptionValue(CmdOptions.FORMAT_SHORT_OPT);
                if (f.equalsIgnoreCase("allrank")) {
                    format = ClassificationResultFormatter.FORMAT.allRank;
                } else if (f.equalsIgnoreCase("fixrank")) {
                    format = ClassificationResultFormatter.FORMAT.fixRank;
                } else if (f.equalsIgnoreCase("filterbyconf")) {
                    format = ClassificationResultFormatter.FORMAT.filterbyconf;
                } else if (f.equalsIgnoreCase("db")) {
                    format = ClassificationResultFormatter.FORMAT.dbformat;
                } else {
                    throw new IllegalArgumentException(
                            "Not valid output format, only allrank, fixrank, filterbyconf and db allowed");
                }

            }
            if (line.hasOption(CmdOptions.GENE_SHORT_OPT)) {
                if (propFile != null) {
                    throw new IllegalArgumentException(
                            "Already specified train_propfile. Can not specify gene any more");
                }
                gene = line.getOptionValue(CmdOptions.GENE_SHORT_OPT).toLowerCase();

                if (!gene.equals(ClassifierFactory.RRNA_16S_GENE)
                        && !gene.equals(ClassifierFactory.FUNGALLSU_GENE)) {
                    throw new IllegalArgumentException(gene + " is NOT valid, only allows "
                            + ClassifierFactory.RRNA_16S_GENE + " and " + ClassifierFactory.FUNGALLSU_GENE);
                }
            }
        } catch (Exception e) {
            System.out.println("Command Error: " + e.getMessage());
            new HelpFormatter().printHelp(120, "ComparisonCmd", "", options, "", true);
            return;
        }

        if (propFile == null && gene == null) {
            gene = ClassifierFactory.RRNA_16S_GENE;
        }

        ComparisonCmd cmd = new ComparisonCmd(propFile, gene);
        cmd.setConfidenceCutoff(conf_cutoff);
        printLicense();

        cmd.doClassify(queryFile1, queryFile2, class_outputFile, compare_outputFile, format);

    }
}