edu.msu.cme.rdp.classifier.ClassifierCmd.java Source code

Java tutorial

Introduction

Here is the source code for edu.msu.cme.rdp.classifier.ClassifierCmd.java

Source

/*
 * Created on Feb 20, 2006
 *
 */
/**
 * This is a simple command line class to do classification.
 */
package edu.msu.cme.rdp.classifier;

import edu.msu.cme.rdp.classifier.cli.CmdOptions;
import edu.msu.cme.rdp.classifier.io.ClassificationResultFormatter;
import edu.msu.cme.rdp.classifier.utils.ClassifierFactory;
import edu.msu.cme.rdp.classifier.utils.ClassifierSequence;
import edu.msu.cme.rdp.readseq.readers.SequenceReader;
import edu.msu.cme.rdp.readseq.readers.SeqReader;
import edu.msu.cme.rdp.readseq.readers.Sequence;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileWriter;
import java.io.IOException;

import org.apache.commons.cli.CommandLine;
import org.apache.commons.cli.HelpFormatter;
import org.apache.commons.cli.Option;
import org.apache.commons.cli.Options;
import org.apache.commons.cli.PosixParser;

/**
 * This is the legacy command line class to do the classification. See edu.msu.cme.rdp.multicompare.main
 * @author wangqion
 */
public class ClassifierCmd {

    private static final Options options = new Options();

    static {
        options.addOption(new Option(CmdOptions.QUERYFILE_SHORT_OPT, CmdOptions.QUERYFILE_LONG_OPT, false,
                CmdOptions.QUERYFILE_DESC)); // keep this for compatibility with old interface
        options.addOption(new Option(CmdOptions.OUTFILE_SHORT_OPT, CmdOptions.OUTFILE_LONG_OPT, true,
                CmdOptions.OUTFILE_DESC));
        options.addOption(new Option(CmdOptions.TRAINPROPFILE_SHORT_OPT, CmdOptions.TRAINPROPFILE_LONG_OPT, true,
                CmdOptions.TRAINPROPFILE_DESC));
        options.addOption(
                new Option(CmdOptions.FORMAT_SHORT_OPT, CmdOptions.FORMAT_LONG_OPT, true, CmdOptions.FORMAT_DESC));
        options.addOption(
                new Option(CmdOptions.GENE_SHORT_OPT, CmdOptions.GENE_LONG_OPT, true, CmdOptions.GENE_DESC));
        options.addOption(new Option(CmdOptions.MIN_BOOTSTRAP_WORDS_SHORT_OPT,
                CmdOptions.MIN_BOOTSTRAP_WORDS_LONG_OPT, true, CmdOptions.MIN_WORDS_DESC));
    }

    /** It classifies query sequences from the input file.
     * If the property file of the mapping of the training files is not null, the default property file will be override.
     * The classification results will be writen to the output file.
     */
    public void doClassify(String inputFile, String outFile, String propfile,
            ClassificationResultFormatter.FORMAT format, String gene, int min_bootstrap_words)
            throws IOException, TrainingDataException {
        if (propfile != null) {
            ClassifierFactory.setDataProp(propfile, false);
        }
        if (format == null) {
            format = CmdOptions.DEFAULT_FORMAT;
        }
        ClassifierFactory factory = ClassifierFactory.getFactory(gene);
        Classifier aClassifier = factory.createClassifier();
        SeqReader parser = new SequenceReader(new File(inputFile));
        BufferedWriter wt = new BufferedWriter(new FileWriter(outFile));
        Sequence pSeq = null;

        try {
            while ((pSeq = parser.readNextSequence()) != null) {
                try {
                    ClassificationResult result = aClassifier.classify(new ClassifierSequence(pSeq),
                            min_bootstrap_words);
                    wt.write(ClassificationResultFormatter.getOutput(result, format));

                } catch (ShortSequenceException e) {
                    System.out.println(e.getMessage());
                } catch (Exception e) {
                    e.printStackTrace();
                }
            }
        } finally {
            wt.close();
        }

    }

    /**
     * Prints the license information to std err.
     */
    public static void printLicense() {
        String license = "Copyright 2006-2011 Michigan State University Board of Trustees.\n\n"
                + "This program is free software; you can redistribute it and/or modify it under the "
                + "terms of the GNU General Public License as published by the Free Software Foundation; "
                + "either version 2 of the License, or (at your option) any later version.\n\n"
                + "This program is distributed in the hope that it will be useful, "
                + "but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY "
                + "or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.\n\n"
                + "You should have received a copy of the GNU General Public License along with this program; "
                + "if not, write to the Free Software Foundation, Inc., 59 Temple Place, "
                + "Suite 330, Boston, MA 02111-1307 USA\n\n" + "Authors's mailng address:\n"
                + "Center for Microbial Ecology\n" + "2225A Biomedical Physical Science\n"
                + "Michigan State University\n" + "East Lansing, Michigan USA 48824-4320\n"
                + "E-mail: James R. Cole at colej@msu.edu\n" + "\tQiong Wang at wangqion@msu.edu\n"
                + "\tJames M. Tiedje at tiedjej@msu.edu\n\n";

        System.err.println(license);
    }

    /**
     * This is the main method to do classification.
     * <p>Usage: java ClassifierCmd queryFile outputFile [property file].
     * <br>
     * queryFile can be one of the following formats: Fasta, Genbank and EMBL. 
     * <br>
     * outputFile will be used to save the classification output.
     * <br>
     * property file contains the mapping of the training files.
     * <br>
     * Note: the training files and the property file should be in the same directory.
     * The default property file is set to data/classifier/16srrna/rRNAClassifier.properties.
     */
    public static void main(String[] args) throws Exception {

        String queryFile = null;
        String outputFile = null;
        String propFile = null;
        String gene = null;
        ClassificationResultFormatter.FORMAT format = CmdOptions.DEFAULT_FORMAT;
        int min_bootstrap_words = Classifier.MIN_BOOTSTRSP_WORDS;

        try {
            CommandLine line = new PosixParser().parse(options, args);

            if (line.hasOption(CmdOptions.OUTFILE_SHORT_OPT)) {
                outputFile = line.getOptionValue(CmdOptions.OUTFILE_SHORT_OPT);
            } else {
                throw new Exception("outputFile must be specified");
            }

            if (line.hasOption(CmdOptions.TRAINPROPFILE_SHORT_OPT)) {
                if (gene != null) {
                    throw new IllegalArgumentException(
                            "Already specified the gene from the default location. Can not specify train_propfile");
                } else {
                    propFile = line.getOptionValue(CmdOptions.TRAINPROPFILE_SHORT_OPT);
                }
            }
            if (line.hasOption(CmdOptions.FORMAT_SHORT_OPT)) {
                String f = line.getOptionValue(CmdOptions.FORMAT_SHORT_OPT);
                if (f.equalsIgnoreCase("allrank")) {
                    format = ClassificationResultFormatter.FORMAT.allRank;
                } else if (f.equalsIgnoreCase("fixrank")) {
                    format = ClassificationResultFormatter.FORMAT.fixRank;
                } else if (f.equalsIgnoreCase("filterbyconf")) {
                    format = ClassificationResultFormatter.FORMAT.filterbyconf;
                } else if (f.equalsIgnoreCase("db")) {
                    format = ClassificationResultFormatter.FORMAT.dbformat;
                } else {
                    throw new IllegalArgumentException(
                            "Not valid output format, only allrank, fixrank, filterbyconf and db allowed");
                }
            }
            if (line.hasOption(CmdOptions.GENE_SHORT_OPT)) {
                if (propFile != null) {
                    throw new IllegalArgumentException(
                            "Already specified train_propfile. Can not specify gene any more");
                }
                gene = line.getOptionValue(CmdOptions.GENE_SHORT_OPT).toLowerCase();

                if (!gene.equals(ClassifierFactory.RRNA_16S_GENE)
                        && !gene.equals(ClassifierFactory.FUNGALLSU_GENE)) {
                    throw new IllegalArgumentException(gene + " is NOT valid, only allows "
                            + ClassifierFactory.RRNA_16S_GENE + " and " + ClassifierFactory.FUNGALLSU_GENE);
                }
            }
            if (line.hasOption(CmdOptions.MIN_BOOTSTRAP_WORDS_SHORT_OPT)) {
                min_bootstrap_words = Integer
                        .parseInt(line.getOptionValue(CmdOptions.MIN_BOOTSTRAP_WORDS_SHORT_OPT));
                if (min_bootstrap_words < Classifier.MIN_BOOTSTRSP_WORDS) {
                    throw new IllegalArgumentException(CmdOptions.MIN_BOOTSTRAP_WORDS_LONG_OPT
                            + " must be at least " + Classifier.MIN_BOOTSTRSP_WORDS);
                }
            }

            args = line.getArgs();
            if (args.length != 1) {
                throw new Exception("Expect one query file");
            }
            queryFile = args[0];

        } catch (Exception e) {
            System.out.println("Command Error: " + e.getMessage());
            new HelpFormatter().printHelp(120,
                    "ClassifierCmd [options] <samplefile>\nNote this is the legacy command for one sample classification ",
                    "", options, "");

            return;
        }

        if (propFile == null && gene == null) {
            gene = CmdOptions.DEFAULT_GENE;
        }
        ClassifierCmd classifierCmd = new ClassifierCmd();

        printLicense();
        classifierCmd.doClassify(queryFile, outputFile, propFile, format, gene, min_bootstrap_words);

    }
}