Java tutorial
/* * ComparisonCmd.java * * This is a command line comparison class * Created on November 7, 2003, 5:14 PM */ package edu.msu.cme.rdp.classifier.comparison; /** * * @author wangqion */ import edu.msu.cme.rdp.classifier.ClassificationResult; import edu.msu.cme.rdp.classifier.Classifier; import edu.msu.cme.rdp.classifier.RankAssignment; import edu.msu.cme.rdp.classifier.ShortSequenceException; import edu.msu.cme.rdp.classifier.TrainingDataException; import edu.msu.cme.rdp.classifier.cli.CmdOptions; import edu.msu.cme.rdp.classifier.io.ClassificationResultFormatter; import edu.msu.cme.rdp.classifier.utils.ClassifierFactory; import edu.msu.cme.rdp.readseq.readers.Sequence; import edu.msu.cme.rdp.readseq.readers.SequenceReader; import java.io.BufferedWriter; import java.io.FileInputStream; import java.io.FileWriter; import java.io.IOException; import java.util.Iterator; import java.util.List; import org.apache.commons.cli.CommandLine; import org.apache.commons.cli.HelpFormatter; import org.apache.commons.cli.Option; import org.apache.commons.cli.Options; import org.apache.commons.cli.PosixParser; public class ComparisonCmd { // long options public static final String QUERYFILE1_LONG_OPT = "queryFile1"; public static final String QUERYFILE2_LONG_OPT = "queryFile2"; public static final String COMPARE_OUTFILE_LONG_OPT = "compare_outputFile"; //short options public static final String QUERYFILE1_SHORT_OPT = "q1"; public static final String QUERYFILE2_SHORT_OPT = "q2"; public static final String COMPARE_OUTFILE_SHORT_OPT = "o"; // description of the options public static final String QUERYFILE_DESC = "query file contains sequences in one of the following formats: Fasta, Genbank and EMBL."; public static final String COMPARE_OUTFILE_DESC = "output file name for the comparsion results."; private static Options options = new Options(); private ClassifierFactory factory; private float confidenceCutoff = CmdOptions.DEFAULT_CONF; // default private static final String SAMPLE1 = "sample1"; private static final String SAMPLE2 = "sample2"; static { options.addOption(new Option(QUERYFILE1_SHORT_OPT, QUERYFILE1_LONG_OPT, true, QUERYFILE_DESC)); options.addOption(new Option(QUERYFILE2_SHORT_OPT, QUERYFILE2_LONG_OPT, true, QUERYFILE_DESC)); options.addOption(new Option(CmdOptions.OUTFILE_SHORT_OPT, CmdOptions.OUTFILE_LONG_OPT, true, CmdOptions.OUTFILE_DESC)); options.addOption( new Option(COMPARE_OUTFILE_SHORT_OPT, COMPARE_OUTFILE_LONG_OPT, true, COMPARE_OUTFILE_DESC)); options.addOption(new Option(CmdOptions.TRAINPROPFILE_SHORT_OPT, CmdOptions.TRAINPROPFILE_LONG_OPT, true, CmdOptions.TRAINPROPFILE_DESC)); options.addOption( new Option(CmdOptions.FORMAT_SHORT_OPT, CmdOptions.FORMAT_LONG_OPT, true, CmdOptions.FORMAT_DESC)); options.addOption(new Option(CmdOptions.BOOTSTRAP_SHORT_OPT, CmdOptions.BOOTSTRAP_LONG_OPT, true, CmdOptions.BOOTSTRAP_DESC)); options.addOption( new Option(CmdOptions.GENE_SHORT_OPT, CmdOptions.GENE_LONG_OPT, true, CmdOptions.GENE_DESC)); } /** Creates a new Manager*/ public ComparisonCmd(String propfile, String gene) throws IOException, TrainingDataException { if (propfile != null) { ClassifierFactory.setDataProp(propfile, false); } factory = ClassifierFactory.getFactory(gene); } /** given the parser and root, calls the tester to do the classification */ public void doClassify(String s1, String s2, String detailFile, String sigFile, ClassificationResultFormatter.FORMAT format) throws IOException { FileInputStream inStream1 = new FileInputStream(s1); FileInputStream inStream2 = new FileInputStream(s2); BufferedWriter wt = new BufferedWriter(new FileWriter(detailFile)); Classifier aClassifier = factory.createClassifier(); TaxonTree root = null; SequenceReader parser = null; Sequence pSeq = null; int count = 0; try { parser = new SequenceReader(inStream1); while ((pSeq = parser.readNextSequence()) != null) { try { ClassificationResult result = aClassifier.classify(pSeq); root = reconstructTree(result, root, SAMPLE1); wt.write(ClassificationResultFormatter.getOutput(result, format)); } catch (ShortSequenceException e) { System.out.println(e.getMessage()); } catch (Exception e) { e.printStackTrace(); } } parser = new SequenceReader(inStream2); while ((pSeq = parser.readNextSequence()) != null) { try { ClassificationResult result = aClassifier.classify(pSeq); root = reconstructTree(result, root, SAMPLE2); wt.write(ClassificationResultFormatter.getOutput(result, format)); } catch (ShortSequenceException e) { System.out.println(e.getMessage()); } catch (Exception e) { e.printStackTrace(); } } } finally { parser.close(); wt.close(); } if (root != null) { // change the assignment count for the taxa based on the confidence cutoff value SigCalculator cal = new SigCalculator(root.getS1Count(), root.getS2Count(), this.confidenceCutoff); root.changeConfidence(cal); BufferedWriter sigWt = new BufferedWriter(new FileWriter(sigFile)); sigWt.write("sample1:\t" + s1 + "\n"); sigWt.write("sample2:\t" + s2 + "\n"); sigWt.write("confidence:\t" + this.confidenceCutoff + "\n"); printSignificance(root, sigWt); sigWt.close(); } } /** reconstruct the hierarchical tree based on the ClassificationResult * we assume that the first assignment is always the root and the only root **/ private TaxonTree reconstructTree(ClassificationResult result, TaxonTree root, String sample) { List assignments = result.getAssignments(); Iterator assignIt = assignments.iterator(); int nodeCount = 0; SeqInfo seqInfo = new SeqInfo(result.getSequence().getSeqName(), result.getSequence().getDesc()); seqInfo.setReverse(result.isReverse()); TaxonTree curTree = null; while (assignIt.hasNext()) { RankAssignment assign = (RankAssignment) assignIt.next(); if (nodeCount == 0) { if (root == null) { // the parent of the root is null root = new TaxonTree(assign.getTaxid(), assign.getName(), assign.getRank(), null); } else { if (root.getTaxid() != assign.getTaxid()) { // this should never occur throw new IllegalStateException("Error: the root " + assign.getTaxid() + " of assignment for " + result.getSequence().getSeqName() + " is different from the other sequences " + root.getTaxid()); } } curTree = root; } else { curTree = curTree.getChildbyTaxid(assign.getTaxid(), assign.getName(), assign.getRank()); } Score score = new Score(assign.getConfidence(), seqInfo, curTree); if (sample.equals(SAMPLE1)) { curTree.addS1Score(score); } else { curTree.addS2Score(score); } nodeCount++; } return root; } private void printSignificance(TaxonTree root, BufferedWriter wt) throws IOException { ComparisonResultTreeSet resultSet = new ComparisonResultTreeSet(); Iterator taxonIt = root.getTaxonIterator(10); while (taxonIt.hasNext()) { resultSet.add(taxonIt.next()); } Iterator it = resultSet.iterator(); wt.write("Significance\tRank\tName\tSample1\tSample2\n"); while (it.hasNext()) { AbstractNode node = (AbstractNode) it.next(); wt.write(node.getSignificance() + "\t" + node.getRank() + "\t" + node.getName() + "\t" + node.getS1Count() + "\t" + node.getS2Count() + "\n"); } } private void setConfidenceCutoff(float conf) { if (conf < 0 || conf > 1) { throw new IllegalArgumentException("The confidence cutoff value should be between 0 - 1.0"); } this.confidenceCutoff = conf; } /** * Prints the license information to std err. */ public static void printLicense() { String license = "Copyright 2006-2011 Michigan State University Board of Trustees.\n\n" + "This program is free software; you can redistribute it and/or modify it under the " + "terms of the GNU General Public License as published by the Free Software Foundation; " + "either version 2 of the License, or (at your option) any later version.\n\n" + "This program is distributed in the hope that it will be useful, " + "but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY " + "or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.\n\n" + "You should have received a copy of the GNU General Public License along with this program; " + "if not, write to the Free Software Foundation, Inc., 59 Temple Place, " + "Suite 330, Boston, MA 02111-1307 USA\n\n" + "Authors's mailng address:\n" + "Center for Microbial Ecology\n" + "2225A Biomedical Physical Science\n" + "Michigan State University\n" + "East Lansing, Michigan USA 48824-4320\n" + "E-mail: James R. Cole at colej@msu.edu\n" + "\tQiong Wang at wangqion@msu.edu\n" + "\tJames M. Tiedje at tiedjej@msu.edu\n\n"; System.err.println(license); } public static void main(String[] args) throws Exception { String queryFile1 = null; String queryFile2 = null; String class_outputFile = null; String compare_outputFile = null; String propFile = null; ClassificationResultFormatter.FORMAT format = CmdOptions.DEFAULT_FORMAT; float conf_cutoff = CmdOptions.DEFAULT_CONF; String gene = null; try { CommandLine line = new PosixParser().parse(options, args); if (line.hasOption(QUERYFILE1_SHORT_OPT)) { queryFile1 = line.getOptionValue(QUERYFILE1_SHORT_OPT); } else { throw new Exception("queryFile1 must be specified"); } if (line.hasOption(QUERYFILE2_SHORT_OPT)) { queryFile2 = line.getOptionValue(QUERYFILE2_SHORT_OPT); } else { throw new Exception("queryFile2 must be specified"); } if (line.hasOption(CmdOptions.OUTFILE_SHORT_OPT)) { class_outputFile = line.getOptionValue(CmdOptions.OUTFILE_SHORT_OPT); } else { throw new Exception("outputFile for classification results must be specified"); } if (line.hasOption(COMPARE_OUTFILE_SHORT_OPT)) { compare_outputFile = line.getOptionValue(COMPARE_OUTFILE_SHORT_OPT); } else { throw new Exception("outputFile for comparsion results must be specified"); } if (line.hasOption(CmdOptions.TRAINPROPFILE_SHORT_OPT)) { if (gene != null) { throw new IllegalArgumentException( "Already specified the gene from the default location. Can not specify train_propfile"); } else { propFile = line.getOptionValue(CmdOptions.TRAINPROPFILE_SHORT_OPT); } } if (line.hasOption(CmdOptions.BOOTSTRAP_SHORT_OPT)) { conf_cutoff = Float.parseFloat(line.getOptionValue(CmdOptions.BOOTSTRAP_SHORT_OPT)); } if (line.hasOption(CmdOptions.FORMAT_SHORT_OPT)) { String f = line.getOptionValue(CmdOptions.FORMAT_SHORT_OPT); if (f.equalsIgnoreCase("allrank")) { format = ClassificationResultFormatter.FORMAT.allRank; } else if (f.equalsIgnoreCase("fixrank")) { format = ClassificationResultFormatter.FORMAT.fixRank; } else if (f.equalsIgnoreCase("filterbyconf")) { format = ClassificationResultFormatter.FORMAT.filterbyconf; } else if (f.equalsIgnoreCase("db")) { format = ClassificationResultFormatter.FORMAT.dbformat; } else { throw new IllegalArgumentException( "Not valid output format, only allrank, fixrank, filterbyconf and db allowed"); } } if (line.hasOption(CmdOptions.GENE_SHORT_OPT)) { if (propFile != null) { throw new IllegalArgumentException( "Already specified train_propfile. Can not specify gene any more"); } gene = line.getOptionValue(CmdOptions.GENE_SHORT_OPT).toLowerCase(); if (!gene.equals(ClassifierFactory.RRNA_16S_GENE) && !gene.equals(ClassifierFactory.FUNGALLSU_GENE)) { throw new IllegalArgumentException(gene + " is NOT valid, only allows " + ClassifierFactory.RRNA_16S_GENE + " and " + ClassifierFactory.FUNGALLSU_GENE); } } } catch (Exception e) { System.out.println("Command Error: " + e.getMessage()); new HelpFormatter().printHelp(120, "ComparisonCmd", "", options, "", true); return; } if (propFile == null && gene == null) { gene = ClassifierFactory.RRNA_16S_GENE; } ComparisonCmd cmd = new ComparisonCmd(propFile, gene); cmd.setConfidenceCutoff(conf_cutoff); printLicense(); cmd.doClassify(queryFile1, queryFile2, class_outputFile, compare_outputFile, format); } }