Java tutorial
/* * Created on June 24, 2002, 2:11 PM * * Copyright 2006 Michigan State University Board of Trustees * * ClassifierTraineeMaker is used to create training files to be used by the classifier * */ package edu.msu.cme.rdp.classifier.train; import java.io.*; import org.apache.commons.cli.CommandLine; import org.apache.commons.cli.HelpFormatter; import org.apache.commons.cli.Option; import org.apache.commons.cli.Options; import org.apache.commons.cli.PosixParser; /** * A command line class to create training information from the raw data. * @author wangqion * @version */ public class ClassifierTraineeMaker { private static final Options options = new Options(); static { options.addOption(new Option("t", "tax_file", true, "contains the hierarchical taxonomy information in the following format:\n" + "taxid*taxon name*parent taxid*depth*rank\nFields taxid, the parent taxid and depth should be in integer format\n" + "The taxid, or the combination of taxon name and rank is unique\n" + "depth indicates the depth from the root taxon.\n Note: the depth for the root is 0")); options.addOption(new Option("s", "seq", true, "training sequences in FASTA format with lineage in the header:\n" + "a list taxon names seperated by ';' with highest rank taxon first.\n" + "The lowest rank of the lineage have to be the same for all sequence.\n" + "The lowest rank is not limited to genus")); options.addOption(new Option("n", "version_no", true, "an integer used to refer to a training set")); options.addOption(new Option("v", "version", true, "the version of the hierarchical taxonomy")); options.addOption(new Option("m", "mod", true, "the modifcation information of the taxonomy")); options.addOption(new Option("o", "out_dir", true, "the output directory")); options.addOption(new Option("c", "copynumber_file", true, "contains at least name, rank and the mean copy number of taxa. A header line is required to find the corresponding columns" + "\nOnly the copy number of the lowest rank taxa will be loaded and the copy number of the other taxa are derived from these.")); } /** Creates a new ClassifierTraineeMaker * @param taxFile contains the hierarchical taxonomy information in the following format: * taxid*taxon name*parent taxid*depth*rank". * taxid, the parent taxid and depth should be in integer format. * depth indicates the depth from the root taxon. * @param seqFile contains the raw training sequences in fasta format. * The header of this fasta file starts with ">", followed by the sequence name, white space(s) * and a list taxon names seperated by ';' with highest rank taxon first. * For example: >seq1 ROOT;Ph1;Fam1;G1; * <br>Note: a sequence can only be assigned to the lowest rank taxon. * @param trainset_no is used to mark the training files generated. * @param version indicates the version of the hierarchical taxonomy. * @param modification holds the modification information of the taxonomy if any. * @param outdir specifies the output directory. * The parsed training information will be saved into four files in the given output directory. */ public ClassifierTraineeMaker(String taxFile, String seqFile, String cnFile, int trainset_no, String version, String modification, String outdir) throws FileNotFoundException, IOException { Reader tax = new FileReader(taxFile); try { TreeFactory factory = new TreeFactory(tax, trainset_no, version, modification); LineageSequenceParser parser = new LineageSequenceParser(new File(seqFile)); factory.parseSequenceFile(parser); if (cnFile != null) { factory.parseCopyNumberFile(cnFile); } //after parsing all the sequences in training set, calculates the prior probability for each word factory.createGenusWordConditionalProb(); if (!(new File(outdir)).exists()) { (new File(outdir)).mkdir(); } outdir = outdir + File.separator; factory.printTrainingFiles(outdir); factory.printWordPriors(outdir); factory.printWordConditionalProbIndexArr(outdir); factory.printGenusIndex_WordProbArr(outdir); } catch (NameRankDupException ex) { ex.printStackTrace(); } } /** * Prints the license information to std err. */ public static void printLicense() { String license = "Copyright 2006 Michigan State University Board of Trustees.\n\n" + "This program is free software; you can redistribute it and/or modify it under the " + "terms of the GNU General Public License as published by the Free Software Foundation; " + "either version 2 of the License, or (at your option) any later version.\n\n" + "This program is distributed in the hope that it will be useful, " + "but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY " + "or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.\n\n" + "You should have received a copy of the GNU General Public License along with this program; " + "if not, write to the Free Software Foundation, Inc., 59 Temple Place, " + "Suite 330, Boston, MA 02111-1307 USA\n\n" + "Authors's mailng address:\n" + "Center for Microbial Ecology\n" + "2225A Biomedical Physical Science\n" + "Michigan State University\n" + "East Lansing, Michigan USA 48824-4320\n" + "E-mail: James R. Cole at colej@msu.edu\n" + "\tQiong Wang at wangqion@msu.edu\n" + "\tJames M. Tiedje at tiedjej@msu.edu\n\n"; System.err.println(license); } /** This is the main method to create training files from raw taxonomic information. * <p> * Usage: java ClassifierTraineeMaker tax_file rawseq.fa trainsetNo version version_modification output_directory. * See the ClassifierTraineeMaker constructor for more detail. * @param args * @throws FileNotFoundException * @throws IOException */ public static void main(String[] args) throws FileNotFoundException, IOException { String taxFile; String cnFile = null; String seqFile; int trainset_no = 1; String version = null; String modification = null; String outdir = null; try { CommandLine line = new PosixParser().parse(options, args); if (line.hasOption("t")) { taxFile = line.getOptionValue("t"); } else { throw new Exception("taxon file must be specified"); } if (line.hasOption("c")) { cnFile = line.getOptionValue("c"); } if (line.hasOption("s")) { seqFile = line.getOptionValue("s"); } else { throw new Exception("seq file must be specified"); } if (line.hasOption("n")) { try { trainset_no = Integer.parseInt(line.getOptionValue("n")); } catch (NumberFormatException ex) { throw new IllegalArgumentException("trainset_no needs to be an integer."); } } if (line.hasOption("o")) { outdir = line.getOptionValue("o"); } else { throw new Exception("output directory must be specified"); } if (line.hasOption("v")) { version = line.getOptionValue("v"); } if (line.hasOption("m")) { modification = line.getOptionValue("m"); } } catch (Exception e) { System.out.println("Command Error: " + e.getMessage()); new HelpFormatter().printHelp(120, "train", "", options, "", true); return; } ClassifierTraineeMaker maker = new ClassifierTraineeMaker(taxFile, seqFile, cnFile, trainset_no, version, modification, outdir); } }