 * Copyright 2006 Michigan State University Board of Trustees
 * ClassifierTraineeMaker is used to create training files to be used by the classifier
package edu.msu.cme.rdp.classifier.train;

import org.apache.commons.cli.CommandLine;
import org.apache.commons.cli.HelpFormatter;
import org.apache.commons.cli.Option;
import org.apache.commons.cli.Options;
import org.apache.commons.cli.PosixParser;

 * A command line class to create training information from the raw data.
 * @author  wangqion
 * @version 
public class ClassifierTraineeMaker {
    private static final Options options = new Options();

    static {
        options.addOption(new Option("t", "tax_file", true,
                "contains the hierarchical taxonomy information in the following format:\n"
                        + "taxid*taxon name*parent taxid*depth*rank\nFields taxid, the parent taxid and depth should be in integer format\n"
                        + "The taxid, or the combination of taxon name and rank is unique\n"
                        + "depth indicates the depth from the root taxon.\n Note: the depth for the root is 0"));
        options.addOption(new Option("s", "seq", true,
                "training sequences in FASTA format with lineage in the header:\n"
                        + "a list taxon names seperated by ';' with highest rank taxon first.\n"
                        + "The lowest rank of the lineage have to be the same for all sequence.\n"
                        + "The lowest rank is not limited to genus"));
        options.addOption(new Option("n", "version_no", true, "an integer used to refer to a training set"));
        options.addOption(new Option("v", "version", true, "the version of the hierarchical taxonomy"));
        options.addOption(new Option("m", "mod", true, "the modifcation information of the taxonomy"));
        options.addOption(new Option("o", "out_dir", true, "the output directory"));
        options.addOption(new Option("c", "copynumber_file", true,
                "contains at least name, rank and the mean copy number of taxa. A header line is required to find the corresponding columns"
                        + "\nOnly the copy number of the lowest rank taxa will be loaded and the copy number of the other taxa are derived from these."));

    /** Creates a new ClassifierTraineeMaker 
     * @param taxFile contains the hierarchical taxonomy information in the following format:
     * taxid*taxon name*parent taxid*depth*rank".
     * taxid, the parent taxid and depth should be in integer format.
     * depth indicates the depth from the root taxon.
     * @param seqFile contains the raw training sequences in fasta format.
     * The header of this fasta file starts with ">", followed by the sequence name, white space(s)
     * and a list taxon names seperated by ';' with highest rank taxon first. 
     * For example: >seq1     ROOT;Ph1;Fam1;G1;
     * <br>Note: a sequence can only be assigned to the lowest rank taxon.
     * @param trainset_no is used to mark the training files generated.
     * @param version indicates the version of the hierarchical taxonomy.
     * @param modification holds the modification information of the taxonomy if any.
     * @param outdir specifies the output directory.
     * The parsed training information will be saved into four files in the given output directory.
    public ClassifierTraineeMaker(String taxFile, String seqFile, String cnFile, int trainset_no, String version,
            String modification, String outdir) throws FileNotFoundException, IOException {
        Reader tax = new FileReader(taxFile);

        try {
            TreeFactory factory = new TreeFactory(tax, trainset_no, version, modification);
            LineageSequenceParser parser = new LineageSequenceParser(new File(seqFile));
            if (cnFile != null) {
            //after parsing all the sequences in training set, calculates the prior probability for each word
            if (!(new File(outdir)).exists()) {
                (new File(outdir)).mkdir();
            outdir = outdir + File.separator;
        } catch (NameRankDupException ex) {

     * Prints the license information to std err.
    /** This is the main method to create training files from raw taxonomic information.
     * <p>
     * Usage: java ClassifierTraineeMaker tax_file rawseq.fa trainsetNo version version_modification output_directory.
     * See the ClassifierTraineeMaker constructor for more detail.
     * @param args
     * @throws FileNotFoundException
     * @throws IOException
    public static void main(String[] args) throws FileNotFoundException, IOException {
        String taxFile;
        String cnFile = null;
        String seqFile;
        int trainset_no = 1;
        String version = null;
        String modification = null;
        String outdir = null;

        try {
            CommandLine line = new PosixParser().parse(options, args);

            if (line.hasOption("t")) {
                taxFile = line.getOptionValue("t");
            } else {
                throw new Exception("taxon file must be specified");
            if (line.hasOption("c")) {
                cnFile = line.getOptionValue("c");
            if (line.hasOption("s")) {
                seqFile = line.getOptionValue("s");
            } else {
                throw new Exception("seq file must be specified");

            if (line.hasOption("n")) {
                try {
                    trainset_no = Integer.parseInt(line.getOptionValue("n"));
                } catch (NumberFormatException ex) {
                    throw new IllegalArgumentException("trainset_no needs to be an integer.");
            if (line.hasOption("o")) {
                outdir = line.getOptionValue("o");
            } else {
                throw new Exception("output directory must be specified");
            if (line.hasOption("v")) {
                version = line.getOptionValue("v");
            if (line.hasOption("m")) {
                modification = line.getOptionValue("m");

        } catch (Exception e) {
            System.out.println("Command Error: " + e.getMessage());
            new HelpFormatter().printHelp(120, "train", "", options, "", true);

        ClassifierTraineeMaker maker = new ClassifierTraineeMaker(taxFile, seqFile, cnFile, trainset_no, version,
                modification, outdir);