tikka.hhl.apps.CommandLineOptions.java Source code

Introduction

Here is the source code for tikka.hhl.apps.CommandLineOptions.java
Source

///////////////////////////////////////////////////////////////////////////////
//  Copyright (C) 2010 Taesun Moon, The University of Texas at Austin
//
//  This library is free software; you can redistribute it and/or
//  modify it under the terms of the GNU Lesser General Public
//  License as published by the Free Software Foundation; either
//  version 3 of the License, or (at your option) any later version.
//
//  This library is distributed in the hope that it will be useful,
//  but WITHOUT ANY WARRANTY; without even the implied warranty of
//  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
//  GNU Lesser General Public License for more details.
//
//  You should have received a copy of the GNU Lesser General Public
//  License along with this program; if not, write to the Free Software
//  Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
///////////////////////////////////////////////////////////////////////////////
package tikka.hhl.apps;

import tikka.opennlp.io.DataFormatEnum;

import java.io.BufferedWriter;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.OutputStreamWriter;

import org.apache.commons.cli.*;

/**
 * Handles options from the command line. Also sets the default parameter
 * values.
 *
 * @author tsmoon
 */
public class CommandLineOptions {

    /**
     * Number of topics
     */
    protected int topics = 50;
    /**
     * Number of types (either word or morpheme) to print per state or topic
     */
    protected int outputPerClass = 50;
    /**
     * Hyperparameter of topic prior
     */
    protected double alpha = 1;
    /**
     * Hyperparameter for word/topic prior
     */
    protected double beta = 0.1;
    /**
     * Path for training data. Should be a full directory
     */
    protected String trainDataDir = null;
    /**
     * Path for test data. Should be a full directory
     */
    private String testDataDir = null;
    /**
     * full path of model to be loaded
     */
    protected String modelInputPath = null;
    /**
     * full path to save model to
     */
    protected String modelOutputPath = null;
    /**
     * Root of path to output annotated texts to
     */
    protected String annotatedTrainTextOutDir = null;
    /**
     * Root of path to output annotated test set texts to
     */
    private String annotatedTestTextOutDir = null;
    /**
     * Number of training iterations
     */
    protected int numIterations = 100;
    /**
     * Number of iterations for test set burnin
     */
    protected int testSetBurninIterations = 10;
    /**
     * Number to seed random number generator. This is set to -1 so it may be
     * checked during the initialization stages. If the value is -1 from the
     * calling function, it means the randomSeed must be set to 0. Otherwise,
     * use whatever value has been passed to it from the command line.
     */
    protected int randomSeed = -1;
    /**
     * Specifier of training data format.
     */
    protected DataFormatEnum.DataFormat dataFormat = DataFormatEnum.DataFormat.CONLL2K;
    /**
     * Name of file to generate tabulated output to
     */
    protected String tabularOutputFilename = null;
    /**
     * Name of file to dump test data sample scores (perplexity) to
     */
    protected String testDataSampleScoreOutputFilename = null;
    /**
     * Name of file to dump training data sample scores (bayes factor) to
     */
    protected String trainDataSampleScoreOutputFilename = null;
    /**
     * Output buffer to write normalized, tabulated data to.
     */
    protected BufferedWriter tabulatedOutput;
    /**
     * Output buffer to dump test data sample scores (perplexity) to
     */
    protected BufferedWriter testDataSampleScoreOutput;
    /**
     * Output buffer to dump training data sample scores (perplexity) to
     */
    protected BufferedWriter trainDataSampleScoreOutput;
    /**
     * Temperature at which to start annealing process
     */
    protected double initialTemperature = 1;
    /**
     * Decrement at which to reduce the temperature in annealing process
     */
    protected double temperatureDecrement = 0.1;
    /**
     * Stop changing temperature after the following temp has been reached.
     */
    protected double targetTemperature = 1;
    /**
     * Hyperparameters for (hierarchical) dirichlet processes
     */
    protected double muStem = 300, muStemBase = 3000, muAffixBase = 3000, muAffix = 300, betaStemBase = 3000,
            betaStem = 300;
    /**
     * Probability of null morph or ,equivalently, the probability of a
     * morpheme boundary
     */
    protected double stemBoundaryProb = 0.2, affixBoundaryProb = 0.25;
    /**
     * Hyperparameter for state transition priors
     */
    protected double psi = 0.1;
    /**
     * Hyperparameter for "switch" prior. See {@link  #switchVector}
     * and {@link #fourthOrderSwitches}.
     */
    protected double xi = 0.1;
    /**
     * The total number of switch types. There are only two. One which bears
     * topical content and one which does not.
     */
    protected final int switchQ = 2;
    /**
     * Model to use for training. Use unhelpful, non-mnemonic names
     */
    protected String experimentModel = "m1";
    /**
     * Hyperparameter for word emission priors
     */
    protected double gamma = 0.0001;
    /**
     * Number of states in HMM.
     */
    protected int states = 15;
    /**
     * Number of topic states. This is a less than {@link #stateS} and entails
     * that the topic states are a subset of the full states.
     */
    protected int topicSubStates = 1;
    /**
     * Number of samples to take
     */
    protected int samples = 100;
    /**
     * Number of iterations between samples
     */
    protected int lag = 10;

    /**
     * 
     * @param cline
     * @throws IOException
     */
    public CommandLineOptions(CommandLine cline) throws IOException {

        String opt = null;

        for (Option option : cline.getOptions()) {
            String value = option.getValue();
            switch (option.getOpt().charAt(0)) {
            case 'a':
                alpha = Double.parseDouble(value);
                break;
            case 'b':
                beta = Double.parseDouble(value);
                break;
            case 'c':
                if (value.equals("conll2k")) {
                    dataFormat = DataFormatEnum.DataFormat.CONLL2K;
                } else if (value.equals("hashslash")) {
                    dataFormat = DataFormatEnum.DataFormat.HASHSLASH;
                } else if (value.equals("pipesep")) {
                    dataFormat = DataFormatEnum.DataFormat.PIPESEP;
                } else if (value.equals("raw")) {
                    dataFormat = DataFormatEnum.DataFormat.RAW;
                } else {
                    System.err.println("\"" + value + "\" is an unknown data format option.");
                    System.exit(1);
                }
                break;
            case 'd':
                if (value.endsWith("" + File.separator)) {
                    trainDataDir = value.substring(0, value.length() - 1);
                } else {
                    trainDataDir = value;
                }
                break;
            case 'e':
                experimentModel = value;
                break;
            case 'f':
                if (value.endsWith("" + File.separator)) {
                    testDataDir = value.substring(0, value.length() - 1);
                } else {
                    testDataDir = value;
                }
                break;
            case 'g':
                gamma = Double.parseDouble(value);
                break;
            case 'i':
                opt = option.getOpt();
                if (opt.equals("itr")) {
                    numIterations = Integer.parseInt(value);
                } else if (opt.equals("ite")) {
                    testSetBurninIterations = Integer.parseInt(value);
                }
                break;
            case 'j':
                if (value.endsWith("" + File.separator)) {
                    annotatedTestTextOutDir = value.substring(0, value.length() - 1);
                } else {
                    annotatedTestTextOutDir = value;
                }
                break;
            case 'k':
                opt = option.getOpt();
                if (opt.equals("ks")) {
                    samples = Integer.parseInt(value);
                } else if (opt.equals("kl")) {
                    lag = Integer.parseInt(value);
                }
                break;
            case 'l':
                modelInputPath = value;
                break;
            case 'm':
                modelOutputPath = value;
                break;
            case 'n':
                if (value.endsWith("" + File.separator)) {
                    annotatedTrainTextOutDir = value.substring(0, value.length() - 1);
                } else {
                    annotatedTrainTextOutDir = value;
                }
                break;
            case 'o':
                opt = option.getOpt();
                if (opt.equals("ot")) {
                    tabularOutputFilename = value;
                    tabulatedOutput = new BufferedWriter(
                            new OutputStreamWriter(new FileOutputStream(tabularOutputFilename)));
                } else if (opt.equals("oste")) {
                    testDataSampleScoreOutputFilename = value;
                    testDataSampleScoreOutput = new BufferedWriter(
                            new OutputStreamWriter(new FileOutputStream(testDataSampleScoreOutputFilename)));
                } else if (opt.equals("ostr")) {
                    trainDataSampleScoreOutputFilename = value;
                    trainDataSampleScoreOutput = new BufferedWriter(
                            new OutputStreamWriter(new FileOutputStream(trainDataSampleScoreOutputFilename)));
                }
                break;
            case 'p':
                opt = option.getOpt();
                if (opt.equals("pi")) {
                    initialTemperature = Double.parseDouble(value);
                } else if (opt.equals("pr")) {
                    targetTemperature = Double.parseDouble(value);
                } else if (opt.equals("pt")) {
                    temperatureDecrement = Double.parseDouble(value);
                }
                break;
            case 'q':
                topicSubStates = Integer.valueOf(value);
                break;
            case 'r':
                randomSeed = Integer.valueOf(value);
                break;
            case 's':
                states = Integer.parseInt(value);
                break;
            case 't':
                topics = Integer.parseInt(value);
                break;
            case 'w':
                outputPerClass = Integer.parseInt(value);
                break;
            case 'x':
                opt = option.getOpt();
                if (opt.equals("xmustembase")) {
                    muStemBase = Double.parseDouble(value);
                } else if (opt.equals("xmustem")) {
                    muStem = Double.parseDouble(value);
                } else if (opt.equals("xbetastembase")) {
                    betaStemBase = Double.parseDouble(value);
                } else if (opt.equals("xbetastem")) {
                    betaStem = Double.parseDouble(value);
                } else if (opt.equals("xmuaffixbase")) {
                    muAffixBase = Double.parseDouble(value);
                } else if (opt.equals("xmuaffix")) {
                    muAffix = Double.parseDouble(value);
                } else if (opt.equals("xpsi")) {
                    psi = Double.parseDouble(value);
                } else if (opt.equals("xxi")) {
                    xi = Double.parseDouble(value);
                } else if (opt.equals("xstemboundaryprob")) {
                    stemBoundaryProb = Double.parseDouble(value);
                } else if (opt.equals("xaffixboundaryprob")) {
                    affixBoundaryProb = Double.parseDouble(value);
                }
                break;
            }
        }
    }

    public double getAlpha() {
        return alpha;
    }

    public double getBeta() {
        return beta;
    }

    public String getTrainDataDir() {
        return trainDataDir;
    }

    public String getTestDataDir() {
        return testDataDir;
    }

    public DataFormatEnum.DataFormat getDataFormat() {
        return dataFormat;
    }

    public int getNumIterations() {
        return numIterations;
    }

    public String getTabularOutputFilename() {
        return tabularOutputFilename;
    }

    public BufferedWriter getTabulatedOutput() {
        return tabulatedOutput;
    }

    public int getOutputPerClass() {
        return outputPerClass;
    }

    public int getRandomSeed() {
        return randomSeed;
    }

    public double getInitialTemperature() {
        return initialTemperature;
    }

    public double getTargetTemperature() {
        return targetTemperature;
    }

    public double getTemperatureDecrement() {
        return temperatureDecrement;
    }

    public int getTopics() {
        return topics;
    }

    public double getBetaStemBase() {
        return betaStemBase;
    }

    public double getBetaStem() {
        return betaStem;
    }

    public String getExperimentModel() {
        return experimentModel;
    }

    public double getMuStemBase() {
        return muStemBase;
    }

    public double getMuAffixBase() {
        return muAffixBase;
    }

    public double getMuStem() {
        return muStem;
    }

    public double getMuAffix() {
        return muAffix;
    }

    public double getPsi() {
        return psi;
    }

    public double getXi() {
        return xi;
    }

    public double getStemBoundaryProb() {
        return stemBoundaryProb;
    }

    public double getAffixBoundaryProb() {
        return affixBoundaryProb;
    }

    public double getGamma() {
        return gamma;
    }

    public int getStates() {
        return states;
    }

    public int getTopicSubStates() {
        return topicSubStates;
    }

    public String getModelInputPath() {
        return modelInputPath;
    }

    public String getModelOutputPath() {
        return modelOutputPath;
    }

    public String getAnnotatedTrainTextOutDir() {
        return annotatedTrainTextOutDir;
    }

    public String getAnnotatedTestTextOutDir() {
        return annotatedTestTextOutDir;
    }

    /**
     * @return the number of samples take
     */
    public int getSamples() {
        return samples;
    }

    /**
     * @return the number of iterations per sample
     */
    public int getLag() {
        return lag;
    }

    public BufferedWriter getTestDataSampleScoreOutput() {
        return testDataSampleScoreOutput;
    }

    public String getTestDataSampleScoreOutputFilename() {
        return testDataSampleScoreOutputFilename;
    }

    public BufferedWriter getTrainDataSampleScoreOutput() {
        return trainDataSampleScoreOutput;
    }

    public String getTrainDataSampleScoreOutputFilename() {
        return trainDataSampleScoreOutputFilename;
    }

    public int getTestSetBurninIterations() {
        return testSetBurninIterations;
    }
}