Example usage for org.apache.commons.cli2.builder DefaultOptionBuilder DefaultOptionBuilder

Introduction

In this page you can find the example usage for org.apache.commons.cli2.builder DefaultOptionBuilder DefaultOptionBuilder.

Prototype

public DefaultOptionBuilder()

Source Link

Document

Creates a new DefaultOptionBuilder using defaults

Usage

From source file:org.apache.mahout.regression.penalizedlinear.LinearRegularizePath.java

private boolean parseArgs(String[] args) {
    DefaultOptionBuilder builder = new DefaultOptionBuilder();

    Option help = builder.withLongName("help").withDescription("print this list").create();

    ArgumentBuilder argumentBuilder = new ArgumentBuilder();
    Option inputFile = builder.withLongName("input").withRequired(true)
            .withArgument(argumentBuilder.withName("input").withMaximum(1).create())
            .withDescription("where to get training data (CSV or white-spaced TEXT file)").create();

    Option outputFile = builder.withLongName("output").withRequired(true)
            .withArgument(argumentBuilder.withName("output").withMaximum(1).create())
            .withDescription("where to get results").create();

    Option dependent = builder.withLongName("dependent").withRequired(true)
            .withArgument(argumentBuilder.withName("dependent").withMinimum(1).withMaximum(1).create())
            .withDescription("the dependent features").create();

    Option independent = builder.withLongName("independent").withRequired(true)
            .withArgument(argumentBuilder.withName("independent").create())
            .withDescription("the independent features").create();

    Option interaction = builder.withLongName("interaction").withRequired(true)
            .withArgument(argumentBuilder.withName("interaction").withMinimum(0).create())
            .withDescription(/*  w ww .ja  va2s . c o  m*/
                    "the interactions of features, the format is: feature1:feature2 (identical features are OK)")
            .create();

    Option bias = builder.withLongName("bias").withDescription("include a bias term").create();

    Option lambda = builder.withLongName("lambda")
            .withArgument(argumentBuilder.withName("lambda").withDefault("0").withMinimum(1).create())
            .withDescription("an increasing positive sequence of penalty coefficient, "
                    + "with length n >= 0; if lambda is not specified, the sequence is chosen by algorithm.")
            .create();

    Option alpha = builder.withLongName("alpha")
            .withArgument(
                    argumentBuilder.withName("alpha").withDefault("1").withMinimum(1).withMaximum(1).create())
            .withDescription("the elastic-net coefficient with default value 1 (LASSO)").create();

    Group normalArgs = new GroupBuilder().withOption(help).withOption(inputFile).withOption(outputFile)
            .withOption(dependent).withOption(independent).withOption(interaction).withOption(bias)
            .withOption(lambda).withOption(alpha).create();

    Parser parser = new Parser();
    parser.setHelpOption(help);
    parser.setHelpTrigger("--help");
    parser.setGroup(normalArgs);
    parser.setHelpFormatter(new HelpFormatter(" ", "", " ", 130));
    CommandLine cmdLine = parser.parseAndHelp(args);
    if (cmdLine == null) {
        return false;
    }

    parameter = new LinearRegularizePathParameter();
    parameter.numOfCV = 1;
    parameter.alpha = Float.parseFloat((String) cmdLine.getValue(alpha));
    parameter.intercept = cmdLine.hasOption(bias);
    parameter.dependent = (String) cmdLine.getValue(dependent);
    String independentString = "";
    for (Object x : cmdLine.getValues(independent)) {
        independentString += x.toString() + ",";
    }
    parameter.independent = independentString.substring(0, Math.max(independentString.length() - 1, 0));
    String interactionString = "";
    for (Object x : cmdLine.getValues(interaction)) {
        interactionString += x.toString() + ",";
    }
    parameter.interaction = interactionString.substring(0, Math.max(interactionString.length() - 1, 0));

    if (!processLambda(parameter, cmdLine, lambda) || parameter.alpha < 0.0 || parameter.alpha > 1.0
            || parameter.numOfCV < 1 || parameter.numOfCV > 20) {
        log.error(
                "please make sure the lambda sequence is positive and increasing, and 0.0 <= alphaValue <= 1.0 and 1 <= numofCV <= 20");
        return false;
    }

    input = (String) cmdLine.getValue(inputFile);
    output = (String) cmdLine.getValue(outputFile);
    return true;
}

From source file:org.apache.mahout.regression.penalizedlinear.PenalizedLinearDriver.java

private boolean parseArgs(String[] args) {
    DefaultOptionBuilder builder = new DefaultOptionBuilder();

    Option help = builder.withLongName("help").withDescription("print this list").create();

    ArgumentBuilder argumentBuilder = new ArgumentBuilder();
    Option inputFile = builder.withLongName("input").withRequired(true)
            .withArgument(argumentBuilder.withName("input").withMaximum(1).create())
            .withDescription(// w ww .java  2  s .c  o m
                    "where to get training data (Mahout sequence file of VectorWritable); in each line, the first element is response; rest are predictors.")
            .create();

    Option outputFile = builder.withLongName("output").withRequired(true)
            .withArgument(argumentBuilder.withName("output").withMaximum(1).create())
            .withDescription("where to get results").create();

    Option lambda = builder.withLongName("lambda")
            .withArgument(argumentBuilder.withName("lambda").withDefault("0").withMinimum(1).create())
            .withDescription("an increasing positive sequence of penalty coefficient, "
                    + "with length n >= 0; if lambda is not specified, the sequence is chosen by algorithm.")
            .create();

    Option alpha = builder.withLongName("alpha")
            .withArgument(
                    argumentBuilder.withName("alpha").withDefault("1").withMinimum(1).withMaximum(1).create())
            .withDescription("the elastic-net coefficient with default value 1 (LASSO)").create();

    Option bias = builder.withLongName("bias").withDescription("include a bias term").create();

    Option numOfCV = builder.withLongName("numOfCV")
            .withArgument(
                    argumentBuilder.withName("numOfCV").withDefault("5").withMinimum(0).withMaximum(1).create())
            .withDescription("number of cross validation, the rule of thumb is 5 or 10").create();

    Group normalArgs = new GroupBuilder().withOption(help).withOption(inputFile).withOption(outputFile)
            .withOption(lambda).withOption(alpha).withOption(bias).withOption(numOfCV).create();

    Parser parser = new Parser();
    parser.setHelpOption(help);
    parser.setHelpTrigger("--help");
    parser.setGroup(normalArgs);
    parser.setHelpFormatter(new HelpFormatter(" ", "", " ", 130));
    CommandLine cmdLine = parser.parseAndHelp(args);
    if (cmdLine == null) {
        return false;
    }

    parameter = new PenalizedLinearParameter();
    parameter.setNumOfCV(Integer.parseInt((String) cmdLine.getValue(numOfCV)));
    parameter.setAlpha(Float.parseFloat((String) cmdLine.getValue(alpha)));
    parameter.setIntercept(cmdLine.hasOption(bias));

    if (!processLambda(parameter, cmdLine, lambda) || parameter.alpha < 0.0 || parameter.alpha > 1.0
            || parameter.numOfCV < 1 || parameter.numOfCV > 20) {
        log.error(
                "please make sure the lambda sequence is positive and increasing, and 0.0 <= alphaValue <= 1.0 and 1 <= numOfCV <= 20");
        return false;
    }

    input = (String) cmdLine.getValue(inputFile);
    output = (String) cmdLine.getValue(outputFile);

    return true;
}

From source file:org.apache.mahout.text.SparseVectorsFromSequenceFiles.java

public static void main(String[] args) throws Exception {
    DefaultOptionBuilder obuilder = new DefaultOptionBuilder();
    ArgumentBuilder abuilder = new ArgumentBuilder();
    GroupBuilder gbuilder = new GroupBuilder();

    Option inputDirOpt = obuilder.withLongName("input").withRequired(true)
            .withArgument(abuilder.withName("input").withMinimum(1).withMaximum(1).create())
            .withDescription("input dir containing the documents in sequence file format").withShortName("i")
            .create();//from   w  w  w  .  ja  v a2s  . c  o m

    Option outputDirOpt = obuilder.withLongName("output").withRequired(true)
            .withArgument(abuilder.withName("output").withMinimum(1).withMaximum(1).create())
            .withDescription("The output directory").withShortName("o").create();
    Option minSupportOpt = obuilder.withLongName("minSupport")
            .withArgument(abuilder.withName("minSupport").withMinimum(1).withMaximum(1).create())
            .withDescription("(Optional) Minimum Support. Default Value: 2").withShortName("s").create();

    Option analyzerNameOpt = obuilder.withLongName("analyzerName")
            .withArgument(abuilder.withName("analyzerName").withMinimum(1).withMaximum(1).create())
            .withDescription("The class name of the analyzer").withShortName("a").create();

    Option chunkSizeOpt = obuilder.withLongName("chunkSize")
            .withArgument(abuilder.withName("chunkSize").withMinimum(1).withMaximum(1).create())
            .withDescription("The chunkSize in MegaBytes. 100-10000 MB").withShortName("chunk").create();

    Option weightOpt = obuilder.withLongName("weight").withRequired(false)
            .withArgument(abuilder.withName("weight").withMinimum(1).withMaximum(1).create())
            .withDescription("The kind of weight to use. Currently TF or TFIDF").withShortName("wt").create();

    Option minDFOpt = obuilder.withLongName("minDF").withRequired(false)
            .withArgument(abuilder.withName("minDF").withMinimum(1).withMaximum(1).create())
            .withDescription("The minimum document frequency.  Default is 1").withShortName("md").create();

    Option maxDFPercentOpt = obuilder.withLongName("maxDFPercent").withRequired(false)
            .withArgument(abuilder.withName("maxDFPercent").withMinimum(1).withMaximum(1).create())
            .withDescription(
                    "The max percentage of docs for the DF.  Can be used to remove really high frequency terms."
                            + " Expressed as an integer between 0 and 100. Default is 99.")
            .withShortName("x").create();

    Option minLLROpt = obuilder.withLongName("minLLR").withRequired(false)
            .withArgument(abuilder.withName("minLLR").withMinimum(1).withMaximum(1).create())
            .withDescription("(Optional)The minimum Log Likelihood Ratio(Float)  Default is "
                    + LLRReducer.DEFAULT_MIN_LLR)
            .withShortName("ml").create();

    Option numReduceTasksOpt = obuilder.withLongName("numReducers")
            .withArgument(abuilder.withName("numReducers").withMinimum(1).withMaximum(1).create())
            .withDescription("(Optional) Number of reduce tasks. Default Value: 1").withShortName("nr")
            .create();

    Option powerOpt = obuilder.withLongName("norm").withRequired(false)
            .withArgument(abuilder.withName("norm").withMinimum(1).withMaximum(1).create())
            .withDescription(
                    "The norm to use, expressed as either a float or \"INF\" if you want to use the Infinite norm.  "
                            + "Must be greater or equal to 0.  The default is not to normalize")
            .withShortName("n").create();
    Option maxNGramSizeOpt = obuilder.withLongName("maxNGramSize").withRequired(false)
            .withArgument(abuilder.withName("ngramSize").withMinimum(1).withMaximum(1).create())
            .withDescription("(Optional) The maximum size of ngrams to create"
                    + " (2 = bigrams, 3 = trigrams, etc) Default Value:1")
            .withShortName("ng").create();
    Option sequentialAccessVectorOpt = obuilder.withLongName("sequentialAccessVector").withRequired(false)
            .withDescription(
                    "(Optional) Whether output vectors should be SequentialAccessVectors. If set true else false")
            .withShortName("seq").create();

    Option overwriteOutput = obuilder.withLongName("overwrite").withRequired(false)
            .withDescription("If set, overwrite the output directory").withShortName("ow").create();
    Option helpOpt = obuilder.withLongName("help").withDescription("Print out help").withShortName("h")
            .create();

    Group group = gbuilder.withName("Options").withOption(minSupportOpt).withOption(analyzerNameOpt)
            .withOption(chunkSizeOpt).withOption(outputDirOpt).withOption(inputDirOpt).withOption(minDFOpt)
            .withOption(maxDFPercentOpt).withOption(weightOpt).withOption(powerOpt).withOption(minLLROpt)
            .withOption(numReduceTasksOpt).withOption(maxNGramSizeOpt).withOption(overwriteOutput)
            .withOption(helpOpt).withOption(sequentialAccessVectorOpt).create();
    try {
        Parser parser = new Parser();
        parser.setGroup(group);
        CommandLine cmdLine = parser.parse(args);

        if (cmdLine.hasOption(helpOpt)) {
            CommandLineUtil.printHelp(group);
            return;
        }

        Path inputDir = new Path((String) cmdLine.getValue(inputDirOpt));
        Path outputDir = new Path((String) cmdLine.getValue(outputDirOpt));

        int chunkSize = 100;
        if (cmdLine.hasOption(chunkSizeOpt)) {
            chunkSize = Integer.parseInt((String) cmdLine.getValue(chunkSizeOpt));
        }
        int minSupport = 2;
        if (cmdLine.hasOption(minSupportOpt)) {
            String minSupportString = (String) cmdLine.getValue(minSupportOpt);
            minSupport = Integer.parseInt(minSupportString);
        }

        int maxNGramSize = 1;

        if (cmdLine.hasOption(maxNGramSizeOpt)) {
            try {
                maxNGramSize = Integer.parseInt(cmdLine.getValue(maxNGramSizeOpt).toString());
            } catch (NumberFormatException ex) {
                log.warn("Could not parse ngram size option");
            }
        }
        log.info("Maximum n-gram size is: {}", maxNGramSize);

        if (cmdLine.hasOption(overwriteOutput)) {
            HadoopUtil.overwriteOutput(outputDir);
        }

        float minLLRValue = LLRReducer.DEFAULT_MIN_LLR;
        if (cmdLine.hasOption(minLLROpt)) {
            minLLRValue = Float.parseFloat(cmdLine.getValue(minLLROpt).toString());
        }
        log.info("Minimum LLR value: {}", minLLRValue);

        int reduceTasks = 1;
        if (cmdLine.hasOption(numReduceTasksOpt)) {
            reduceTasks = Integer.parseInt(cmdLine.getValue(numReduceTasksOpt).toString());
        }
        log.info("Number of reduce tasks: {}", reduceTasks);

        Class<? extends Analyzer> analyzerClass = DefaultAnalyzer.class;
        if (cmdLine.hasOption(analyzerNameOpt)) {
            String className = cmdLine.getValue(analyzerNameOpt).toString();
            analyzerClass = (Class<? extends Analyzer>) Class.forName(className);
            // try instantiating it, b/c there isn't any point in setting it if
            // you can't instantiate it
            analyzerClass.newInstance();
        }

        boolean processIdf;

        if (cmdLine.hasOption(weightOpt)) {
            String wString = cmdLine.getValue(weightOpt).toString();
            if (wString.equalsIgnoreCase("tf")) {
                processIdf = false;
            } else if (wString.equalsIgnoreCase("tfidf")) {
                processIdf = true;
            } else {
                throw new OptionException(weightOpt);
            }
        } else {
            processIdf = true;
        }

        int minDf = 1;
        if (cmdLine.hasOption(minDFOpt)) {
            minDf = Integer.parseInt(cmdLine.getValue(minDFOpt).toString());
        }
        int maxDFPercent = 99;
        if (cmdLine.hasOption(maxDFPercentOpt)) {
            maxDFPercent = Integer.parseInt(cmdLine.getValue(maxDFPercentOpt).toString());
        }

        float norm = PartialVectorMerger.NO_NORMALIZING;
        if (cmdLine.hasOption(powerOpt)) {
            String power = cmdLine.getValue(powerOpt).toString();
            if (power.equals("INF")) {
                norm = Float.POSITIVE_INFINITY;
            } else {
                norm = Float.parseFloat(power);
            }
        }
        HadoopUtil.overwriteOutput(outputDir);
        Path tokenizedPath = new Path(outputDir, DocumentProcessor.TOKENIZED_DOCUMENT_OUTPUT_FOLDER);
        DocumentProcessor.tokenizeDocuments(inputDir, analyzerClass, tokenizedPath);

        boolean sequentialAccessOutput = false;
        if (cmdLine.hasOption(sequentialAccessVectorOpt)) {
            sequentialAccessOutput = true;
        }

        DictionaryVectorizer.createTermFrequencyVectors(tokenizedPath, outputDir, minSupport, maxNGramSize,
                minLLRValue, reduceTasks, chunkSize, sequentialAccessOutput);
        if (processIdf) {
            TFIDFConverter.processTfIdf(new Path(outputDir, DictionaryVectorizer.DOCUMENT_VECTOR_OUTPUT_FOLDER),
                    new Path(outputDir, TFIDFConverter.TFIDF_OUTPUT_FOLDER), chunkSize, minDf, maxDFPercent,
                    norm, sequentialAccessOutput, reduceTasks);
        }
    } catch (OptionException e) {
        log.error("Exception", e);
        CommandLineUtil.printHelp(group);
    }
}

From source file:org.apache.mahout.text.wikipedia.WikipediaXmlSplitter.java

public static void main(String[] args) throws IOException {
    DefaultOptionBuilder obuilder = new DefaultOptionBuilder();
    ArgumentBuilder abuilder = new ArgumentBuilder();
    GroupBuilder gbuilder = new GroupBuilder();

    Option dumpFileOpt = obuilder.withLongName("dumpFile").withRequired(true)
            .withArgument(abuilder.withName("dumpFile").withMinimum(1).withMaximum(1).create())
            .withDescription("The path to the wikipedia dump file (.bz2 or uncompressed)").withShortName("d")
            .create();/*w  ww .  j  a v a2  s  . co m*/

    Option outputDirOpt = obuilder.withLongName("outputDir").withRequired(true)
            .withArgument(abuilder.withName("outputDir").withMinimum(1).withMaximum(1).create())
            .withDescription("The output directory to place the splits in:\n"
                    + "local files:\n\t/var/data/wikipedia-xml-chunks or\n\tfile:///var/data/wikipedia-xml-chunks\n"
                    + "Hadoop DFS:\n\thdfs://wikipedia-xml-chunks\n"
                    + "AWS S3 (blocks):\n\ts3://bucket-name/wikipedia-xml-chunks\n"
                    + "AWS S3 (native files):\n\ts3n://bucket-name/wikipedia-xml-chunks\n")

            .withShortName("o").create();

    Option s3IdOpt = obuilder.withLongName("s3ID").withRequired(false)
            .withArgument(abuilder.withName("s3Id").withMinimum(1).withMaximum(1).create())
            .withDescription("Amazon S3 ID key").withShortName("i").create();
    Option s3SecretOpt = obuilder.withLongName("s3Secret").withRequired(false)
            .withArgument(abuilder.withName("s3Secret").withMinimum(1).withMaximum(1).create())
            .withDescription("Amazon S3 secret key").withShortName("s").create();

    Option chunkSizeOpt = obuilder.withLongName("chunkSize").withRequired(true)
            .withArgument(abuilder.withName("chunkSize").withMinimum(1).withMaximum(1).create())
            .withDescription("The Size of the chunk, in megabytes").withShortName("c").create();
    Option numChunksOpt = obuilder.withLongName("numChunks").withRequired(false)
            .withArgument(abuilder.withName("numChunks").withMinimum(1).withMaximum(1).create())
            .withDescription(
                    "The maximum number of chunks to create.  If specified, program will only create a subset of the chunks")
            .withShortName("n").create();
    Group group = gbuilder.withName("Options").withOption(dumpFileOpt).withOption(outputDirOpt)
            .withOption(chunkSizeOpt).withOption(numChunksOpt).withOption(s3IdOpt).withOption(s3SecretOpt)
            .create();

    Parser parser = new Parser();
    parser.setGroup(group);
    CommandLine cmdLine;
    try {
        cmdLine = parser.parse(args);
    } catch (OptionException e) {
        log.error("Error while parsing options", e);
        CommandLineUtil.printHelp(group);
        return;
    }

    Configuration conf = new Configuration();
    String dumpFilePath = (String) cmdLine.getValue(dumpFileOpt);
    String outputDirPath = (String) cmdLine.getValue(outputDirOpt);

    if (cmdLine.hasOption(s3IdOpt)) {
        String id = (String) cmdLine.getValue(s3IdOpt);
        conf.set("fs.s3n.awsAccessKeyId", id);
        conf.set("fs.s3.awsAccessKeyId", id);
    }
    if (cmdLine.hasOption(s3SecretOpt)) {
        String secret = (String) cmdLine.getValue(s3SecretOpt);
        conf.set("fs.s3n.awsSecretAccessKey", secret);
        conf.set("fs.s3.awsSecretAccessKey", secret);
    }
    // do not compute crc file when using local FS
    conf.set("fs.file.impl", "org.apache.hadoop.fs.RawLocalFileSystem");
    FileSystem fs = FileSystem.get(URI.create(outputDirPath), conf);

    int chunkSize = 1024 * 1024 * Integer.parseInt((String) cmdLine.getValue(chunkSizeOpt));

    int numChunks = Integer.MAX_VALUE;
    if (cmdLine.hasOption(numChunksOpt)) {
        numChunks = Integer.parseInt((String) cmdLine.getValue(numChunksOpt));
    }

    String header = "<mediawiki xmlns=\"http://www.mediawiki.org/xml/export-0.3/\" "
            + "xmlns:xsi=\"http://www.w3.org/2001/XMLSchema-instance\" "
            + "xsi:schemaLocation=\"http://www.mediawiki.org/xml/export-0.3/ "
            + "http://www.mediawiki.org/xml/export-0.3.xsd\" " + "version=\"0.3\" " + "xml:lang=\"en\">\n"
            + "  <siteinfo>\n" + "<sitename>Wikipedia</sitename>\n"
            + "    <base>http://en.wikipedia.org/wiki/Main_Page</base>\n"
            + "    <generator>MediaWiki 1.13alpha</generator>\n" + "    <case>first-letter</case>\n"
            + "    <namespaces>\n" + "      <namespace key=\"-2\">Media</namespace>\n"
            + "      <namespace key=\"-1\">Special</namespace>\n" + "      <namespace key=\"0\" />\n"
            + "      <namespace key=\"1\">Talk</namespace>\n" + "      <namespace key=\"2\">User</namespace>\n"
            + "      <namespace key=\"3\">User talk</namespace>\n"
            + "      <namespace key=\"4\">Wikipedia</namespace>\n"
            + "      <namespace key=\"5\">Wikipedia talk</namespace>\n"
            + "      <namespace key=\"6\">Image</namespace>\n"
            + "      <namespace key=\"7\">Image talk</namespace>\n"
            + "      <namespace key=\"8\">MediaWiki</namespace>\n"
            + "      <namespace key=\"9\">MediaWiki talk</namespace>\n"
            + "      <namespace key=\"10\">Template</namespace>\n"
            + "      <namespace key=\"11\">Template talk</namespace>\n"
            + "      <namespace key=\"12\">Help</namespace>\n"
            + "      <namespace key=\"13\">Help talk</namespace>\n"
            + "      <namespace key=\"14\">Category</namespace>\n"
            + "      <namespace key=\"15\">Category talk</namespace>\n"
            + "      <namespace key=\"100\">Portal</namespace>\n"
            + "      <namespace key=\"101\">Portal talk</namespace>\n" + "    </namespaces>\n"
            + "  </siteinfo>\n";

    StringBuilder content = new StringBuilder();
    content.append(header);
    NumberFormat decimalFormatter = new DecimalFormat("0000");
    File dumpFile = new File(dumpFilePath);

    // If the specified path for the input file is incorrect, return immediately
    if (!dumpFile.exists()) {
        log.error("Input file path {} doesn't exist", dumpFilePath);
        return;
    }

    FileLineIterator it;
    if (dumpFilePath.endsWith(".bz2")) {
        // default compression format from http://download.wikimedia.org
        CompressionCodec codec = new BZip2Codec();
        it = new FileLineIterator(codec.createInputStream(new FileInputStream(dumpFile)));
    } else {
        // assume the user has previously de-compressed the dump file
        it = new FileLineIterator(dumpFile);
    }
    int fileNumber = 0;
    while (it.hasNext()) {
        String thisLine = it.next();
        if (thisLine.trim().startsWith("<page>")) {
            boolean end = false;
            while (!thisLine.trim().startsWith("</page>")) {
                content.append(thisLine).append('\n');
                if (it.hasNext()) {
                    thisLine = it.next();
                } else {
                    end = true;
                    break;
                }
            }
            content.append(thisLine).append('\n');

            if (content.length() > chunkSize || end) {
                content.append("</mediawiki>");
                fileNumber++;
                String filename = outputDirPath + "/chunk-" + decimalFormatter.format(fileNumber) + ".xml";
                BufferedWriter chunkWriter = new BufferedWriter(
                        new OutputStreamWriter(fs.create(new Path(filename)), "UTF-8"));
                try {
                    chunkWriter.write(content.toString(), 0, content.length());
                } finally {
                    Closeables.close(chunkWriter, false);
                }
                if (fileNumber >= numChunks) {
                    break;
                }
                content = new StringBuilder();
                content.append(header);
            }
        }
    }
}

From source file:org.apache.mahout.text.WikipediaToSequenceFile.java

/**
 * Takes in two arguments://from   w w w  .  j  av a 2  s.c o m
 * <ol>
 * <li>The input {@link org.apache.hadoop.fs.Path} where the input documents live</li>
 * <li>The output {@link org.apache.hadoop.fs.Path} where to write the classifier as a
 * {@link org.apache.hadoop.io.SequenceFile}</li>
 * </ol>
 */
public static void main(String[] args) throws IOException {
    DefaultOptionBuilder obuilder = new DefaultOptionBuilder();
    ArgumentBuilder abuilder = new ArgumentBuilder();
    GroupBuilder gbuilder = new GroupBuilder();

    Option dirInputPathOpt = DefaultOptionCreator.inputOption().create();

    Option dirOutputPathOpt = DefaultOptionCreator.outputOption().create();

    Option categoriesOpt = obuilder.withLongName("categories")
            .withArgument(abuilder.withName("categories").withMinimum(1).withMaximum(1).create())
            .withDescription("Location of the categories file.  One entry per line. "
                    + "Will be used to make a string match in Wikipedia Category field")
            .withShortName("c").create();

    Option exactMatchOpt = obuilder.withLongName("exactMatch")
            .withDescription("If set, then the category name must exactly match the "
                    + "entry in the categories file. Default is false")
            .withShortName("e").create();

    Option allOpt = obuilder.withLongName("all").withDescription("If set, Select all files. Default is false")
            .withShortName("all").create();

    Option removeLabelOpt = obuilder.withLongName("removeLabels")
            .withDescription("If set, remove [[Category:labels]] from document text after extracting label."
                    + "Default is false")
            .withShortName("rl").create();

    Option helpOpt = DefaultOptionCreator.helpOption();

    Group group = gbuilder.withName("Options").withOption(categoriesOpt).withOption(dirInputPathOpt)
            .withOption(dirOutputPathOpt).withOption(exactMatchOpt).withOption(allOpt).withOption(helpOpt)
            .withOption(removeLabelOpt).create();

    Parser parser = new Parser();
    parser.setGroup(group);
    parser.setHelpOption(helpOpt);
    try {
        CommandLine cmdLine = parser.parse(args);
        if (cmdLine.hasOption(helpOpt)) {
            CommandLineUtil.printHelp(group);
            return;
        }

        String inputPath = (String) cmdLine.getValue(dirInputPathOpt);
        String outputPath = (String) cmdLine.getValue(dirOutputPathOpt);

        String catFile = "";
        if (cmdLine.hasOption(categoriesOpt)) {
            catFile = (String) cmdLine.getValue(categoriesOpt);
        }

        boolean all = false;
        if (cmdLine.hasOption(allOpt)) {
            all = true;
        }

        boolean removeLabels = false;
        if (cmdLine.hasOption(removeLabelOpt)) {
            removeLabels = true;
        }

        runJob(inputPath, outputPath, catFile, cmdLine.hasOption(exactMatchOpt), all, removeLabels);
    } catch (OptionException e) {
        log.error("Exception", e);
        CommandLineUtil.printHelp(group);
    } catch (InterruptedException e) {
        log.error("Exception", e);
        CommandLineUtil.printHelp(group);
    } catch (ClassNotFoundException e) {
        log.error("Exception", e);
        CommandLineUtil.printHelp(group);
    }
}

From source file:org.apache.mahout.utils.nlp.collocations.llr.CollocDriver.java

@Override
public int run(String[] args) throws Exception {
    DefaultOptionBuilder obuilder = new DefaultOptionBuilder();
    ArgumentBuilder abuilder = new ArgumentBuilder();
    GroupBuilder gbuilder = new GroupBuilder();

    Option inputOpt = obuilder.withLongName("input").withRequired(true)
            .withArgument(abuilder.withName("input").withMinimum(1).withMaximum(1).create())
            .withDescription("The Path for input files.").withShortName("i").create();

    Option outputOpt = obuilder.withLongName("output").withRequired(true)
            .withArgument(abuilder.withName("output").withMinimum(1).withMaximum(1).create())
            .withDescription("The Path write output to").withShortName("o").create();

    Option maxNGramSizeOpt = obuilder.withLongName("maxNGramSize").withRequired(false)
            .withArgument(abuilder.withName("ngramSize").withMinimum(1).withMaximum(1).create())
            .withDescription("(Optional) The maximum size of ngrams to create"
                    + " (2 = bigrams, 3 = trigrams, etc) Default Value:2")
            .withShortName("ng").create();

    Option minSupportOpt = obuilder.withLongName("minSupport").withRequired(false)
            .withArgument(abuilder.withName("minSupport").withMinimum(1).withMaximum(1).create())
            .withDescription("(Optional) Minimum Support. Default Value: " + CollocReducer.DEFAULT_MIN_SUPPORT)
            .withShortName("s").create();

    Option minLLROpt = obuilder.withLongName("minLLR").withRequired(false)
            .withArgument(abuilder.withName("minLLR").withMinimum(1).withMaximum(1).create())
            .withDescription("(Optional)The minimum Log Likelihood Ratio(Float)  Default is "
                    + LLRReducer.DEFAULT_MIN_LLR)
            .withShortName("ml").create();

    Option numReduceTasksOpt = obuilder.withLongName("numReducers").withRequired(false)
            .withArgument(abuilder.withName("numReducers").withMinimum(1).withMaximum(1).create())
            .withDescription(//from ww w  . j  a  v a  2  s .com
                    "(Optional) Number of reduce tasks. Default Value: " + DEFAULT_PASS1_NUM_REDUCE_TASKS)
            .withShortName("nr").create();

    Option preprocessOpt = obuilder.withLongName("preprocess").withRequired(false)
            .withDescription("If set, input is SequenceFile<Text,Text> where the value is the document, "
                    + " which will be tokenized using the specified analyzer.")
            .withShortName("p").create();

    Option unigramOpt = obuilder.withLongName("unigram").withRequired(false)
            .withDescription("If set, unigrams will be emitted in the final output alongside collocations")
            .withShortName("u").create();

    Option overwriteOutput = obuilder.withLongName("overwrite").withRequired(false)
            .withDescription("If set, overwrite the output directory").withShortName("w").create();

    Option analyzerNameOpt = obuilder.withLongName("analyzerName")
            .withArgument(abuilder.withName("analyzerName").withMinimum(1).withMaximum(1).create())
            .withDescription("The class name of the analyzer").withShortName("a").create();

    Option helpOpt = obuilder.withLongName("help").withDescription("Print out help").withShortName("h")
            .create();

    Group group = gbuilder.withName("Options").withOption(inputOpt).withOption(outputOpt)
            .withOption(maxNGramSizeOpt).withOption(overwriteOutput).withOption(minSupportOpt)
            .withOption(minLLROpt).withOption(numReduceTasksOpt).withOption(analyzerNameOpt)
            .withOption(preprocessOpt).withOption(unigramOpt).withOption(helpOpt).create();

    try {
        Parser parser = new Parser();
        parser.setGroup(group);
        CommandLine cmdLine = parser.parse(args);

        if (cmdLine.hasOption(helpOpt)) {
            CommandLineUtil.printHelp(group);
            return 1;
        }

        Path input = new Path(cmdLine.getValue(inputOpt).toString());
        Path output = new Path(cmdLine.getValue(outputOpt).toString());

        int maxNGramSize = DEFAULT_MAX_NGRAM_SIZE;

        if (cmdLine.hasOption(maxNGramSizeOpt)) {
            try {
                maxNGramSize = Integer.parseInt(cmdLine.getValue(maxNGramSizeOpt).toString());
            } catch (NumberFormatException ex) {
                log.warn("Could not parse ngram size option");
            }
        }
        log.info("Maximum n-gram size is: {}", maxNGramSize);

        if (cmdLine.hasOption(overwriteOutput)) {
            HadoopUtil.overwriteOutput(output);
        }

        int minSupport = CollocReducer.DEFAULT_MIN_SUPPORT;
        if (cmdLine.hasOption(minSupportOpt)) {
            minSupport = Integer.parseInt(cmdLine.getValue(minSupportOpt).toString());
        }
        log.info("Minimum Support value: {}", minSupport);

        float minLLRValue = LLRReducer.DEFAULT_MIN_LLR;
        if (cmdLine.hasOption(minLLROpt)) {
            minLLRValue = Float.parseFloat(cmdLine.getValue(minLLROpt).toString());
        }
        log.info("Minimum LLR value: {}", minLLRValue);

        int reduceTasks = DEFAULT_PASS1_NUM_REDUCE_TASKS;
        if (cmdLine.hasOption(numReduceTasksOpt)) {
            reduceTasks = Integer.parseInt(cmdLine.getValue(numReduceTasksOpt).toString());
        }
        log.info("Number of pass1 reduce tasks: {}", reduceTasks);

        boolean emitUnigrams = cmdLine.hasOption(unigramOpt);

        if (cmdLine.hasOption(preprocessOpt)) {
            log.info("Input will be preprocessed");

            Class<? extends Analyzer> analyzerClass = DefaultAnalyzer.class;
            if (cmdLine.hasOption(analyzerNameOpt)) {
                String className = cmdLine.getValue(analyzerNameOpt).toString();
                analyzerClass = Class.forName(className).asSubclass(Analyzer.class);
                // try instantiating it, b/c there isn't any point in setting it if
                // you can't instantiate it
                analyzerClass.newInstance();
            }

            Path tokenizedPath = new Path(output, DocumentProcessor.TOKENIZED_DOCUMENT_OUTPUT_FOLDER);

            DocumentProcessor.tokenizeDocuments(input, analyzerClass, tokenizedPath);
            input = tokenizedPath;
        } else {
            log.info("Input will NOT be preprocessed");
        }

        // parse input and extract collocations
        long ngramCount = generateCollocations(input, output, emitUnigrams, maxNGramSize, reduceTasks,
                minSupport);

        // tally collocations and perform LLR calculation
        computeNGramsPruneByLLR(ngramCount, output, emitUnigrams, minLLRValue, reduceTasks);

    } catch (OptionException e) {
        log.error("Exception", e);
        CommandLineUtil.printHelp(group);
        return 1;
    }

    return 0;
}

From source file:org.apache.mahout.utils.vectors.arff.Driver.java

public static void main(String[] args) throws IOException {
    DefaultOptionBuilder obuilder = new DefaultOptionBuilder();
    ArgumentBuilder abuilder = new ArgumentBuilder();
    GroupBuilder gbuilder = new GroupBuilder();

    Option inputOpt = obuilder.withLongName("input").withRequired(true)
            .withArgument(abuilder.withName("input").withMinimum(1).withMaximum(1).create())
            .withDescription(//w ww  .j a v  a2 s.com
                    "The file or directory containing the ARFF files.  If it is a directory, all .arff files will be converted")
            .withShortName("d").create();

    Option outputOpt = obuilder.withLongName("output").withRequired(true)
            .withArgument(abuilder.withName("output").withMinimum(1).withMaximum(1).create())
            .withDescription(
                    "The output directory.  Files will have the same name as the input, but with the extension .mvc")
            .withShortName("o").create();

    Option maxOpt = obuilder.withLongName("max").withRequired(false)
            .withArgument(abuilder.withName("max").withMinimum(1).withMaximum(1).create())
            .withDescription(
                    "The maximum number of vectors to output.  If not specified, then it will loop over all docs")
            .withShortName("m").create();

    Option dictOutOpt = obuilder.withLongName("dictOut").withRequired(true)
            .withArgument(abuilder.withName("dictOut").withMinimum(1).withMaximum(1).create())
            .withDescription("The file to output the label bindings").withShortName("t").create();

    Option jsonDictonaryOpt = obuilder.withLongName("json-dictonary").withRequired(false)
            .withDescription("Write dictonary in JSON format").withShortName("j").create();

    Option delimiterOpt = obuilder.withLongName("delimiter").withRequired(false)
            .withArgument(abuilder.withName("delimiter").withMinimum(1).withMaximum(1).create())
            .withDescription("The delimiter for outputing the dictionary").withShortName("l").create();

    Option helpOpt = obuilder.withLongName("help").withDescription("Print out help").withShortName("h")
            .create();
    Group group = gbuilder.withName("Options").withOption(inputOpt).withOption(outputOpt).withOption(maxOpt)
            .withOption(helpOpt).withOption(dictOutOpt).withOption(jsonDictonaryOpt).withOption(delimiterOpt)
            .create();

    try {
        Parser parser = new Parser();
        parser.setGroup(group);
        CommandLine cmdLine = parser.parse(args);

        if (cmdLine.hasOption(helpOpt)) {

            CommandLineUtil.printHelp(group);
            return;
        }
        if (cmdLine.hasOption(inputOpt)) { // Lucene case
            File input = new File(cmdLine.getValue(inputOpt).toString());
            long maxDocs = Long.MAX_VALUE;
            if (cmdLine.hasOption(maxOpt)) {
                maxDocs = Long.parseLong(cmdLine.getValue(maxOpt).toString());
            }
            if (maxDocs < 0) {
                throw new IllegalArgumentException("maxDocs must be >= 0");
            }
            String outDir = cmdLine.getValue(outputOpt).toString();
            log.info("Output Dir: {}", outDir);

            String delimiter = cmdLine.hasOption(delimiterOpt) ? cmdLine.getValue(delimiterOpt).toString()
                    : "\t";
            File dictOut = new File(cmdLine.getValue(dictOutOpt).toString());
            boolean jsonDictonary = cmdLine.hasOption(jsonDictonaryOpt);
            ARFFModel model = new MapBackedARFFModel();
            if (input.exists() && input.isDirectory()) {
                File[] files = input.listFiles(new FilenameFilter() {
                    @Override
                    public boolean accept(File file, String name) {
                        return name.endsWith(".arff");
                    }
                });

                for (File file : files) {
                    writeFile(outDir, file, maxDocs, model, dictOut, delimiter, jsonDictonary);
                }
            } else {
                writeFile(outDir, input, maxDocs, model, dictOut, delimiter, jsonDictonary);
            }
        }

    } catch (OptionException e) {
        log.error("Exception", e);
        CommandLineUtil.printHelp(group);
    }
}

From source file:org.apache.mahout.utils.vectors.libsvm.Driver.java

/**
 * The main method./*from  w w w.ja va2s  .c om*/
 * 
 * @param args the arguments
 * @throws IOException Signals that an I/O exception has occurred.
 */
public static void main(String[] args) throws IOException {
    DefaultOptionBuilder obuilder = new DefaultOptionBuilder();
    ArgumentBuilder abuilder = new ArgumentBuilder();
    GroupBuilder gbuilder = new GroupBuilder();

    Option inputOpt = obuilder.withLongName("input").withRequired(true)
            .withArgument(abuilder.withName("input").withMinimum(1).withMaximum(1).create())
            .withDescription(
                    "The file or directory containing the ARFF files.  If it is a directory, all .arff files will be converted")
            .withShortName("d").create();

    Option outputOpt = obuilder.withLongName("output").withRequired(true)
            .withArgument(abuilder.withName("output").withMinimum(1).withMaximum(1).create())
            .withDescription(
                    "The output directory.  Files will have the same name as the input, but with the extension .mvc")
            .withShortName("o").create();

    Option maxOpt = obuilder.withLongName("max").withRequired(false)
            .withArgument(abuilder.withName("max").withMinimum(1).withMaximum(1).create())
            .withDescription(
                    "The maximum number of vectors to output.  If not specified, then it will loop over all docs")
            .withShortName("m").create();

    Option dictOutOpt = obuilder.withLongName("dictOut").withRequired(true)
            .withArgument(abuilder.withName("dictOut").withMinimum(1).withMaximum(1).create())
            .withDescription("The file to output the label bindings").withShortName("t").create();

    Option outWriterOpt = obuilder.withLongName("outputWriter").withRequired(false)
            .withArgument(abuilder.withName("outputWriter").withMinimum(1).withMaximum(1).create())
            .withDescription(
                    "The VectorWriter to use, either seq (SequenceFileVectorWriter - default) or file (Writes to a File using JSON format)")
            .withShortName("e").create();

    Option helpOpt = obuilder.withLongName("help").withDescription("Print out help").withShortName("h")
            .create();
    Group group = gbuilder.withName("Options").withOption(inputOpt).withOption(outputOpt).withOption(maxOpt)
            .withOption(helpOpt).withOption(dictOutOpt).withOption(outWriterOpt).create();
    try {
        Parser parser = new Parser();
        parser.setGroup(group);
        CommandLine cmdLine = parser.parse(args);

        if (cmdLine.hasOption(helpOpt)) {

            CommandLineUtil.printHelp(group);
            return;
        }

        if (cmdLine.hasOption(inputOpt)) {// Lucene case
            File input = new File(cmdLine.getValue(inputOpt).toString());
            long maxDocs = Long.MAX_VALUE;
            if (cmdLine.hasOption(maxOpt)) {
                maxDocs = Long.parseLong(cmdLine.getValue(maxOpt).toString());
            }
            if (maxDocs < 0) {
                throw new IllegalArgumentException("maxDocs must be >= 0");
            }
            String outDir = cmdLine.getValue(outputOpt).toString();
            Driver.log.info("Output Dir: {}", outDir);
            String outWriter = null;
            if (cmdLine.hasOption(outWriterOpt)) {
                outWriter = cmdLine.getValue(outWriterOpt).toString();
            }
            File dictOut = new File(cmdLine.getValue(dictOutOpt).toString());
            List<Double> labels = new ArrayList<Double>();
            if (input.exists() && input.isDirectory()) {
                File[] files = input.listFiles();

                for (File file : files) {
                    //            Driver.writeFile(outWriter, outDir, file, maxDocs, labels);
                }
            } else {
                //          Driver.writeFile(outWriter, outDir, input, maxDocs, labels);
            }
            Driver.log.info("Dictionary Output file: {}", dictOut);
            BufferedWriter writer = new BufferedWriter(
                    new OutputStreamWriter(new FileOutputStream(dictOut), Charset.forName("UTF8")));
            for (Double label : labels) {
                writer.append(label.toString()).append('\n');
            }
            writer.close();

        }

    } catch (OptionException e) {
        Driver.log.error("Exception", e);
        CommandLineUtil.printHelp(group);
    }
}

From source file:org.apache.mahout.utils.vectors.lucene.ClusterLabels.java

public static void main(String[] args) {

    DefaultOptionBuilder obuilder = new DefaultOptionBuilder();
    ArgumentBuilder abuilder = new ArgumentBuilder();
    GroupBuilder gbuilder = new GroupBuilder();

    Option indexOpt = obuilder.withLongName("dir").withRequired(true)
            .withArgument(abuilder.withName("dir").withMinimum(1).withMaximum(1).create())
            .withDescription("The Lucene index directory").withShortName("d").create();

    Option outputOpt = obuilder.withLongName("output").withRequired(false)
            .withArgument(abuilder.withName("output").withMinimum(1).withMaximum(1).create())
            .withDescription("The output file. If not specified, the result is printed on console.")
            .withShortName("o").create();

    Option fieldOpt = obuilder.withLongName("field").withRequired(true)
            .withArgument(abuilder.withName("field").withMinimum(1).withMaximum(1).create())
            .withDescription("The content field in the index").withShortName("f").create();

    Option idFieldOpt = obuilder.withLongName("idField").withRequired(false)
            .withArgument(abuilder.withName("idField").withMinimum(1).withMaximum(1).create())
            .withDescription(//from  www . ja va2 s . c om
                    "The field for the document ID in the index.  If null, then the Lucene internal doc "
                            + "id is used which is prone to error if the underlying index changes")
            .withShortName("i").create();

    Option seqOpt = obuilder.withLongName("seqFileDir").withRequired(true)
            .withArgument(abuilder.withName("seqFileDir").withMinimum(1).withMaximum(1).create())
            .withDescription("The directory containing Sequence Files for the Clusters").withShortName("s")
            .create();

    Option pointsOpt = obuilder.withLongName("pointsDir").withRequired(true)
            .withArgument(abuilder.withName("pointsDir").withMinimum(1).withMaximum(1).create())
            .withDescription(
                    "The directory containing points sequence files mapping input vectors to their cluster.  ")
            .withShortName("p").create();
    Option minClusterSizeOpt = obuilder.withLongName("minClusterSize").withRequired(false)
            .withArgument(abuilder.withName("minClusterSize").withMinimum(1).withMaximum(1).create())
            .withDescription("The minimum number of points required in a cluster to print the labels for")
            .withShortName("m").create();
    Option maxLabelsOpt = obuilder.withLongName("maxLabels").withRequired(false)
            .withArgument(abuilder.withName("maxLabels").withMinimum(1).withMaximum(1).create())
            .withDescription("The maximum number of labels to print per cluster").withShortName("x").create();
    Option helpOpt = DefaultOptionCreator.helpOption();

    Group group = gbuilder.withName("Options").withOption(indexOpt).withOption(idFieldOpt).withOption(outputOpt)
            .withOption(fieldOpt).withOption(seqOpt).withOption(pointsOpt).withOption(helpOpt)
            .withOption(maxLabelsOpt).withOption(minClusterSizeOpt).create();

    try {
        Parser parser = new Parser();
        parser.setGroup(group);
        CommandLine cmdLine = parser.parse(args);

        if (cmdLine.hasOption(helpOpt)) {
            CommandLineUtil.printHelp(group);
            return;
        }

        Path seqFileDir = new Path(cmdLine.getValue(seqOpt).toString());
        Path pointsDir = new Path(cmdLine.getValue(pointsOpt).toString());
        String indexDir = cmdLine.getValue(indexOpt).toString();
        String contentField = cmdLine.getValue(fieldOpt).toString();

        String idField = null;

        if (cmdLine.hasOption(idFieldOpt)) {
            idField = cmdLine.getValue(idFieldOpt).toString();
        }
        String output = null;
        if (cmdLine.hasOption(outputOpt)) {
            output = cmdLine.getValue(outputOpt).toString();
        }
        int maxLabels = DEFAULT_MAX_LABELS;
        if (cmdLine.hasOption(maxLabelsOpt)) {
            maxLabels = Integer.parseInt(cmdLine.getValue(maxLabelsOpt).toString());
        }
        int minSize = DEFAULT_MIN_IDS;
        if (cmdLine.hasOption(minClusterSizeOpt)) {
            minSize = Integer.parseInt(cmdLine.getValue(minClusterSizeOpt).toString());
        }
        ClusterLabels clusterLabel = new ClusterLabels(seqFileDir, pointsDir, indexDir, contentField, minSize,
                maxLabels);

        if (idField != null) {
            clusterLabel.setIdField(idField);
        }
        if (output != null) {
            clusterLabel.setOutput(output);
        }

        clusterLabel.getLabels();

    } catch (OptionException e) {
        log.error("Exception", e);
        CommandLineUtil.printHelp(group);
    } catch (IOException e) {
        log.error("Exception", e);
    }
}

From source file:org.apache.mahout.utils.vectors.lucene.Driver.java

public static void main(String[] args) throws IOException {

    DefaultOptionBuilder obuilder = new DefaultOptionBuilder();
    ArgumentBuilder abuilder = new ArgumentBuilder();
    GroupBuilder gbuilder = new GroupBuilder();

    Option inputOpt = obuilder.withLongName("dir").withRequired(true)
            .withArgument(abuilder.withName("dir").withMinimum(1).withMaximum(1).create())
            .withDescription("The Lucene directory").withShortName("d").create();

    Option outputOpt = obuilder.withLongName("output").withRequired(true)
            .withArgument(abuilder.withName("output").withMinimum(1).withMaximum(1).create())
            .withDescription("The output file").withShortName("o").create();

    Option fieldOpt = obuilder.withLongName("field").withRequired(true)
            .withArgument(abuilder.withName("field").withMinimum(1).withMaximum(1).create())
            .withDescription("The field in the index").withShortName("f").create();

    Option idFieldOpt = obuilder.withLongName("idField").withRequired(false)
            .withArgument(abuilder.withName("idField").withMinimum(1).withMaximum(1).create())
            .withDescription(/*from w w w.  ja v a 2s  . c  o  m*/
                    "The field in the index containing the index.  If null, then the Lucene internal doc "
                            + "id is used which is prone to error if the underlying index changes")
            .create();

    Option dictOutOpt = obuilder.withLongName("dictOut").withRequired(true)
            .withArgument(abuilder.withName("dictOut").withMinimum(1).withMaximum(1).create())
            .withDescription("The output of the dictionary").withShortName("t").create();

    Option seqDictOutOpt = obuilder.withLongName("seqDictOut").withRequired(false)
            .withArgument(abuilder.withName("seqDictOut").withMinimum(1).withMaximum(1).create())
            .withDescription("The output of the dictionary as sequence file").withShortName("st").create();

    Option weightOpt = obuilder.withLongName("weight").withRequired(false)
            .withArgument(abuilder.withName("weight").withMinimum(1).withMaximum(1).create())
            .withDescription("The kind of weight to use. Currently TF or TFIDF").withShortName("w").create();

    Option delimiterOpt = obuilder.withLongName("delimiter").withRequired(false)
            .withArgument(abuilder.withName("delimiter").withMinimum(1).withMaximum(1).create())
            .withDescription("The delimiter for outputting the dictionary").withShortName("l").create();

    Option powerOpt = obuilder.withLongName("norm").withRequired(false)
            .withArgument(abuilder.withName("norm").withMinimum(1).withMaximum(1).create())
            .withDescription(
                    "The norm to use, expressed as either a double or \"INF\" if you want to use the Infinite norm.  "
                            + "Must be greater or equal to 0.  The default is not to normalize")
            .withShortName("n").create();

    Option maxOpt = obuilder.withLongName("max").withRequired(false)
            .withArgument(abuilder.withName("max").withMinimum(1).withMaximum(1).create())
            .withDescription(
                    "The maximum number of vectors to output.  If not specified, then it will loop over all docs")
            .withShortName("m").create();

    Option minDFOpt = obuilder.withLongName("minDF").withRequired(false)
            .withArgument(abuilder.withName("minDF").withMinimum(1).withMaximum(1).create())
            .withDescription("The minimum document frequency.  Default is 1").withShortName("md").create();

    Option maxDFPercentOpt = obuilder.withLongName("maxDFPercent").withRequired(false)
            .withArgument(abuilder.withName("maxDFPercent").withMinimum(1).withMaximum(1).create())
            .withDescription(
                    "The max percentage of docs for the DF.  Can be used to remove really high frequency terms."
                            + "  Expressed as an integer between 0 and 100. Default is 99.")
            .withShortName("x").create();

    Option maxPercentErrorDocsOpt = obuilder.withLongName("maxPercentErrorDocs").withRequired(false)
            .withArgument(abuilder.withName("maxPercentErrorDocs").withMinimum(1).withMaximum(1).create())
            .withDescription(
                    "The max percentage of docs that can have a null term vector. These are noise document and can occur if the "
                            + "analyzer used strips out all terms in the target field. This percentage is expressed as a value "
                            + "between 0 and 1. The default is 0.")
            .withShortName("err").create();

    Option helpOpt = obuilder.withLongName("help").withDescription("Print out help").withShortName("h")
            .create();

    Group group = gbuilder.withName("Options").withOption(inputOpt).withOption(idFieldOpt).withOption(outputOpt)
            .withOption(delimiterOpt).withOption(helpOpt).withOption(fieldOpt).withOption(maxOpt)
            .withOption(dictOutOpt).withOption(seqDictOutOpt).withOption(powerOpt).withOption(maxDFPercentOpt)
            .withOption(weightOpt).withOption(minDFOpt).withOption(maxPercentErrorDocsOpt).create();

    try {
        Parser parser = new Parser();
        parser.setGroup(group);
        CommandLine cmdLine = parser.parse(args);

        if (cmdLine.hasOption(helpOpt)) {

            CommandLineUtil.printHelp(group);
            return;
        }

        if (cmdLine.hasOption(inputOpt)) { // Lucene case
            Driver luceneDriver = new Driver();
            luceneDriver.setLuceneDir(cmdLine.getValue(inputOpt).toString());

            if (cmdLine.hasOption(maxOpt)) {
                luceneDriver.setMaxDocs(Long.parseLong(cmdLine.getValue(maxOpt).toString()));
            }

            if (cmdLine.hasOption(weightOpt)) {
                luceneDriver.setWeightType(cmdLine.getValue(weightOpt).toString());
            }

            luceneDriver.setField(cmdLine.getValue(fieldOpt).toString());

            if (cmdLine.hasOption(minDFOpt)) {
                luceneDriver.setMinDf(Integer.parseInt(cmdLine.getValue(minDFOpt).toString()));
            }

            if (cmdLine.hasOption(maxDFPercentOpt)) {
                luceneDriver.setMaxDFPercent(Integer.parseInt(cmdLine.getValue(maxDFPercentOpt).toString()));
            }

            if (cmdLine.hasOption(powerOpt)) {
                String power = cmdLine.getValue(powerOpt).toString();
                if ("INF".equals(power)) {
                    luceneDriver.setNorm(Double.POSITIVE_INFINITY);
                } else {
                    luceneDriver.setNorm(Double.parseDouble(power));
                }
            }

            if (cmdLine.hasOption(idFieldOpt)) {
                luceneDriver.setIdField(cmdLine.getValue(idFieldOpt).toString());
            }

            if (cmdLine.hasOption(maxPercentErrorDocsOpt)) {
                luceneDriver.setMaxPercentErrorDocs(
                        Double.parseDouble(cmdLine.getValue(maxPercentErrorDocsOpt).toString()));
            }

            luceneDriver.setOutFile(cmdLine.getValue(outputOpt).toString());

            luceneDriver.setDelimiter(
                    cmdLine.hasOption(delimiterOpt) ? cmdLine.getValue(delimiterOpt).toString() : "\t");

            luceneDriver.setDictOut(cmdLine.getValue(dictOutOpt).toString());

            if (cmdLine.hasOption(seqDictOutOpt)) {
                luceneDriver.setSeqDictOut(cmdLine.getValue(seqDictOutOpt).toString());
            }

            luceneDriver.dumpVectors();
        }
    } catch (OptionException e) {
        log.error("Exception", e);
        CommandLineUtil.printHelp(group);
    }
}