Example usage for org.apache.mahout.common.commandline DefaultOptionCreator inputOption

Introduction

In this page you can find the example usage for org.apache.mahout.common.commandline DefaultOptionCreator inputOption.

Prototype

public static DefaultOptionBuilder inputOption()

Source Link

Document

Returns a default command line option for input directory specification.

Usage

From source file:mahout.vectorizer.SparseVectorsFromSequenceFiles.java

License:Apache License

@Override
public int run(String[] args) throws Exception {
    DefaultOptionBuilder obuilder = new DefaultOptionBuilder();
    ArgumentBuilder abuilder = new ArgumentBuilder();
    GroupBuilder gbuilder = new GroupBuilder();

    Option inputDirOpt = DefaultOptionCreator.inputOption().create();

    Option outputDirOpt = DefaultOptionCreator.outputOption().create();

    Option minSupportOpt = obuilder.withLongName("minSupport")
            .withArgument(abuilder.withName("minSupport").withMinimum(1).withMaximum(1).create())
            .withDescription("(Optional) Minimum Support. Default Value: 2").withShortName("s").create();

    Option analyzerNameOpt = obuilder.withLongName("analyzerName")
            .withArgument(abuilder.withName("analyzerName").withMinimum(1).withMaximum(1).create())
            .withDescription("The class name of the analyzer").withShortName("a").create();

    Option chunkSizeOpt = obuilder.withLongName("chunkSize")
            .withArgument(abuilder.withName("chunkSize").withMinimum(1).withMaximum(1).create())
            .withDescription("The chunkSize in MegaBytes. 100-10000 MB").withShortName("chunk").create();

    Option weightOpt = obuilder.withLongName("weight").withRequired(false)
            .withArgument(abuilder.withName("weight").withMinimum(1).withMaximum(1).create())
            .withDescription("The kind of weight to use. Currently TF or TFIDF").withShortName("wt").create();

    Option minDFOpt = obuilder.withLongName("minDF").withRequired(false)
            .withArgument(abuilder.withName("minDF").withMinimum(1).withMaximum(1).create())
            .withDescription("The minimum document frequency.  Default is 1").withShortName("md").create();

    Option maxDFPercentOpt = obuilder.withLongName("maxDFPercent").withRequired(false)
            .withArgument(abuilder.withName("maxDFPercent").withMinimum(1).withMaximum(1).create())
            .withDescription(//w  ww.java 2 s . co m
                    "The max percentage of docs for the DF.  Can be used to remove really high frequency terms."
                            + " Expressed as an integer between 0 and 100. Default is 99.  If maxDFSigma is also set, "
                            + "it will override this value.")
            .withShortName("x").create();

    Option maxDFSigmaOpt = obuilder.withLongName("maxDFSigma").withRequired(false)
            .withArgument(abuilder.withName("maxDFSigma").withMinimum(1).withMaximum(1).create())
            .withDescription(
                    "What portion of the tf (tf-idf) vectors to be used, expressed in times the standard deviation (sigma) "
                            + "of the document frequencies of these vectors. Can be used to remove really high frequency terms."
                            + " Expressed as a double value. Good value to be specified is 3.0. In case the value is less than 0 "
                            + "no vectors will be filtered out. Default is -1.0.  Overrides maxDFPercent")
            .withShortName("xs").create();

    Option minLLROpt = obuilder.withLongName("minLLR").withRequired(false)
            .withArgument(abuilder.withName("minLLR").withMinimum(1).withMaximum(1).create())
            .withDescription("(Optional)The minimum Log Likelihood Ratio(Float)  Default is "
                    + LLRReducer.DEFAULT_MIN_LLR)
            .withShortName("ml").create();

    Option numReduceTasksOpt = obuilder.withLongName("numReducers")
            .withArgument(abuilder.withName("numReducers").withMinimum(1).withMaximum(1).create())
            .withDescription("(Optional) Number of reduce tasks. Default Value: 1").withShortName("nr")
            .create();

    Option powerOpt = obuilder.withLongName("norm").withRequired(false)
            .withArgument(abuilder.withName("norm").withMinimum(1).withMaximum(1).create())
            .withDescription(
                    "The norm to use, expressed as either a float or \"INF\" if you want to use the Infinite norm.  "
                            + "Must be greater or equal to 0.  The default is not to normalize")
            .withShortName("n").create();

    Option logNormalizeOpt = obuilder.withLongName("logNormalize").withRequired(false)
            .withDescription("(Optional) Whether output vectors should be logNormalize. If set true else false")
            .withShortName("lnorm").create();

    Option maxNGramSizeOpt = obuilder.withLongName("maxNGramSize").withRequired(false)
            .withArgument(abuilder.withName("ngramSize").withMinimum(1).withMaximum(1).create())
            .withDescription("(Optional) The maximum size of ngrams to create"
                    + " (2 = bigrams, 3 = trigrams, etc) Default Value:1")
            .withShortName("ng").create();

    Option sequentialAccessVectorOpt = obuilder.withLongName("sequentialAccessVector").withRequired(false)
            .withDescription(
                    "(Optional) Whether output vectors should be SequentialAccessVectors. If set true else false")
            .withShortName("seq").create();

    Option namedVectorOpt = obuilder.withLongName("namedVector").withRequired(false)
            .withDescription("(Optional) Whether output vectors should be NamedVectors. If set true else false")
            .withShortName("nv").create();

    Option overwriteOutput = obuilder.withLongName("overwrite").withRequired(false)
            .withDescription("If set, overwrite the output directory").withShortName("ow").create();
    Option helpOpt = obuilder.withLongName("help").withDescription("Print out help").withShortName("h")
            .create();

    Group group = gbuilder.withName("Options").withOption(minSupportOpt).withOption(analyzerNameOpt)
            .withOption(chunkSizeOpt).withOption(outputDirOpt).withOption(inputDirOpt).withOption(minDFOpt)
            .withOption(maxDFSigmaOpt).withOption(maxDFPercentOpt).withOption(weightOpt).withOption(powerOpt)
            .withOption(minLLROpt).withOption(numReduceTasksOpt).withOption(maxNGramSizeOpt)
            .withOption(overwriteOutput).withOption(helpOpt).withOption(sequentialAccessVectorOpt)
            .withOption(namedVectorOpt).withOption(logNormalizeOpt).create();
    try {
        Parser parser = new Parser();
        parser.setGroup(group);
        parser.setHelpOption(helpOpt);
        CommandLine cmdLine = parser.parse(args);

        if (cmdLine.hasOption(helpOpt)) {
            CommandLineUtil.printHelp(group);
            return -1;
        }

        Path inputDir = new Path((String) cmdLine.getValue(inputDirOpt));
        Path outputDir = new Path((String) cmdLine.getValue(outputDirOpt));

        int chunkSize = 100;
        if (cmdLine.hasOption(chunkSizeOpt)) {
            chunkSize = Integer.parseInt((String) cmdLine.getValue(chunkSizeOpt));
        }
        int minSupport = 2;
        if (cmdLine.hasOption(minSupportOpt)) {
            String minSupportString = (String) cmdLine.getValue(minSupportOpt);
            minSupport = Integer.parseInt(minSupportString);
        }

        int maxNGramSize = 1;

        if (cmdLine.hasOption(maxNGramSizeOpt)) {
            try {
                maxNGramSize = Integer.parseInt(cmdLine.getValue(maxNGramSizeOpt).toString());
            } catch (NumberFormatException ex) {
                log.warn("Could not parse ngram size option");
            }
        }
        log.info("Maximum n-gram size is: {}", maxNGramSize);

        if (cmdLine.hasOption(overwriteOutput)) {
            HadoopUtil.delete(getConf(), outputDir);
        }

        float minLLRValue = LLRReducer.DEFAULT_MIN_LLR;
        if (cmdLine.hasOption(minLLROpt)) {
            minLLRValue = Float.parseFloat(cmdLine.getValue(minLLROpt).toString());
        }
        log.info("Minimum LLR value: {}", minLLRValue);

        int reduceTasks = 1;
        if (cmdLine.hasOption(numReduceTasksOpt)) {
            reduceTasks = Integer.parseInt(cmdLine.getValue(numReduceTasksOpt).toString());
        }
        log.info("Number of reduce tasks: {}", reduceTasks);

        Class<? extends Analyzer> analyzerClass = StandardAnalyzer.class;
        if (cmdLine.hasOption(analyzerNameOpt)) {
            String className = cmdLine.getValue(analyzerNameOpt).toString();
            analyzerClass = Class.forName(className).asSubclass(Analyzer.class);
            // try instantiating it, b/c there isn't any point in setting it if
            // you can't instantiate it
            AnalyzerUtils.createAnalyzer(analyzerClass);
        }

        boolean processIdf;

        if (cmdLine.hasOption(weightOpt)) {
            String wString = cmdLine.getValue(weightOpt).toString();
            if ("tf".equalsIgnoreCase(wString)) {
                processIdf = false;
            } else if ("tfidf".equalsIgnoreCase(wString)) {
                processIdf = true;
            } else {
                throw new OptionException(weightOpt);
            }
        } else {
            processIdf = true;
        }

        int minDf = 1;
        if (cmdLine.hasOption(minDFOpt)) {
            minDf = Integer.parseInt(cmdLine.getValue(minDFOpt).toString());
        }
        int maxDFPercent = 99;
        if (cmdLine.hasOption(maxDFPercentOpt)) {
            maxDFPercent = Integer.parseInt(cmdLine.getValue(maxDFPercentOpt).toString());
        }
        double maxDFSigma = -1.0;
        if (cmdLine.hasOption(maxDFSigmaOpt)) {
            maxDFSigma = Double.parseDouble(cmdLine.getValue(maxDFSigmaOpt).toString());
        }

        float norm = PartialVectorMerger.NO_NORMALIZING;
        if (cmdLine.hasOption(powerOpt)) {
            String power = cmdLine.getValue(powerOpt).toString();
            if ("INF".equals(power)) {
                norm = Float.POSITIVE_INFINITY;
            } else {
                norm = Float.parseFloat(power);
            }
        }

        boolean logNormalize = false;
        if (cmdLine.hasOption(logNormalizeOpt)) {
            logNormalize = true;
        }

        Configuration conf = getConf();
        Path tokenizedPath = new Path(outputDir, DocumentProcessor.TOKENIZED_DOCUMENT_OUTPUT_FOLDER);
        //TODO: move this into DictionaryVectorizer , and then fold SparseVectorsFrom with EncodedVectorsFrom
        // to have one framework for all of this.
        DocumentProcessor.tokenizeDocuments(inputDir, analyzerClass, tokenizedPath, conf);

        boolean sequentialAccessOutput = false;
        if (cmdLine.hasOption(sequentialAccessVectorOpt)) {
            sequentialAccessOutput = true;
        }

        boolean namedVectors = false;
        if (cmdLine.hasOption(namedVectorOpt)) {
            namedVectors = true;
        }
        boolean shouldPrune = maxDFSigma >= 0.0;
        String tfDirName = shouldPrune ? DictionaryVectorizer.DOCUMENT_VECTOR_OUTPUT_FOLDER + "-toprune"
                : DictionaryVectorizer.DOCUMENT_VECTOR_OUTPUT_FOLDER;

        if (processIdf) {
            DictionaryVectorizer.createTermFrequencyVectors(tokenizedPath, outputDir, tfDirName, conf,
                    minSupport, maxNGramSize, minLLRValue, -1.0f, false, reduceTasks, chunkSize,
                    sequentialAccessOutput, namedVectors);
        } else {
            DictionaryVectorizer.createTermFrequencyVectors(tokenizedPath, outputDir, tfDirName, conf,
                    minSupport, maxNGramSize, minLLRValue, norm, logNormalize, reduceTasks, chunkSize,
                    sequentialAccessOutput, namedVectors);
        }

        Pair<Long[], List<Path>> docFrequenciesFeatures = null;
        // Should document frequency features be processed
        if (shouldPrune || processIdf) {
            docFrequenciesFeatures = TFIDFConverter.calculateDF(new Path(outputDir, tfDirName), outputDir, conf,
                    chunkSize);
        }

        long maxDF = maxDFPercent; //if we are pruning by std dev, then this will get changed
        if (shouldPrune) {
            Path dfDir = new Path(outputDir, TFIDFConverter.WORDCOUNT_OUTPUT_FOLDER);
            Path stdCalcDir = new Path(outputDir, HighDFWordsPruner.STD_CALC_DIR);

            // Calculate the standard deviation
            double stdDev = BasicStats.stdDevForGivenMean(dfDir, stdCalcDir, 0.0, conf);
            long vectorCount = docFrequenciesFeatures.getFirst()[1];
            maxDF = (int) (100.0 * maxDFSigma * stdDev / vectorCount);

            // Prune the term frequency vectors
            Path tfDir = new Path(outputDir, tfDirName);
            Path prunedTFDir = new Path(outputDir, DictionaryVectorizer.DOCUMENT_VECTOR_OUTPUT_FOLDER);
            Path prunedPartialTFDir = new Path(outputDir,
                    DictionaryVectorizer.DOCUMENT_VECTOR_OUTPUT_FOLDER + "-partial");

            if (processIdf) {
                HighDFWordsPruner.pruneVectors(tfDir, prunedTFDir, prunedPartialTFDir, maxDF, conf,
                        docFrequenciesFeatures, -1.0f, false, reduceTasks);
            } else {
                HighDFWordsPruner.pruneVectors(tfDir, prunedTFDir, prunedPartialTFDir, maxDF, conf,
                        docFrequenciesFeatures, norm, logNormalize, reduceTasks);
            }
            HadoopUtil.delete(new Configuration(conf), tfDir);
        }
        if (processIdf) {
            TFIDFConverter.processTfIdf(new Path(outputDir, DictionaryVectorizer.DOCUMENT_VECTOR_OUTPUT_FOLDER),
                    outputDir, conf, docFrequenciesFeatures, minDf, maxDF, norm, logNormalize,
                    sequentialAccessOutput, namedVectors, reduceTasks);
        }
    } catch (OptionException e) {
        log.error("Exception", e);
        CommandLineUtil.printHelp(group);
    }
    return 0;
}

From source file:my.mahout.AbstractJob.java

License:Apache License

/** Add the default input directory option, '-i' which takes a directory
 *  name as an argument. When {@link #parseArguments(String[])} is 
 *  called, the inputPath will be set based upon the value for this option.
 *  If this method is called, the input is required.
 *//*w  ww  .  ja v a2 s .  co m*/
protected void addInputOption() {
    this.inputOption = addOption(DefaultOptionCreator.inputOption().create());
}

From source file:net.hubs1.mahout.cluster.CRLFSeparatedToSequenceFile.java

License:Apache License

/**
 * Takes in two arguments:/* w ww  .jav  a  2  s. co  m*/
 * <ol>
 * <li>The input {@link org.apache.hadoop.fs.Path} where the input documents live</li>
 * <li>The output {@link org.apache.hadoop.fs.Path} where to write the classifier as a
 * {@link org.apache.hadoop.io.SequenceFile}</li>
 * </ol>
 */
public static void main(String[] args) throws IOException {

    GroupBuilder gbuilder = new GroupBuilder();

    Option dirInputPathOpt = DefaultOptionCreator.inputOption().create();

    Option dirOutputPathOpt = DefaultOptionCreator.outputOption().create();

    Option helpOpt = DefaultOptionCreator.helpOption();

    Group group = gbuilder.withName("Options").withOption(dirInputPathOpt).withOption(dirOutputPathOpt)
            .withOption(helpOpt).create();

    Parser parser = new Parser();
    parser.setGroup(group);
    parser.setHelpOption(helpOpt);
    try {
        CommandLine cmdLine = parser.parse(args);
        if (cmdLine.hasOption(helpOpt)) {
            CommandLineUtil.printHelp(group);
            return;
        }

        String inputPath = (String) cmdLine.getValue(dirInputPathOpt);
        String outputPath = (String) cmdLine.getValue(dirOutputPathOpt);

        runJob(inputPath, outputPath);
    } catch (OptionException e) {
        log.error("Exception", e);
        CommandLineUtil.printHelp(group);
    } catch (InterruptedException e) {
        log.error("Exception", e);
        CommandLineUtil.printHelp(group);
    } catch (ClassNotFoundException e) {
        log.error("Exception", e);
        CommandLineUtil.printHelp(group);
    }
}

From source file:parse_wikipedia.ParseWikipedia.java

License:Apache License

public static void main(String[] args) throws IOException {
    DefaultOptionBuilder obuilder = new DefaultOptionBuilder();
    GroupBuilder gbuilder = new GroupBuilder();

    Option dirInputPathOpt = DefaultOptionCreator.inputOption().create();
    Option dirOutputPathOpt = DefaultOptionCreator.outputOption().create();

    Group group = gbuilder.withName("Options").withOption(dirInputPathOpt).withOption(dirOutputPathOpt)
            .create();//from w  ww  .  jav  a 2s  .com

    Parser parser = new Parser();
    parser.setGroup(group);

    try {
        CommandLine cmdLine = parser.parse(args);

        String inputPath = (String) cmdLine.getValue(dirInputPathOpt);
        String outputPath = (String) cmdLine.getValue(dirOutputPathOpt);

        runJob(inputPath, outputPath);
    } catch (OptionException | InterruptedException | ClassNotFoundException e) {
        log.error("Exception", e);
    }

}

From source file:tk.summerway.mahout9.tools.MyClusterDumper.java

License:Apache License

private boolean buildParse(String[] args) {
    DefaultOptionBuilder obuilder = new DefaultOptionBuilder();
    ArgumentBuilder abuilder = new ArgumentBuilder();
    GroupBuilder gbuilder = new GroupBuilder();

    Option inputDirOpt = DefaultOptionCreator.inputOption().create();
    Option outputDirOpt = DefaultOptionCreator.outputOption().create();

    Option outputFormatOpt = obuilder.withLongName(OUTPUT_FORMAT_OPT)
            .withArgument(abuilder.withName(OUTPUT_FORMAT_OPT).create())
            .withDescription(/*from w  w w  .j av  a 2 s  . c  om*/
                    "The optional output format for the results. Options: TEXT, CSV, JSON or GRAPH_ML. Default is TEXT")
            .withShortName("of").create();

    Option substringOpt = obuilder.withLongName(SUBSTRING_OPTION)
            .withArgument(abuilder.withName(SUBSTRING_OPTION).create())
            .withDescription("The number of chars of the asFormatString() to print").withShortName("b")
            .create();

    Option pointsDirOpt = obuilder.withLongName(POINTS_DIR_OPTION)
            .withArgument(abuilder.withName(POINTS_DIR_OPTION).create())
            .withDescription(
                    "The directory containing points sequence files mapping input vectors to their cluster. "
                            + "If specified, then the program will output the points associated with a cluster")
            .withShortName("p").create();

    Option samplePointsOpt = obuilder.withLongName(SAMPLE_POINTS)
            .withArgument(abuilder.withName(SAMPLE_POINTS).create())
            .withDescription("Specifies the maximum number of points to include _per_ cluster.  The default "
                    + "is to include all points")
            .withShortName("sp").create();

    Option dictionaryOpt = obuilder.withLongName(DICTIONARY_OPTION)
            .withArgument(abuilder.withName(DICTIONARY_OPTION).create()).withDescription("The dictionary file")
            .withShortName("d").create();

    Option dictionaryTypeOpt = obuilder.withLongName(DICTIONARY_TYPE_OPTION)
            .withArgument(abuilder.withName(DICTIONARY_TYPE_OPTION).create())
            .withDescription("The dictionary file type (text|sequencefile), default is text")
            .withShortName("dt").create();

    Option numWordsOpt = obuilder.withLongName(NUM_WORDS_OPTION)
            .withArgument(abuilder.withName(NUM_WORDS_OPTION).create())
            .withDescription("The number of top terms to print").withShortName("n").create();

    Option evaluateOpt = obuilder.withLongName(EVALUATE_CLUSTERS)
            .withArgument(abuilder.withName(EVALUATE_CLUSTERS).create())
            .withDescription("Run ClusterEvaluator and CDbwEvaluator over the input.  "
                    + "The output will be appended to the rest of the output at the end. Default is false.")
            .withShortName("e").create();

    Option distanceMeasureOpt = obuilder.withLongName("distanceMeasure")
            .withArgument(abuilder.withName("distanceMeasure").create())
            .withDescription("k-means distance measure class name").withShortName("dm").create();

    Option helpOpt = obuilder.withLongName("help").withDescription("Print out help").withShortName("h")
            .create();

    Group group = gbuilder.withName("Options").withOption(inputDirOpt).withOption(outputDirOpt)
            .withOption(outputFormatOpt).withOption(substringOpt).withOption(pointsDirOpt)
            .withOption(samplePointsOpt).withOption(dictionaryOpt).withOption(dictionaryTypeOpt)
            .withOption(numWordsOpt).withOption(evaluateOpt).withOption(distanceMeasureOpt).withOption(helpOpt)
            .create();
    try {
        Parser parser = new Parser();
        parser.setGroup(group);
        parser.setHelpOption(helpOpt);
        CommandLine cmdLine = parser.parse(args);

        if (cmdLine.hasOption(helpOpt)) {
            CommandLineUtil.printHelp(group);
            return false;
        }

        seqFileDir = getInputPath();
        inputPath = getInputPath();
        inputFile = getInputFile();
        if (cmdLine.hasOption(inputDirOpt)) {
            seqFileDir = new Path(cmdLine.getValue(inputDirOpt).toString());
            inputPath = new Path(cmdLine.getValue(inputDirOpt).toString());
            inputFile = new File(cmdLine.getValue(inputDirOpt).toString());
        }
        log.info("seqFileDir value: {}", seqFileDir);
        log.info("inputPath value: {}", inputPath);
        log.info("inputFile value: {}", inputFile);

        outputPath = getOutputPath();
        outputFile = getOutputFile();
        if (cmdLine.hasOption(outputDirOpt)) {
            outputPath = new Path(cmdLine.getValue(outputDirOpt).toString());
            outputFile = new File(cmdLine.getValue(outputDirOpt).toString());
        }
        log.info("outputPath value: {}", outputPath);
        log.info("outputFile value: {}", outputFile);

        if (cmdLine.hasOption(pointsDirOpt)) {
            pointsDir = new Path(cmdLine.getValue(pointsDirOpt).toString());
        }
        log.info("pointsDir value: {}", pointsDir);

        if (cmdLine.hasOption(substringOpt)) {
            int sub = Integer.parseInt(cmdLine.getValue(substringOpt).toString());
            if (sub >= 0) {
                subString = sub;
            }
        }
        log.info("subString value: {}", subString);

        termDictionary = cmdLine.getValue(dictionaryOpt).toString();
        dictionaryFormat = cmdLine.getValue(dictionaryTypeOpt).toString();
        log.info("termDictionary value: {}", termDictionary);
        log.info("dictionaryFormat value: {}", dictionaryFormat);

        if (cmdLine.hasOption(numWordsOpt)) {
            numTopFeatures = Integer.parseInt(cmdLine.getValue(numWordsOpt).toString());
        }
        log.info("numTopFeatures value: {}", numTopFeatures);

        outputFormat = OUTPUT_FORMAT.TEXT;
        if (cmdLine.hasOption(outputFormatOpt)) {
            outputFormat = OUTPUT_FORMAT.valueOf(cmdLine.getValue(outputFormatOpt).toString());
        }
        log.info("outputFormat value: {}", outputFormat);

        if (cmdLine.hasOption(samplePointsOpt)) {
            maxPointsPerCluster = Long.parseLong(cmdLine.getValue(samplePointsOpt).toString());
        } else {
            maxPointsPerCluster = Long.MAX_VALUE;
        }
        log.info("maxPointsPerCluster value: {}", maxPointsPerCluster);

        runEvaluation = cmdLine.hasOption(evaluateOpt);
        log.info("runEvaluation value: {}", runEvaluation);

        String distanceMeasureClass = null;
        if (cmdLine.hasOption(distanceMeasureOpt)) {
            distanceMeasureClass = cmdLine.getValue(distanceMeasureOpt).toString();
        }
        if (distanceMeasureClass != null) {
            measure = ClassUtils.instantiateAs(distanceMeasureClass, DistanceMeasure.class);
        }
        log.info("distanceMeasureClass value: {}", distanceMeasureClass);

    } catch (OptionException e) {
        CommandLineUtil.printHelp(group);
        log.error("parse para error", e);
    }
    return true;
}