Example usage for org.apache.mahout.common.commandline DefaultOptionCreator inputOption

List of usage examples for org.apache.mahout.common.commandline DefaultOptionCreator inputOption

Introduction

In this page you can find the example usage for org.apache.mahout.common.commandline DefaultOptionCreator inputOption.

Prototype

public static DefaultOptionBuilder inputOption() 

Source Link

Document

Returns a default command line option for input directory specification.

Usage

From source file:mahout.vectorizer.SparseVectorsFromSequenceFiles.java

License:Apache License

@Override
public int run(String[] args) throws Exception {
    DefaultOptionBuilder obuilder = new DefaultOptionBuilder();
    ArgumentBuilder abuilder = new ArgumentBuilder();
    GroupBuilder gbuilder = new GroupBuilder();

    Option inputDirOpt = DefaultOptionCreator.inputOption().create();

    Option outputDirOpt = DefaultOptionCreator.outputOption().create();

    Option minSupportOpt = obuilder.withLongName("minSupport")
            .withArgument(abuilder.withName("minSupport").withMinimum(1).withMaximum(1).create())
            .withDescription("(Optional) Minimum Support. Default Value: 2").withShortName("s").create();

    Option analyzerNameOpt = obuilder.withLongName("analyzerName")
            .withArgument(abuilder.withName("analyzerName").withMinimum(1).withMaximum(1).create())
            .withDescription("The class name of the analyzer").withShortName("a").create();

    Option chunkSizeOpt = obuilder.withLongName("chunkSize")
            .withArgument(abuilder.withName("chunkSize").withMinimum(1).withMaximum(1).create())
            .withDescription("The chunkSize in MegaBytes. 100-10000 MB").withShortName("chunk").create();

    Option weightOpt = obuilder.withLongName("weight").withRequired(false)
            .withArgument(abuilder.withName("weight").withMinimum(1).withMaximum(1).create())
            .withDescription("The kind of weight to use. Currently TF or TFIDF").withShortName("wt").create();

    Option minDFOpt = obuilder.withLongName("minDF").withRequired(false)
            .withArgument(abuilder.withName("minDF").withMinimum(1).withMaximum(1).create())
            .withDescription("The minimum document frequency.  Default is 1").withShortName("md").create();

    Option maxDFPercentOpt = obuilder.withLongName("maxDFPercent").withRequired(false)
            .withArgument(abuilder.withName("maxDFPercent").withMinimum(1).withMaximum(1).create())
            .withDescription(//w  ww.java 2 s . co m
                    "The max percentage of docs for the DF.  Can be used to remove really high frequency terms."
                            + " Expressed as an integer between 0 and 100. Default is 99.  If maxDFSigma is also set, "
                            + "it will override this value.")
            .withShortName("x").create();

    Option maxDFSigmaOpt = obuilder.withLongName("maxDFSigma").withRequired(false)
            .withArgument(abuilder.withName("maxDFSigma").withMinimum(1).withMaximum(1).create())
            .withDescription(
                    "What portion of the tf (tf-idf) vectors to be used, expressed in times the standard deviation (sigma) "
                            + "of the document frequencies of these vectors. Can be used to remove really high frequency terms."
                            + " Expressed as a double value. Good value to be specified is 3.0. In case the value is less than 0 "
                            + "no vectors will be filtered out. Default is -1.0.  Overrides maxDFPercent")
            .withShortName("xs").create();

    Option minLLROpt = obuilder.withLongName("minLLR").withRequired(false)
            .withArgument(abuilder.withName("minLLR").withMinimum(1).withMaximum(1).create())
            .withDescription("(Optional)The minimum Log Likelihood Ratio(Float)  Default is "
                    + LLRReducer.DEFAULT_MIN_LLR)
            .withShortName("ml").create();

    Option numReduceTasksOpt = obuilder.withLongName("numReducers")
            .withArgument(abuilder.withName("numReducers").withMinimum(1).withMaximum(1).create())
            .withDescription("(Optional) Number of reduce tasks. Default Value: 1").withShortName("nr")
            .create();

    Option powerOpt = obuilder.withLongName("norm").withRequired(false)
            .withArgument(abuilder.withName("norm").withMinimum(1).withMaximum(1).create())
            .withDescription(
                    "The norm to use, expressed as either a float or \"INF\" if you want to use the Infinite norm.  "
                            + "Must be greater or equal to 0.  The default is not to normalize")
            .withShortName("n").create();

    Option logNormalizeOpt = obuilder.withLongName("logNormalize").withRequired(false)
            .withDescription("(Optional) Whether output vectors should be logNormalize. If set true else false")
            .withShortName("lnorm").create();

    Option maxNGramSizeOpt = obuilder.withLongName("maxNGramSize").withRequired(false)
            .withArgument(abuilder.withName("ngramSize").withMinimum(1).withMaximum(1).create())
            .withDescription("(Optional) The maximum size of ngrams to create"
                    + " (2 = bigrams, 3 = trigrams, etc) Default Value:1")
            .withShortName("ng").create();

    Option sequentialAccessVectorOpt = obuilder.withLongName("sequentialAccessVector").withRequired(false)
            .withDescription(
                    "(Optional) Whether output vectors should be SequentialAccessVectors. If set true else false")
            .withShortName("seq").create();

    Option namedVectorOpt = obuilder.withLongName("namedVector").withRequired(false)
            .withDescription("(Optional) Whether output vectors should be NamedVectors. If set true else false")
            .withShortName("nv").create();

    Option overwriteOutput = obuilder.withLongName("overwrite").withRequired(false)
            .withDescription("If set, overwrite the output directory").withShortName("ow").create();
    Option helpOpt = obuilder.withLongName("help").withDescription("Print out help").withShortName("h")
            .create();

    Group group = gbuilder.withName("Options").withOption(minSupportOpt).withOption(analyzerNameOpt)
            .withOption(chunkSizeOpt).withOption(outputDirOpt).withOption(inputDirOpt).withOption(minDFOpt)
            .withOption(maxDFSigmaOpt).withOption(maxDFPercentOpt).withOption(weightOpt).withOption(powerOpt)
            .withOption(minLLROpt).withOption(numReduceTasksOpt).withOption(maxNGramSizeOpt)
            .withOption(overwriteOutput).withOption(helpOpt).withOption(sequentialAccessVectorOpt)
            .withOption(namedVectorOpt).withOption(logNormalizeOpt).create();
    try {
        Parser parser = new Parser();
        parser.setGroup(group);
        parser.setHelpOption(helpOpt);
        CommandLine cmdLine = parser.parse(args);

        if (cmdLine.hasOption(helpOpt)) {
            CommandLineUtil.printHelp(group);
            return -1;
        }

        Path inputDir = new Path((String) cmdLine.getValue(inputDirOpt));
        Path outputDir = new Path((String) cmdLine.getValue(outputDirOpt));

        int chunkSize = 100;
        if (cmdLine.hasOption(chunkSizeOpt)) {
            chunkSize = Integer.parseInt((String) cmdLine.getValue(chunkSizeOpt));
        }
        int minSupport = 2;
        if (cmdLine.hasOption(minSupportOpt)) {
            String minSupportString = (String) cmdLine.getValue(minSupportOpt);
            minSupport = Integer.parseInt(minSupportString);
        }

        int maxNGramSize = 1;

        if (cmdLine.hasOption(maxNGramSizeOpt)) {
            try {
                maxNGramSize = Integer.parseInt(cmdLine.getValue(maxNGramSizeOpt).toString());
            } catch (NumberFormatException ex) {
                log.warn("Could not parse ngram size option");
            }
        }
        log.info("Maximum n-gram size is: {}", maxNGramSize);

        if (cmdLine.hasOption(overwriteOutput)) {
            HadoopUtil.delete(getConf(), outputDir);
        }

        float minLLRValue = LLRReducer.DEFAULT_MIN_LLR;
        if (cmdLine.hasOption(minLLROpt)) {
            minLLRValue = Float.parseFloat(cmdLine.getValue(minLLROpt).toString());
        }
        log.info("Minimum LLR value: {}", minLLRValue);

        int reduceTasks = 1;
        if (cmdLine.hasOption(numReduceTasksOpt)) {
            reduceTasks = Integer.parseInt(cmdLine.getValue(numReduceTasksOpt).toString());
        }
        log.info("Number of reduce tasks: {}", reduceTasks);

        Class<? extends Analyzer> analyzerClass = StandardAnalyzer.class;
        if (cmdLine.hasOption(analyzerNameOpt)) {
            String className = cmdLine.getValue(analyzerNameOpt).toString();
            analyzerClass = Class.forName(className).asSubclass(Analyzer.class);
            // try instantiating it, b/c there isn't any point in setting it if
            // you can't instantiate it
            AnalyzerUtils.createAnalyzer(analyzerClass);
        }

        boolean processIdf;

        if (cmdLine.hasOption(weightOpt)) {
            String wString = cmdLine.getValue(weightOpt).toString();
            if ("tf".equalsIgnoreCase(wString)) {
                processIdf = false;
            } else if ("tfidf".equalsIgnoreCase(wString)) {
                processIdf = true;
            } else {
                throw new OptionException(weightOpt);
            }
        } else {
            processIdf = true;
        }

        int minDf = 1;
        if (cmdLine.hasOption(minDFOpt)) {
            minDf = Integer.parseInt(cmdLine.getValue(minDFOpt).toString());
        }
        int maxDFPercent = 99;
        if (cmdLine.hasOption(maxDFPercentOpt)) {
            maxDFPercent = Integer.parseInt(cmdLine.getValue(maxDFPercentOpt).toString());
        }
        double maxDFSigma = -1.0;
        if (cmdLine.hasOption(maxDFSigmaOpt)) {
            maxDFSigma = Double.parseDouble(cmdLine.getValue(maxDFSigmaOpt).toString());
        }

        float norm = PartialVectorMerger.NO_NORMALIZING;
        if (cmdLine.hasOption(powerOpt)) {
            String power = cmdLine.getValue(powerOpt).toString();
            if ("INF".equals(power)) {
                norm = Float.POSITIVE_INFINITY;
            } else {
                norm = Float.parseFloat(power);
            }
        }

        boolean logNormalize = false;
        if (cmdLine.hasOption(logNormalizeOpt)) {
            logNormalize = true;
        }

        Configuration conf = getConf();
        Path tokenizedPath = new Path(outputDir, DocumentProcessor.TOKENIZED_DOCUMENT_OUTPUT_FOLDER);
        //TODO: move this into DictionaryVectorizer , and then fold SparseVectorsFrom with EncodedVectorsFrom
        // to have one framework for all of this.
        DocumentProcessor.tokenizeDocuments(inputDir, analyzerClass, tokenizedPath, conf);

        boolean sequentialAccessOutput = false;
        if (cmdLine.hasOption(sequentialAccessVectorOpt)) {
            sequentialAccessOutput = true;
        }

        boolean namedVectors = false;
        if (cmdLine.hasOption(namedVectorOpt)) {
            namedVectors = true;
        }
        boolean shouldPrune = maxDFSigma >= 0.0;
        String tfDirName = shouldPrune ? DictionaryVectorizer.DOCUMENT_VECTOR_OUTPUT_FOLDER + "-toprune"
                : DictionaryVectorizer.DOCUMENT_VECTOR_OUTPUT_FOLDER;

        if (processIdf) {
            DictionaryVectorizer.createTermFrequencyVectors(tokenizedPath, outputDir, tfDirName, conf,
                    minSupport, maxNGramSize, minLLRValue, -1.0f, false, reduceTasks, chunkSize,
                    sequentialAccessOutput, namedVectors);
        } else {
            DictionaryVectorizer.createTermFrequencyVectors(tokenizedPath, outputDir, tfDirName, conf,
                    minSupport, maxNGramSize, minLLRValue, norm, logNormalize, reduceTasks, chunkSize,
                    sequentialAccessOutput, namedVectors);
        }

        Pair<Long[], List<Path>> docFrequenciesFeatures = null;
        // Should document frequency features be processed
        if (shouldPrune || processIdf) {
            docFrequenciesFeatures = TFIDFConverter.calculateDF(new Path(outputDir, tfDirName), outputDir, conf,
                    chunkSize);
        }

        long maxDF = maxDFPercent; //if we are pruning by std dev, then this will get changed
        if (shouldPrune) {
            Path dfDir = new Path(outputDir, TFIDFConverter.WORDCOUNT_OUTPUT_FOLDER);
            Path stdCalcDir = new Path(outputDir, HighDFWordsPruner.STD_CALC_DIR);

            // Calculate the standard deviation
            double stdDev = BasicStats.stdDevForGivenMean(dfDir, stdCalcDir, 0.0, conf);
            long vectorCount = docFrequenciesFeatures.getFirst()[1];
            maxDF = (int) (100.0 * maxDFSigma * stdDev / vectorCount);

            // Prune the term frequency vectors
            Path tfDir = new Path(outputDir, tfDirName);
            Path prunedTFDir = new Path(outputDir, DictionaryVectorizer.DOCUMENT_VECTOR_OUTPUT_FOLDER);
            Path prunedPartialTFDir = new Path(outputDir,
                    DictionaryVectorizer.DOCUMENT_VECTOR_OUTPUT_FOLDER + "-partial");

            if (processIdf) {
                HighDFWordsPruner.pruneVectors(tfDir, prunedTFDir, prunedPartialTFDir, maxDF, conf,
                        docFrequenciesFeatures, -1.0f, false, reduceTasks);
            } else {
                HighDFWordsPruner.pruneVectors(tfDir, prunedTFDir, prunedPartialTFDir, maxDF, conf,
                        docFrequenciesFeatures, norm, logNormalize, reduceTasks);
            }
            HadoopUtil.delete(new Configuration(conf), tfDir);
        }
        if (processIdf) {
            TFIDFConverter.processTfIdf(new Path(outputDir, DictionaryVectorizer.DOCUMENT_VECTOR_OUTPUT_FOLDER),
                    outputDir, conf, docFrequenciesFeatures, minDf, maxDF, norm, logNormalize,
                    sequentialAccessOutput, namedVectors, reduceTasks);
        }
    } catch (OptionException e) {
        log.error("Exception", e);
        CommandLineUtil.printHelp(group);
    }
    return 0;
}

From source file:my.mahout.AbstractJob.java

License:Apache License

/** Add the default input directory option, '-i' which takes a directory
 *  name as an argument. When {@link #parseArguments(String[])} is 
 *  called, the inputPath will be set based upon the value for this option.
 *  If this method is called, the input is required.
 *//*w  ww  .  ja v a2 s .  co m*/
protected void addInputOption() {
    this.inputOption = addOption(DefaultOptionCreator.inputOption().create());
}

From source file:net.hubs1.mahout.cluster.CRLFSeparatedToSequenceFile.java

License:Apache License

/**
 * Takes in two arguments:/* w ww  .jav  a  2  s. co  m*/
 * <ol>
 * <li>The input {@link org.apache.hadoop.fs.Path} where the input documents live</li>
 * <li>The output {@link org.apache.hadoop.fs.Path} where to write the classifier as a
 * {@link org.apache.hadoop.io.SequenceFile}</li>
 * </ol>
 */
public static void main(String[] args) throws IOException {

    GroupBuilder gbuilder = new GroupBuilder();

    Option dirInputPathOpt = DefaultOptionCreator.inputOption().create();

    Option dirOutputPathOpt = DefaultOptionCreator.outputOption().create();

    Option helpOpt = DefaultOptionCreator.helpOption();

    Group group = gbuilder.withName("Options").withOption(dirInputPathOpt).withOption(dirOutputPathOpt)
            .withOption(helpOpt).create();

    Parser parser = new Parser();
    parser.setGroup(group);
    parser.setHelpOption(helpOpt);
    try {
        CommandLine cmdLine = parser.parse(args);
        if (cmdLine.hasOption(helpOpt)) {
            CommandLineUtil.printHelp(group);
            return;
        }

        String inputPath = (String) cmdLine.getValue(dirInputPathOpt);
        String outputPath = (String) cmdLine.getValue(dirOutputPathOpt);

        runJob(inputPath, outputPath);
    } catch (OptionException e) {
        log.error("Exception", e);
        CommandLineUtil.printHelp(group);
    } catch (InterruptedException e) {
        log.error("Exception", e);
        CommandLineUtil.printHelp(group);
    } catch (ClassNotFoundException e) {
        log.error("Exception", e);
        CommandLineUtil.printHelp(group);
    }
}

From source file:parse_wikipedia.ParseWikipedia.java

License:Apache License

public static void main(String[] args) throws IOException {
    DefaultOptionBuilder obuilder = new DefaultOptionBuilder();
    GroupBuilder gbuilder = new GroupBuilder();

    Option dirInputPathOpt = DefaultOptionCreator.inputOption().create();
    Option dirOutputPathOpt = DefaultOptionCreator.outputOption().create();

    Group group = gbuilder.withName("Options").withOption(dirInputPathOpt).withOption(dirOutputPathOpt)
            .create();//from w  ww  .  jav  a 2s  .com

    Parser parser = new Parser();
    parser.setGroup(group);

    try {
        CommandLine cmdLine = parser.parse(args);

        String inputPath = (String) cmdLine.getValue(dirInputPathOpt);
        String outputPath = (String) cmdLine.getValue(dirOutputPathOpt);

        runJob(inputPath, outputPath);
    } catch (OptionException | InterruptedException | ClassNotFoundException e) {
        log.error("Exception", e);
    }

}

From source file:tk.summerway.mahout9.tools.MyClusterDumper.java

License:Apache License

private boolean buildParse(String[] args) {
    DefaultOptionBuilder obuilder = new DefaultOptionBuilder();
    ArgumentBuilder abuilder = new ArgumentBuilder();
    GroupBuilder gbuilder = new GroupBuilder();

    Option inputDirOpt = DefaultOptionCreator.inputOption().create();
    Option outputDirOpt = DefaultOptionCreator.outputOption().create();

    Option outputFormatOpt = obuilder.withLongName(OUTPUT_FORMAT_OPT)
            .withArgument(abuilder.withName(OUTPUT_FORMAT_OPT).create())
            .withDescription(/*from w  w w  .j av  a 2 s  . c  om*/
                    "The optional output format for the results. Options: TEXT, CSV, JSON or GRAPH_ML. Default is TEXT")
            .withShortName("of").create();

    Option substringOpt = obuilder.withLongName(SUBSTRING_OPTION)
            .withArgument(abuilder.withName(SUBSTRING_OPTION).create())
            .withDescription("The number of chars of the asFormatString() to print").withShortName("b")
            .create();

    Option pointsDirOpt = obuilder.withLongName(POINTS_DIR_OPTION)
            .withArgument(abuilder.withName(POINTS_DIR_OPTION).create())
            .withDescription(
                    "The directory containing points sequence files mapping input vectors to their cluster. "
                            + "If specified, then the program will output the points associated with a cluster")
            .withShortName("p").create();

    Option samplePointsOpt = obuilder.withLongName(SAMPLE_POINTS)
            .withArgument(abuilder.withName(SAMPLE_POINTS).create())
            .withDescription("Specifies the maximum number of points to include _per_ cluster.  The default "
                    + "is to include all points")
            .withShortName("sp").create();

    Option dictionaryOpt = obuilder.withLongName(DICTIONARY_OPTION)
            .withArgument(abuilder.withName(DICTIONARY_OPTION).create()).withDescription("The dictionary file")
            .withShortName("d").create();

    Option dictionaryTypeOpt = obuilder.withLongName(DICTIONARY_TYPE_OPTION)
            .withArgument(abuilder.withName(DICTIONARY_TYPE_OPTION).create())
            .withDescription("The dictionary file type (text|sequencefile), default is text")
            .withShortName("dt").create();

    Option numWordsOpt = obuilder.withLongName(NUM_WORDS_OPTION)
            .withArgument(abuilder.withName(NUM_WORDS_OPTION).create())
            .withDescription("The number of top terms to print").withShortName("n").create();

    Option evaluateOpt = obuilder.withLongName(EVALUATE_CLUSTERS)
            .withArgument(abuilder.withName(EVALUATE_CLUSTERS).create())
            .withDescription("Run ClusterEvaluator and CDbwEvaluator over the input.  "
                    + "The output will be appended to the rest of the output at the end. Default is false.")
            .withShortName("e").create();

    Option distanceMeasureOpt = obuilder.withLongName("distanceMeasure")
            .withArgument(abuilder.withName("distanceMeasure").create())
            .withDescription("k-means distance measure class name").withShortName("dm").create();

    Option helpOpt = obuilder.withLongName("help").withDescription("Print out help").withShortName("h")
            .create();

    Group group = gbuilder.withName("Options").withOption(inputDirOpt).withOption(outputDirOpt)
            .withOption(outputFormatOpt).withOption(substringOpt).withOption(pointsDirOpt)
            .withOption(samplePointsOpt).withOption(dictionaryOpt).withOption(dictionaryTypeOpt)
            .withOption(numWordsOpt).withOption(evaluateOpt).withOption(distanceMeasureOpt).withOption(helpOpt)
            .create();
    try {
        Parser parser = new Parser();
        parser.setGroup(group);
        parser.setHelpOption(helpOpt);
        CommandLine cmdLine = parser.parse(args);

        if (cmdLine.hasOption(helpOpt)) {
            CommandLineUtil.printHelp(group);
            return false;
        }

        seqFileDir = getInputPath();
        inputPath = getInputPath();
        inputFile = getInputFile();
        if (cmdLine.hasOption(inputDirOpt)) {
            seqFileDir = new Path(cmdLine.getValue(inputDirOpt).toString());
            inputPath = new Path(cmdLine.getValue(inputDirOpt).toString());
            inputFile = new File(cmdLine.getValue(inputDirOpt).toString());
        }
        log.info("seqFileDir value: {}", seqFileDir);
        log.info("inputPath value: {}", inputPath);
        log.info("inputFile value: {}", inputFile);

        outputPath = getOutputPath();
        outputFile = getOutputFile();
        if (cmdLine.hasOption(outputDirOpt)) {
            outputPath = new Path(cmdLine.getValue(outputDirOpt).toString());
            outputFile = new File(cmdLine.getValue(outputDirOpt).toString());
        }
        log.info("outputPath value: {}", outputPath);
        log.info("outputFile value: {}", outputFile);

        if (cmdLine.hasOption(pointsDirOpt)) {
            pointsDir = new Path(cmdLine.getValue(pointsDirOpt).toString());
        }
        log.info("pointsDir value: {}", pointsDir);

        if (cmdLine.hasOption(substringOpt)) {
            int sub = Integer.parseInt(cmdLine.getValue(substringOpt).toString());
            if (sub >= 0) {
                subString = sub;
            }
        }
        log.info("subString value: {}", subString);

        termDictionary = cmdLine.getValue(dictionaryOpt).toString();
        dictionaryFormat = cmdLine.getValue(dictionaryTypeOpt).toString();
        log.info("termDictionary value: {}", termDictionary);
        log.info("dictionaryFormat value: {}", dictionaryFormat);

        if (cmdLine.hasOption(numWordsOpt)) {
            numTopFeatures = Integer.parseInt(cmdLine.getValue(numWordsOpt).toString());
        }
        log.info("numTopFeatures value: {}", numTopFeatures);

        outputFormat = OUTPUT_FORMAT.TEXT;
        if (cmdLine.hasOption(outputFormatOpt)) {
            outputFormat = OUTPUT_FORMAT.valueOf(cmdLine.getValue(outputFormatOpt).toString());
        }
        log.info("outputFormat value: {}", outputFormat);

        if (cmdLine.hasOption(samplePointsOpt)) {
            maxPointsPerCluster = Long.parseLong(cmdLine.getValue(samplePointsOpt).toString());
        } else {
            maxPointsPerCluster = Long.MAX_VALUE;
        }
        log.info("maxPointsPerCluster value: {}", maxPointsPerCluster);

        runEvaluation = cmdLine.hasOption(evaluateOpt);
        log.info("runEvaluation value: {}", runEvaluation);

        String distanceMeasureClass = null;
        if (cmdLine.hasOption(distanceMeasureOpt)) {
            distanceMeasureClass = cmdLine.getValue(distanceMeasureOpt).toString();
        }
        if (distanceMeasureClass != null) {
            measure = ClassUtils.instantiateAs(distanceMeasureClass, DistanceMeasure.class);
        }
        log.info("distanceMeasureClass value: {}", distanceMeasureClass);

    } catch (OptionException e) {
        CommandLineUtil.printHelp(group);
        log.error("parse para error", e);
    }
    return true;
}