org.apache.mahout.utils.nlp.collocations.llr.CollocDriver.java Source code

Introduction

Here is the source code for org.apache.mahout.utils.nlp.collocations.llr.CollocDriver.java
Source

/**
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.mahout.utils.nlp.collocations.llr;

import java.io.IOException;

import org.apache.commons.cli2.CommandLine;
import org.apache.commons.cli2.Group;
import org.apache.commons.cli2.Option;
import org.apache.commons.cli2.OptionException;
import org.apache.commons.cli2.builder.ArgumentBuilder;
import org.apache.commons.cli2.builder.DefaultOptionBuilder;
import org.apache.commons.cli2.builder.GroupBuilder;
import org.apache.commons.cli2.commandline.Parser;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.DoubleWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.FileInputFormat;
import org.apache.hadoop.mapred.FileOutputFormat;
import org.apache.hadoop.mapred.JobClient;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.RunningJob;
import org.apache.hadoop.mapred.SequenceFileInputFormat;
import org.apache.hadoop.mapred.SequenceFileOutputFormat;
import org.apache.hadoop.mapred.lib.IdentityMapper;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
import org.apache.lucene.analysis.Analyzer;
import org.apache.mahout.common.CommandLineUtil;
import org.apache.mahout.common.HadoopUtil;
import org.apache.mahout.text.DefaultAnalyzer;
import org.apache.mahout.utils.vectors.text.DictionaryVectorizer;
import org.apache.mahout.utils.vectors.text.DocumentProcessor;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/** Driver for LLR Collocation discovery mapreduce job */
public class CollocDriver extends Configured implements Tool {
    public static final String DEFAULT_OUTPUT_DIRECTORY = "output";
    public static final String SUBGRAM_OUTPUT_DIRECTORY = "subgrams";
    public static final String NGRAM_OUTPUT_DIRECTORY = "ngrams";

    public static final String EMIT_UNIGRAMS = "emit-unigrams";
    public static final boolean DEFAULT_EMIT_UNIGRAMS = false;

    public static final int DEFAULT_MAX_NGRAM_SIZE = 2;
    public static final int DEFAULT_PASS1_NUM_REDUCE_TASKS = 1;

    private static final Logger log = LoggerFactory.getLogger(CollocDriver.class);

    private CollocDriver() {
    }

    public static void main(String[] args) throws Exception {
        ToolRunner.run(new CollocDriver(), args);
    }

    @Override
    public int run(String[] args) throws Exception {
        DefaultOptionBuilder obuilder = new DefaultOptionBuilder();
        ArgumentBuilder abuilder = new ArgumentBuilder();
        GroupBuilder gbuilder = new GroupBuilder();

        Option inputOpt = obuilder.withLongName("input").withRequired(true)
                .withArgument(abuilder.withName("input").withMinimum(1).withMaximum(1).create())
                .withDescription("The Path for input files.").withShortName("i").create();

        Option outputOpt = obuilder.withLongName("output").withRequired(true)
                .withArgument(abuilder.withName("output").withMinimum(1).withMaximum(1).create())
                .withDescription("The Path write output to").withShortName("o").create();

        Option maxNGramSizeOpt = obuilder.withLongName("maxNGramSize").withRequired(false)
                .withArgument(abuilder.withName("ngramSize").withMinimum(1).withMaximum(1).create())
                .withDescription("(Optional) The maximum size of ngrams to create"
                        + " (2 = bigrams, 3 = trigrams, etc) Default Value:2")
                .withShortName("ng").create();

        Option minSupportOpt = obuilder.withLongName("minSupport").withRequired(false)
                .withArgument(abuilder.withName("minSupport").withMinimum(1).withMaximum(1).create())
                .withDescription("(Optional) Minimum Support. Default Value: " + CollocReducer.DEFAULT_MIN_SUPPORT)
                .withShortName("s").create();

        Option minLLROpt = obuilder.withLongName("minLLR").withRequired(false)
                .withArgument(abuilder.withName("minLLR").withMinimum(1).withMaximum(1).create())
                .withDescription("(Optional)The minimum Log Likelihood Ratio(Float)  Default is "
                        + LLRReducer.DEFAULT_MIN_LLR)
                .withShortName("ml").create();

        Option numReduceTasksOpt = obuilder.withLongName("numReducers").withRequired(false)
                .withArgument(abuilder.withName("numReducers").withMinimum(1).withMaximum(1).create())
                .withDescription(
                        "(Optional) Number of reduce tasks. Default Value: " + DEFAULT_PASS1_NUM_REDUCE_TASKS)
                .withShortName("nr").create();

        Option preprocessOpt = obuilder.withLongName("preprocess").withRequired(false)
                .withDescription("If set, input is SequenceFile<Text,Text> where the value is the document, "
                        + " which will be tokenized using the specified analyzer.")
                .withShortName("p").create();

        Option unigramOpt = obuilder.withLongName("unigram").withRequired(false)
                .withDescription("If set, unigrams will be emitted in the final output alongside collocations")
                .withShortName("u").create();

        Option overwriteOutput = obuilder.withLongName("overwrite").withRequired(false)
                .withDescription("If set, overwrite the output directory").withShortName("w").create();

        Option analyzerNameOpt = obuilder.withLongName("analyzerName")
                .withArgument(abuilder.withName("analyzerName").withMinimum(1).withMaximum(1).create())
                .withDescription("The class name of the analyzer").withShortName("a").create();

        Option helpOpt = obuilder.withLongName("help").withDescription("Print out help").withShortName("h")
                .create();

        Group group = gbuilder.withName("Options").withOption(inputOpt).withOption(outputOpt)
                .withOption(maxNGramSizeOpt).withOption(overwriteOutput).withOption(minSupportOpt)
                .withOption(minLLROpt).withOption(numReduceTasksOpt).withOption(analyzerNameOpt)
                .withOption(preprocessOpt).withOption(unigramOpt).withOption(helpOpt).create();

        try {
            Parser parser = new Parser();
            parser.setGroup(group);
            CommandLine cmdLine = parser.parse(args);

            if (cmdLine.hasOption(helpOpt)) {
                CommandLineUtil.printHelp(group);
                return 1;
            }

            Path input = new Path(cmdLine.getValue(inputOpt).toString());
            Path output = new Path(cmdLine.getValue(outputOpt).toString());

            int maxNGramSize = DEFAULT_MAX_NGRAM_SIZE;

            if (cmdLine.hasOption(maxNGramSizeOpt)) {
                try {
                    maxNGramSize = Integer.parseInt(cmdLine.getValue(maxNGramSizeOpt).toString());
                } catch (NumberFormatException ex) {
                    log.warn("Could not parse ngram size option");
                }
            }
            log.info("Maximum n-gram size is: {}", maxNGramSize);

            if (cmdLine.hasOption(overwriteOutput)) {
                HadoopUtil.overwriteOutput(output);
            }

            int minSupport = CollocReducer.DEFAULT_MIN_SUPPORT;
            if (cmdLine.hasOption(minSupportOpt)) {
                minSupport = Integer.parseInt(cmdLine.getValue(minSupportOpt).toString());
            }
            log.info("Minimum Support value: {}", minSupport);

            float minLLRValue = LLRReducer.DEFAULT_MIN_LLR;
            if (cmdLine.hasOption(minLLROpt)) {
                minLLRValue = Float.parseFloat(cmdLine.getValue(minLLROpt).toString());
            }
            log.info("Minimum LLR value: {}", minLLRValue);

            int reduceTasks = DEFAULT_PASS1_NUM_REDUCE_TASKS;
            if (cmdLine.hasOption(numReduceTasksOpt)) {
                reduceTasks = Integer.parseInt(cmdLine.getValue(numReduceTasksOpt).toString());
            }
            log.info("Number of pass1 reduce tasks: {}", reduceTasks);

            boolean emitUnigrams = cmdLine.hasOption(unigramOpt);

            if (cmdLine.hasOption(preprocessOpt)) {
                log.info("Input will be preprocessed");

                Class<? extends Analyzer> analyzerClass = DefaultAnalyzer.class;
                if (cmdLine.hasOption(analyzerNameOpt)) {
                    String className = cmdLine.getValue(analyzerNameOpt).toString();
                    analyzerClass = Class.forName(className).asSubclass(Analyzer.class);
                    // try instantiating it, b/c there isn't any point in setting it if
                    // you can't instantiate it
                    analyzerClass.newInstance();
                }

                Path tokenizedPath = new Path(output, DocumentProcessor.TOKENIZED_DOCUMENT_OUTPUT_FOLDER);

                DocumentProcessor.tokenizeDocuments(input, analyzerClass, tokenizedPath);
                input = tokenizedPath;
            } else {
                log.info("Input will NOT be preprocessed");
            }

            // parse input and extract collocations
            long ngramCount = generateCollocations(input, output, emitUnigrams, maxNGramSize, reduceTasks,
                    minSupport);

            // tally collocations and perform LLR calculation
            computeNGramsPruneByLLR(ngramCount, output, emitUnigrams, minLLRValue, reduceTasks);

        } catch (OptionException e) {
            log.error("Exception", e);
            CommandLineUtil.printHelp(group);
            return 1;
        }

        return 0;
    }

    /**
     * Generate all ngrams for the {@link DictionaryVectorizer} job
     * 
     * @param input
     *          input path containing tokenized documents
     * @param output
     *          output path where ngrams are generated including unigrams
     * @param maxNGramSize
     *          minValue = 2.
     * @param minSupport
     *          minimum support to prune ngrams including unigrams
     * @param minLLRValue
     *          minimum threshold to prune ngrams
     * @param reduceTasks
     *          number of reducers used
     * @throws IOException
     */
    public static void generateAllGrams(Path input, Path output, int maxNGramSize, int minSupport,
            float minLLRValue, int reduceTasks) throws IOException {
        // parse input and extract collocations
        long ngramCount = generateCollocations(input, output, true, maxNGramSize, reduceTasks, minSupport);

        // tally collocations and perform LLR calculation
        computeNGramsPruneByLLR(ngramCount, output, true, minLLRValue, reduceTasks);
    }

    /**
     * pass1: generate collocations, ngrams
     */
    public static long generateCollocations(Path input, Path output, boolean emitUnigrams, int maxNGramSize,
            int reduceTasks, int minSupport) throws IOException {
        JobConf conf = new JobConf(CollocDriver.class);
        conf.setJobName(CollocDriver.class.getSimpleName() + ".generateCollocations:" + input);

        conf.setMapOutputKeyClass(GramKey.class);
        conf.setMapOutputValueClass(Gram.class);
        conf.setPartitionerClass(GramKeyPartitioner.class);
        conf.setOutputValueGroupingComparator(GramKeyGroupComparator.class);

        conf.setOutputKeyClass(Gram.class);
        conf.setOutputValueClass(Gram.class);

        conf.setCombinerClass(CollocCombiner.class);

        conf.setBoolean(EMIT_UNIGRAMS, emitUnigrams);

        FileInputFormat.setInputPaths(conf, input);
        Path outPath = new Path(output, SUBGRAM_OUTPUT_DIRECTORY);
        FileOutputFormat.setOutputPath(conf, outPath);

        conf.setInputFormat(SequenceFileInputFormat.class);
        conf.setMapperClass(CollocMapper.class);

        conf.setOutputFormat(SequenceFileOutputFormat.class);
        conf.setReducerClass(CollocReducer.class);
        conf.setInt(CollocMapper.MAX_SHINGLE_SIZE, maxNGramSize);
        conf.setInt(CollocReducer.MIN_SUPPORT, minSupport);
        conf.setNumReduceTasks(reduceTasks);

        RunningJob job = JobClient.runJob(conf);
        return job.getCounters().findCounter(CollocMapper.Count.NGRAM_TOTAL).getValue();
    }

    /**
     * pass2: perform the LLR calculation
     */
    public static void computeNGramsPruneByLLR(long nGramTotal, Path output, boolean emitUnigrams,
            float minLLRValue, int reduceTasks) throws IOException {
        JobConf conf = new JobConf(CollocDriver.class);
        conf.setJobName(CollocDriver.class.getSimpleName() + ".computeNGrams: " + output);

        conf.setLong(LLRReducer.NGRAM_TOTAL, nGramTotal);
        conf.setBoolean(EMIT_UNIGRAMS, emitUnigrams);

        conf.setMapOutputKeyClass(Gram.class);
        conf.setMapOutputValueClass(Gram.class);

        conf.setOutputKeyClass(Text.class);
        conf.setOutputValueClass(DoubleWritable.class);

        FileInputFormat.setInputPaths(conf, new Path(output, SUBGRAM_OUTPUT_DIRECTORY));
        Path outPath = new Path(output, NGRAM_OUTPUT_DIRECTORY);
        FileOutputFormat.setOutputPath(conf, outPath);

        conf.setMapperClass(IdentityMapper.class);
        conf.setInputFormat(SequenceFileInputFormat.class);
        conf.setOutputFormat(SequenceFileOutputFormat.class);
        conf.setReducerClass(LLRReducer.class);
        conf.setNumReduceTasks(reduceTasks);

        conf.setFloat(LLRReducer.MIN_LLR, minLLRValue);
        JobClient.runJob(conf);
    }
}