gov.llnl.ontology.mapreduce.stats.CompoundTokenCountMR.java Source code

Introduction

Here is the source code for gov.llnl.ontology.mapreduce.stats.CompoundTokenCountMR.java
Source

/*
 * Copyright (c) 2011, Lawrence Livermore National Security, LLC. Produced at
 * the Lawrence Livermore National Laboratory. Written by Keith Stevens,
 * kstevens@cs.ucla.edu OCEC-10-073 All rights reserved. 
 *
 * This file is part of the C-Cat package and is covered under the terms and
 * conditions therein.
 *
 * The C-Cat package is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License version 2 as published
 * by the Free Software Foundation and distributed hereunder to you.
 *
 * THIS SOFTWARE IS PROVIDED "AS IS" AND NO REPRESENTATIONS OR WARRANTIES,
 * EXPRESS OR IMPLIED ARE MADE.  BY WAY OF EXAMPLE, BUT NOT LIMITATION, WE MAKE
 * NO REPRESENTATIONS OR WARRANTIES OF MERCHANT- ABILITY OR FITNESS FOR ANY
 * PARTICULAR PURPOSE OR THAT THE USE OF THE LICENSED SOFTWARE OR DOCUMENTATION
 * WILL NOT INFRINGE ANY THIRD PARTY PATENTS, COPYRIGHTS, TRADEMARKS OR OTHER
 * RIGHTS.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program. If not, see <http://www.gnu.org/licenses/>.
 */

package gov.llnl.ontology.mapreduce.stats;

import gov.llnl.ontology.mapreduce.CorpusTableMR;
import gov.llnl.ontology.mapreduce.MRArgOptions;
import gov.llnl.ontology.mapreduce.table.CorpusTable;
import gov.llnl.ontology.text.Sentence;
import gov.llnl.ontology.text.TextUtil;
import gov.llnl.ontology.util.Counter;
import gov.llnl.ontology.util.StringCounter;
import gov.llnl.ontology.util.StringPair;

import com.google.common.collect.Lists;
import com.google.common.collect.Sets;

import edu.ucla.sspace.text.CompoundWordIterator;
import edu.ucla.sspace.util.ReflectionUtil;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.filecache.DistributedCache;
import org.apache.hadoop.util.ToolRunner;
import org.apache.hadoop.hbase.HBaseConfiguration;
import org.apache.hadoop.hbase.client.Put;
import org.apache.hadoop.hbase.client.Result;
import org.apache.hadoop.hbase.io.ImmutableBytesWritable;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.Mapper.Context;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import org.apache.hadoop.mapreduce.lib.reduce.IntSumReducer;

import java.io.BufferedReader;
import java.io.FileReader;
import java.io.IOException;
import java.net.URI;
import java.net.URISyntaxException;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Set;

/**
 * A Map/Reduce job that counts the number of occurrences for each token in a
 * corpus.
 *
 * @author Keith Stevens
 */
public class CompoundTokenCountMR extends CorpusTableMR {

    /**
     * The job description used in help text.
     */
    public static final String ABOUT = "Computes token counts from a particular corpus.  "
            + "If no corpus is specified, then all corpora will be used to compute "
            + "the frequencies.  The co-occurrence counts will be stored in reduce "
            + "parts on hdfs under the specified <outdir>.";

    /**
     * Runs the {@link CompoundTokenCountMR}.
     */
    public static void main(String[] args) throws Exception {
        ToolRunner.run(HBaseConfiguration.create(), new CompoundTokenCountMR(), args);
    }

    /**
     * {@inheritDoc}
     */
    protected void validateOptions(MRArgOptions options) {
        options.validate(ABOUT, "<outdir>", CompoundTokenCountMR.class, 1, 'C', 'c');
    }

    /**
     * {@inheritDoc}
     */
    protected String jobName() {
        return "Token Count";
    }

    /**
     * {@inheritDoc}
     */
    protected void addOptions(MRArgOptions options) {
        options.addOption('c', "compoundWordList", "Specifies the compound word list.", true, "FILE", "Required");
    }

    /**
     * {@inheritDoc}
     */
    protected void setupConfiguration(MRArgOptions options, Configuration conf) {
        try {
            DistributedCache.addCacheFile(new URI(options.getStringOption('c')), conf);
        } catch (URISyntaxException use) {
            use.printStackTrace();
        }
    }

    /**
     * {@inheritDoc}
     */
    protected Class mapperClass() {
        return CompoundTokenCountMapper.class;
    }

    /**
     * Returns the {@link Class} object for the Mapper Value of this task.
     */
    protected Class mapperKeyClass() {
        return Text.class;
    }

    /**
     * Returns the {@link Class} object for the Mapper Value of this task.
     */
    protected Class mapperValueClass() {
        return IntWritable.class;
    }

    /**
     * Sets up the Reducer for this job.  
     */
    protected void setupReducer(String tableName, Job job, MRArgOptions options) {
        job.setCombinerClass(IntSumReducer.class);
        job.setReducerClass(IntSumReducer.class);
        job.setOutputFormatClass(TextOutputFormat.class);
        TextOutputFormat.setOutputPath(job, new Path(options.getPositionalArg(0)));
        job.setNumReduceTasks(24);
    }

    /**
     * The {@link TableMapper} responsible for most of the work.
     */
    public static class CompoundTokenCountMapper extends CorpusTableMR.CorpusTableMapper<Text, IntWritable> {

        /**
         * Represents a single occurrence.
         */
        private static final IntWritable ONE = new IntWritable(1);

        private Set<String> wordList;

        /**
         * {@inheritDoc}
         */
        public void setup(Context context, Configuration conf) throws IOException, InterruptedException {
            Path wordListPath = DistributedCache.getLocalCacheFiles(conf)[0];
            BufferedReader br = new BufferedReader(new FileReader(wordListPath.toString()));

            wordList = Sets.newHashSet();
            for (String line = null; (line = br.readLine()) != null;)
                wordList.add(line.toLowerCase());
        }

        /**
         * {@inheritDoc}
         */
        public void map(ImmutableBytesWritable key, Result row, Context context)
                throws IOException, InterruptedException {
            Counter<String> counter = new StringCounter();
            for (Sentence sentence : table.sentences(row)) {
                List<String> tokens = Lists.newArrayList();
                for (StringPair tokenPos : sentence.taggedTokens())
                    if (tokenPos.x != null)
                        tokens.add(TextUtil.cleanTerm(tokenPos.x));
                Iterator<String> iter = new CompoundWordIterator(tokens.iterator(), wordList);
                while (iter.hasNext())
                    counter.count(iter.next());
            }

            for (Map.Entry<String, Integer> entry : counter)
                context.write(new Text(entry.getKey()), new IntWritable(entry.getValue()));

            context.getCounter("CompoundTokenCountMR", "Document").increment(1);
        }
    }
}