gov.llnl.ontology.mapreduce.stats.WordnetShortestPathMR.java Source code

Introduction

Here is the source code for gov.llnl.ontology.mapreduce.stats.WordnetShortestPathMR.java
Source

/*
 * Copyright (c) 2011, Lawrence Livermore National Security, LLC. Produced at
 * the Lawrence Livermore National Laboratory. Written by Keith Stevens,
 * kstevens@cs.ucla.edu OCEC-10-073 All rights reserved. 
 *
 * This file is part of the C-Cat package and is covered under the terms and
 * conditions therein.
 *
 * The C-Cat package is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License version 2 as published
 * by the Free Software Foundation and distributed hereunder to you.
 *
 * THIS SOFTWARE IS PROVIDED "AS IS" AND NO REPRESENTATIONS OR WARRANTIES,
 * EXPRESS OR IMPLIED ARE MADE.  BY WAY OF EXAMPLE, BUT NOT LIMITATION, WE MAKE
 * NO REPRESENTATIONS OR WARRANTIES OF MERCHANT- ABILITY OR FITNESS FOR ANY
 * PARTICULAR PURPOSE OR THAT THE USE OF THE LICENSED SOFTWARE OR DOCUMENTATION
 * WILL NOT INFRINGE ANY THIRD PARTY PATENTS, COPYRIGHTS, TRADEMARKS OR OTHER
 * RIGHTS.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program. If not, see <http://www.gnu.org/licenses/>.
 */

package gov.llnl.ontology.mapreduce.stats;

import gov.llnl.ontology.mapreduce.table.CorpusTable;
import gov.llnl.ontology.text.hbase.LineDocInputFormat;
import gov.llnl.ontology.wordnet.OntologyReader;
import gov.llnl.ontology.wordnet.Synset;
import gov.llnl.ontology.wordnet.SynsetRelations;
import gov.llnl.ontology.wordnet.WordNetCorpusReader;

import edu.ucla.sspace.common.ArgOptions;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hbase.HBaseConfiguration;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;

import java.io.IOException;
import java.io.PrintStream;
import java.util.HashSet;
import java.util.Set;

/**
 * A Map/Reduce job that computes the shortest path between every wordnet
 * synset.
 *
 * @author Keith Stevens
 */
public class WordnetShortestPathMR extends Configured implements Tool {

    /**
     * The base name for every configuration setting.
     */
    public static final String CONF_BASE = "gov.llnl.ontology.mapreduce.stats.WordnetShortestPathMR";

    /**
     * The configuration setting for the wordnet directory.
     */
    public static final String WORDNET = CONF_BASE + ".wordnet";

    /**
     * A temp file name for the synset pairwise combinations.
     */
    public static final String TEMP_TERM_PAIR_PATH = "wordnet-pair-file";

    /**
     * Runs the {@link IngestCorpusMR}.
     */
    public static void main(String[] args) throws Exception {
        ToolRunner.run(HBaseConfiguration.create(), new WordnetShortestPathMR(), args);
    }

    /**
     * {@inheritDoc}
     */
    public int run(String[] args) throws Exception {
        // Setup and valdiate the arguments.
        ArgOptions options = new ArgOptions();
        options.addOption('w', "wordnetDir", "The directory path to the wordnet data files", true, "PATH",
                "Required");

        options.parseOptions(args);
        if (!options.hasOption('w')) {
            System.err.println("usage: java WordnetShortestPathMR [OPTIONS] <outdir>\n" + options.prettyPrint());
        }

        // Open the wordnet reader and gather the set of all Synsets known by
        // the ontology.
        OntologyReader reader = WordNetCorpusReader.initialize(options.getStringOption('w'));
        Set<Synset> synsetSet = new HashSet<Synset>();
        for (String lemma : reader.wordnetTerms())
            for (Synset synset : reader.getSynsets(lemma))
                synsetSet.add(synset);

        // Compute each pairing of Synsets and write that pairing to a file in
        // HDFS.
        Synset[] synsets = synsetSet.toArray(new Synset[0]);
        PrintStream outStream = createPrintStream();
        for (int i = 0; i < synsets.length; ++i)
            for (int j = i + 1; j < synsets.length; ++j)
                outStream.printf("%s|%s\n", synsets[i].getName(), synsets[j].getName());
        outStream.close();

        // Store the wordnet directory information so that the mappers can load
        // it up.  They need it to figure out the shortest path information.
        Configuration conf = getConf();
        conf.set(WORDNET, options.getStringOption('w'));

        // Setup the job information.
        Job job = new Job(conf, "Compute Wordnet Shortest Paths");
        job.setJarByClass(WordnetShortestPathMR.class);

        job.setMapperClass(WordnetShortestPathMapper.class);

        // The input file will be the temporary file created with the synset
        // pairings.
        job.setInputFormatClass(LineDocInputFormat.class);
        FileInputFormat.addInputPath(job, new Path(TEMP_TERM_PAIR_PATH));

        // The mappers do all of the real work, so we just write their output
        // straight to disk.
        job.setCombinerClass(Reducer.class);
        job.setReducerClass(Reducer.class);
        job.setOutputFormatClass(TextOutputFormat.class);
        TextOutputFormat.setOutputPath(job, new Path(options.getPositionalArg(0)));

        // Start the job.
        job.waitForCompletion(true);

        return 0;
    }

    /**
     * Creates a new temporary file on HDFS that will store the {@link Synset}
     * name pairs to be used for finding the shortest distance between all
     * nodes.
     */
    private static PrintStream createPrintStream() throws Exception {
        Configuration conf = new Configuration();
        FileSystem fs = FileSystem.get(conf);
        Path finalPath = new Path(TEMP_TERM_PAIR_PATH);
        FSDataOutputStream ostr = fs.create(finalPath);
        return new PrintStream(ostr);
    }

    /**
     * The {@link Mapper} responsible for the real work.
     */
    public static class WordnetShortestPathMapper extends Mapper<LongWritable, Text, Text, IntWritable> {

        /**
         * The {@link OntologyReader} needed for accessing any {@link Synset}.
         */
        private OntologyReader reader;

        /**
         * {@inheritDoc}
         */
        public void setup(Context context) throws IOException, InterruptedException {
            Configuration conf = context.getConfiguration();
            reader = WordNetCorpusReader.initialize(conf.get(WORDNET));
        }

        /**
         * {@inheritDoc}
         */
        public void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
            String[] terms = value.toString().split("\\|");
            Synset synset1 = reader.getSynset(terms[0]);
            Synset synset2 = reader.getSynset(terms[1]);
            int shortestPath = SynsetRelations.shortestPathDistance(synset1, synset2);
            context.write(value, new IntWritable(shortestPath));
        }
    }
}