jobimtext.thesaurus.distributional.hadoop.mapreduce.SimCountsLog.java Source code

Introduction

Here is the source code for jobimtext.thesaurus.distributional.hadoop.mapreduce.SimCountsLog.java
Source

/*******************************************************************************
* Copyright 2012
* Ubiquitous Knowledge Processing (UKP) Lab
* Technische Universitt Darmstadt
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
*   http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
******************************************************************************/

package jobimtext.thesaurus.distributional.hadoop.mapreduce;

import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;

import jobimtext.thesaurus.distributional.hadoop.SumReducer.DoubleSumReducer;
import jobimtext.thesaurus.distributional.hadoop.util.HadoopUtil;

import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.DoubleWritable;
import org.apache.hadoop.io.FloatWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.FileInputFormat;
import org.apache.hadoop.mapred.FileOutputFormat;
import org.apache.hadoop.mapred.JobClient;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.MapReduceBase;
import org.apache.hadoop.mapred.Mapper;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.mapred.TextInputFormat;
import org.apache.hadoop.mapred.TextOutputFormat;

/**
** This is the last main MapReduce step in our
** process chain.
**
** This step will count the similarity, i.e.:
** How often do two words share the same feature?
**
** @author Richard Steuer
**
**/
@SuppressWarnings("deprecation")
public class SimCountsLog {

    /**
     * The mapper takes the feature and its word list
     * (all the words having this feature) as input
     * It then compares all words pair-wise and
     * outputs 1/log(n), with n being the number of words.
     *
     */
    public static class Map extends MapReduceBase implements Mapper<LongWritable, Text, Text, DoubleWritable> {

        /*
         * this will be the (inverted) bag size of the
         * words having a feature in common
         */
        private DoubleWritable size = new DoubleWritable();
        private Text word = new Text();

        /*
         * max. number of words per feature
         * (may be obtained experimentally)
         *
         * this is important because this MapReduce step
         * is computationally very intensive ( O(n^2) )
         */
        private final int threshold = 1000;

        /* the line as a list */
        private List<String> line = new ArrayList<String>();
        private int linesize;

        /* the two word window we are looping through */
        private String word1, word2;

        public void map(LongWritable key, Text value, OutputCollector<Text, DoubleWritable> output,
                Reporter reporter) throws IOException {

            // split line into tokens
            // the first is the word feature, the rest are all
            // word having this feature
            line = Arrays.asList(value.toString().split("\\s+"));
            linesize = line.size();

            /*
             * calculate the weight of these two words
             * inverted: to filter noise (like stopwords)
             */
            double weight = 1.0 / Math.log((double) linesize);

            // set the mapper output once (stays the same
            // for all words in the line)
            size.set((float) weight);

            // consider lower and upper limit
            if (linesize >= 2 && linesize < threshold) {

                // count all word pairs in that line
                // skip first entry since this is the feature
                for (int i = 1; i < linesize; i++) {

                    for (int j = i; j < linesize; j++) {

                        // skip same index
                        // if (i == j) { continue; }
                        // update: no, it's good for normalization

                        word1 = line.get(i);
                        word2 = line.get(j);

                        /*
                         * output words in both directions
                         */
                        word.set(word1 + "\t" + word2);
                        output.collect(word, size);

                        word.set(word2 + "\t" + word1);
                        output.collect(word, size);

                    } // for (inner)

                } // for (outer)

            } // if < threshold

        } // map()

    } // class map

    /**
     * The reducer step will sum all float values, i.e. the
     * weight for any (word1,word2) pair sharing a feature.
     */

    public static void main(String[] args) throws Exception {

        JobConf conf = HadoopUtil.generateJobConf(args);

        conf.setOutputKeyClass(Text.class);
        conf.setOutputValueClass(FloatWritable.class);

        conf.setMapperClass(Map.class);
        conf.setCombinerClass(DoubleSumReducer.class);
        conf.setReducerClass(DoubleSumReducer.class);

        conf.setInputFormat(TextInputFormat.class);
        conf.setOutputFormat(TextOutputFormat.class);

        FileInputFormat.setInputPaths(conf, new Path(args[0]));
        FileOutputFormat.setOutputPath(conf, new Path(args[1]));

        /* number of milliseconds before killing a not responding task */
        conf.set("mapred.task.timeout", "600000");

        /* change to 128mb */
        conf.set("dfs.block.size", "134217728");

        /* set the maximum number of task per node */
        int maptasks = 100;

        /* Number of map tasks to deploy on each machine. 0.5 to 2 * (cores/node) */
        conf.set("mapred.tasktracker.map.tasks.maximum", "" + maptasks);
        conf.set("mapred.tasktracker.map", "" + maptasks);
        /* The default number of map tasks per job. Typically set to a prime several
           times greater than number of available hosts. */
        conf.set("mapred.map.tasks", "" + maptasks);

        int reducetasks = 100;

        conf.set("mapred.tasktracker.reduce.tasks.maximum", "" + reducetasks);
        conf.set("mapred.tasktracker.reduce", "" + reducetasks);
        conf.set("mapred.reduce.tasks", "" + reducetasks);

        JobClient.runJob(conf);

    } // main

} // class SimCountsLog