jobimtext.thesaurus.distributional.hadoop.mapreduce.TotalWords.java Source code

Introduction

Here is the source code for jobimtext.thesaurus.distributional.hadoop.mapreduce.TotalWords.java
Source

/*******************************************************************************
** /*******************************************************************************
* Copyright 2012
* Ubiquitous Knowledge Processing (UKP) Lab
* Technische Universitt Darmstadt
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
*   http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
******************************************************************************/

package jobimtext.thesaurus.distributional.hadoop.mapreduce;

import java.io.IOException;
import java.util.Iterator;

import jobimtext.thesaurus.distributional.hadoop.util.HadoopUtil;

import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.FileInputFormat;
import org.apache.hadoop.mapred.FileOutputFormat;
import org.apache.hadoop.mapred.JobClient;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.MapReduceBase;
import org.apache.hadoop.mapred.Mapper;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.Reducer;
import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.mapred.TextInputFormat;
import org.apache.hadoop.mapred.TextOutputFormat;

/**
** This MapReduce step counts the number of total
** words the text contains. Again, this information
** is taken from the Context output.
** 
** The result of this job would be one single line
** (containing the total word count).
** 
** This Java class is currently not used in the project
** because this functionality is integrated in the Pig
** script of the FreqSig step.
** 
** @author Richard Steuer
**
**/
public class TotalWords {

    @SuppressWarnings("deprecation")
    public static class Map extends MapReduceBase implements Mapper<LongWritable, Text, Text, IntWritable> {

        private Text keyOut = new Text();
        private String[] tokens;

        public void map(LongWritable key, Text value, OutputCollector<Text, IntWritable> output, Reporter reporter)
                throws IOException {

            // split lines into the two tokens: word and feature
            // (these are separated by one whitespace)
            tokens = value.toString().split("(\\s|\\xA0)+");

            /*
             * now we have: word = tokens[0], feature = tokens[1] number =
             * tokens[2];
             * 
             * Simply emit any unique string of your liking and
             * multiply the counting by the number of appearances
             * in the (word,feature) pair context output.
             */
            keyOut.set("TOTALWORDS");
            output.collect(keyOut, new IntWritable(Integer.parseInt(tokens[2])));

        } // map()

    } // class map

    @SuppressWarnings("deprecation")
    public static class Reduce extends MapReduceBase implements Reducer<Text, IntWritable, Text, IntWritable> {

        int sum;

        public void reduce(Text key, Iterator<IntWritable> values, OutputCollector<Text, IntWritable> output,
                Reporter reporter) throws IOException {

            sum = 0;

            while (values.hasNext()) {

                sum += values.next().get();

            } // while

            output.collect(key, new IntWritable(sum));

        } // reduce()

    } // class reduce

    @SuppressWarnings("deprecation")
    public static void main(String[] args) throws Exception {

        JobConf conf = HadoopUtil.generateJobConf(args);

        conf.setOutputKeyClass(Text.class);
        conf.setOutputValueClass(IntWritable.class);

        conf.setMapperClass(Map.class);
        conf.setCombinerClass(Reduce.class);
        conf.setReducerClass(Reduce.class);

        conf.setInputFormat(TextInputFormat.class);
        conf.setOutputFormat(TextOutputFormat.class);

        FileInputFormat.setInputPaths(conf, new Path(args[0]));
        FileOutputFormat.setOutputPath(conf, new Path(args[1]));

        JobClient.runJob(conf);

    } // main

} // class WordCount