io.aos.t4f.hadoop.mapreduce.WordCountMapReduceTest2.java Source code

Introduction

Here is the source code for io.aos.t4f.hadoop.mapreduce.WordCountMapReduceTest2.java
Source

/****************************************************************
 * Licensed to the AOS Community (AOS) under one or more        *
 * contributor license agreements.  See the NOTICE file         *
 * distributed with this work for additional information        *
 * regarding copyright ownership.  The AOS licenses this file   *
 * to you under the Apache License, Version 2.0 (the            *
 * "License"); you may not use this file except in compliance   *
 * with the License.  You may obtain a copy of the License at   *
 *                                                              *
 *   http://www.apache.org/licenses/LICENSE-2.0                 *
 *                                                              *
 * Unless required by applicable law or agreed to in writing,   *
 * software distributed under the License is distributed on an  *
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY       *
 * KIND, either express or implied.  See the License for the    *
 * specific language governing permissions and limitations      *
 * under the License.                                           *
 ****************************************************************/
package io.aos.t4f.hadoop.mapreduce;

import java.io.IOException;
import java.util.ArrayList;
import java.util.Date;
import java.util.List;
import java.util.StringTokenizer;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;

public class WordCountMapReduceTest2 {

    static final long MEGABYTES = 1024 * 1024;

    static void printUsage() {
        System.out.println("WordCount [-r <reduces>] <input> <output>");
        System.exit(-1);
    }

    private static List<String> parseArguments(String args[], Job j) {

        List<String> argList = new ArrayList<String>();
        for (int i = 0; i < args.length; ++i) {
            try {
                if ("-r".equals(args[i])) {
                    // set the number of reducers to the specified parameter
                    j.setNumReduceTasks(Integer.parseInt(args[++i]));
                } else {
                    argList.add(args[i]);
                }
            } catch (NumberFormatException except) {
                System.out.println("ERROR: Integer expected instead of " + args[i]);
                printUsage();
            } catch (ArrayIndexOutOfBoundsException except) {
                System.out.println("ERROR: Required parameter missing from " + args[i - 1]);
                printUsage();
            }
        }

        return argList;
    }

    public static int main(String... args) throws Exception {

        // Get the default configuration object
        Configuration conf = new Configuration();

        // Add resources
        conf.addResource("hdfs-default.xml");
        conf.addResource("hdfs-site.xml");
        conf.addResource("mapred-default.xml");
        conf.addResource("mapred-site.xml");

        Job job = new Job(conf);
        job.setJobName("WordCount");

        List<String> other_args = parseArguments(args, job);

        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(IntWritable.class);

        // the keys are words (strings)
        job.setOutputKeyClass(Text.class);
        // the values are counts (ints)
        job.setOutputValueClass(IntWritable.class);

        job.setMapperClass(MapClass.class);
        job.setCombinerClass(ReduceClass.class);
        job.setReducerClass(ReduceClass.class);

        // Set the input format class
        job.setInputFormatClass(TextInputFormat.class);
        // Set the output format class
        job.setOutputFormatClass(TextOutputFormat.class);
        // Set the input path
        TextInputFormat.setInputPaths(job, other_args.get(0));
        // Set the output path
        TextOutputFormat.setOutputPath(job, new Path(other_args.get(1)));

        /*
         * Set the minimum and maximum split sizes This parameter helps to
         * specify the number of map tasks. For each input split, there will be
         * a separate map task. In this example each split is of size 32 MB
         */
        TextInputFormat.setMinInputSplitSize(job, 32 * MEGABYTES);
        TextInputFormat.setMaxInputSplitSize(job, 32 * MEGABYTES);

        // Set the jar file to run
        job.setJarByClass(WordCountMapReduceTest2.class);

        // Submit the job
        Date startTime = new Date();
        System.out.println("Job started: " + startTime);
        int exitCode = job.waitForCompletion(true) ? 0 : 1;

        if (exitCode == 0) {
            Date end_time = new Date();
            System.out.println("Job ended: " + end_time);
            System.out.println("The job took " + (end_time.getTime() - startTime.getTime()) / 1000 + " seconds.");
        } else {
            System.out.println("Job Failed!!!");
        }

        return exitCode;

    }

    public static class ReduceClass extends Reducer<Text, IntWritable, Text, IntWritable> {
        public void reduce(Text key, Iterable<IntWritable> values, Context context)
                throws IOException, InterruptedException {
            int sum = 0;
            for (IntWritable val : values) {
                sum += val.get();
            }
            context.write(key, new IntWritable(sum));
        }
    }

    public static class MapClass extends Mapper<LongWritable, Text, Text, IntWritable> {
        private static final IntWritable one = new IntWritable(1);
        private Text word = new Text();

        public void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
            String line = value.toString();
            StringTokenizer tokenizer = new StringTokenizer(line);
            while (tokenizer.hasMoreTokens()) {
                word.set(tokenizer.nextToken());
                context.write(word, one);
            }
        }
    }

}