org.pentaho.hadoop.sample.wordcount.WordCount.java Source code

Java tutorial

Introduction

Here is the source code for org.pentaho.hadoop.sample.wordcount.WordCount.java

Source

/*******************************************************************************
*
* Pentaho Big Data
*
* Copyright (C) 2002-2013 by Pentaho : http://www.pentaho.com
*
*******************************************************************************
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
*    http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
******************************************************************************/

package org.pentaho.hadoop.sample.wordcount;

import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.FileInputFormat;
import org.apache.hadoop.mapred.FileOutputFormat;
import org.apache.hadoop.mapred.JobClient;
import org.apache.hadoop.mapred.JobConf;

public class WordCount {
    public static void main(String[] args) throws Exception {
        String hdfsHost = "localhost:9000";
        String jobTrackerHost = "localhost:9001";
        String fsPrefix = "hdfs";

        String dirInput = "/wordcount/input";
        String dirOutput = "/wordcount/output";

        if (args.length == 1 && (args[0].equals("--help") || args[0].equals("-h") || args[0].equals("/?"))) {
            System.out.println("Usage: WordCount <options>");
            System.out.println();
            System.out.println("Options:");
            System.out.println();
            System.out.println("--input=DIR                   The directory containing the input files for the");
            System.out.println("                              WordCount Hadoop job");
            System.out.println("--output=DIR                  The directory where the results of the WordCount");
            System.out.println("                              Hadoop job will be stored");
            System.out.println("--hdfsHost=HOST               The host<:port> of the HDFS service");
            System.out.println("                              e.g.- localhost:9000");
            System.out.println("--jobTrackerHost=HOST         The host<:port> of the job tracker service");
            System.out.println("                              e.g.- localhost:9001");
            System.out.println("--fsPrefix=PREFIX             The prefix to use for for the filesystem");
            System.out.println("                              e.g.- hdfs");
            System.out.println();
            System.out.println();
            System.out.println("If an option is not provided through the command prompt the following defaults");
            System.out.println("will be used:");
            System.out.println("--input='/wordcount/input'");
            System.out.println("--output='/wordcount/output'");
            System.out.println("--hdfsHost=localhost:9000");
            System.out.println("--jobTrackerHost=localhost:9001");
            System.out.println("--fsPrefix=hdfs");

        } else {
            if (args.length > 0) {
                for (String arg : args) {
                    if (arg.startsWith("--input=")) {
                        dirInput = WordCount.getArgValue(arg);
                    } else if (arg.startsWith("--output=")) {
                        dirOutput = WordCount.getArgValue(arg);
                    } else if (arg.startsWith("--hdfsHost=")) {
                        hdfsHost = WordCount.getArgValue(arg);
                    } else if (arg.startsWith("--jobTrackerHost=")) {
                        jobTrackerHost = WordCount.getArgValue(arg);
                    } else if (arg.startsWith("--fsPrefix=")) {
                        fsPrefix = WordCount.getArgValue(arg);
                    }
                }
            }

            JobConf conf = new JobConf(WordCount.class);
            conf.setJobName("WordCount");

            String hdfsBaseUrl = fsPrefix + "://" + hdfsHost;
            conf.set("fs.default.name", hdfsBaseUrl + "/");
            if (jobTrackerHost != null && jobTrackerHost.length() > 0) {
                conf.set("mapred.job.tracker", jobTrackerHost);
            }

            FileInputFormat.setInputPaths(conf, new Path[] { new Path(hdfsBaseUrl + dirInput) });
            FileOutputFormat.setOutputPath(conf, new Path(hdfsBaseUrl + dirOutput));

            conf.setMapperClass(WordCountMapper.class);
            conf.setReducerClass(WordCountReducer.class);

            conf.setMapOutputKeyClass(Text.class);
            conf.setMapOutputValueClass(IntWritable.class);

            conf.setOutputKeyClass(Text.class);
            conf.setOutputValueClass(IntWritable.class);

            JobClient.runJob(conf);
        }
    }

    private static String getArgValue(String arg) {
        String result = null;

        String[] tokens = arg.split("=");
        if (tokens.length > 1) {
            result = tokens[1].replace("'", "").replace("\"", "");
        }
        System.out.println(arg + " parses to " + result);
        return result;
    }
}