co.cask.cdap.examples.fileset.WordCount.java Source code

Introduction

Here is the source code for co.cask.cdap.examples.fileset.WordCount.java
Source

/*
 * Copyright  2014-2016 Cask Data, Inc.
 *
 * Licensed under the Apache License, Version 2.0 (the "License"); you may not
 * use this file except in compliance with the License. You may obtain a copy of
 * the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
 * License for the specific language governing permissions and limitations under
 * the License.
 */

package co.cask.cdap.examples.fileset;

import co.cask.cdap.api.Resources;
import co.cask.cdap.api.data.batch.Input;
import co.cask.cdap.api.data.batch.Output;
import co.cask.cdap.api.mapreduce.AbstractMapReduce;
import co.cask.cdap.api.mapreduce.MapReduceContext;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;

import java.io.IOException;

/**
 * A simple word counter. It reads inputs from the "lines" FileSet and writes its output to
 * the "counts" FileSet. The input and output path can be configured as runtime arguments:
 * <ul>
 * <li>"dataset.lines.input.paths" for the input. Multiple paths can be given, separated by commas.</li>
 * <li>"dataset.counts.output.path" for the output.</li>
 * </ul>
 */
public class WordCount extends AbstractMapReduce {

    @Override
    public void configure() {
        setMapperResources(new Resources(1024));
        setReducerResources(new Resources(1024));
    }

    @Override
    public void beforeSubmit(MapReduceContext context) throws Exception {
        Job job = context.getHadoopJob();
        job.setMapperClass(Tokenizer.class);
        job.setReducerClass(Counter.class);
        job.setNumReduceTasks(1);

        String inputDataset = context.getRuntimeArguments().get("input");
        inputDataset = inputDataset != null ? inputDataset : "lines";

        String outputDataset = context.getRuntimeArguments().get("output");
        outputDataset = outputDataset != null ? outputDataset : "counts";

        context.addInput(Input.ofDataset(inputDataset));
        context.addOutput(Output.ofDataset(outputDataset));
    }

    /**
     * A mapper that tokenizes each input line and emits each token with a value of 1.
     */
    public static class Tokenizer extends Mapper<LongWritable, Text, Text, IntWritable> {

        private Text word = new Text();
        private static final IntWritable ONE = new IntWritable(1);

        @Override
        public void map(LongWritable key, Text data, Context context) throws IOException, InterruptedException {
            for (String token : data.toString().split(" ")) {
                word.set(token);
                context.write(word, ONE);
            }
        }
    }

    /**
     * A reducer that sums up the counts for each key.
     */
    public static class Counter extends Reducer<Text, IntWritable, String, Long> {

        @Override
        public void reduce(Text key, Iterable<IntWritable> values, Context context)
                throws IOException, InterruptedException {
            long sum = 0L;
            for (IntWritable value : values) {
                sum += value.get();
            }
            context.write(key.toString(), sum);
        }
    }
}