co.cask.cdap.examples.wikipedia.TopNMapReduce.java Source code

Introduction

Here is the source code for co.cask.cdap.examples.wikipedia.TopNMapReduce.java
Source

/*
 * Copyright  2015-2016 Cask Data, Inc.
 *
 * Licensed under the Apache License, Version 2.0 (the "License"); you may not
 * use this file except in compliance with the License. You may obtain a copy of
 * the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
 * License for the specific language governing permissions and limitations under
 * the License.
 */

package co.cask.cdap.examples.wikipedia;

import co.cask.cdap.api.ProgramLifecycle;
import co.cask.cdap.api.Resources;
import co.cask.cdap.api.common.Bytes;
import co.cask.cdap.api.data.batch.Input;
import co.cask.cdap.api.data.batch.Output;
import co.cask.cdap.api.mapreduce.AbstractMapReduce;
import co.cask.cdap.api.mapreduce.MapReduceContext;
import co.cask.cdap.api.workflow.Value;
import co.cask.cdap.api.workflow.WorkflowToken;
import com.google.common.collect.Iterables;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;

import java.io.IOException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Comparator;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.StringTokenizer;

/**
 * MapReduce program that outputs the top N words from the input text.
 */
public class TopNMapReduce extends AbstractMapReduce {
    public static final String NAME = TopNMapReduce.class.getSimpleName();

    @Override
    protected void configure() {
        setName(NAME);
        setDescription("A MapReduce job that returns the top-n words in a dataset.");
        setMapperResources(new Resources(512));
        setReducerResources(new Resources(512));
    }

    @Override
    public void beforeSubmit(MapReduceContext context) throws Exception {
        Map<String, String> runtimeArguments = context.getRuntimeArguments();
        Job job = context.getHadoopJob();
        WorkflowToken workflowToken = context.getWorkflowToken();
        int topNRank = 10;
        if (runtimeArguments.containsKey("topn.rank")) {
            topNRank = Integer.parseInt(runtimeArguments.get("topn.rank"));
        }
        if (workflowToken != null) {
            workflowToken.put("topn.rank", Value.of(topNRank));
        }
        int numReduceTasks = 1;
        if (runtimeArguments.containsKey("num.reduce.tasks")) {
            numReduceTasks = Integer.parseInt(runtimeArguments.get("num.reduce.tasks"));
        }
        job.setNumReduceTasks(numReduceTasks);
        job.setMapperClass(TokenizerMapper.class);
        job.setReducerClass(TopNReducer.class);
        context.addInput(Input.ofDataset(WikipediaPipelineApp.NORMALIZED_WIKIPEDIA_DATASET));
        context.addOutput(Output.ofDataset(WikipediaPipelineApp.MAPREDUCE_TOPN_OUTPUT));
    }

    @Override
    public void onFinish(boolean succeeded, MapReduceContext context) throws Exception {
        WorkflowToken workflowToken = context.getWorkflowToken();
        if (workflowToken != null) {
            workflowToken.put("result", Value.of(succeeded));
        }
    }

    /**
     * Mapper that emits tokens.
     */
    public static class TokenizerMapper extends Mapper<byte[], byte[], Text, IntWritable> {
        private final Text outputKey = new Text();
        private final IntWritable one = new IntWritable(1);

        @Override
        protected void map(byte[] key, byte[] value, Context context) throws IOException, InterruptedException {
            StringTokenizer tokenizer = new StringTokenizer(Bytes.toString(value));
            while (tokenizer.hasMoreTokens()) {
                String word = tokenizer.nextToken().trim();
                outputKey.set(word);
                context.write(outputKey, one);
            }
        }
    }

    /**
     * Reducer that outputs top N tokens. Implements {@link ProgramLifecycle} to demonstrate accessing workflow
     * token in a Mapper/Reducer class.
     */
    public static class TopNReducer extends Reducer<Text, IntWritable, byte[], byte[]>
            implements ProgramLifecycle<MapReduceContext> {
        private final Map<Text, Integer> countMap = new HashMap<>();
        private int n = 10;

        @Override
        public void initialize(MapReduceContext context) throws Exception {
            WorkflowToken workflowToken = context.getWorkflowToken();
            if (workflowToken != null) {
                Value value = workflowToken.get("topn.rank");
                if (value != null) {
                    n = value.getAsInt();
                }
            }
        }

        @Override
        protected void reduce(Text key, Iterable<IntWritable> values, Context context)
                throws IOException, InterruptedException {
            int sum = Iterables.size(values);
            // Need to create a new Text instance because the instance we receive is the same for all records.
            countMap.put(new Text(key), sum);
        }

        @Override
        protected void cleanup(Context context) throws IOException, InterruptedException {
            List<Map.Entry<Text, Integer>> entries = new ArrayList<>(countMap.entrySet());
            Collections.sort(entries, new Comparator<Map.Entry<Text, Integer>>() {
                @Override
                public int compare(Map.Entry<Text, Integer> o1, Map.Entry<Text, Integer> o2) {
                    return o2.getValue().compareTo(o1.getValue());
                }
            });
            for (int i = 0; i < n; i++) {
                Map.Entry<Text, Integer> entry = entries.get(i);
                context.write(Bytes.toBytes(entry.getKey().toString()), Bytes.toBytes(entry.getValue()));
                context.getCounter("custom", "num.records").increment(1);
            }
        }

        @Override
        public void destroy() {
            // no-op
        }
    }
}