co.cask.cdap.examples.clicksandviews.ClicksAndViewsMapReduce.java Source code

Introduction

Here is the source code for co.cask.cdap.examples.clicksandviews.ClicksAndViewsMapReduce.java
Source

/*
 * Copyright  2016 Cask Data, Inc.
 *
 * Licensed under the Apache License, Version 2.0 (the "License"); you may not
 * use this file except in compliance with the License. You may obtain a copy of
 * the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
 * License for the specific language governing permissions and limitations under
 * the License.
 */

package co.cask.cdap.examples.clicksandviews;

import co.cask.cdap.api.ProgramLifecycle;
import co.cask.cdap.api.Resources;
import co.cask.cdap.api.data.batch.Input;
import co.cask.cdap.api.data.batch.Output;
import co.cask.cdap.api.dataset.lib.PartitionKey;
import co.cask.cdap.api.dataset.lib.PartitionedFileSet;
import co.cask.cdap.api.dataset.lib.PartitionedFileSetArguments;
import co.cask.cdap.api.mapreduce.AbstractMapReduce;
import co.cask.cdap.api.mapreduce.MapReduceContext;
import co.cask.cdap.api.mapreduce.MapReduceTaskContext;
import com.google.common.base.Joiner;
import com.google.common.base.Preconditions;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;

import java.io.IOException;
import java.util.HashMap;
import java.util.Map;

/**
 * A MapReduce program that reads from a CLICKS stream and a VIEWS stream. It performs a join across these two streams,
 * based upon the viewId of the records. It then keeps track of how many clicks a particular view resulted in.
 */
public class ClicksAndViewsMapReduce extends AbstractMapReduce {
    static final String NAME = "ClicksAndViewsMapReduce";

    private static final Joiner TAB_JOINER = Joiner.on("\t");

    @Override
    public void configure() {
        setName(NAME);
        setMapperResources(new Resources(1024));
        setReducerResources(new Resources(1024));
    }

    @Override
    public void beforeSubmit(MapReduceContext context) throws Exception {
        context.addInput(Input.ofStream(ClicksAndViews.CLICKS));
        context.addInput(Input.ofStream(ClicksAndViews.VIEWS));

        PartitionedFileSet joinedPFS = context.getDataset(ClicksAndViews.JOINED);
        PartitionKey outputPartitionKey = PartitionedFileSetArguments
                .getOutputPartitionKey(context.getRuntimeArguments(), joinedPFS.getPartitioning());

        if (outputPartitionKey == null) {
            outputPartitionKey = PartitionKey.builder().addLongField("runtime", context.getLogicalStartTime())
                    .build();
        }

        Map<String, String> outputArgs = new HashMap<>();
        PartitionedFileSetArguments.setOutputPartitionKey(outputArgs, outputPartitionKey);
        context.addOutput(Output.ofDataset(ClicksAndViews.JOINED, outputArgs));

        Job job = context.getHadoopJob();
        job.setMapperClass(ImpressionKeyingMapper.class);
        job.setReducerClass(JoiningReducer.class);
    }

    /**
     * A Mapper which tags all of the records with the source that it is coming from.
     * It also keys all of the output records with the viewId, so that a single reducer gets all of the records
     * for a particular viewId.
     */
    public static class ImpressionKeyingMapper extends Mapper<LongWritable, Text, LongWritable, Text>
            implements ProgramLifecycle<MapReduceTaskContext<LongWritable, Text>> {

        private String inputName;

        @Override
        public void initialize(MapReduceTaskContext<LongWritable, Text> context) throws Exception {
            inputName = context.getInputName();
            Preconditions.checkNotNull(inputName);
            Preconditions.checkArgument(
                    ClicksAndViews.CLICKS.equals(inputName) || ClicksAndViews.VIEWS.equals(inputName));
        }

        @Override
        public void destroy() {
        }

        @Override
        protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
            String[] parts = value.toString().split("\t");
            // tag each record with the input name, so the reducer is simpler
            context.write(new LongWritable(Long.valueOf(parts[0])),
                    new Text(TAB_JOINER.join(inputName, value.toString())));
        }
    }

    /**
     * Reducer class which looks at all the records for the viewId.
     * It outputs the view record, while appending the count of how many click records there are for it.
     */
    public static class JoiningReducer extends Reducer<LongWritable, Text, NullWritable, String> {

        @Override
        public void reduce(LongWritable key, Iterable<Text> values, Context context)
                throws IOException, InterruptedException {
            String viewData = null;
            int totalClicks = 0;

            for (Text value : values) {
                String[] parts = value.toString().split("\t", 2);
                String source = parts[0];
                if (ClicksAndViews.CLICKS.equals(source)) {
                    totalClicks += 1;
                } else if (ClicksAndViews.VIEWS.equals(source)) {
                    viewData = parts[1];
                }
            }
            Preconditions.checkNotNull(viewData);
            // the viewId (which we key on from the mapper output) is already in the viewData
            context.write(NullWritable.get(), TAB_JOINER.join(viewData, totalClicks));
        }
    }
}