co.cask.cdap.etl.batch.mapreduce.MapReduceTransformExecutorFactory.java Source code

Introduction

Here is the source code for co.cask.cdap.etl.batch.mapreduce.MapReduceTransformExecutorFactory.java
Source

/*
 * Copyright  2016 Cask Data, Inc.
 *
 * Licensed under the Apache License, Version 2.0 (the "License"); you may not
 * use this file except in compliance with the License. You may obtain a copy of
 * the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
 * License for the specific language governing permissions and limitations under
 * the License.
 */

package co.cask.cdap.etl.batch.mapreduce;

import co.cask.cdap.api.dataset.lib.KeyValue;
import co.cask.cdap.api.mapreduce.MapReduceTaskContext;
import co.cask.cdap.api.metrics.Metrics;
import co.cask.cdap.etl.api.Aggregator;
import co.cask.cdap.etl.api.Emitter;
import co.cask.cdap.etl.api.StageMetrics;
import co.cask.cdap.etl.api.Transformation;
import co.cask.cdap.etl.api.batch.BatchAggregator;
import co.cask.cdap.etl.api.batch.BatchRuntimeContext;
import co.cask.cdap.etl.batch.PipelinePluginInstantiator;
import co.cask.cdap.etl.batch.TransformExecutorFactory;
import co.cask.cdap.etl.batch.conversion.WritableConversion;
import co.cask.cdap.etl.batch.conversion.WritableConversions;
import co.cask.cdap.etl.common.DatasetContextLookupProvider;
import co.cask.cdap.etl.common.DefaultEmitter;
import co.cask.cdap.etl.common.DefaultStageMetrics;
import co.cask.cdap.etl.common.TrackedTransform;
import com.google.common.base.Function;
import com.google.common.collect.Iterators;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.mapreduce.JobContext;
import org.apache.hadoop.mapreduce.Mapper;

import java.util.HashMap;
import java.util.Iterator;
import java.util.Map;
import javax.annotation.Nullable;

/**
 * Creates transform executors for mapreduce programs.
 *
 * @param <T> the type of input for the created transform executors
 */
public class MapReduceTransformExecutorFactory<T> extends TransformExecutorFactory<T> {
    private final Map<String, Map<String, String>> pluginRuntimeArgs;
    private final MapReduceTaskContext taskContext;
    private final String mapOutputKeyClassName;
    private final String mapOutputValClassName;
    private final boolean isMapper;

    public MapReduceTransformExecutorFactory(MapReduceTaskContext taskContext,
            PipelinePluginInstantiator pluginInstantiator, Metrics metrics,
            Map<String, Map<String, String>> pluginRuntimeArgs) {
        super(pluginInstantiator, metrics);
        this.taskContext = taskContext;
        this.pluginRuntimeArgs = pluginRuntimeArgs;
        JobContext hadoopContext = (JobContext) taskContext.getHadoopContext();
        Configuration hConf = hadoopContext.getConfiguration();
        this.mapOutputKeyClassName = hConf.get(ETLMapReduce.GROUP_KEY_CLASS);
        this.mapOutputValClassName = hConf.get(ETLMapReduce.GROUP_VAL_CLASS);
        this.isMapper = hadoopContext instanceof Mapper.Context;
    }

    @Override
    protected BatchRuntimeContext createRuntimeContext(String stageName) {
        Map<String, String> stageRuntimeArgs = pluginRuntimeArgs.get(stageName);
        if (stageRuntimeArgs == null) {
            stageRuntimeArgs = new HashMap<>();
        }
        return new MapReduceRuntimeContext(taskContext, metrics, new DatasetContextLookupProvider(taskContext),
                stageName, stageRuntimeArgs);
    }

    @SuppressWarnings("unchecked")
    @Override
    protected TrackedTransform getTransformation(String pluginType, String stageName) throws Exception {
        if (BatchAggregator.PLUGIN_TYPE.equals(pluginType)) {
            BatchAggregator<?, ?, ?> batchAggregator = pluginInstantiator.newPluginInstance(stageName);
            BatchRuntimeContext runtimeContext = createRuntimeContext(stageName);
            batchAggregator.initialize(runtimeContext);
            StageMetrics stageMetrics = new DefaultStageMetrics(metrics, stageName);
            if (isMapper) {
                return getTrackedGroupStep(new MapperAggregatorTransformation(batchAggregator,
                        mapOutputKeyClassName, mapOutputValClassName), stageMetrics);
            } else {
                return getTrackedAggregateStep(new ReducerAggregatorTransformation(batchAggregator,
                        mapOutputKeyClassName, mapOutputValClassName), stageMetrics);
            }
        }
        return super.getTransformation(pluginType, stageName);
    }

    /**
     * A Transformation that uses an aggregator's groupBy method. Supports applying a function to the types
     * returned by the aggregator. These functions are used when the aggregator outputs group keys that are not
     * WritableComparable and values that are not Writable. For example, aggregators that output StructuredRecord
     * will need some function to change a StructuredRecord to a StructuredRecordWritable so that we can use this
     * in mapreduce.
     *
     * @param <GROUP_KEY> type of group key output by the aggregator
     * @param <GROUP_VAL> type of group value used by the aggregator
     * @param <OUT_KEY> type of output key for mapreduce. Must implement WritableComparable
     * @param <OUT_VAL> type of output value for mapreduce. Must implement Writable
     */
    private static class MapperAggregatorTransformation<GROUP_KEY, GROUP_VAL, OUT_KEY extends Writable, OUT_VAL extends Writable>
            implements Transformation<GROUP_VAL, KeyValue<OUT_KEY, OUT_VAL>> {
        private final Aggregator<GROUP_KEY, GROUP_VAL, ?> aggregator;
        private final DefaultEmitter<GROUP_KEY> groupKeyEmitter;
        private final WritableConversion<GROUP_KEY, OUT_KEY> keyConversion;
        private final WritableConversion<GROUP_VAL, OUT_VAL> valConversion;

        public MapperAggregatorTransformation(Aggregator<GROUP_KEY, GROUP_VAL, ?> aggregator,
                String groupKeyClassName, String groupValClassName) {
            this.aggregator = aggregator;
            this.groupKeyEmitter = new DefaultEmitter<>();
            WritableConversion<GROUP_KEY, OUT_KEY> keyConversion = WritableConversions
                    .getConversion(groupKeyClassName);
            WritableConversion<GROUP_VAL, OUT_VAL> valConversion = WritableConversions
                    .getConversion(groupValClassName);
            // if the conversion is null, it means the user is using a Writable already
            this.keyConversion = keyConversion == null ? new CastConversion<GROUP_KEY, OUT_KEY>() : keyConversion;
            this.valConversion = valConversion == null ? new CastConversion<GROUP_VAL, OUT_VAL>() : valConversion;
        }

        @Override
        public void transform(GROUP_VAL input, Emitter<KeyValue<OUT_KEY, OUT_VAL>> emitter) throws Exception {
            groupKeyEmitter.reset();
            aggregator.groupBy(input, groupKeyEmitter);
            for (GROUP_KEY groupKey : groupKeyEmitter.getEntries()) {
                emitter.emit(new KeyValue<>(keyConversion.toWritable(groupKey), valConversion.toWritable(input)));
            }
        }
    }

    /**
     * A Transformation that uses an aggregator's aggregate method. Supports applying a function to the types
     * send as input to the aggregator. These functions are used when the aggregator takes group keys that are not
     * WritableComparable and values that are not Writable. For example, aggregators that aggregate StructuredRecords
     * will need some function to change a StructuredRecordWritable to a StructuredRecord so that we can use this
     * in mapreduce.
     *
     * @param <GROUP_KEY> type of group key output by the aggregator
     * @param <GROUP_VAL> type of group value used by the aggregator
     * @param <REDUCE_KEY> type of reduce key for mapreduce. Must implement WritableComparable
     * @param <REDUCE_VAL> type of reduce value for mapreduce. Must implement Writable
     */
    private static class ReducerAggregatorTransformation<GROUP_KEY, GROUP_VAL, OUT, REDUCE_KEY extends WritableComparable, REDUCE_VAL extends Writable>
            implements Transformation<KeyValue<REDUCE_KEY, Iterator<REDUCE_VAL>>, OUT> {
        private final Aggregator<GROUP_KEY, GROUP_VAL, OUT> aggregator;
        private final WritableConversion<GROUP_KEY, REDUCE_KEY> keyConversion;
        private final WritableConversion<GROUP_VAL, REDUCE_VAL> valConversion;

        public ReducerAggregatorTransformation(Aggregator<GROUP_KEY, GROUP_VAL, OUT> aggregator,
                String groupKeyClassName, String groupValClassName) {
            this.aggregator = aggregator;
            WritableConversion<GROUP_KEY, REDUCE_KEY> keyConversion = WritableConversions
                    .getConversion(groupKeyClassName);
            WritableConversion<GROUP_VAL, REDUCE_VAL> valConversion = WritableConversions
                    .getConversion(groupValClassName);
            this.keyConversion = keyConversion == null ? new CastConversion<GROUP_KEY, REDUCE_KEY>()
                    : keyConversion;
            this.valConversion = valConversion == null ? new CastConversion<GROUP_VAL, REDUCE_VAL>()
                    : valConversion;
        }

        @Override
        public void transform(KeyValue<REDUCE_KEY, Iterator<REDUCE_VAL>> input, Emitter<OUT> emitter)
                throws Exception {
            GROUP_KEY groupKey = keyConversion.fromWritable(input.getKey());
            Iterator<GROUP_VAL> iter = Iterators.transform(input.getValue(), new Function<REDUCE_VAL, GROUP_VAL>() {
                @Nullable
                @Override
                public GROUP_VAL apply(@Nullable REDUCE_VAL input) {
                    return valConversion.fromWritable(input);
                }
            });
            aggregator.aggregate(groupKey, iter, emitter);
        }
    }

    /**
     * Conversion that doesn't do anything but cast types to each other.
     * This is used in the MapperAggregatorTransformation and ReducerAggregatorTransformation when the user is already
     * using a Writable class and we don't need to do any conversion.
     *
     * @param <T> type of object to convert to a Writable
     * @param <W> the Writable type to convert to
     */
    private static class CastConversion<T, W extends Writable> extends WritableConversion<T, W> {

        @Override
        public W toWritable(T val) {
            return (W) val;
        }

        @Override
        public T fromWritable(W val) {
            return (T) val;
        }
    }
}