co.cask.cdap.examples.datacleansing.DataCleansingMapReduce.java Source code

Introduction

Here is the source code for co.cask.cdap.examples.datacleansing.DataCleansingMapReduce.java
Source

/*
 * Copyright  2015-2016 Cask Data, Inc.
 *
 * Licensed under the Apache License, Version 2.0 (the "License"); you may not
 * use this file except in compliance with the License. You may obtain a copy of
 * the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
 * License for the specific language governing permissions and limitations under
 * the License.
 */

package co.cask.cdap.examples.datacleansing;

import co.cask.cdap.api.ProgramLifecycle;
import co.cask.cdap.api.Resources;
import co.cask.cdap.api.data.batch.Input;
import co.cask.cdap.api.data.batch.Output;
import co.cask.cdap.api.data.schema.Schema;
import co.cask.cdap.api.dataset.lib.DynamicPartitioner;
import co.cask.cdap.api.dataset.lib.PartitionKey;
import co.cask.cdap.api.dataset.lib.PartitionedFileSetArguments;
import co.cask.cdap.api.dataset.lib.partitioned.KVTableStatePersistor;
import co.cask.cdap.api.dataset.lib.partitioned.PartitionBatchInput;
import co.cask.cdap.api.mapreduce.AbstractMapReduce;
import co.cask.cdap.api.mapreduce.MapReduceContext;
import co.cask.cdap.api.mapreduce.MapReduceTaskContext;
import co.cask.cdap.api.metrics.Metrics;
import com.google.common.collect.ImmutableMap;
import com.google.gson.JsonParser;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;

import java.io.IOException;
import java.util.HashMap;
import java.util.Map;

/**
 * A simple MapReduce that reads records from the rawRecords PartitionedFileSet and writes all records
 * that match a particular {@link Schema} to the cleanRecords PartitionedFileSet. It also keeps track of its state of
 * which partitions it has processed, so that it only processes new partitions of data each time it runs.
 */
public class DataCleansingMapReduce extends AbstractMapReduce {
    protected static final String NAME = "DataCleansingMapReduce";
    protected static final String OUTPUT_PARTITION_KEY = "output.partition.key";
    protected static final String SCHEMA_KEY = "schema.key";

    private PartitionBatchInput.BatchPartitionCommitter partitionCommitter;

    @Override
    public void configure() {
        setName(NAME);
        setMapperResources(new Resources(1024));
        setReducerResources(new Resources(1024));
    }

    @Override
    public void beforeSubmit(MapReduceContext context) throws Exception {
        partitionCommitter = PartitionBatchInput.setInput(context, DataCleansing.RAW_RECORDS,
                new KVTableStatePersistor(DataCleansing.CONSUMING_STATE, "state.key"));

        // Each run writes its output to a partition for the league
        Long timeKey = Long.valueOf(context.getRuntimeArguments().get(OUTPUT_PARTITION_KEY));
        PartitionKey outputKey = PartitionKey.builder().addLongField("time", timeKey).build();

        Map<String, String> metadataToAssign = ImmutableMap.of("source.program", "DataCleansingMapReduce");

        // set up two outputs - one for invalid records and one for valid records
        Map<String, String> invalidRecordsArgs = new HashMap<>();
        PartitionedFileSetArguments.setOutputPartitionKey(invalidRecordsArgs, outputKey);
        PartitionedFileSetArguments.setOutputPartitionMetadata(invalidRecordsArgs, metadataToAssign);
        context.addOutput(Output.ofDataset(DataCleansing.INVALID_RECORDS, invalidRecordsArgs));

        Map<String, String> cleanRecordsArgs = new HashMap<>();
        PartitionedFileSetArguments.setDynamicPartitioner(cleanRecordsArgs, TimeAndZipPartitioner.class);
        PartitionedFileSetArguments.setOutputPartitionMetadata(cleanRecordsArgs, metadataToAssign);
        context.addOutput(Output.ofDataset(DataCleansing.CLEAN_RECORDS, cleanRecordsArgs));

        Job job = context.getHadoopJob();
        job.setMapperClass(SchemaMatchingFilter.class);
        job.setNumReduceTasks(0);

        // simply propagate the schema (if any) to be used by the mapper
        String schemaJson = context.getRuntimeArguments().get(SCHEMA_KEY);
        if (schemaJson != null) {
            job.getConfiguration().set(SCHEMA_KEY, schemaJson);
        }
    }

    @Override
    public void onFinish(boolean succeeded, MapReduceContext context) throws Exception {
        partitionCommitter.onFinish(succeeded);
    }

    /**
     * Partitions the records based upon a runtime argument (time) and a field extracted from the text being written (zip)
     */
    public static final class TimeAndZipPartitioner extends DynamicPartitioner<NullWritable, Text> {

        private Long time;
        private JsonParser jsonParser;

        @Override
        public void initialize(MapReduceTaskContext<NullWritable, Text> mapReduceTaskContext) {
            this.time = Long.valueOf(mapReduceTaskContext.getRuntimeArguments().get(OUTPUT_PARTITION_KEY));
            this.jsonParser = new JsonParser();
        }

        @Override
        public PartitionKey getPartitionKey(NullWritable key, Text value) {
            int zip = jsonParser.parse(value.toString()).getAsJsonObject().get("zip").getAsInt();
            return PartitionKey.builder().addLongField("time", time).addIntField("zip", zip).build();
        }
    }

    /**
     * A Mapper which skips text that doesn't match a given schema.
     */
    public static class SchemaMatchingFilter extends Mapper<LongWritable, Text, NullWritable, Text>
            implements ProgramLifecycle<MapReduceTaskContext<NullWritable, Text>> {
        public static final Schema DEFAULT_SCHEMA = Schema.recordOf("person",
                Schema.Field.of("pid", Schema.of(Schema.Type.LONG)),
                Schema.Field.of("name", Schema.of(Schema.Type.STRING)),
                Schema.Field.of("dob", Schema.of(Schema.Type.STRING)),
                Schema.Field.of("zip", Schema.of(Schema.Type.INT)));

        private SimpleSchemaMatcher schemaMatcher;
        private Metrics mapMetrics;
        private MapReduceTaskContext<NullWritable, Text> mapReduceTaskContext;

        @Override
        public void initialize(MapReduceTaskContext<NullWritable, Text> context) throws Exception {
            this.mapReduceTaskContext = context;
        }

        @Override
        public void destroy() {
        }

        @Override
        protected void setup(Context context) throws IOException, InterruptedException {
            // setup the schema to be used by the mapper
            String schemaJson = context.getConfiguration().get(SCHEMA_KEY);
            if (schemaJson == null) {
                schemaMatcher = new SimpleSchemaMatcher(DEFAULT_SCHEMA);
            } else {
                schemaMatcher = new SimpleSchemaMatcher(Schema.parseJson(schemaJson));
            }
        }

        public void map(LongWritable key, Text data, MapReduceTaskContext<NullWritable, Text> context)
                throws IOException, InterruptedException {
            if (!schemaMatcher.matches(data.toString())) {
                context.write(DataCleansing.INVALID_RECORDS, NullWritable.get(), data);
                mapMetrics.count("records.invalid", 1);
            } else {
                context.write(DataCleansing.CLEAN_RECORDS, NullWritable.get(), data);
                mapMetrics.count("records.valid", 1);
            }
        }

        @Override
        protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
            map(key, value, this.mapReduceTaskContext);
        }
    }
}