co.cask.cdap.examples.wikipedia.StreamToDataset.java Source code

Introduction

Here is the source code for co.cask.cdap.examples.wikipedia.StreamToDataset.java
Source

/*
 * Copyright  2015 Cask Data, Inc.
 *
 * Licensed under the Apache License, Version 2.0 (the "License"); you may not
 * use this file except in compliance with the License. You may obtain a copy of
 * the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
 * License for the specific language governing permissions and limitations under
 * the License.
 */

package co.cask.cdap.examples.wikipedia;

import co.cask.cdap.api.Resources;
import co.cask.cdap.api.common.Bytes;
import co.cask.cdap.api.data.stream.StreamBatchReadable;
import co.cask.cdap.api.dataset.lib.KeyValueTable;
import co.cask.cdap.api.flow.flowlet.StreamEvent;
import co.cask.cdap.api.mapreduce.AbstractMapReduce;
import co.cask.cdap.api.mapreduce.MapReduceContext;
import co.cask.cdap.api.workflow.Value;
import co.cask.cdap.api.workflow.WorkflowToken;
import com.google.common.annotations.VisibleForTesting;
import com.google.gson.Gson;
import com.google.gson.annotations.SerializedName;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.io.IOException;
import java.util.UUID;

/**
 * MapReduce program that dumps events from a stream to a dataset.
 */
public class StreamToDataset extends AbstractMapReduce {
    private static final Logger LOG = LoggerFactory.getLogger(StreamToDataset.class);

    private final String name;

    public StreamToDataset(String name) {
        this.name = name;
    }

    @Override
    public void configure() {
        setName(name);
        setDescription("A MapReduce program that dumps events from a stream to a dataset.");
        setMapperResources(new Resources(512));
    }

    @Override
    public void beforeSubmit(MapReduceContext context) throws Exception {
        Job job = context.getHadoopJob();
        job.setNumReduceTasks(0);
        WorkflowToken workflowToken = context.getWorkflowToken();
        Class<? extends Mapper> mapper = PageTitleToDatasetMapper.class;
        String inputStream = WikipediaPipelineApp.PAGE_TITLES_STREAM;
        String outputDataset = WikipediaPipelineApp.PAGE_TITLES_DATASET;
        if (workflowToken != null) {
            Value likesToDatasetResult = workflowToken.get("result", WikipediaPipelineApp.LIKES_TO_DATASET_MR_NAME);
            if (likesToDatasetResult != null && likesToDatasetResult.getAsBoolean()) {
                // The "likes" stream to the dataset has already run and has been successful in this run so far.
                // Now run raw wikipedia stream to dataset.
                mapper = RawWikiDataToDatasetMapper.class;
                inputStream = WikipediaPipelineApp.RAW_WIKIPEDIA_STREAM;
                outputDataset = WikipediaPipelineApp.RAW_WIKIPEDIA_DATASET;
            }
        }
        LOG.info("Using '{}' as the input stream and '{}' as the output dataset.", inputStream, outputDataset);
        job.setMapperClass(mapper);
        StreamBatchReadable.useStreamInput(context, inputStream);
        context.addOutput(outputDataset);
    }

    @Override
    public void onFinish(boolean succeeded, MapReduceContext context) throws Exception {
        WorkflowToken workflowToken = context.getWorkflowToken();
        if (workflowToken != null) {
            workflowToken.put("result", Value.of(succeeded));
        }
    }

    /**
     * Mapper that dumps stream events to a {@link KeyValueTable}.
     */
    public static final class PageTitleToDatasetMapper extends Mapper<LongWritable, StreamEvent, byte[], byte[]> {
        private final Gson gson = new Gson();

        @Override
        protected void map(LongWritable timestamp, StreamEvent streamEvent, Context context)
                throws IOException, InterruptedException {
            String contents = Bytes.toString(streamEvent.getBody());
            Page page = gson.fromJson(contents, Page.class);
            context.write(Bytes.toBytes(page.getId()), Bytes.toBytes(page.getName()));
            // Increment the same counter from all map-reduce programs so we can use them for verification via
            // Workflow Token in tests as well as Condition Node Predicates where applicable.
            context.getCounter("custom", "num.records").increment(1);
        }

        @VisibleForTesting
        static class Page {
            private final String name;
            private final String id;
            @SuppressWarnings("unused")
            @SerializedName("created_time")
            private final String createdTime;

            Page(String name, String id, String createdTime) {
                this.name = name;
                this.id = id;
                this.createdTime = createdTime;
            }

            public String getName() {
                return name;
            }

            public String getId() {
                return id;
            }
        }
    }

    /**
     * Mapper that dumps raw Wikipedia data from a stream to a {@link KeyValueTable}.
     */
    public static final class RawWikiDataToDatasetMapper extends Mapper<LongWritable, StreamEvent, byte[], byte[]> {
        @Override
        protected void map(LongWritable key, StreamEvent streamEvent, Context context)
                throws IOException, InterruptedException {
            String contents = Bytes.toString(streamEvent.getBody());
            context.write(Bytes.toBytes(UUID.randomUUID()), Bytes.toBytes(contents));
            context.getCounter("custom", "num.records").increment(1);
        }
    }
}