org.apache.spark.examples.streaming.JavaKinesisWordCountASL.java Source code

Java tutorial

Introduction

Here is the source code for org.apache.spark.examples.streaming.JavaKinesisWordCountASL.java

Source

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.spark.examples.streaming;

import java.nio.charset.StandardCharsets;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Iterator;
import java.util.List;
import java.util.regex.Pattern;

import org.apache.spark.SparkConf;
import org.apache.spark.api.java.function.FlatMapFunction;
import org.apache.spark.api.java.function.Function2;
import org.apache.spark.api.java.function.PairFunction;
import org.apache.spark.storage.StorageLevel;
import org.apache.spark.streaming.Duration;
import org.apache.spark.streaming.api.java.JavaDStream;
import org.apache.spark.streaming.api.java.JavaPairDStream;
import org.apache.spark.streaming.api.java.JavaStreamingContext;
import org.apache.spark.streaming.kinesis.KinesisUtils;

import scala.Tuple2;

import com.amazonaws.auth.DefaultAWSCredentialsProviderChain;
import com.amazonaws.services.kinesis.AmazonKinesisClient;
import com.amazonaws.services.kinesis.clientlibrary.lib.worker.InitialPositionInStream;

/**
 * Consumes messages from a Amazon Kinesis streams and does wordcount.
 *
 * This example spins up 1 Kinesis Receiver per shard for the given stream.
 * It then starts pulling from the last checkpointed sequence number of the given stream.
 *
 * Usage: JavaKinesisWordCountASL [app-name] [stream-name] [endpoint-url] [region-name]
 *   [app-name] is the name of the consumer app, used to track the read data in DynamoDB
 *   [stream-name] name of the Kinesis stream (ie. mySparkStream)
 *   [endpoint-url] endpoint of the Kinesis service
 *     (e.g. https://kinesis.us-east-1.amazonaws.com)
 *
 *
 * Example:
 *      # export AWS keys if necessary
 *      $ export AWS_ACCESS_KEY_ID=[your-access-key]
 *      $ export AWS_SECRET_KEY=<your-secret-key>
 *
 *      # run the example
 *      $ SPARK_HOME/bin/run-example   streaming.JavaKinesisWordCountASL myAppName  mySparkStream \
 *             https://kinesis.us-east-1.amazonaws.com
 *
 * There is a companion helper class called KinesisWordProducerASL which puts dummy data
 * onto the Kinesis stream.
 *
 * This code uses the DefaultAWSCredentialsProviderChain to find credentials
 * in the following order:
 *    Environment Variables - AWS_ACCESS_KEY_ID and AWS_SECRET_KEY
 *    Java System Properties - aws.accessKeyId and aws.secretKey
 *    Credential profiles file - default location (~/.aws/credentials) shared by all AWS SDKs
 *    Instance profile credentials - delivered through the Amazon EC2 metadata service
 * For more information, see
 * http://docs.aws.amazon.com/AWSSdkDocsJava/latest/DeveloperGuide/credentials.html
 *
 * See http://spark.apache.org/docs/latest/streaming-kinesis-integration.html for more details on
 * the Kinesis Spark Streaming integration.
 */
public final class JavaKinesisWordCountASL { // needs to be public for access from run-example
    private static final Pattern WORD_SEPARATOR = Pattern.compile(" ");

    public static void main(String[] args) throws Exception {
        // Check that all required args were passed in.
        if (args.length != 3) {
            System.err.println("Usage: JavaKinesisWordCountASL <stream-name> <endpoint-url>\n\n"
                    + "    <app-name> is the name of the app, used to track the read data in DynamoDB\n"
                    + "    <stream-name> is the name of the Kinesis stream\n"
                    + "    <endpoint-url> is the endpoint of the Kinesis service\n"
                    + "                   (e.g. https://kinesis.us-east-1.amazonaws.com)\n"
                    + "Generate data for the Kinesis stream using the example KinesisWordProducerASL.\n"
                    + "See http://spark.apache.org/docs/latest/streaming-kinesis-integration.html for more\n"
                    + "details.\n");
            System.exit(1);
        }

        // Set default log4j logging level to WARN to hide Spark logs
        StreamingExamples.setStreamingLogLevels();

        // Populate the appropriate variables from the given args
        String kinesisAppName = args[0];
        String streamName = args[1];
        String endpointUrl = args[2];

        // Create a Kinesis client in order to determine the number of shards for the given stream
        AmazonKinesisClient kinesisClient = new AmazonKinesisClient(new DefaultAWSCredentialsProviderChain());
        kinesisClient.setEndpoint(endpointUrl);
        int numShards = kinesisClient.describeStream(streamName).getStreamDescription().getShards().size();

        // In this example, we're going to create 1 Kinesis Receiver/input DStream for each shard.
        // This is not a necessity; if there are less receivers/DStreams than the number of shards,
        // then the shards will be automatically distributed among the receivers and each receiver
        // will receive data from multiple shards.
        int numStreams = numShards;

        // Spark Streaming batch interval
        Duration batchInterval = new Duration(2000);

        // Kinesis checkpoint interval.  Same as batchInterval for this example.
        Duration kinesisCheckpointInterval = batchInterval;

        // Get the region name from the endpoint URL to save Kinesis Client Library metadata in
        // DynamoDB of the same region as the Kinesis stream
        String regionName = KinesisExampleUtils.getRegionNameByEndpoint(endpointUrl);

        // Setup the Spark config and StreamingContext
        SparkConf sparkConfig = new SparkConf().setAppName("JavaKinesisWordCountASL");
        JavaStreamingContext jssc = new JavaStreamingContext(sparkConfig, batchInterval);

        // Create the Kinesis DStreams
        List<JavaDStream<byte[]>> streamsList = new ArrayList<>(numStreams);
        for (int i = 0; i < numStreams; i++) {
            streamsList.add(KinesisUtils.createStream(jssc, kinesisAppName, streamName, endpointUrl, regionName,
                    InitialPositionInStream.LATEST, kinesisCheckpointInterval, StorageLevel.MEMORY_AND_DISK_2()));
        }

        // Union all the streams if there is more than 1 stream
        JavaDStream<byte[]> unionStreams;
        if (streamsList.size() > 1) {
            unionStreams = jssc.union(streamsList.get(0), streamsList.subList(1, streamsList.size()));
        } else {
            // Otherwise, just use the 1 stream
            unionStreams = streamsList.get(0);
        }

        // Convert each line of Array[Byte] to String, and split into words
        JavaDStream<String> words = unionStreams.flatMap(new FlatMapFunction<byte[], String>() {
            @Override
            public Iterator<String> call(byte[] line) {
                String s = new String(line, StandardCharsets.UTF_8);
                return Arrays.asList(WORD_SEPARATOR.split(s)).iterator();
            }
        });

        // Map each word to a (word, 1) tuple so we can reduce by key to count the words
        JavaPairDStream<String, Integer> wordCounts = words.mapToPair(new PairFunction<String, String, Integer>() {
            @Override
            public Tuple2<String, Integer> call(String s) {
                return new Tuple2<>(s, 1);
            }
        }).reduceByKey(new Function2<Integer, Integer, Integer>() {
            @Override
            public Integer call(Integer i1, Integer i2) {
                return i1 + i2;
            }
        });

        // Print the first 10 wordCounts
        wordCounts.print();

        // Start the streaming context and await termination
        jssc.start();
        jssc.awaitTermination();
    }
}