com.sparkz.streamcount.WordCount.java Source code

Java tutorial

Introduction

Here is the source code for com.sparkz.streamcount.WordCount.java

Source

/**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package com.sparkz.streamcount;

import java.util.Arrays;
import java.util.List;
import java.util.regex.Pattern;

import org.apache.spark.SparkConf;
import org.apache.spark.SparkContext;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.StorageLevels;
import org.apache.spark.api.java.function.FlatMapFunction;
import org.apache.spark.api.java.function.Function;
import org.apache.spark.api.java.function.Function3;
import org.apache.spark.api.java.function.PairFunction;
//import org.apache.spark.wordcount.State;
//import org.apache.spark.wordcount.StateSpec;
//import org.apache.spark.ex.util.*;
//import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.streaming.Duration;
import org.apache.spark.streaming.State;
import org.apache.spark.streaming.StateSpec;
import org.apache.spark.streaming.api.java.JavaDStream;
import org.apache.spark.streaming.api.java.JavaMapWithStateDStream;
import org.apache.spark.streaming.api.java.JavaPairDStream;
import org.apache.spark.streaming.api.java.JavaReceiverInputDStream;
import org.apache.spark.streaming.api.java.JavaStreamingContext;

import scala.Tuple2;

import com.google.common.base.Optional;

public class WordCount {
    private static final Pattern SPACE = Pattern.compile(" ");

    public static void main(String[] args) {

        SparkConf config = new SparkConf();
        config.setAppName("Word Count");
        Duration batchDuration = new Duration(1000);
        JavaSparkContext ctx = new JavaSparkContext(config);
        JavaSparkContext.jarOfClass(org.apache.spark.streaming.State.class);
        JavaSparkContext.jarOfClass(org.apache.spark.streaming.StateSpec.class);
        ctx.addFile("/home/cloudera/Downloads/spark-streaming_2.10-1.6.0.jar");
        JavaStreamingContext jssc = new JavaStreamingContext(ctx, batchDuration);
        jssc.checkpoint(".");
        final int threshold = Integer.parseInt(args[0]);

        // Initial state RDD input to mapWithState
        @SuppressWarnings("unchecked")
        List<Tuple2<String, Integer>> tuples = Arrays.asList(new Tuple2<String, Integer>("hello", 1),
                new Tuple2<String, Integer>("world", 1));
        JavaPairRDD<String, Integer> initialRDD = jssc.sparkContext().parallelizePairs(tuples);

        JavaReceiverInputDStream<String> lines = jssc.socketTextStream("127.0.0.1", 37337,
                StorageLevels.MEMORY_AND_DISK_SER_2);

        // split each document into words
        JavaDStream<String> tokenized = lines.flatMap(new FlatMapFunction<String, String>() {
            private static final long serialVersionUID = 1L;

            @Override
            public Iterable<String> call(String s) {
                return Arrays.asList(SPACE.split(s));
            }
        });

        // count the occurrence of each word
        JavaPairDStream<String, Integer> wordsDstream = tokenized
                .mapToPair(new PairFunction<String, String, Integer>() {
                    private static final long serialVersionUID = 1L;

                    @Override
                    public Tuple2<String, Integer> call(String s) {
                        return new Tuple2<String, Integer>(s, 1);
                    }
                });

        // Update the cumulative count function
        final Function3<String, Optional<Integer>, State<Integer>, Tuple2<String, Integer>> mappingFunc = new Function3<String, Optional<Integer>, State<Integer>, Tuple2<String, Integer>>() {

            private static final long serialVersionUID = 1L;

            @Override
            public Tuple2<String, Integer> call(String word, Optional<Integer> one, State<Integer> state) {
                int sum = one.or(0) + (state.exists() ? state.get() : 0);
                Tuple2<String, Integer> output = new Tuple2<String, Integer>(word, sum);
                state.update(sum);
                return output;
            }
        };

        // DStream made of get cumulative counts that get updated in every batch
        JavaMapWithStateDStream<String, Integer, Integer, Tuple2<String, Integer>> stateDstream = wordsDstream
                .mapWithState(StateSpec.function(mappingFunc).initialState(initialRDD));

        stateDstream.print();

        JavaDStream<Tuple2<String, Integer>> filteredStream = stateDstream
                .filter(new Function<Tuple2<String, Integer>, Boolean>() {

                    private static final long serialVersionUID = 1L;

                    @Override
                    public Boolean call(Tuple2<String, Integer> state) throws Exception {
                        return state._2 > threshold;
                    }
                });

        filteredStream.print();

        jssc.start();
        jssc.awaitTermination();

        jssc.close();

    }
}