com.dataartisans.flink.dataflow.examples.streaming.KafkaWindowedWordCountExample.java Source code

Java tutorial

Introduction

Here is the source code for com.dataartisans.flink.dataflow.examples.streaming.KafkaWindowedWordCountExample.java

Source

/*
 * Copyright 2015 Data Artisans GmbH
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package com.dataartisans.flink.dataflow.examples.streaming;

import com.dataartisans.flink.dataflow.FlinkPipelineRunner;
import com.dataartisans.flink.dataflow.translation.wrappers.streaming.io.UnboundedFlinkSource;
import com.google.cloud.dataflow.sdk.Pipeline;
import com.google.cloud.dataflow.sdk.io.Read;
import com.google.cloud.dataflow.sdk.io.TextIO;
import com.google.cloud.dataflow.sdk.options.Default;
import com.google.cloud.dataflow.sdk.options.Description;
import com.google.cloud.dataflow.sdk.options.PipelineOptionsFactory;
import com.google.cloud.dataflow.sdk.transforms.*;
import com.google.cloud.dataflow.sdk.transforms.windowing.*;
import com.google.cloud.dataflow.sdk.values.KV;
import com.google.cloud.dataflow.sdk.values.PCollection;
import org.apache.flink.streaming.connectors.kafka.FlinkKafkaConsumer08;
import org.apache.flink.streaming.util.serialization.SimpleStringSchema;
import org.joda.time.Duration;

import java.util.Properties;

public class KafkaWindowedWordCountExample {

    static final String KAFKA_TOPIC = "test"; // Default kafka topic to read from
    static final String KAFKA_BROKER = "localhost:9092"; // Default kafka broker to contact
    static final String GROUP_ID = "myGroup"; // Default groupId
    static final String ZOOKEEPER = "localhost:2181"; // Default zookeeper to connect to for Kafka

    public static class ExtractWordsFn extends DoFn<String, String> {
        private final Aggregator<Long, Long> emptyLines = createAggregator("emptyLines", new Sum.SumLongFn());

        @Override
        public void processElement(ProcessContext c) {
            if (c.element().trim().isEmpty()) {
                emptyLines.addValue(1L);
            }

            // Split the line into words.
            String[] words = c.element().split("[^a-zA-Z']+");

            // Output each word encountered into the output PCollection.
            for (String word : words) {
                if (!word.isEmpty()) {
                    c.output(word);
                }
            }
        }
    }

    public static class FormatAsStringFn extends DoFn<KV<String, Long>, String> {
        @Override
        public void processElement(ProcessContext c) {
            String row = c.element().getKey() + " - " + c.element().getValue() + " @ " + c.timestamp().toString();
            System.out.println(row);
            c.output(row);
        }
    }

    public interface KafkaStreamingWordCountOptions extends WindowedWordCount.StreamingWordCountOptions {
        @Description("The Kafka topic to read from")
        @Default.String(KAFKA_TOPIC)
        String getKafkaTopic();

        void setKafkaTopic(String value);

        @Description("The Kafka Broker to read from")
        @Default.String(KAFKA_BROKER)
        String getBroker();

        void setBroker(String value);

        @Description("The Zookeeper server to connect to")
        @Default.String(ZOOKEEPER)
        String getZookeeper();

        void setZookeeper(String value);

        @Description("The groupId")
        @Default.String(GROUP_ID)
        String getGroup();

        void setGroup(String value);

    }

    public static void main(String[] args) {
        PipelineOptionsFactory.register(KafkaStreamingWordCountOptions.class);
        KafkaStreamingWordCountOptions options = PipelineOptionsFactory.fromArgs(args)
                .as(KafkaStreamingWordCountOptions.class);
        options.setJobName("KafkaExample - WindowSize: " + options.getWindowSize() + " seconds");
        options.setStreaming(true);
        options.setCheckpointingInterval(1000L);
        options.setNumberOfExecutionRetries(5);
        options.setExecutionRetryDelay(3000L);
        options.setRunner(FlinkPipelineRunner.class);

        System.out.println(options.getKafkaTopic() + " " + options.getZookeeper() + " " + options.getBroker() + " "
                + options.getGroup());
        Pipeline pipeline = Pipeline.create(options);

        Properties p = new Properties();
        p.setProperty("zookeeper.connect", options.getZookeeper());
        p.setProperty("bootstrap.servers", options.getBroker());
        p.setProperty("group.id", options.getGroup());

        // this is the Flink consumer that reads the input to
        // the program from a kafka topic.
        FlinkKafkaConsumer08<String> kafkaConsumer = new FlinkKafkaConsumer08<>(options.getKafkaTopic(),
                new SimpleStringSchema(), p);

        PCollection<String> words = pipeline
                .apply(Read.from(new UnboundedFlinkSource<>(kafkaConsumer)).named("StreamingWordCount"))
                .apply(ParDo.of(new ExtractWordsFn()))
                .apply(Window.<String>into(FixedWindows.of(Duration.standardSeconds(options.getWindowSize())))
                        .triggering(AfterWatermark.pastEndOfWindow()).withAllowedLateness(Duration.ZERO)
                        .discardingFiredPanes());

        PCollection<KV<String, Long>> wordCounts = words.apply(Count.<String>perElement());

        wordCounts.apply(ParDo.of(new FormatAsStringFn())).apply(TextIO.Write.to("./outputKafka.txt"));

        pipeline.run();
    }
}