Java tutorial
/* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.flink.streaming.connectors.kafka; import static org.junit.Assert.assertEquals; import java.io.File; import java.io.IOException; import java.lang.reflect.Field; import java.net.InetAddress; import java.net.UnknownHostException; import java.util.ArrayList; import java.util.Arrays; import java.util.BitSet; import java.util.Collections; import java.util.List; import java.util.Map; import java.util.Properties; import java.util.Random; import java.util.UUID; import kafka.admin.AdminUtils; import kafka.api.PartitionMetadata; import kafka.consumer.Consumer; import kafka.consumer.ConsumerConfig; import kafka.consumer.ConsumerIterator; import kafka.consumer.KafkaStream; import kafka.javaapi.consumer.ConsumerConnector; import kafka.message.MessageAndMetadata; import kafka.network.SocketServer; import kafka.server.KafkaConfig; import kafka.server.KafkaServer; import org.I0Itec.zkclient.ZkClient; import org.apache.commons.collections.map.LinkedMap; import org.apache.curator.test.TestingServer; import org.apache.flink.api.common.ExecutionConfig; import org.apache.flink.api.common.functions.MapFunction; import org.apache.flink.api.common.functions.RichFlatMapFunction; import org.apache.flink.api.common.typeinfo.TypeInformation; import org.apache.flink.api.java.tuple.Tuple2; import org.apache.flink.configuration.Configuration; import org.apache.flink.runtime.client.JobExecutionException; import org.apache.flink.runtime.net.NetUtils; import org.apache.flink.streaming.api.datastream.DataStream; import org.apache.flink.streaming.api.datastream.DataStreamSource; import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment; import org.apache.flink.streaming.api.functions.sink.RichSinkFunction; import org.apache.flink.streaming.api.functions.sink.SinkFunction; import org.apache.flink.streaming.api.functions.source.RichParallelSourceFunction; import org.apache.flink.streaming.api.functions.source.RichSourceFunction; import org.apache.flink.streaming.api.functions.source.SourceFunction; import org.apache.flink.streaming.connectors.kafka.api.KafkaSink; import org.apache.flink.streaming.connectors.kafka.api.KafkaSource; import org.apache.flink.streaming.connectors.kafka.api.persistent.PersistentKafkaSource; import org.apache.flink.streaming.connectors.kafka.partitioner.SerializableKafkaPartitioner; import org.apache.flink.streaming.connectors.kafka.util.KafkaLocalSystemTime; import org.apache.flink.streaming.util.serialization.DeserializationSchema; import org.apache.flink.streaming.util.serialization.JavaDefaultStringSchema; import org.apache.flink.util.Collector; import org.junit.AfterClass; import org.junit.Assert; import org.junit.BeforeClass; import org.junit.ClassRule; import org.junit.Ignore; import org.junit.Test; import org.junit.rules.TemporaryFolder; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import scala.collection.Seq; /** * Code in this test is based on the following GitHub repository: * (as per commit bc6b2b2d5f6424d5f377aa6c0871e82a956462ef) * <p/> * https://github.com/sakserv/hadoop-mini-clusters (ASL licensed) */ public class KafkaITCase { private static final Logger LOG = LoggerFactory.getLogger(KafkaITCase.class); private static final int NUMBER_OF_KAFKA_SERVERS = 3; private static int zkPort; private static String kafkaHost; private static String zookeeperConnectionString; @ClassRule public static TemporaryFolder tempFolder = new TemporaryFolder(); public static File tmpZkDir; public static List<File> tmpKafkaDirs; private static TestingServer zookeeper; private static List<KafkaServer> brokers; private static String brokerConnectionStrings = ""; private static ConsumerConfig standardCC; private static ZkClient zkClient; @BeforeClass public static void prepare() throws IOException { LOG.info("Starting KafkaITCase.prepare()"); tmpZkDir = tempFolder.newFolder(); tmpKafkaDirs = new ArrayList<File>(NUMBER_OF_KAFKA_SERVERS); for (int i = 0; i < NUMBER_OF_KAFKA_SERVERS; i++) { tmpKafkaDirs.add(tempFolder.newFolder()); } kafkaHost = InetAddress.getLocalHost().getHostName(); zkPort = NetUtils.getAvailablePort(); zookeeperConnectionString = "localhost:" + zkPort; zookeeper = null; brokers = null; try { LOG.info("Starting Zookeeper"); zookeeper = getZookeeper(); LOG.info("Starting KafkaServer"); brokers = new ArrayList<KafkaServer>(NUMBER_OF_KAFKA_SERVERS); for (int i = 0; i < NUMBER_OF_KAFKA_SERVERS; i++) { brokers.add(getKafkaServer(i, tmpKafkaDirs.get(i))); SocketServer socketServer = brokers.get(i).socketServer(); String host = "localhost"; if (socketServer.host() != null) { host = socketServer.host(); } brokerConnectionStrings += host + ":" + socketServer.port() + ","; } LOG.info("ZK and KafkaServer started."); } catch (Throwable t) { LOG.warn("Test failed with exception", t); Assert.fail("Test failed with: " + t.getMessage()); } Properties cProps = new Properties(); cProps.setProperty("zookeeper.connect", zookeeperConnectionString); cProps.setProperty("group.id", "flink-tests"); cProps.setProperty("auto.commit.enable", "false"); cProps.setProperty("auto.offset.reset", "smallest"); // read from the beginning. standardCC = new ConsumerConfig(cProps); zkClient = new ZkClient(standardCC.zkConnect(), standardCC.zkSessionTimeoutMs(), standardCC.zkConnectionTimeoutMs(), new PersistentKafkaSource.KafkaZKStringSerializer()); } @AfterClass public static void shutDownServices() { LOG.info("Shutting down all services"); for (KafkaServer broker : brokers) { if (broker != null) { broker.shutdown(); } } if (zookeeper != null) { try { zookeeper.stop(); } catch (IOException e) { LOG.warn("ZK.stop() failed", e); } } zkClient.close(); } // -------------------------- test checkpointing ------------------------ @Test public void testCheckpointing() throws Exception { createTestTopic("testCheckpointing", 1, 1); Properties props = new Properties(); props.setProperty("zookeeper.connect", zookeeperConnectionString); props.setProperty("group.id", "testCheckpointing"); props.setProperty("auto.commit.enable", "false"); ConsumerConfig cc = new ConsumerConfig(props); PersistentKafkaSource<String> source = new PersistentKafkaSource<String>("testCheckpointing", new FakeDeserializationSchema(), cc); Field pendingCheckpointsField = PersistentKafkaSource.class.getDeclaredField("pendingCheckpoints"); pendingCheckpointsField.setAccessible(true); LinkedMap pendingCheckpoints = (LinkedMap) pendingCheckpointsField.get(source); Assert.assertEquals(0, pendingCheckpoints.size()); // first restore source.restoreState(new long[] { 1337 }); // then open source.open(new Configuration()); long[] state1 = source.snapshotState(1, 15); Assert.assertArrayEquals(new long[] { 1337 }, state1); long[] state2 = source.snapshotState(2, 30); Assert.assertArrayEquals(new long[] { 1337 }, state2); Assert.assertEquals(2, pendingCheckpoints.size()); source.notifyCheckpointComplete(1); Assert.assertEquals(1, pendingCheckpoints.size()); source.notifyCheckpointComplete(2); Assert.assertEquals(0, pendingCheckpoints.size()); source.notifyCheckpointComplete(666); // invalid checkpoint Assert.assertEquals(0, pendingCheckpoints.size()); // create 500 snapshots for (int i = 0; i < 500; i++) { source.snapshotState(i, 15 * i); } Assert.assertEquals(500, pendingCheckpoints.size()); // commit only the second last source.notifyCheckpointComplete(498); Assert.assertEquals(1, pendingCheckpoints.size()); // access invalid checkpoint source.notifyCheckpointComplete(490); // and the last source.notifyCheckpointComplete(499); Assert.assertEquals(0, pendingCheckpoints.size()); } private static class FakeDeserializationSchema implements DeserializationSchema<String> { @Override public String deserialize(byte[] message) { return null; } @Override public boolean isEndOfStream(String nextElement) { return false; } @Override public TypeInformation<String> getProducedType() { return null; } } // --------------------------------------------------------------- @Test public void testOffsetManipulation() { ZkClient zk = new ZkClient(standardCC.zkConnect(), standardCC.zkSessionTimeoutMs(), standardCC.zkConnectionTimeoutMs(), new PersistentKafkaSource.KafkaZKStringSerializer()); final String topicName = "testOffsetManipulation"; // create topic Properties topicConfig = new Properties(); LOG.info("Creating topic {}", topicName); AdminUtils.createTopic(zk, topicName, 3, 2, topicConfig); PersistentKafkaSource.setOffset(zk, standardCC.groupId(), topicName, 0, 1337); Assert.assertEquals(1337L, PersistentKafkaSource.getOffset(zk, standardCC.groupId(), topicName, 0)); zk.close(); } public static class TestPersistentKafkaSource<OUT> extends PersistentKafkaSource<OUT> { private static Object sync = new Object(); public static long[] finalOffset; public TestPersistentKafkaSource(String topicName, DeserializationSchema<OUT> deserializationSchema, ConsumerConfig consumerConfig) { super(topicName, deserializationSchema, consumerConfig); } @Override public void close() { super.close(); LOG.info("Starting close " + Arrays.toString(commitedOffsets)); synchronized (sync) { if (finalOffset == null) { finalOffset = new long[commitedOffsets.length]; } for (int i = 0; i < commitedOffsets.length; i++) { if (commitedOffsets[i] > 0) { if (finalOffset[i] > 0) { throw new RuntimeException("This is unexpected on i = " + i); } finalOffset[i] = commitedOffsets[i]; } } } LOG.info("Finished closing. Final " + Arrays.toString(finalOffset)); } } /** * We want to use the High level java consumer API but manage the offset in Zookeeper manually. * */ @Test @Ignore public void testPersistentSourceWithOffsetUpdates() throws Exception { LOG.info("Starting testPersistentSourceWithOffsetUpdates()"); ZkClient zk = new ZkClient(standardCC.zkConnect(), standardCC.zkSessionTimeoutMs(), standardCC.zkConnectionTimeoutMs(), new PersistentKafkaSource.KafkaZKStringSerializer()); final String topicName = "testOffsetHacking"; StreamExecutionEnvironment env = StreamExecutionEnvironment.createLocalEnvironment(3); env.getConfig().disableSysoutLogging(); env.enableCheckpointing(50); env.setNumberOfExecutionRetries(0); // create topic Properties topicConfig = new Properties(); LOG.info("Creating topic {}", topicName); AdminUtils.createTopic(zk, topicName, 3, 2, topicConfig); // write a sequence from 0 to 99 to each of the three partitions. writeSequence(env, topicName, 0, 99); readSequence(env, standardCC, topicName, 0, 100, 300); LOG.info("State in persistent kafka sources {}", TestPersistentKafkaSource.finalOffset); // check offsets to be set at least higher than 50. // correctly, we would expect them to be set to 99, but right now there is no way of stopping a topology once all pending // checkpoints have been committed. // To work around that limitation, the persistent kafka consumer is throtteled with a thread.sleep(). long o1 = -1, o2 = -1, o3 = -1; if (TestPersistentKafkaSource.finalOffset[0] > 0) { o1 = PersistentKafkaSource.getOffset(zk, standardCC.groupId(), topicName, 0); Assert.assertTrue("The offset seems incorrect, got " + o1, o1 == TestPersistentKafkaSource.finalOffset[0]); } if (TestPersistentKafkaSource.finalOffset[1] > 0) { o2 = PersistentKafkaSource.getOffset(zk, standardCC.groupId(), topicName, 1); Assert.assertTrue("The offset seems incorrect, got " + o2, o2 == TestPersistentKafkaSource.finalOffset[1]); } if (TestPersistentKafkaSource.finalOffset[2] > 0) { o3 = PersistentKafkaSource.getOffset(zk, standardCC.groupId(), topicName, 2); Assert.assertTrue("The offset seems incorrect, got " + o3, o3 == TestPersistentKafkaSource.finalOffset[2]); } Assert.assertFalse("no offset has been set", TestPersistentKafkaSource.finalOffset[0] == 0 && TestPersistentKafkaSource.finalOffset[1] == 0 && TestPersistentKafkaSource.finalOffset[2] == 0); LOG.info("Got final offsets from zookeeper o1={}, o2={}, o3={}", o1, o2, o3); LOG.info("Manipulating offsets"); // set the offset to 50 for the three partitions PersistentKafkaSource.setOffset(zk, standardCC.groupId(), topicName, 0, 50); PersistentKafkaSource.setOffset(zk, standardCC.groupId(), topicName, 1, 50); PersistentKafkaSource.setOffset(zk, standardCC.groupId(), topicName, 2, 50); // create new env env = StreamExecutionEnvironment.createLocalEnvironment(3); env.getConfig().disableSysoutLogging(); readSequence(env, standardCC, topicName, 50, 50, 150); zk.close(); LOG.info("Finished testPersistentSourceWithOffsetUpdates()"); } private void readSequence(StreamExecutionEnvironment env, ConsumerConfig cc, final String topicName, final int valuesStartFrom, final int valuesCount, final int finalCount) throws Exception { LOG.info("Reading sequence for verification until final count {}", finalCount); TestPersistentKafkaSource<Tuple2<Integer, Integer>> pks = new TestPersistentKafkaSource<Tuple2<Integer, Integer>>( topicName, new Utils.TypeInformationSerializationSchema<Tuple2<Integer, Integer>>( new Tuple2<Integer, Integer>(1, 1), env.getConfig()), cc); DataStream<Tuple2<Integer, Integer>> source = env.addSource(pks) .map(new MapFunction<Tuple2<Integer, Integer>, Tuple2<Integer, Integer>>() { // we need to slow down the source so that it can participate in a few checkpoints. // Otherwise it would write its data into buffers and shut down. @Override public Tuple2<Integer, Integer> map(Tuple2<Integer, Integer> value) throws Exception { Thread.sleep(50); return value; } }); // verify data DataStream<Integer> validIndexes = source .flatMap(new RichFlatMapFunction<Tuple2<Integer, Integer>, Integer>() { private static final long serialVersionUID = 1L; int[] values = new int[valuesCount]; int count = 0; @Override public void flatMap(Tuple2<Integer, Integer> value, Collector<Integer> out) throws Exception { values[value.f1 - valuesStartFrom]++; count++; LOG.info("Reader " + getRuntimeContext().getIndexOfThisSubtask() + " got " + value + " count=" + count + "/" + finalCount); // verify if we've seen everything if (count == finalCount) { LOG.info("Received all values"); for (int i = 0; i < values.length; i++) { int v = values[i]; if (v != 3) { LOG.warn("Test is going to fail"); printTopic(topicName, valuesCount, this.getRuntimeContext().getExecutionConfig()); throw new RuntimeException("Expected v to be 3, but was " + v + " on element " + i + " array=" + Arrays.toString(values)); } } // test has passed throw new SuccessException(); } } }).setParallelism(1); tryExecute(env, "Read data from Kafka"); LOG.info("Successfully read sequence for verification"); } private void writeSequence(StreamExecutionEnvironment env, String topicName, final int from, final int to) throws Exception { LOG.info("Writing sequence from {} to {} to topic {}", from, to, topicName); DataStream<Tuple2<Integer, Integer>> stream = env .addSource(new RichParallelSourceFunction<Tuple2<Integer, Integer>>() { private static final long serialVersionUID = 1L; boolean running = true; @Override public void run(SourceContext<Tuple2<Integer, Integer>> ctx) throws Exception { LOG.info("Starting source."); int cnt = from; int partition = getRuntimeContext().getIndexOfThisSubtask(); while (running) { LOG.info("Writing " + cnt + " to partition " + partition); ctx.collect( new Tuple2<Integer, Integer>(getRuntimeContext().getIndexOfThisSubtask(), cnt)); if (cnt == to) { LOG.info("Writer reached end."); return; } cnt++; } } @Override public void cancel() { LOG.info("Source got cancel()"); running = false; } }).setParallelism(3); stream.addSink(new KafkaSink<Tuple2<Integer, Integer>>(brokerConnectionStrings, topicName, new Utils.TypeInformationSerializationSchema<Tuple2<Integer, Integer>>( new Tuple2<Integer, Integer>(1, 1), env.getConfig()), new T2Partitioner())).setParallelism(3); env.execute("Write sequence from " + from + " to " + to + " to topic " + topicName); LOG.info("Finished writing sequence"); } private static class T2Partitioner implements SerializableKafkaPartitioner { private static final long serialVersionUID = 1L; @Override public int partition(Object key, int numPartitions) { if (numPartitions != 3) { throw new IllegalArgumentException("Expected three partitions"); } Tuple2<Integer, Integer> element = (Tuple2<Integer, Integer>) key; return element.f0; } } @Test public void regularKafkaSourceTest() throws Exception { LOG.info("Starting KafkaITCase.regularKafkaSourceTest()"); String topic = "regularKafkaSourceTestTopic"; createTestTopic(topic, 1, 1); final StreamExecutionEnvironment env = StreamExecutionEnvironment.createLocalEnvironment(1); // add consuming topology: DataStreamSource<Tuple2<Long, String>> consuming = env .addSource(new KafkaSource<Tuple2<Long, String>>(zookeeperConnectionString, topic, "myFlinkGroup", new Utils.TypeInformationSerializationSchema<Tuple2<Long, String>>( new Tuple2<Long, String>(1L, ""), env.getConfig()), 5000)); consuming.addSink(new SinkFunction<Tuple2<Long, String>>() { private static final long serialVersionUID = 1L; int elCnt = 0; int start = -1; BitSet validator = new BitSet(101); @Override public void invoke(Tuple2<Long, String> value) throws Exception { LOG.debug("Got value = " + value); String[] sp = value.f1.split("-"); int v = Integer.parseInt(sp[1]); assertEquals(value.f0 - 1000, (long) v); if (start == -1) { start = v; } Assert.assertFalse("Received tuple twice", validator.get(v - start)); validator.set(v - start); elCnt++; if (elCnt == 100) { // check if everything in the bitset is set to true int nc; if ((nc = validator.nextClearBit(0)) != 100) { throw new RuntimeException("The bitset was not set to 1 on all elements. Next clear:" + nc + " Set: " + validator); } throw new SuccessException(); } } }); // add producing topology DataStream<Tuple2<Long, String>> stream = env.addSource(new SourceFunction<Tuple2<Long, String>>() { private static final long serialVersionUID = 1L; boolean running = true; @Override public void run(SourceContext<Tuple2<Long, String>> ctx) throws Exception { LOG.info("Starting source."); int cnt = 0; while (running) { ctx.collect(new Tuple2<Long, String>(1000L + cnt, "kafka-" + cnt++)); try { Thread.sleep(100); } catch (InterruptedException ignored) { } } } @Override public void cancel() { LOG.info("Source got cancel()"); running = false; } }); stream.addSink(new KafkaSink<Tuple2<Long, String>>(brokerConnectionStrings, topic, new Utils.TypeInformationSerializationSchema<Tuple2<Long, String>>(new Tuple2<Long, String>(1L, ""), env.getConfig()))); tryExecute(env, "regular kafka source test"); LOG.info("Finished KafkaITCase.regularKafkaSourceTest()"); } @Test public void tupleTestTopology() throws Exception { LOG.info("Starting KafkaITCase.tupleTestTopology()"); String topic = "tupleTestTopic"; createTestTopic(topic, 1, 1); final StreamExecutionEnvironment env = StreamExecutionEnvironment.createLocalEnvironment(1); // add consuming topology: DataStreamSource<Tuple2<Long, String>> consuming = env .addSource(new PersistentKafkaSource<Tuple2<Long, String>>(topic, new Utils.TypeInformationSerializationSchema<Tuple2<Long, String>>( new Tuple2<Long, String>(1L, ""), env.getConfig()), standardCC)); consuming.addSink(new RichSinkFunction<Tuple2<Long, String>>() { private static final long serialVersionUID = 1L; int elCnt = 0; int start = -1; BitSet validator = new BitSet(101); @Override public void invoke(Tuple2<Long, String> value) throws Exception { LOG.info("Got value " + value); String[] sp = value.f1.split("-"); int v = Integer.parseInt(sp[1]); assertEquals(value.f0 - 1000, (long) v); if (start == -1) { start = v; } Assert.assertFalse("Received tuple twice", validator.get(v - start)); validator.set(v - start); elCnt++; if (elCnt == 100) { // check if everything in the bitset is set to true int nc; if ((nc = validator.nextClearBit(0)) != 100) { throw new RuntimeException("The bitset was not set to 1 on all elements. Next clear:" + nc + " Set: " + validator); } throw new SuccessException(); } } @Override public void close() throws Exception { super.close(); Assert.assertTrue("No element received", elCnt > 0); } }); // add producing topology DataStream<Tuple2<Long, String>> stream = env.addSource(new SourceFunction<Tuple2<Long, String>>() { private static final long serialVersionUID = 1L; boolean running = true; @Override public void run(SourceContext<Tuple2<Long, String>> ctx) throws Exception { LOG.info("Starting source."); int cnt = 0; while (running) { ctx.collect(new Tuple2<Long, String>(1000L + cnt, "kafka-" + cnt++)); LOG.info("Produced " + cnt); try { Thread.sleep(100); } catch (InterruptedException ignored) { } } } @Override public void cancel() { LOG.info("Source got cancel()"); running = false; } }); stream.addSink(new KafkaSink<Tuple2<Long, String>>(brokerConnectionStrings, topic, new Utils.TypeInformationSerializationSchema<Tuple2<Long, String>>(new Tuple2<Long, String>(1L, ""), env.getConfig()))); tryExecute(env, "tupletesttopology"); LOG.info("Finished KafkaITCase.tupleTestTopology()"); } /** * Test Flink's Kafka integration also with very big records (30MB) * * see http://stackoverflow.com/questions/21020347/kafka-sending-a-15mb-message * * @throws Exception */ @Test public void bigRecordTestTopology() throws Exception { LOG.info("Starting KafkaITCase.bigRecordTestTopology()"); String topic = "bigRecordTestTopic"; createTestTopic(topic, 1, 1); final StreamExecutionEnvironment env = StreamExecutionEnvironment.createLocalEnvironment(1); // add consuming topology: Utils.TypeInformationSerializationSchema<Tuple2<Long, byte[]>> serSchema = new Utils.TypeInformationSerializationSchema<Tuple2<Long, byte[]>>( new Tuple2<Long, byte[]>(0L, new byte[] { 0 }), env.getConfig()); Properties consumerProps = new Properties(); consumerProps.setProperty("fetch.message.max.bytes", Integer.toString(1024 * 1024 * 30)); consumerProps.setProperty("zookeeper.connect", zookeeperConnectionString); consumerProps.setProperty("group.id", "test"); consumerProps.setProperty("auto.commit.enable", "false"); consumerProps.setProperty("auto.offset.reset", "smallest"); ConsumerConfig cc = new ConsumerConfig(consumerProps); DataStreamSource<Tuple2<Long, byte[]>> consuming = env .addSource(new PersistentKafkaSource<Tuple2<Long, byte[]>>(topic, serSchema, cc)); consuming.addSink(new SinkFunction<Tuple2<Long, byte[]>>() { private static final long serialVersionUID = 1L; int elCnt = 0; @Override public void invoke(Tuple2<Long, byte[]> value) throws Exception { LOG.info("Received {}", value.f0); elCnt++; if (value.f0 == -1) { // we should have seen 11 elements now. if (elCnt == 11) { throw new SuccessException(); } else { throw new RuntimeException("There have been " + elCnt + " elements"); } } if (elCnt > 10) { throw new RuntimeException("More than 10 elements seen: " + elCnt); } } }).setParallelism(1); // add producing topology DataStream<Tuple2<Long, byte[]>> stream = env.addSource(new RichSourceFunction<Tuple2<Long, byte[]>>() { private static final long serialVersionUID = 1L; boolean running; @Override public void open(Configuration parameters) throws Exception { super.open(parameters); running = true; } @Override public void run(SourceContext<Tuple2<Long, byte[]>> ctx) throws Exception { LOG.info("Starting source."); long cnt = 0; Random rnd = new Random(1337); while (running) { // byte[] wl = new byte[Math.abs(rnd.nextInt(1024 * 1024 * 30))]; ctx.collect(new Tuple2<Long, byte[]>(cnt++, wl)); LOG.info("Emitted cnt=" + (cnt - 1) + " with byte.length = " + wl.length); try { Thread.sleep(100); } catch (InterruptedException ignored) { } if (cnt == 10) { LOG.info("Send end signal"); // signal end ctx.collect(new Tuple2<Long, byte[]>(-1L, new byte[] { 1 })); running = false; } } } @Override public void cancel() { LOG.info("Source got cancel()"); running = false; } }); stream.addSink(new KafkaSink<Tuple2<Long, byte[]>>(brokerConnectionStrings, topic, new Utils.TypeInformationSerializationSchema<Tuple2<Long, byte[]>>( new Tuple2<Long, byte[]>(0L, new byte[] { 0 }), env.getConfig()))); tryExecute(env, "big topology test"); LOG.info("Finished KafkaITCase.bigRecordTestTopology()"); } @Test public void customPartitioningTestTopology() throws Exception { LOG.info("Starting KafkaITCase.customPartitioningTestTopology()"); String topic = "customPartitioningTestTopic"; createTestTopic(topic, 3, 1); final StreamExecutionEnvironment env = StreamExecutionEnvironment.createLocalEnvironment(1); // add consuming topology: DataStreamSource<Tuple2<Long, String>> consuming = env .addSource(new PersistentKafkaSource<Tuple2<Long, String>>(topic, new Utils.TypeInformationSerializationSchema<Tuple2<Long, String>>( new Tuple2<Long, String>(1L, ""), env.getConfig()), standardCC)); consuming.addSink(new SinkFunction<Tuple2<Long, String>>() { private static final long serialVersionUID = 1L; int start = -1; BitSet validator = new BitSet(101); boolean gotPartition1 = false; boolean gotPartition2 = false; boolean gotPartition3 = false; @Override public void invoke(Tuple2<Long, String> value) throws Exception { LOG.debug("Got " + value); String[] sp = value.f1.split("-"); int v = Integer.parseInt(sp[1]); assertEquals(value.f0 - 1000, (long) v); switch (v) { case 9: gotPartition1 = true; break; case 19: gotPartition2 = true; break; case 99: gotPartition3 = true; break; } if (start == -1) { start = v; } Assert.assertFalse("Received tuple twice", validator.get(v - start)); validator.set(v - start); if (gotPartition1 && gotPartition2 && gotPartition3) { // check if everything in the bitset is set to true int nc; if ((nc = validator.nextClearBit(0)) != 100) { throw new RuntimeException("The bitset was not set to 1 on all elements. Next clear:" + nc + " Set: " + validator); } throw new SuccessException(); } } }); // add producing topology DataStream<Tuple2<Long, String>> stream = env.addSource(new SourceFunction<Tuple2<Long, String>>() { private static final long serialVersionUID = 1L; boolean running = true; @Override public void run(SourceContext<Tuple2<Long, String>> ctx) throws Exception { LOG.info("Starting source."); int cnt = 0; while (running) { ctx.collect(new Tuple2<Long, String>(1000L + cnt, "kafka-" + cnt++)); try { Thread.sleep(100); } catch (InterruptedException ignored) { } } } @Override public void cancel() { LOG.info("Source got cancel()"); running = false; } }); stream.addSink(new KafkaSink<Tuple2<Long, String>>(brokerConnectionStrings, topic, new Utils.TypeInformationSerializationSchema<Tuple2<Long, String>>(new Tuple2<Long, String>(1L, ""), env.getConfig()), new CustomPartitioner())); tryExecute(env, "custom partitioning test"); LOG.info("Finished KafkaITCase.customPartitioningTestTopology()"); } /** * This is for a topic with 3 partitions and Tuple2<Long, String> */ private static class CustomPartitioner implements SerializableKafkaPartitioner { private static final long serialVersionUID = 1L; @Override public int partition(Object key, int numPartitions) { @SuppressWarnings("unchecked") Tuple2<Long, String> tuple = (Tuple2<Long, String>) key; if (tuple.f0 < 10) { return 0; } else if (tuple.f0 < 20) { return 1; } else { return 2; } } } @Test public void simpleTestTopology() throws Exception { String topic = "simpleTestTopic"; createTestTopic(topic, 1, 1); final StreamExecutionEnvironment env = StreamExecutionEnvironment.createLocalEnvironment(1); // add consuming topology: DataStreamSource<String> consuming = env .addSource(new PersistentKafkaSource<String>(topic, new JavaDefaultStringSchema(), standardCC)); consuming.addSink(new SinkFunction<String>() { private static final long serialVersionUID = 1L; int elCnt = 0; int start = -1; BitSet validator = new BitSet(101); @Override public void invoke(String value) throws Exception { LOG.debug("Got " + value); String[] sp = value.split("-"); int v = Integer.parseInt(sp[1]); if (start == -1) { start = v; } Assert.assertFalse("Received tuple twice", validator.get(v - start)); validator.set(v - start); elCnt++; if (elCnt == 100) { // check if everything in the bitset is set to true int nc; if ((nc = validator.nextClearBit(0)) != 100) { throw new RuntimeException("The bitset was not set to 1 on all elements. Next clear:" + nc + " Set: " + validator); } throw new SuccessException(); } } }); // add producing topology DataStream<String> stream = env.addSource(new SourceFunction<String>() { private static final long serialVersionUID = 1L; boolean running = true; @Override public void run(SourceContext<String> ctx) throws Exception { LOG.info("Starting source."); int cnt = 0; while (running) { ctx.collect("kafka-" + cnt++); try { Thread.sleep(100); } catch (InterruptedException ignored) { } } } @Override public void cancel() { LOG.info("Source got cancel()"); running = false; } }); stream.addSink(new KafkaSink<String>(brokerConnectionStrings, topic, new JavaDefaultStringSchema())); tryExecute(env, "simpletest"); } private static boolean leaderHasShutDown = false; private static boolean shutdownKafkaBroker; @Test(timeout = 60000) public void brokerFailureTest() throws Exception { String topic = "brokerFailureTestTopic"; createTestTopic(topic, 2, 2); // --------------------------- write data to topic --------------------- LOG.info("Writing data to topic {}", topic); StreamExecutionEnvironment env = StreamExecutionEnvironment.createLocalEnvironment(1); DataStream<String> stream = env.addSource(new SourceFunction<String>() { private static final long serialVersionUID = 1L; boolean running = true; @Override public void run(SourceContext<String> ctx) throws Exception { LOG.info("Starting source."); int cnt = 0; while (running) { String msg = "kafka-" + cnt++; ctx.collect(msg); LOG.info("sending message = " + msg); if ((cnt - 1) % 20 == 0) { LOG.debug("Sending message #{}", cnt - 1); } if (cnt == 200) { LOG.info("Stopping to produce after 200 msgs"); break; } } } @Override public void cancel() { LOG.info("Source got chancel()"); running = false; } }); stream.addSink(new KafkaSink<String>(brokerConnectionStrings, topic, new JavaDefaultStringSchema())) .setParallelism(1); tryExecute(env, "broker failure test - writer"); // --------------------------- read and let broker fail --------------------- LOG.info("Reading data from topic {} and let a broker fail", topic); PartitionMetadata firstPart = null; do { if (firstPart != null) { LOG.info("Unable to find leader. error code {}", firstPart.errorCode()); // not the first try. Sleep a bit Thread.sleep(150); } Seq<PartitionMetadata> partitionMetadata = AdminUtils.fetchTopicMetadataFromZk(topic, zkClient) .partitionsMetadata(); firstPart = partitionMetadata.head(); } while (firstPart.errorCode() != 0); final String leaderToShutDown = firstPart.leader().get().connectionString(); LOG.info("Leader to shutdown {}", leaderToShutDown); final Thread brokerShutdown = new Thread(new Runnable() { @Override public void run() { shutdownKafkaBroker = false; while (!shutdownKafkaBroker) { try { Thread.sleep(10); } catch (InterruptedException e) { LOG.warn("Interruption", e); } } for (KafkaServer kafkaServer : brokers) { if (leaderToShutDown.equals(kafkaServer.config().advertisedHostName() + ":" + kafkaServer.config().advertisedPort())) { LOG.info("Killing Kafka Server {}", leaderToShutDown); kafkaServer.shutdown(); leaderHasShutDown = true; break; } } } }); brokerShutdown.start(); // add consuming topology: DataStreamSource<String> consuming = env .addSource(new PersistentKafkaSource<String>(topic, new JavaDefaultStringSchema(), standardCC)); consuming.setParallelism(1); consuming.addSink(new SinkFunction<String>() { private static final long serialVersionUID = 1L; int elCnt = 0; int start = 0; int numOfMessagesToBeCorrect = 100; int stopAfterMessages = 150; BitSet validator = new BitSet(numOfMessagesToBeCorrect + 1); @Override public void invoke(String value) throws Exception { LOG.info("Got message = " + value + " leader has shut down " + leaderHasShutDown + " el cnt = " + elCnt + " to rec" + numOfMessagesToBeCorrect); String[] sp = value.split("-"); int v = Integer.parseInt(sp[1]); if (start == -1) { start = v; } int offset = v - start; Assert.assertFalse("Received tuple with value " + offset + " twice", validator.get(offset)); if (v - start < 0 && LOG.isWarnEnabled()) { LOG.warn("Not in order: {}", value); } validator.set(offset); elCnt++; if (elCnt == 20) { LOG.info("Asking leading broker to shut down"); // shut down a Kafka broker shutdownKafkaBroker = true; } if (shutdownKafkaBroker) { // we become a bit slower because the shutdown takes some time and we have // only a fixed nubmer of elements to read Thread.sleep(20); } if (leaderHasShutDown) { // it only makes sence to check once the shutdown is completed if (elCnt >= stopAfterMessages) { // check if everything in the bitset is set to true int nc; if ((nc = validator.nextClearBit(0)) < numOfMessagesToBeCorrect) { throw new RuntimeException( "The bitset was not set to 1 on all elements to be checked. Next clear:" + nc + " Set: " + validator); } throw new SuccessException(); } } } }); tryExecute(env, "broker failure test - reader"); } public static void tryExecute(StreamExecutionEnvironment see, String name) throws Exception { try { see.execute(name); } catch (JobExecutionException good) { Throwable t = good.getCause(); int limit = 0; while (!(t instanceof SuccessException)) { if (t == null) { LOG.warn("Test failed with exception", good); Assert.fail("Test failed with: " + good.getMessage()); } t = t.getCause(); if (limit++ == 20) { LOG.warn("Test failed with exception", good); Assert.fail("Test failed with: " + good.getMessage()); } } } } private void createTestTopic(String topic, int numberOfPartitions, int replicationFactor) { // create topic Properties topicConfig = new Properties(); LOG.info("Creating topic {}", topic); AdminUtils.createTopic(zkClient, topic, numberOfPartitions, replicationFactor, topicConfig); } private static TestingServer getZookeeper() throws Exception { return new TestingServer(zkPort, tmpZkDir); } /** * Copied from com.github.sakserv.minicluster.KafkaLocalBrokerIntegrationTest (ASL licensed) */ private static KafkaServer getKafkaServer(int brokerId, File tmpFolder) throws UnknownHostException { Properties kafkaProperties = new Properties(); int kafkaPort = NetUtils.getAvailablePort(); // properties have to be Strings kafkaProperties.put("advertised.host.name", kafkaHost); kafkaProperties.put("port", Integer.toString(kafkaPort)); kafkaProperties.put("broker.id", Integer.toString(brokerId)); kafkaProperties.put("log.dir", tmpFolder.toString()); kafkaProperties.put("zookeeper.connect", zookeeperConnectionString); kafkaProperties.put("message.max.bytes", "" + (35 * 1024 * 1024)); kafkaProperties.put("replica.fetch.max.bytes", "" + (35 * 1024 * 1024)); KafkaConfig kafkaConfig = new KafkaConfig(kafkaProperties); KafkaServer server = new KafkaServer(kafkaConfig, new KafkaLocalSystemTime()); server.startup(); return server; } public static class SuccessException extends Exception { private static final long serialVersionUID = 1L; } // ----------------------- Debugging utilities -------------------- /** * Read topic to list, only using Kafka code. * @return */ private static List<MessageAndMetadata<byte[], byte[]>> readTopicToList(String topicName, ConsumerConfig config, final int stopAfter) { ConsumerConnector consumerConnector = Consumer.createJavaConsumerConnector(config); // we request only one stream per consumer instance. Kafka will make sure that each consumer group // will see each message only once. Map<String, Integer> topicCountMap = Collections.singletonMap(topicName, 1); Map<String, List<KafkaStream<byte[], byte[]>>> streams = consumerConnector .createMessageStreams(topicCountMap); if (streams.size() != 1) { throw new RuntimeException("Expected only one message stream but got " + streams.size()); } List<KafkaStream<byte[], byte[]>> kafkaStreams = streams.get(topicName); if (kafkaStreams == null) { throw new RuntimeException("Requested stream not available. Available streams: " + streams.toString()); } if (kafkaStreams.size() != 1) { throw new RuntimeException( "Requested 1 stream from Kafka, bot got " + kafkaStreams.size() + " streams"); } LOG.info("Opening Consumer instance for topic '{}' on group '{}'", topicName, config.groupId()); ConsumerIterator<byte[], byte[]> iteratorToRead = kafkaStreams.get(0).iterator(); List<MessageAndMetadata<byte[], byte[]>> result = new ArrayList<MessageAndMetadata<byte[], byte[]>>(); int read = 0; while (iteratorToRead.hasNext()) { read++; result.add(iteratorToRead.next()); if (read == stopAfter) { LOG.info("Read " + read + " elements"); return result; } } return result; } private static void printTopic(String topicName, ConsumerConfig config, DeserializationSchema deserializationSchema, int stopAfter) { List<MessageAndMetadata<byte[], byte[]>> contents = readTopicToList(topicName, config, stopAfter); LOG.info("Printing contents of topic {} in consumer grouo {}", topicName, config.groupId()); for (MessageAndMetadata<byte[], byte[]> message : contents) { Object out = deserializationSchema.deserialize(message.message()); LOG.info("Message: partition: {} offset: {} msg: {}", message.partition(), message.offset(), out.toString()); } } private static void printTopic(String topicName, int elements, ExecutionConfig ec) { // write the sequence to log for debugging purposes Properties stdProps = standardCC.props().props(); Properties newProps = new Properties(stdProps); newProps.setProperty("group.id", "topic-printer" + UUID.randomUUID().toString()); newProps.setProperty("auto.offset.reset", "smallest"); newProps.setProperty("zookeeper.connect", standardCC.zkConnect()); ConsumerConfig printerConfig = new ConsumerConfig(newProps); DeserializationSchema deserializer = new Utils.TypeInformationSerializationSchema<Tuple2<Integer, Integer>>( new Tuple2<Integer, Integer>(1, 1), ec); printTopic(topicName, printerConfig, deserializer, elements); } }