Java tutorial
/** * Copyright 2014 Conductor, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the * specific language governing permissions and limitations under the License. * */ package com.baynote.kafka.hadoop; import java.io.IOException; import java.util.*; import java.util.concurrent.TimeUnit; import kafka.api.PartitionOffsetRequestInfo; import kafka.common.TopicAndPartition; import kafka.javaapi.OffsetRequest; import kafka.javaapi.OffsetResponse; import kafka.javaapi.consumer.SimpleConsumer; import org.apache.commons.io.IOUtils; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.io.BytesWritable; import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.mapreduce.*; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import com.baynote.kafka.Broker; import com.baynote.kafka.Partition; import com.baynote.kafka.zk.ZkUtils; import com.google.common.annotations.VisibleForTesting; import com.google.common.collect.*; /** * An {@link InputFormat} that splits up Kafka {@link Broker}-{@link Partition}s further into a set of offsets. * * <p> * Specifically, it will call {@link SimpleConsumer#getOffsetsBefore} to retrieve a list of valid offsets, and create * {@code N} number of {@link InputSplit}s per {@link Broker}-{@link Partition}, where {@code N} is the number of * offsets returned by {@link SimpleConsumer#getOffsetsBefore}. * * <p> * Thanks to <a href="https://github.com/miniway">Dongmin Yu</a> for providing the inspiration for this code. * * <p> * The original source code can be found <a target="_blank" href="https://github.com/miniway/kafka-hadoop-consumer">on * Github</a>. * * @see KafkaInputSplit * @see KafkaRecordReader * * @author <a href="mailto:cgreen@conductor.com">Casey Green</a> */ public class KafkaInputFormat extends InputFormat<LongWritable, BytesWritable> { private static final Logger LOG = LoggerFactory.getLogger(KafkaInputFormat.class); /** * Default Kafka fetch size, 1MB. */ public static final int DEFAULT_FETCH_SIZE_BYTES = 1024 * 1024; // 1MB /** * Default Kafka socket timeout, 10 seconds. */ public static final int DEFAULT_SOCKET_TIMEOUT_MS = (int) TimeUnit.SECONDS.toMillis(10); /** * Default Kafka buffer size, 64KB. */ public static final int DEFAULT_BUFFER_SIZE_BYTES = 64 * 1024; // 64 KB /** * Default Zookeeper session timeout, 10 seconds. */ public static final int DEFAULT_ZK_SESSION_TIMEOUT_MS = (int) TimeUnit.SECONDS.toMillis(10); /** * Default Zookeeper connection timeout, 10 seconds. */ public static final int DEFAULT_ZK_CONNECTION_TIMEOUT_MS = (int) TimeUnit.SECONDS.toMillis(10); /** * Default Zookeeper root, '/'. */ public static final String DEFAULT_ZK_ROOT = "/"; /** * Default maximum number of partitions per split. */ public static final int DEFAULT_MAX_SPLITS_PER_PARTITION = Integer.MAX_VALUE; /** * Default timestamp to include */ public static final long DEFAULT_INCLUDE_OFFSETS_AFTER_TIMESTAMP = 0; @Override public RecordReader<LongWritable, BytesWritable> createRecordReader(final InputSplit inputSplit, final TaskAttemptContext taskAttemptContext) throws IOException, InterruptedException { return new KafkaRecordReader(); } @Override public List<InputSplit> getSplits(JobContext context) throws IOException, InterruptedException { final Configuration conf = context.getConfiguration(); final String topic = getTopic(conf); final String group = getConsumerGroup(conf); return getInputSplits(conf, topic, group); } /** * Returns the {@code topic} splits of the consumer {@code group} that would be input to a {@link Job} configured * with the provided {@code conf} * <p> * This information may be useful for calculating the number of reducers your job will need. * <p> * <em>Note:</em> At the very least, {@code kafka.zk.connect} must be set in {@code conf}. * * @param conf * the conf, containing at least the {@code kafka.zk.connect} setting. * @param topic * the kafka topic of hypothetical job. * @param group * the consumer group of the hypothetical job * @return the number of splits the hypothetical job would get. * @throws IOException IO error */ public static List<InputSplit> getSplits(final Configuration conf, final String topic, final String group) throws IOException { return new KafkaInputFormat().getInputSplits(conf, topic, group); } /** * Returns all of the {@code topic} splits that would be input to a {@link Job} configured with the provided * {@code conf}. * <p> * This information may be useful for calculating the number of reducers your job will need. * <p> * <em>Note:</em> At the very least, {@code kafka.zk.connect} must be set in {@code conf}. * * @param conf * the conf, containing at least the {@code kafka.zk.connect} setting. * @param topic * the kafka topic of hypothetical job. * @return the number of splits the hypothetical job would get. * @throws IOException IO error */ public static List<InputSplit> getAllSplits(final Configuration conf, final String topic) throws IOException { // use a random UUID as the consumer group to (basically) guarantee a non-existent consumer return new KafkaInputFormat().getInputSplits(conf, topic, UUID.randomUUID().toString()); } /** * Gets all of the input splits for the {@code topic}, filtering out any {@link InputSplit}s already consumed by the * {@code group}. * * @param conf * the job configuration. * @param topic * the topic. * @param group * the consumer group. * @return input splits for the job. * @throws IOException */ List<InputSplit> getInputSplits(final Configuration conf, final String topic, final String group) throws IOException { final List<InputSplit> splits = Lists.newArrayList(); final ZkUtils zk = getZk(conf); final Map<Broker, SimpleConsumer> consumers = Maps.newHashMap(); try { for (final Partition partition : zk.getPartitions(topic)) { // cache the consumer connections - each partition will make use of each broker consumer final Broker broker = partition.getBroker(); if (!consumers.containsKey(broker)) { consumers.put(broker, getConsumer(broker, conf)); } // grab all valid offsets final List<Long> offsets = getOffsets(consumers.get(broker), topic, partition.getPartId(), zk.getLastCommit(group, partition), getIncludeOffsetsAfterTimestamp(conf), getMaxSplitsPerPartition(conf), conf); LOG.info("Topic Offsets: " + offsets.toString()); for (int i = 0; i < offsets.size() - 1; i++) { // ( offsets in descending order ) final long start = offsets.get(i + 1); final long end = offsets.get(i); // since the offsets are in descending order, the first offset in the list is the largest offset for // the current partition. This split will be in charge of committing the offset for this partition. final boolean partitionCommitter = (i == 0); final InputSplit split = new KafkaInputSplit(partition, start, end, partitionCommitter); LOG.debug("Created input split: " + split); splits.add(split); } } } finally { // close resources IOUtils.closeQuietly(zk); for (final SimpleConsumer consumer : consumers.values()) { consumer.close(); } } return splits; } @VisibleForTesting List<Long> getOffsets(final SimpleConsumer consumer, final String topic, final int partitionNum, final long lastCommit, final long asOfTime, final int maxSplitsPerPartition, final Configuration conf) { // TODO: take advantage of new API, which allows you to request offsets for multiple topic-partitions. // all offsets that exist for this partition (in descending order) final OffsetRequest allReq = toOffsetRequest(topic, partitionNum, kafka.api.OffsetRequest.LatestTime(), Integer.MAX_VALUE, conf); final OffsetResponse allOffsetsResponse = consumer.getOffsetsBefore(allReq); final long[] allOffsets = allOffsetsResponse.offsets(topic, partitionNum); // this gets us an offset that is strictly before 'asOfTime', or zero if none exist before that time final OffsetRequest requestBeforeAsOf = toOffsetRequest(topic, partitionNum, asOfTime, 1, conf); final OffsetResponse offsetsBeforeAsOfResponse = consumer.getOffsetsBefore(requestBeforeAsOf); final long[] offsetsBeforeAsOf = offsetsBeforeAsOfResponse.offsets(topic, partitionNum); final long includeAfter = offsetsBeforeAsOf.length == 1 ? offsetsBeforeAsOf[0] : -1; // note that the offsets are in descending order List<Long> result = Lists.newArrayList(); LOG.info("Last commit: " + Long.valueOf(lastCommit)); LOG.info("includeAfter: " + Long.valueOf(includeAfter)); LOG.info("Offsets returned by SimpleConsumer: " + Arrays.toString(allOffsets)); for (final long offset : allOffsets) { if (offset > lastCommit && offset > includeAfter) { result.add(offset); } else if (lastCommit == -1L && offset > includeAfter) { // nothing commited yet, so consume everything result.add(offset); } else { // we add "lastCommit" if it is after "includeAfter" if (lastCommit > includeAfter) { result.add(lastCommit + 1); } // we can break out of loop here bc offsets are in desc order, and we've hit the latest one to include break; } } // to get maxSplitsPerPartition number of splits, you need (maxSplitsPerPartition + 1) number of offsets. if (result.size() - 1 > maxSplitsPerPartition) { result = result.subList(result.size() - maxSplitsPerPartition - 1, result.size()); } LOG.info( String.format("Offsets for %s:%d:%d = %s", consumer.host(), consumer.port(), partitionNum, result)); return result; } @VisibleForTesting static OffsetRequest toOffsetRequest(final String topic, final int partitionNum, final long asOfTime, final int numOffsets, final Configuration conf) { final TopicAndPartition topicAndPartition = new TopicAndPartition(topic, partitionNum); final PartitionOffsetRequestInfo partitionInfoReq = new PartitionOffsetRequestInfo(asOfTime, numOffsets); final Map<TopicAndPartition, PartitionOffsetRequestInfo> requestInfo = ImmutableMap.of(topicAndPartition, partitionInfoReq); return new OffsetRequest(requestInfo, kafka.api.OffsetRequest.CurrentVersion(), KafkaInputFormat.getConsumerGroup(conf)); } /* * We make the following two methods visible for testing so that we can mock these components out in unit tests */ @VisibleForTesting SimpleConsumer getConsumer(final Broker broker, final Configuration conf) { return new SimpleConsumer(broker.getHost(), broker.getPort(), DEFAULT_SOCKET_TIMEOUT_MS, DEFAULT_BUFFER_SIZE_BYTES, KafkaInputFormat.getConsumerGroup(conf)); } @VisibleForTesting ZkUtils getZk(final Configuration conf) { return new ZkUtils(conf); } /** * Sets the Zookeeper connection string (required). * * @param job * the job being configured * @param zkConnect * zookeeper connection string. */ public static void setZkConnect(final Job job, final String zkConnect) { job.getConfiguration().set("kafka.zk.connect", zkConnect); } /** * Gets the Zookeeper connection string set by {@link #setZkConnect(Job, String)}. * * @param conf * the job conf. * @return the Zookeeper connection string. */ public static String getZkConnect(final Configuration conf) { return conf.get("kafka.zk.connect"); } /** * Set the Zookeeper session timeout for Kafka. * * @param job * the job being configured. * @param sessionTimeout * the session timeout in milliseconds. */ public static void setZkSessionTimeoutMs(final Job job, final int sessionTimeout) { job.getConfiguration().setInt("kafka.zk.session.timeout.ms", sessionTimeout); } /** * Gets the Zookeeper session timeout set by {@link #setZkSessionTimeoutMs(Job, int)}, defaulting to * {@link #DEFAULT_ZK_SESSION_TIMEOUT_MS} if it has not been set. * * @param conf * the job conf. * @return the Zookeeper session timeout. */ public static int getZkSessionTimeoutMs(final Configuration conf) { return conf.getInt("kafka.zk.session.timeout.ms", DEFAULT_ZK_SESSION_TIMEOUT_MS); } /** * Set the Zookeeper connection timeout for Zookeeper. * * @param job * the job being configured. * @param connectionTimeout * the connection timeout in milliseconds. */ public static void setZkConnectionTimeoutMs(final Job job, final int connectionTimeout) { job.getConfiguration().setInt("kafka.zk.connection.timeout.ms", connectionTimeout); } /** * Gets the Zookeeper connection timeout set by {@link #setZkConnectionTimeoutMs(Job, int)}, defaulting to * {@link #DEFAULT_ZK_CONNECTION_TIMEOUT_MS} if it has not been set. * * @param conf * the job conf. * @return the Zookeeper connection timeout. */ public static int getZkConnectionTimeoutMs(final Configuration conf) { return conf.getInt("kafka.zk.connection.timeout.ms", DEFAULT_ZK_CONNECTION_TIMEOUT_MS); } /** * Sets the Zookeeper root for Kafka. * * @param job * the job being configured. * @param root * the zookeeper root path. */ public static void setZkRoot(final Job job, final String root) { job.getConfiguration().set("kafka.zk.root", root); } /** * Gets the Zookeeper root of Kafka set by {@link #setZkRoot(Job, String)}, defaulting to {@link #DEFAULT_ZK_ROOT} * if it has not been set. * * @param conf * the job conf. * @return the Zookeeper root of Kafka. */ public static String getZkRoot(final Configuration conf) { return conf.get("kafka.zk.root", DEFAULT_ZK_ROOT); } /** * Sets the input topic (required). * * @param job * the job being configured * @param topic * the topic name */ public static void setTopic(final Job job, final String topic) { job.getConfiguration().set("kafka.topic", topic); } /** * Gets the input topic. * * @param conf * the job conf. * @return the input topic. */ public static String getTopic(final Configuration conf) { return conf.get("kafka.topic"); } /** * Sets the consumer group of the input reader (required). * * @param job * the job being configured. * @param consumerGroup * consumer group name. */ public static void setConsumerGroup(final Job job, final String consumerGroup) { job.getConfiguration().set("kafka.groupid", consumerGroup); } /** * Gets the consumer group. * * @param conf * the job conf. * @return the consumer group. */ public static String getConsumerGroup(final Configuration conf) { return conf.get("kafka.groupid"); } /** * Only consider partitions created <em>approximately</em> on or after {@code timestamp}. * <p> * Note that you are only guaranteed to get all data on or after {@code timestamp}, but you may get <i>some</i> data * before the specified timestamp. * * @param job * the job being configured. * @param timestamp * the timestamp. * @see SimpleConsumer#getOffsetsBefore */ public static void setIncludeOffsetsAfterTimestamp(final Job job, final long timestamp) { job.getConfiguration().setLong("kafka.timestamp.offset", timestamp); } /** * Gets the offset timestamp set by {@link #setIncludeOffsetsAfterTimestamp(Job, long)}, returning {@code 0} by * default. * * @param conf * the job conf. * @return the offset timestamp, {@code 0} by default. */ public static long getIncludeOffsetsAfterTimestamp(final Configuration conf) { return conf.getLong("kafka.timestamp.offset", DEFAULT_INCLUDE_OFFSETS_AFTER_TIMESTAMP); } /** * Limits the number of splits to create per partition. * <p> * Note that it if there more partitions to consume than {@code maxSplits}, the input format will take the * <em>earliest</em> Kafka partitions. * * @param job * the job to configure. * @param maxSplits * the maximum number of splits to create from each Kafka partition. */ public static void setMaxSplitsPerPartition(final Job job, final int maxSplits) { job.getConfiguration().setInt("kafka.max.splits.per.partition", maxSplits); } /** * Gets the maximum number of splits per partition set by {@link #setMaxSplitsPerPartition(Job, int)}, returning * {@link Integer#MAX_VALUE} by default. * * @param conf * the job conf * @return the maximum number of splits, {@link Integer#MAX_VALUE} by default. */ public static int getMaxSplitsPerPartition(final Configuration conf) { return conf.getInt("kafka.max.splits.per.partition", DEFAULT_MAX_SPLITS_PER_PARTITION); } /** * Sets the fetch size of the {@link RecordReader}. Note that your mapper should have enough memory allocation to * handle the specified size, or else you will likely throw {@link OutOfMemoryError}s. * * @param job * the job being configured. * @param fetchSize * the fetch size (bytes). */ public static void setKafkaFetchSizeBytes(final Job job, final int fetchSize) { job.getConfiguration().setInt("kafka.fetch.size", fetchSize); } /** * Gets the Kafka fetch size set by {@link #setKafkaFetchSizeBytes(Job, int)}, defaulting to * {@link #DEFAULT_FETCH_SIZE_BYTES} if it has not been set. * * @param conf * the job conf. * @return the Kafka fetch size. */ public static int getKafkaFetchSizeBytes(final Configuration conf) { return conf.getInt("kafka.fetch.size", DEFAULT_FETCH_SIZE_BYTES); } /** * Sets the buffer size of the {@link SimpleConsumer} inside of the {@link KafkaRecordReader}. * * @param job * the job being configured. * @param bufferSize * the buffer size (bytes). */ public static void setKafkaBufferSizeBytes(final Job job, final int bufferSize) { job.getConfiguration().setInt("kafka.socket.buffersize", bufferSize); } /** * Gets the Kafka buffer size set by {@link #setKafkaBufferSizeBytes(Job, int)}, defaulting to * {@link #DEFAULT_BUFFER_SIZE_BYTES} if it has not been set. * * @param conf * the job conf. * @return the Kafka buffer size. */ public static int getKafkaBufferSizeBytes(final Configuration conf) { return conf.getInt("kafka.socket.buffersize", DEFAULT_BUFFER_SIZE_BYTES); } /** * Sets the socket timeout of the {@link SimpleConsumer} inside of the {@link KafkaRecordReader}. * * @param job * the job being configured. * @param timeout * the socket timeout (milliseconds). */ public static void setKafkaSocketTimeoutMs(final Job job, final int timeout) { job.getConfiguration().setInt("kafka.socket.timeout.ms", timeout); } /** * Gets the Kafka socket timeout set by {@link #setKafkaSocketTimeoutMs(Job, int)}, defaulting to * {@link #DEFAULT_SOCKET_TIMEOUT_MS} if it has not been set. * * @param conf * the job conf. * @return the Kafka socket timeout. */ public static int getKafkaSocketTimeoutMs(final Configuration conf) { return conf.getInt("kafka.socket.timeout.ms", DEFAULT_SOCKET_TIMEOUT_MS); } }