com.uber.hoodie.utilities.sources.KafkaSource.java Source code

Java tutorial

Introduction

Here is the source code for com.uber.hoodie.utilities.sources.KafkaSource.java

Source

/*
 *  Copyright (c) 2017 Uber Technologies, Inc. (hoodie-dev-group@uber.com)
 *
 *  Licensed under the Apache License, Version 2.0 (the "License");
 *  you may not use this file except in compliance with the License.
 *  You may obtain a copy of the License at
 *
 *           http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 *
 *
 */

package com.uber.hoodie.utilities.sources;

import com.uber.hoodie.exception.HoodieNotSupportedException;
import com.uber.hoodie.utilities.UtilHelpers;
import com.uber.hoodie.utilities.exception.HoodieDeltaStreamerException;
import com.uber.hoodie.utilities.schema.SchemaProvider;

import org.apache.avro.generic.GenericRecord;
import org.apache.commons.configuration.PropertiesConfiguration;
import org.apache.commons.lang3.tuple.ImmutablePair;
import org.apache.commons.lang3.tuple.Pair;
import org.apache.log4j.LogManager;
import org.apache.log4j.Logger;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.streaming.kafka.KafkaCluster;
import org.apache.spark.streaming.kafka.KafkaUtils;
import org.apache.spark.streaming.kafka.OffsetRange;
import kafka.common.TopicAndPartition;

import java.nio.charset.Charset;
import java.util.Arrays;
import java.util.Comparator;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Optional;
import java.util.Spliterator;
import java.util.Spliterators;
import java.util.stream.Collectors;
import java.util.stream.Stream;
import java.util.stream.StreamSupport;

import kafka.serializer.DefaultDecoder;
import scala.Predef;
import scala.Tuple2;
import scala.collection.JavaConverters;
import scala.collection.immutable.Map;
import scala.collection.immutable.Set;
import scala.collection.mutable.ArrayBuffer;
import scala.collection.mutable.StringBuilder;
import scala.util.Either;

/**
 * Source to read data from Kafka, incrementally
 */
public class KafkaSource extends Source {

    private static volatile Logger log = LogManager.getLogger(KafkaSource.class);

    static class CheckpointUtils {

        /**
         * Reconstruct checkpoint from string.
         *
         * @param checkpointStr
         * @return
         */
        public static HashMap<TopicAndPartition, KafkaCluster.LeaderOffset> strToOffsets(String checkpointStr) {
            HashMap<TopicAndPartition, KafkaCluster.LeaderOffset> offsetMap = new HashMap<>();
            String[] splits = checkpointStr.split(",");
            String topic = splits[0];
            for (int i = 1; i < splits.length; i++) {
                String[] subSplits = splits[i].split(":");
                offsetMap.put(new TopicAndPartition(topic, Integer.parseInt(subSplits[0])),
                        new KafkaCluster.LeaderOffset("", -1, Long.parseLong(subSplits[1])));
            }
            return offsetMap;
        }

        /**
         * String representation of checkpoint
         *
         * Format:
         * topic1,0:offset0,1:offset1,2:offset2, .....
         *
         * @param offsetMap
         * @return
         */
        public static String offsetsToStr(HashMap<TopicAndPartition, KafkaCluster.LeaderOffset> offsetMap) {
            StringBuilder sb = new StringBuilder();
            // atleast 1 partition will be present.
            sb.append(offsetMap.entrySet().stream().findFirst().get().getKey().topic() + ",");
            sb.append(offsetMap.entrySet().stream()
                    .map(e -> String.format("%s:%d", e.getKey().partition(), e.getValue().offset()))
                    .collect(Collectors.joining(",")));
            return sb.toString();
        }

        public static OffsetRange[] computeOffsetRanges(
                HashMap<TopicAndPartition, KafkaCluster.LeaderOffset> fromOffsetMap,
                HashMap<TopicAndPartition, KafkaCluster.LeaderOffset> toOffsetMap) {
            Comparator<OffsetRange> byPartition = (OffsetRange o1, OffsetRange o2) -> {
                return Integer.valueOf(o1.partition()).compareTo(Integer.valueOf(o2.partition()));
            };
            List<OffsetRange> offsetRanges = toOffsetMap.entrySet().stream().map(e -> {
                TopicAndPartition tp = e.getKey();
                long fromOffset = -1;
                if (fromOffsetMap.containsKey(tp)) {
                    fromOffset = fromOffsetMap.get(tp).offset();
                }
                return OffsetRange.create(tp, fromOffset, e.getValue().offset());
            }).sorted(byPartition).collect(Collectors.toList());

            OffsetRange[] ranges = new OffsetRange[offsetRanges.size()];
            return offsetRanges.toArray(ranges);
        }

        public static long totalNewMessages(OffsetRange[] ranges) {
            long totalMsgs = 0;
            for (OffsetRange range : ranges) {
                totalMsgs += Math.max(range.untilOffset() - range.fromOffset(), 0);
            }
            return totalMsgs;
        }
    }

    /**
     * Helpers to deal with tricky scala <=> java conversions. (oh my!)
     */
    static class ScalaHelpers {
        public static <K, V> Map<K, V> toScalaMap(HashMap<K, V> m) {
            return JavaConverters.mapAsScalaMapConverter(m).asScala().toMap(Predef.<Tuple2<K, V>>conforms());
        }

        public static Set<String> toScalaSet(HashSet<String> s) {
            return JavaConverters.asScalaSetConverter(s).asScala().<String>toSet();
        }

        public static <K, V> java.util.Map<K, V> toJavaMap(Map<K, V> m) {
            return JavaConverters.<K, V>mapAsJavaMapConverter(m).asJava();
        }
    }

    /**
     * Configs to be passed for this source. All standard Kafka consumer configs are also
     * respected
     */
    static class Config {
        private final static String KAFKA_TOPIC_NAME = "hoodie.deltastreamer.source.kafka.topic";
        private final static String DEFAULT_AUTO_RESET_OFFSET = "largest";
    }

    private HashMap<String, String> kafkaParams;

    private final String topicName;

    public KafkaSource(PropertiesConfiguration config, JavaSparkContext sparkContext, SourceDataFormat dataFormat,
            SchemaProvider schemaProvider) {
        super(config, sparkContext, dataFormat, schemaProvider);

        kafkaParams = new HashMap<>();
        Stream<String> keys = StreamSupport
                .stream(Spliterators.spliteratorUnknownSize(config.getKeys(), Spliterator.NONNULL), false);
        keys.forEach(k -> kafkaParams.put(k, config.getString(k)));

        UtilHelpers.checkRequiredProperties(config, Arrays.asList(Config.KAFKA_TOPIC_NAME));
        topicName = config.getString(Config.KAFKA_TOPIC_NAME);
    }

    @Override
    public Pair<Optional<JavaRDD<GenericRecord>>, String> fetchNewData(Optional<String> lastCheckpointStr,
            long maxInputBytes) {

        // Obtain current metadata for the topic
        KafkaCluster cluster = new KafkaCluster(ScalaHelpers.toScalaMap(kafkaParams));
        Either<ArrayBuffer<Throwable>, Set<TopicAndPartition>> either = cluster
                .getPartitions(ScalaHelpers.toScalaSet(new HashSet<>(Arrays.asList(topicName))));
        if (either.isLeft()) {
            // log errors. and bail out.
            throw new HoodieDeltaStreamerException("Error obtaining partition metadata",
                    either.left().get().head());
        }
        Set<TopicAndPartition> topicPartitions = either.right().get();

        // Determine the offset ranges to read from
        HashMap<TopicAndPartition, KafkaCluster.LeaderOffset> fromOffsets;
        if (lastCheckpointStr.isPresent()) {
            fromOffsets = CheckpointUtils.strToOffsets(lastCheckpointStr.get());
        } else {
            String autoResetValue = config.getString("auto.offset.reset", Config.DEFAULT_AUTO_RESET_OFFSET);
            if (autoResetValue.equals("smallest")) {
                fromOffsets = new HashMap(
                        ScalaHelpers.toJavaMap(cluster.getEarliestLeaderOffsets(topicPartitions).right().get()));
            } else if (autoResetValue.equals("largest")) {
                fromOffsets = new HashMap(
                        ScalaHelpers.toJavaMap(cluster.getLatestLeaderOffsets(topicPartitions).right().get()));
            } else {
                throw new HoodieNotSupportedException("Auto reset value must be one of 'smallest' or 'largest' ");
            }
        }

        // Always read until the latest offset
        HashMap<TopicAndPartition, KafkaCluster.LeaderOffset> toOffsets = new HashMap(
                ScalaHelpers.toJavaMap(cluster.getLatestLeaderOffsets(topicPartitions).right().get()));

        // Come up with final set of OffsetRanges to read (account for new partitions)
        // TODO(vc): Respect maxInputBytes, by estimating number of messages to read each batch from partition size
        OffsetRange[] offsetRanges = CheckpointUtils.computeOffsetRanges(fromOffsets, toOffsets);
        long totalNewMsgs = CheckpointUtils.totalNewMessages(offsetRanges);
        if (totalNewMsgs <= 0) {
            return new ImmutablePair<>(Optional.empty(), lastCheckpointStr.isPresent() ? lastCheckpointStr.get()
                    : CheckpointUtils.offsetsToStr(toOffsets));
        } else {
            log.info("About to read " + totalNewMsgs + " from Kafka for topic :" + topicName);
        }

        // Perform the actual read from Kafka
        JavaRDD<byte[]> kafkaRDD = KafkaUtils.createRDD(sparkContext, byte[].class, byte[].class,
                DefaultDecoder.class, DefaultDecoder.class, kafkaParams, offsetRanges).values();

        // Produce a RDD[GenericRecord]
        final AvroConvertor avroConvertor = new AvroConvertor(schemaProvider.getSourceSchema().toString());
        JavaRDD<GenericRecord> newDataRDD;
        if (dataFormat == SourceDataFormat.AVRO) {
            newDataRDD = kafkaRDD.map(bytes -> avroConvertor.fromAvroBinary(bytes));
        } else if (dataFormat == SourceDataFormat.JSON) {
            newDataRDD = kafkaRDD.map(bytes -> avroConvertor.fromJson(new String(bytes, Charset.forName("utf-8"))));
        } else {
            throw new HoodieNotSupportedException("Unsupport data format :" + dataFormat);
        }

        return new ImmutablePair<>(Optional.of(newDataRDD), CheckpointUtils.offsetsToStr(toOffsets));
    }
}