com.hurence.logisland.connect.source.KafkaConnectStreamSource.java Source code

Java tutorial

Introduction

Here is the source code for com.hurence.logisland.connect.source.KafkaConnectStreamSource.java

Source

/**
 * Copyright (C) 2016 Hurence (support@hurence.com)
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *         http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package com.hurence.logisland.connect.source;

import com.hurence.logisland.connect.AbstractKafkaConnectComponent;
import com.hurence.logisland.stream.spark.provider.StreamOptions;
import com.hurence.logisland.util.spark.SparkPlatform;
import org.apache.commons.lang3.StringUtils;
import org.apache.kafka.connect.runtime.WorkerSourceTaskContext;
import org.apache.kafka.connect.source.SourceConnector;
import org.apache.kafka.connect.source.SourceRecord;
import org.apache.kafka.connect.source.SourceTask;
import org.apache.kafka.connect.storage.Converter;
import org.apache.kafka.connect.storage.OffsetBackingStore;
import org.apache.kafka.connect.storage.OffsetStorageReaderImpl;
import org.apache.kafka.connect.storage.OffsetStorageWriter;
import org.apache.kafka.connect.util.ConnectorTaskId;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.SQLContext;
import org.apache.spark.sql.catalyst.InternalRow;
import org.apache.spark.sql.execution.streaming.Offset;
import org.apache.spark.sql.execution.streaming.SerializedOffset;
import org.apache.spark.sql.execution.streaming.Source;
import org.apache.spark.sql.types.DataTypes;
import org.apache.spark.sql.types.Metadata;
import org.apache.spark.sql.types.StructField;
import org.apache.spark.sql.types.StructType;
import org.apache.spark.unsafe.types.UTF8String;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import scala.Option;
import scala.Tuple2;
import scala.collection.JavaConversions;

import java.util.*;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.concurrent.atomic.AtomicLong;
import java.util.stream.Collectors;
import java.util.stream.Stream;
import java.util.stream.StreamSupport;

/**
 * Kafka connect to spark sql streaming bridge.
 *
 * @author amarziali
 */
public class KafkaConnectStreamSource extends AbstractKafkaConnectComponent<SourceConnector, SourceTask>
        implements Source {

    /**
     * The Schema used for this source.
     */
    public final static StructType SCHEMA = new StructType(new StructField[] {
            new StructField(StreamOptions.KAFKA_CONNECT_CONNECTOR_PROPERTIES().getName(),
                    DataTypes.createMapType(DataTypes.StringType, DataTypes.StringType), false, Metadata.empty()),
            new StructField(StreamOptions.KAFKA_CONNECT_KEY_CONVERTER().getName(), DataTypes.StringType, false,
                    Metadata.empty()),
            new StructField(StreamOptions.KAFKA_CONNECT_KEY_CONVERTER_PROPERTIES().getName(),
                    DataTypes.createMapType(DataTypes.StringType, DataTypes.StringType), false, Metadata.empty()),
            new StructField(StreamOptions.KAFKA_CONNECT_VALUE_CONVERTER().getName(), DataTypes.StringType, false,
                    Metadata.empty()),
            new StructField(StreamOptions.KAFKA_CONNECT_VALUE_CONVERTER_PROPERTIES().getName(),
                    DataTypes.createMapType(DataTypes.StringType, DataTypes.StringType), false, Metadata.empty()),
            new StructField(StreamOptions.KAFKA_CONNECT_MAX_TASKS().getName(),
                    DataTypes.createMapType(DataTypes.IntegerType, DataTypes.StringType), false,
                    Metadata.empty()) });
    /**
     * The schema used to represent the outgoing dataframe.
     */
    public final static StructType DATA_SCHEMA = new StructType(
            new StructField[] { new StructField("topic", DataTypes.StringType, false, Metadata.empty()),
                    new StructField("sourcePartition", DataTypes.StringType, false, Metadata.empty()),
                    new StructField("sourceOffset", DataTypes.StringType, false, Metadata.empty()),
                    new StructField("key", DataTypes.BinaryType, true, Metadata.empty()),
                    new StructField("value", DataTypes.BinaryType, false, Metadata.empty())

            });
    private final static Logger LOGGER = LoggerFactory.getLogger(KafkaConnectStreamSource.class);

    private final AtomicLong counter = new AtomicLong();
    private final AtomicInteger taskCounter = new AtomicInteger();

    private final Map<SourceTask, OffsetStorageWriter> offsetWriterMap = new IdentityHashMap<>();
    private final SortedMap<Long, List<Tuple2<SourceTask, SourceRecord>>> bufferedRecords = Collections
            .synchronizedSortedMap(new TreeMap<>());
    private final SortedMap<Long, List<Tuple2<SourceTask, SourceRecord>>> uncommittedRecords = Collections
            .synchronizedSortedMap(new TreeMap<>());
    private final Map<SourceTask, Thread> busyTasks = Collections.synchronizedMap(new IdentityHashMap<>());

    private final SparkPlatform sparkPlatform = StreamSupport
            .stream(Spliterators.spliteratorUnknownSize(ServiceLoader.load(SparkPlatform.class).iterator(),
                    Spliterator.ORDERED), false)
            .findFirst().orElseThrow(() -> new IllegalStateException(
                    "SparkPlatform service spi not defined. " + "Unable to continue"));

    /**
     * Base constructor. Should be called by {@link KafkaConnectStreamSourceProvider}
     *
     * @param sqlContext          the spark sql context.
     * @param connectorProperties the connector related properties.
     * @param keyConverter        the converter for the data key
     * @param valueConverter      the converter for the data body
     * @param offsetBackingStore  the backing store implementation (can be in-memory, file based, kafka based, etc...)
     * @param maxTasks            the maximum theoretical number of tasks this source should spawn.
     * @param connectorClass      the class of kafka connect source connector to wrap.
     * @param streamId            the id of the underlying stream
     */
    public KafkaConnectStreamSource(SQLContext sqlContext, Map<String, String> connectorProperties,
            Converter keyConverter, Converter valueConverter, OffsetBackingStore offsetBackingStore, int maxTasks,
            String connectorClass, String streamId) {
        super(sqlContext, connectorProperties, keyConverter, valueConverter, offsetBackingStore, maxTasks,
                connectorClass, streamId);

    }

    @Override
    protected void initialize(SourceTask task) {
        int taskId = taskCounter.incrementAndGet();
        ConnectorTaskId connectorTaskId = new ConnectorTaskId(
                StringUtils.join(new String[] { streamId, connectorName }, '#'), taskId);
        task.initialize(new WorkerSourceTaskContext(new OffsetStorageReaderImpl(offsetBackingStore,
                connectorTaskId.toString(), createInternalConverter(true), createInternalConverter(false))));
        offsetWriterMap.put(task, new OffsetStorageWriter(offsetBackingStore, connectorTaskId.toString(),
                createInternalConverter(true), createInternalConverter(false)));

    }

    @Override
    public StructType schema() {
        return SCHEMA;
    }

    @Override
    protected void createAndStartAllTasks()
            throws IllegalAccessException, InstantiationException, ClassNotFoundException {
        counter.set(0);
        taskCounter.set(0);
        busyTasks.clear();
        bufferedRecords.clear();
        offsetWriterMap.clear();
        super.createAndStartAllTasks();
    }

    @Override
    public synchronized Option<Offset> getOffset() {
        if (!uncommittedRecords.isEmpty()) {
            return Option.apply(SerializedOffset.apply(Long.toString(counter.incrementAndGet())));
        }
        if (bufferedRecords.isEmpty()) {
            tasks.forEach(t -> busyTasks.computeIfAbsent(t, sourceTask -> {
                Thread thread = new Thread(() -> {
                    try {
                        List<Tuple2<SourceTask, SourceRecord>> tmp = sourceTask.poll().stream()
                                .map(sourceRecord -> Tuple2.apply(sourceTask, sourceRecord))
                                .collect(Collectors.toList());
                        if (!tmp.isEmpty()) {
                            bufferedRecords.put(counter.incrementAndGet(), tmp);
                        }
                    } catch (InterruptedException ie) {
                        LOGGER.warn("Task {} interrupted while waiting.", sourceTask.getClass().getCanonicalName());
                    } finally {
                        busyTasks.remove(t);
                    }
                });
                thread.start();
                return thread;
            }));
        } else {
            return Option.apply(SerializedOffset.apply(bufferedRecords.lastKey().toString()));

        }
        return Option.empty();
    }

    @Override
    public Dataset<Row> getBatch(Option<Offset> start, Offset end) {
        Long startOff = start.isDefined() ? Long.parseLong(start.get().json())
                : !bufferedRecords.isEmpty() ? bufferedRecords.firstKey() : 0L;

        Map<Integer, List<InternalRow>> current = new LinkedHashMap<>(
                bufferedRecords.subMap(startOff, Long.parseLong(end.json()) + 1)).keySet().stream()
                        .flatMap(offset -> {
                            List<Tuple2<SourceTask, SourceRecord>> srl = bufferedRecords.remove(offset);
                            if (srl != null) {
                                uncommittedRecords.put(offset, srl);
                                return srl.stream();
                            }
                            return Stream.empty();
                        }).map(Tuple2::_2).map(
                                sourceRecord -> InternalRow.fromSeq(JavaConversions
                                        .<Object>asScalaBuffer(Arrays.asList(toUTFString(sourceRecord.topic()),
                                                toUTFString(sourceRecord.sourcePartition()),
                                                toUTFString(sourceRecord.sourceOffset()),
                                                keyConverter.fromConnectData(sourceRecord.topic(),
                                                        sourceRecord.keySchema(), sourceRecord.key()),
                                                valueConverter.fromConnectData(sourceRecord.topic(),
                                                        sourceRecord.valueSchema(), sourceRecord.value())))
                                        .toSeq()))
                        .collect(Collectors.groupingBy(row -> Objects.hashCode((row.getString(1)))));
        return sparkPlatform.createStreamingDataFrame(sqlContext, new SimpleRDD(sqlContext.sparkContext(), current),
                DATA_SCHEMA);

    }

    private UTF8String toUTFString(Object o) {
        if (o != null) {
            return UTF8String.fromString(o.toString());
        }
        return UTF8String.EMPTY_UTF8;
    }

    @Override
    public void commit(Offset end) {
        if (uncommittedRecords.isEmpty()) {
            return;
        }
        //first commit all offsets already given
        List<Tuple2<SourceTask, SourceRecord>> recordsToCommit = new LinkedHashMap<>(
                uncommittedRecords.subMap(uncommittedRecords.firstKey(), Long.parseLong(end.json()) + 1)).keySet()
                        .stream().flatMap(key -> uncommittedRecords.remove(key).stream())
                        .collect(Collectors.toList());

        recordsToCommit.forEach(tuple -> {
            try {
                offsetWriterMap.get(tuple._1()).offset(tuple._2().sourcePartition(), tuple._2().sourceOffset());
                tuple._1().commitRecord(tuple._2());
            } catch (Exception e) {
                LOGGER.warn("Unable to commit record " + tuple._2(), e);
            }
        });
        recordsToCommit.stream().map(Tuple2::_1).distinct().forEach(sourceTask -> {
            try {
                sourceTask.commit();
            } catch (Exception e) {
                LOGGER.warn("Unable to bulk commit offset for connector " + connectorName, e);
            }
        });
        //now flush offset writer
        offsetWriterMap.values().forEach(offsetStorageWriter -> {
            try {
                if (offsetStorageWriter.beginFlush()) {
                    offsetStorageWriter.doFlush((error, result) -> {
                        if (error == null) {
                            LOGGER.debug("Flushing till offset {} with result {}", end, result);
                        } else {
                            LOGGER.error("Unable to commit records till source offset " + end, error);

                        }
                    }).get(30, TimeUnit.SECONDS);
                }
            } catch (Exception e) {
                LOGGER.error("Unable to commit records till source offset " + end, e);
            }
        });
    }

    @Override
    public void stop() {
        super.stop();
    }

}