gobblin.source.extractor.extract.kafka.KafkaDeserializerExtractor.java Source code

Java tutorial

Introduction

Here is the source code for gobblin.source.extractor.extract.kafka.KafkaDeserializerExtractor.java

Source

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package gobblin.source.extractor.extract.kafka;

import io.confluent.kafka.serializers.KafkaAvroDeserializer;
import io.confluent.kafka.serializers.KafkaJsonDeserializer;

import java.io.IOException;
import java.util.Properties;

import lombok.AccessLevel;
import lombok.AllArgsConstructor;
import lombok.Getter;

import org.apache.avro.Schema;
import org.apache.avro.generic.GenericRecord;
import org.apache.commons.lang3.reflect.ConstructorUtils;
import org.apache.kafka.common.serialization.ByteArrayDeserializer;
import org.apache.kafka.common.serialization.Deserializer;
import org.apache.kafka.common.serialization.StringDeserializer;

import com.google.common.annotations.VisibleForTesting;
import com.google.common.base.Enums;
import com.google.common.base.Optional;
import com.google.common.base.Preconditions;

import gobblin.annotation.Alias;
import gobblin.configuration.WorkUnitState;
import gobblin.kafka.client.ByteArrayBasedKafkaRecord;
import gobblin.metrics.kafka.KafkaSchemaRegistry;
import gobblin.metrics.kafka.SchemaRegistryException;
import gobblin.util.AvroUtils;
import gobblin.util.PropertiesUtils;

/**
 * <p>
 *   Extension of {@link KafkaExtractor} that wraps Kafka's {@link Deserializer} API. Kafka's {@link Deserializer} provides
 *   a generic way of converting Kafka {@link kafka.message.Message}s to {@link Object}. Typically, a {@link Deserializer}
 *   will be used along with a {@link org.apache.kafka.common.serialization.Serializer} which is responsible for converting
 *   an {@link Object} to a Kafka {@link kafka.message.Message}. These APIs are useful for reading and writing to Kafka,
 *   since Kafka is primarily a byte oriented system.
 * </p>
 *
 * <p>
 *   This class wraps the {@link Deserializer} API allowing any existing classes that implement the {@link Deserializer}
 *   API to integrate with seamlessly with Gobblin. The deserializer can be specified in the following ways:
 *
 *   <ul>
 *     <li>{@link #KAFKA_DESERIALIZER_TYPE} can be used to specify a pre-defined enum from {@link Deserializers} or
 *     it can be used to specify the fully-qualified name of a {@link Class} that defines the {@link Deserializer}
 *     interface. If this property is set to a class name, then {@link KafkaSchemaRegistry} must also be specified
 *     using the {@link KafkaSchemaRegistry#KAFKA_SCHEMA_REGISTRY_CLASS} config key</li>
 *   </ul>
 * </p>
 */
@Getter(AccessLevel.PACKAGE)
@Alias(value = "DESERIALIZER")
public class KafkaDeserializerExtractor extends KafkaExtractor<Object, Object> {

    public static final String KAFKA_DESERIALIZER_TYPE = "kafka.deserializer.type";

    private static final String CONFLUENT_SCHEMA_REGISTRY_URL = "schema.registry.url";

    private final Deserializer<?> kafkaDeserializer;
    private final KafkaSchemaRegistry<?, ?> kafkaSchemaRegistry;
    private final Schema latestSchema;

    public KafkaDeserializerExtractor(WorkUnitState state) throws ReflectiveOperationException {
        this(state, getDeserializerType(state.getProperties()));
    }

    private KafkaDeserializerExtractor(WorkUnitState state, Optional<Deserializers> deserializerType)
            throws ReflectiveOperationException {
        this(state, deserializerType, getDeserializer(getProps(state), deserializerType),
                getKafkaSchemaRegistry(getProps(state)));
    }

    @VisibleForTesting
    KafkaDeserializerExtractor(WorkUnitState state, Optional<Deserializers> deserializerType,
            Deserializer<?> kafkaDeserializer, KafkaSchemaRegistry<?, ?> kafkaSchemaRegistry) {
        super(state);
        this.kafkaDeserializer = kafkaDeserializer;
        this.kafkaSchemaRegistry = kafkaSchemaRegistry;
        this.latestSchema = (deserializerType.equals(Optional.of(Deserializers.CONFLUENT_AVRO)))
                ? (Schema) getSchema()
                : null;
    }

    @Override
    protected Object decodeRecord(ByteArrayBasedKafkaRecord messageAndOffset) throws IOException {
        Object deserialized = kafkaDeserializer.deserialize(this.topicName, messageAndOffset.getMessageBytes());

        // For Confluent's Schema Registry the read schema is the latest registered schema to support schema evolution
        return (this.latestSchema == null) ? deserialized
                : AvroUtils.convertRecordSchema((GenericRecord) deserialized, this.latestSchema);
    }

    @Override
    public Object getSchema() {
        try {
            return this.kafkaSchemaRegistry.getLatestSchemaByTopic(this.topicName);
        } catch (SchemaRegistryException e) {
            throw new RuntimeException(e);
        }
    }

    private static Optional<Deserializers> getDeserializerType(Properties props) {
        Preconditions.checkArgument(props.containsKey(KAFKA_DESERIALIZER_TYPE),
                "Missing required property " + KAFKA_DESERIALIZER_TYPE);
        return Enums.getIfPresent(Deserializers.class, props.getProperty(KAFKA_DESERIALIZER_TYPE).toUpperCase());
    }

    /**
     * Constructs a {@link Deserializer}, using the value of {@link #KAFKA_DESERIALIZER_TYPE}.
     */
    private static Deserializer<?> getDeserializer(Properties props, Optional<Deserializers> deserializerType)
            throws ReflectiveOperationException {

        Deserializer<?> deserializer;
        if (deserializerType.isPresent()) {
            deserializer = ConstructorUtils.invokeConstructor(deserializerType.get().getDeserializerClass());
        } else {
            deserializer = Deserializer.class.cast(
                    ConstructorUtils.invokeConstructor(Class.forName(props.getProperty(KAFKA_DESERIALIZER_TYPE))));
        }
        deserializer.configure(PropertiesUtils.propsToStringKeyMap(props), false);
        return deserializer;
    }

    /**
     * Constructs a {@link KafkaSchemaRegistry} using the value of {@link #KAFKA_DESERIALIZER_TYPE}, if not set it
     * defaults to {@link SimpleKafkaSchemaRegistry}.
     */
    private static KafkaSchemaRegistry<?, ?> getKafkaSchemaRegistry(Properties props)
            throws ReflectiveOperationException {

        Optional<Deserializers> deserializerType = Enums.getIfPresent(Deserializers.class,
                props.getProperty(KAFKA_DESERIALIZER_TYPE).toUpperCase());

        if (deserializerType.isPresent()) {
            return ConstructorUtils.invokeConstructor(deserializerType.get().getSchemaRegistryClass(), props);
        }
        if (props.containsKey(KafkaSchemaRegistry.KAFKA_SCHEMA_REGISTRY_CLASS)) {
            return KafkaSchemaRegistry.get(props);
        }
        return new SimpleKafkaSchemaRegistry(props);
    }

    /**
     * Gets {@link Properties} from a {@link WorkUnitState} and sets the config <code>schema.registry.url</code> to value
     * of {@link KafkaSchemaRegistry#KAFKA_SCHEMA_REGISTRY_URL} if set. This way users don't need to specify both
     * properties as <code>schema.registry.url</code> is required by the {@link ConfluentKafkaSchemaRegistry}.
     */
    private static Properties getProps(WorkUnitState workUnitState) {
        Properties properties = workUnitState.getProperties();
        if (properties.containsKey(KafkaSchemaRegistry.KAFKA_SCHEMA_REGISTRY_URL)) {
            properties.setProperty(CONFLUENT_SCHEMA_REGISTRY_URL,
                    properties.getProperty(KafkaSchemaRegistry.KAFKA_SCHEMA_REGISTRY_URL));
        }
        return properties;
    }

    /**
     * Pre-defined {@link Deserializer} that can be referenced by the enum name.
     */
    @AllArgsConstructor
    @Getter
    public enum Deserializers {

        /**
         * Confluent's Avro {@link Deserializer}
         *
         * @see KafkaAvroDeserializer
         */
        CONFLUENT_AVRO(KafkaAvroDeserializer.class, ConfluentKafkaSchemaRegistry.class),

        /**
         * Confluent's JSON {@link Deserializer}
         *
         * @see KafkaJsonDeserializer
         */
        CONFLUENT_JSON(KafkaJsonDeserializer.class, SimpleKafkaSchemaRegistry.class),

        /**
         * A custom {@link Deserializer} for converting <code>byte[]</code> to {@link com.google.gson.JsonElement}s
         *
         * @see KafkaGsonDeserializer
         */
        GSON(KafkaGsonDeserializer.class, SimpleKafkaSchemaRegistry.class),

        /**
         * A standard Kafka {@link Deserializer} that does nothing, it simply returns the <code>byte[]</code>
         */
        BYTE_ARRAY(ByteArrayDeserializer.class, SimpleKafkaSchemaRegistry.class),

        /**
         * A standard Kafka {@link Deserializer} for converting <code>byte[]</code> to {@link String}s
         */
        STRING(StringDeserializer.class, SimpleKafkaSchemaRegistry.class);

        private final Class<? extends Deserializer> deserializerClass;
        private final Class<? extends KafkaSchemaRegistry> schemaRegistryClass;
    }
}