gobblin.source.extractor.extract.kafka.KafkaAvroExtractor.java Source code

Introduction

Here is the source code for gobblin.source.extractor.extract.kafka.KafkaAvroExtractor.java
Source

/*
 * Copyright (C) 2014-2015 LinkedIn Corp. All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License"); you may not use
 * this file except in compliance with the License. You may obtain a copy of the
 * License at  http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software distributed
 * under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
 * CONDITIONS OF ANY KIND, either express or implied.
 */

package gobblin.source.extractor.extract.kafka;

import java.io.IOException;
import java.util.Arrays;

import kafka.message.MessageAndOffset;

import org.apache.avro.Schema;
import org.apache.avro.SchemaBuilder;
import org.apache.avro.generic.GenericData.Record;
import org.apache.avro.generic.GenericDatumReader;
import org.apache.avro.generic.GenericRecord;
import org.apache.avro.io.Decoder;
import org.apache.avro.io.DecoderFactory;
import org.apache.commons.codec.binary.Hex;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import com.google.common.base.Optional;

import gobblin.configuration.WorkUnitState;
import gobblin.metrics.kafka.KafkaAvroSchemaRegistry;
import gobblin.metrics.kafka.SchemaNotFoundException;
import gobblin.source.extractor.DataRecordException;
import gobblin.source.extractor.Extractor;
import gobblin.util.AvroUtils;

/**
 * An implementation of {@link Extractor} for Kafka, where events are in Avro format.
 *
 * @author ziliu
 */
public class KafkaAvroExtractor extends KafkaExtractor<Schema, GenericRecord> {

    private static final Logger LOG = LoggerFactory.getLogger(KafkaAvroExtractor.class);
    private static final Schema DEFAULT_SCHEMA = SchemaBuilder.record("DefaultSchema").fields().name("header")
            .type(SchemaBuilder.record("header").fields().name("time").type("long").withDefault(0).endRecord())
            .noDefault().endRecord();

    private final Optional<Schema> schema;
    private final KafkaAvroSchemaRegistry schemaRegistry;
    private final Optional<GenericDatumReader<Record>> reader;

    /**
     * @param state state should contain property "kafka.schema.registry.url", and optionally
     * "kafka.schema.registry.max.cache.size" (default = 1000) and
     * "kafka.schema.registry.cache.expire.after.write.min" (default = 10).
     * @throws SchemaNotFoundException if the latest schema of the topic cannot be retrieved
     * from the schema registry.
     */
    public KafkaAvroExtractor(WorkUnitState state) {
        super(state);
        this.schemaRegistry = new KafkaAvroSchemaRegistry(state.getProperties());
        this.schema = Optional.fromNullable(getLatestSchemaByTopic());
        if (this.schema.isPresent()) {
            this.reader = Optional.of(new GenericDatumReader<Record>(this.schema.get()));
        } else {
            this.reader = Optional.absent();
        }
    }

    private Schema getLatestSchemaByTopic() {
        try {
            return this.schemaRegistry.getLatestSchemaByTopic(this.topicName);
        } catch (SchemaNotFoundException e) {
            LOG.error(String.format("Cannot find latest schema for topic %s. This topic will be skipped",
                    this.topicName), e);
            return null;
        }
    }

    @Override
    public GenericRecord readRecordImpl(GenericRecord reuse) throws DataRecordException, IOException {
        if (!this.schema.isPresent()) {
            return null;
        }
        return super.readRecordImpl(reuse);
    }

    @Override
    public Schema getSchema() {
        return this.schema.or(DEFAULT_SCHEMA);
    }

    @Override
    protected GenericRecord decodeRecord(MessageAndOffset messageAndOffset)
            throws SchemaNotFoundException, IOException {
        byte[] payload = getBytes(messageAndOffset.message().payload());
        if (payload[0] != KafkaAvroSchemaRegistry.MAGIC_BYTE) {
            throw new RuntimeException(
                    String.format("Unknown magic byte for partition %s", this.getCurrentPartition()));
        }

        byte[] schemaIdByteArray = Arrays.copyOfRange(payload, 1,
                1 + KafkaAvroSchemaRegistry.SCHEMA_ID_LENGTH_BYTE);
        String schemaId = Hex.encodeHexString(schemaIdByteArray);
        Schema schema = null;
        schema = this.schemaRegistry.getSchemaById(schemaId);
        reader.get().setSchema(schema);
        Decoder binaryDecoder = DecoderFactory.get().binaryDecoder(payload,
                1 + KafkaAvroSchemaRegistry.SCHEMA_ID_LENGTH_BYTE,
                payload.length - 1 - KafkaAvroSchemaRegistry.SCHEMA_ID_LENGTH_BYTE, null);
        try {
            GenericRecord record = reader.get().read(null, binaryDecoder);
            if (!record.getSchema().equals(this.schema.get())) {
                record = AvroUtils.convertRecordSchema(record, this.schema.get());
            }
            return record;
        } catch (IOException e) {
            LOG.error(String.format("Error during decoding record for partition %s: ", this.getCurrentPartition()));
            throw e;
        }
    }
}