Java tutorial
/* * Copyright 2017 Cask Data, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); you may not * use this file except in compliance with the License. You may obtain a copy of * the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the * License for the specific language governing permissions and limitations under * the License. */ package co.cask.hydrator.plugin.batchSource; import co.cask.cdap.api.annotation.Description; import co.cask.cdap.api.annotation.Macro; import co.cask.cdap.api.annotation.Name; import co.cask.cdap.api.annotation.Plugin; import co.cask.cdap.api.common.Bytes; import co.cask.cdap.api.data.batch.Input; import co.cask.cdap.api.data.format.FormatSpecification; import co.cask.cdap.api.data.format.RecordFormat; import co.cask.cdap.api.data.format.StructuredRecord; import co.cask.cdap.api.data.schema.Schema; import co.cask.cdap.api.dataset.DatasetProperties; import co.cask.cdap.api.dataset.lib.KeyValue; import co.cask.cdap.api.dataset.lib.KeyValueTable; import co.cask.cdap.api.flow.flowlet.StreamEvent; import co.cask.cdap.etl.api.Emitter; import co.cask.cdap.etl.api.PipelineConfigurer; import co.cask.cdap.etl.api.batch.BatchRuntimeContext; import co.cask.cdap.etl.api.batch.BatchSource; import co.cask.cdap.etl.api.batch.BatchSourceContext; import co.cask.cdap.format.RecordFormats; import co.cask.hydrator.common.KeyValueListParser; import co.cask.hydrator.common.ReferencePluginConfig; import co.cask.hydrator.common.SourceInputFormatProvider; import co.cask.hydrator.common.batch.JobUtils; import com.google.common.base.Joiner; import com.google.common.base.Splitter; import com.google.common.base.Strings; import kafka.common.TopicAndPartition; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.mapreduce.Job; import java.io.IOException; import java.util.ArrayList; import java.util.HashMap; import java.util.HashSet; import java.util.List; import java.util.Map; import java.util.Set; import javax.annotation.Nullable; /** * Kafka batch source. */ @Plugin(type = BatchSource.PLUGIN_TYPE) @Name(KafkaBatchSource.NAME) @Description("Kafka batch source.") public class KafkaBatchSource extends BatchSource<KafkaKey, KafkaMessage, StructuredRecord> { public static final String NAME = "Kafka"; private final KafkaBatchConfig config; private KeyValueTable table; private List<KafkaRequest> kafkaRequests; private Schema schema; private RecordFormat<StreamEvent, StructuredRecord> recordFormat; private String messageField; /** * Config properties for the plugin. */ public static class KafkaBatchConfig extends ReferencePluginConfig { @Description("Kafka topic to read from.") @Macro private String topic; @Description("List of Kafka brokers specified in host1:port1,host2:port2 form. For example, " + "host1.example.com:9092,host2.example.com:9092.") @Macro private String kafkaBrokers; @Description("Table name to track the latest offset we read from kafka. It is recommended to name it " + "same as the pipeline name to avoid conflict on table names.") private String tableName; @Description("The topic partitions to read from. If not specified, all partitions will be read.") @Macro @Nullable private String partitions; @Description("The initial offset for each topic partition. This offset will only be used for the" + "first run of the pipeline. Any subsequent run will read from the latest offset from previous run." + "Offsets are inclusive. If an offset of 5 is used, the message at offset 5 will be read.") @Nullable @Macro private String initialPartitionOffsets; @Description("Output schema of the source, including the timeField and keyField. " + "The fields excluding keyField are used in conjunction with the format " + "to parse Kafka payloads.") private String schema; @Description("Optional format of the Kafka event. Any format supported by CDAP is supported. " + "For example, a value of 'csv' will attempt to parse Kafka payloads as comma-separated values. " + "If no format is given, Kafka message payloads will be treated as bytes.") @Nullable private String format; @Description("Optional name of the field containing the message key. " + "If this is not set, no key field will be added to output records. " + "If set, this field must be present in the schema property and must be bytes.") @Nullable private String keyField; @Description("Optional name of the field containing the kafka partition that was read from. " + "If this is not set, no partition field will be added to output records. " + "If set, this field must be present in the schema property and must be an integer.") @Nullable private String partitionField; @Description("Optional name of the field containing the kafka offset that the message was read from. " + "If this is not set, no offset field will be added to output records. " + "If set, this field must be present in the schema property and must be a long.") @Nullable private String offsetField; public KafkaBatchConfig() { super(""); } public KafkaBatchConfig(String brokers, String partitions, String topic, String initialPartitionOffsets, Long defaultOffset) { super(String.format("Kafka_%s", topic)); this.kafkaBrokers = brokers; this.partitions = partitions; this.topic = topic; this.initialPartitionOffsets = initialPartitionOffsets; } // Accessors public String getTopic() { return topic; } public String getBrokers() { return kafkaBrokers; } public String getTableName() { return tableName; } public Set<Integer> getPartitions() { Set<Integer> partitionSet = new HashSet<>(); if (partitions == null) { return partitionSet; } for (String partition : Splitter.on(',').trimResults().split(partitions)) { try { partitionSet.add(Integer.parseInt(partition)); } catch (NumberFormatException e) { throw new IllegalArgumentException( String.format("Invalid partition '%s'. Partitions must be integers.", partition)); } } return partitionSet; } @Nullable public String getKeyField() { return Strings.isNullOrEmpty(keyField) ? null : keyField; } @Nullable public String getPartitionField() { return Strings.isNullOrEmpty(partitionField) ? null : partitionField; } @Nullable public String getOffsetField() { return Strings.isNullOrEmpty(offsetField) ? null : offsetField; } @Nullable public String getFormat() { return Strings.isNullOrEmpty(format) ? null : format; } public Schema getSchema() { try { return Strings.isNullOrEmpty(schema) ? null : Schema.parseJson(schema); } catch (IOException e) { throw new IllegalArgumentException("Unable to parse schema: " + e.getMessage()); } } // gets the message schema from the schema field. If the time, key, partition, or offset fields are in the configured // schema, they will be removed. public Schema getMessageSchema() { Schema schema = getSchema(); List<Schema.Field> messageFields = new ArrayList<>(); boolean keyFieldExists = false; boolean partitionFieldExists = false; boolean offsetFieldExists = false; for (Schema.Field field : schema.getFields()) { String fieldName = field.getName(); Schema fieldSchema = field.getSchema(); Schema.Type fieldType = fieldSchema.isNullable() ? fieldSchema.getNonNullable().getType() : fieldSchema.getType(); // if the field is not the time field and not the key field, it is a message field. if (fieldName.equals(keyField)) { if (fieldType != Schema.Type.BYTES) { throw new IllegalArgumentException( "The key field must be of type bytes or nullable bytes."); } keyFieldExists = true; } else if (fieldName.equals(partitionField)) { if (fieldType != Schema.Type.INT) { throw new IllegalArgumentException("The partition field must be of type int."); } partitionFieldExists = true; } else if (fieldName.equals(offsetField)) { if (fieldType != Schema.Type.LONG) { throw new IllegalArgumentException("The offset field must be of type long."); } offsetFieldExists = true; } else { messageFields.add(field); } } if (messageFields.isEmpty()) { throw new IllegalArgumentException( "Schema must contain at least one other field besides the time and key fields."); } if (getKeyField() != null && !keyFieldExists) { throw new IllegalArgumentException(String.format( "keyField '%s' does not exist in the schema. Please add it to the schema.", keyField)); } if (getPartitionField() != null && !partitionFieldExists) { throw new IllegalArgumentException(String.format( "partitionField '%s' does not exist in the schema. Please add it to the schema.", partitionField)); } if (getOffsetField() != null && !offsetFieldExists) { throw new IllegalArgumentException( String.format("offsetField '%s' does not exist in the schema. Please add it to the schema.", offsetFieldExists)); } return Schema.recordOf("kafka.message", messageFields); } /** * @return broker host to broker port mapping. */ public Map<String, Integer> getBrokerMap() { Map<String, Integer> brokerMap = new HashMap<>(); for (KeyValue<String, String> hostAndPort : KeyValueListParser.DEFAULT.parse(kafkaBrokers)) { String host = hostAndPort.getKey(); String portStr = hostAndPort.getValue(); try { brokerMap.put(host, Integer.parseInt(portStr)); } catch (NumberFormatException e) { throw new IllegalArgumentException( String.format("Invalid port '%s' for host '%s'.", portStr, host)); } } if (brokerMap.isEmpty()) { throw new IllegalArgumentException("Must specify kafka brokers."); } return brokerMap; } public Map<TopicAndPartition, Long> getInitialPartitionOffsets() { Map<TopicAndPartition, Long> partitionOffsets = new HashMap<>(); // if initial partition offsets are specified, overwrite the defaults. if (initialPartitionOffsets != null) { for (KeyValue<String, String> partitionAndOffset : KeyValueListParser.DEFAULT .parse(initialPartitionOffsets)) { String partitionStr = partitionAndOffset.getKey(); String offsetStr = partitionAndOffset.getValue(); int partition; try { partition = Integer.parseInt(partitionStr); } catch (NumberFormatException e) { throw new IllegalArgumentException( String.format("Invalid partition '%s' in initialPartitionOffsets.", partitionStr)); } long offset; try { offset = Long.parseLong(offsetStr); } catch (NumberFormatException e) { throw new IllegalArgumentException( String.format("Invalid offset '%s' in initialPartitionOffsets for partition %d.", partitionStr, partition)); } partitionOffsets.put(new TopicAndPartition(topic, partition), offset); } } return partitionOffsets; } public void validate() { getBrokerMap(); getPartitions(); getInitialPartitionOffsets(); Schema messageSchema = getMessageSchema(); // if format is empty, there must be just a single message field of type bytes or nullable types. if (Strings.isNullOrEmpty(format)) { List<Schema.Field> messageFields = messageSchema.getFields(); if (messageFields.size() > 1) { List<String> fieldNames = new ArrayList<>(); for (Schema.Field messageField : messageFields) { fieldNames.add(messageField.getName()); } throw new IllegalArgumentException(String.format( "Without a format, the schema must contain just a single message field of type bytes or nullable bytes. " + "Found %s message fields (%s).", messageFields.size(), Joiner.on(',').join(fieldNames))); } Schema.Field messageField = messageFields.get(0); Schema messageFieldSchema = messageField.getSchema(); Schema.Type messageFieldType = messageFieldSchema.isNullable() ? messageFieldSchema.getNonNullable().getType() : messageFieldSchema.getType(); if (messageFieldType != Schema.Type.BYTES) { throw new IllegalArgumentException(String.format( "Without a format, the message field must be of type bytes or nullable bytes, but field %s is of type %s.", messageField.getName(), messageField.getSchema())); } } else { // otherwise, if there is a format, make sure we can instantiate it. FormatSpecification formatSpec = new FormatSpecification(format, messageSchema, new HashMap<String, String>()); try { RecordFormats.createInitializedFormat(formatSpec); } catch (Exception e) { throw new IllegalArgumentException(String.format( "Unable to instantiate a message parser from format '%s' and message schema '%s': %s", format, messageSchema, e.getMessage()), e); } } } } public KafkaBatchSource(KafkaBatchConfig config) { this.config = config; } @Override public void configurePipeline(PipelineConfigurer pipelineConfigurer) { pipelineConfigurer.createDataset(config.getTableName(), KeyValueTable.class, DatasetProperties.EMPTY); config.validate(); pipelineConfigurer.getStageConfigurer().setOutputSchema(config.getSchema()); } @Override public void prepareRun(BatchSourceContext context) throws Exception { Job job = JobUtils.createInstance(); Configuration conf = job.getConfiguration(); table = context.getDataset(config.getTableName()); kafkaRequests = KafkaInputFormat.saveKafkaRequests(conf, config.getTopic(), config.getBrokerMap(), config.getPartitions(), config.getInitialPartitionOffsets(), table); context.setInput( Input.of(config.referenceName, new SourceInputFormatProvider(KafkaInputFormat.class, conf))); } @Override public void onRunFinish(boolean succeeded, BatchSourceContext context) { if (succeeded) { for (KafkaRequest kafkaRequest : kafkaRequests) { TopicAndPartition topicAndPartition = new TopicAndPartition(kafkaRequest.getTopic(), kafkaRequest.getPartition()); table.write(topicAndPartition.toString(), Bytes.toBytes(kafkaRequest.getLastOffset())); } } } @Override public void initialize(BatchRuntimeContext context) throws Exception { super.initialize(context); schema = config.getSchema(); Schema messageSchema = config.getMessageSchema(); for (Schema.Field field : schema.getFields()) { String name = field.getName(); if (!name.equals(config.getKeyField()) && !name.equals(config.getPartitionField()) && !name.equals(config.getOffsetField())) { messageField = name; break; } } if (config.getFormat() != null) { FormatSpecification spec = new FormatSpecification(config.getFormat(), messageSchema, new HashMap<String, String>()); recordFormat = RecordFormats.createInitializedFormat(spec); } } @Override public void transform(KeyValue<KafkaKey, KafkaMessage> input, Emitter<StructuredRecord> emitter) throws Exception { StructuredRecord.Builder builder = StructuredRecord.builder(schema); if (config.getKeyField() != null) { builder.set(config.getKeyField(), input.getValue().getKey().array()); } if (config.getPartitionField() != null) { builder.set(config.getPartitionField(), input.getKey().getPartition()); } if (config.getOffsetField() != null) { builder.set(config.getOffsetField(), input.getKey().getOffset()); } if (config.getFormat() == null) { builder.set(messageField, input.getValue().getPayload().array()); } else { StructuredRecord messageRecord = recordFormat.read(new StreamEvent(input.getValue().getPayload())); for (Schema.Field field : messageRecord.getSchema().getFields()) { String fieldName = field.getName(); builder.set(fieldName, messageRecord.get(fieldName)); } } emitter.emit(builder.build()); } }