Java tutorial
/* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.gobblin.source.extractor.extract.kafka; import java.io.IOException; import java.util.ArrayList; import java.util.List; import java.util.Map; import java.util.Set; import java.util.concurrent.ConcurrentLinkedQueue; import java.util.concurrent.ExecutorService; import java.util.concurrent.Executors; import java.util.concurrent.TimeUnit; import java.util.concurrent.atomic.AtomicBoolean; import java.util.concurrent.atomic.AtomicInteger; import java.util.regex.Pattern; import org.apache.gobblin.dataset.DatasetConstants; import org.apache.gobblin.dataset.DatasetDescriptor; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import com.codahale.metrics.Timer; import com.google.common.base.Joiner; import com.google.common.base.Function; import com.google.common.base.Optional; import com.google.common.base.Preconditions; import com.google.common.base.Splitter; import com.google.common.base.Stopwatch; import com.google.common.collect.Iterables; import com.google.common.collect.Lists; import com.google.common.collect.Maps; import com.google.common.collect.Sets; import com.typesafe.config.Config; import org.apache.gobblin.configuration.ConfigurationKeys; import org.apache.gobblin.configuration.SourceState; import org.apache.gobblin.configuration.State; import org.apache.gobblin.configuration.WorkUnitState; import org.apache.gobblin.kafka.client.GobblinKafkaConsumerClient; import org.apache.gobblin.kafka.client.GobblinKafkaConsumerClient.GobblinKafkaConsumerClientFactory; import org.apache.gobblin.metrics.event.lineage.LineageInfo; import org.apache.gobblin.source.extractor.extract.EventBasedSource; import org.apache.gobblin.source.extractor.extract.kafka.workunit.packer.KafkaWorkUnitPacker; import org.apache.gobblin.source.extractor.limiter.LimiterConfigurationKeys; import org.apache.gobblin.source.workunit.Extract; import org.apache.gobblin.source.workunit.MultiWorkUnit; import org.apache.gobblin.source.workunit.WorkUnit; import org.apache.gobblin.util.ClassAliasResolver; import org.apache.gobblin.util.ConfigUtils; import org.apache.gobblin.util.DatasetFilterUtils; import org.apache.gobblin.util.ExecutorsUtils; import org.apache.gobblin.util.dataset.DatasetUtils; import org.apache.gobblin.instrumented.Instrumented; import org.apache.gobblin.metrics.MetricContext; import lombok.Getter; import lombok.Setter; import static java.util.stream.Collectors.toList; import static java.util.stream.Collectors.toSet; /** * A {@link org.apache.gobblin.source.Source} implementation for Kafka source. * * @author Ziyang Liu */ public abstract class KafkaSource<S, D> extends EventBasedSource<S, D> { private static final Logger LOG = LoggerFactory.getLogger(KafkaSource.class); public static final String TOPIC_BLACKLIST = "topic.blacklist"; public static final String TOPIC_WHITELIST = "topic.whitelist"; public static final String LATEST_OFFSET = "latest"; public static final String EARLIEST_OFFSET = "earliest"; public static final String NEAREST_OFFSET = "nearest"; public static final String BOOTSTRAP_WITH_OFFSET = "bootstrap.with.offset"; public static final String DEFAULT_BOOTSTRAP_WITH_OFFSET = LATEST_OFFSET; public static final String TOPICS_MOVE_TO_LATEST_OFFSET = "topics.move.to.latest.offset"; public static final String RESET_ON_OFFSET_OUT_OF_RANGE = "reset.on.offset.out.of.range"; public static final String DEFAULT_RESET_ON_OFFSET_OUT_OF_RANGE = NEAREST_OFFSET; public static final String TOPIC_NAME = "topic.name"; public static final String PARTITION_ID = "partition.id"; public static final String LEADER_ID = "leader.id"; public static final String LEADER_HOSTANDPORT = "leader.hostandport"; public static final Extract.TableType DEFAULT_TABLE_TYPE = Extract.TableType.APPEND_ONLY; public static final String DEFAULT_NAMESPACE_NAME = "KAFKA"; public static final String ALL_TOPICS = "all"; public static final String AVG_RECORD_SIZE = "avg.record.size"; public static final String AVG_RECORD_MILLIS = "avg.record.millis"; public static final String PREVIOUS_LATEST_OFFSET = "previousLatestOffset"; public static final String OFFSET_FETCH_EPOCH_TIME = "offsetFetchEpochTime"; public static final String PREVIOUS_OFFSET_FETCH_EPOCH_TIME = "previousOffsetFetchEpochTime"; public static final String GOBBLIN_KAFKA_CONSUMER_CLIENT_FACTORY_CLASS = "gobblin.kafka.consumerClient.class"; public static final String GOBBLIN_KAFKA_EXTRACT_ALLOW_TABLE_TYPE_NAMESPACE_CUSTOMIZATION = "gobblin.kafka.extract.allowTableTypeAndNamspaceCustomization"; public static final String DEFAULT_GOBBLIN_KAFKA_CONSUMER_CLIENT_FACTORY_CLASS = "org.apache.gobblin.kafka.client.Kafka08ConsumerClient$Factory"; public static final String GOBBLIN_KAFKA_SHOULD_ENABLE_DATASET_STATESTORE = "gobblin.kafka.shouldEnableDatasetStateStore"; public static final boolean DEFAULT_GOBBLIN_KAFKA_SHOULD_ENABLE_DATASET_STATESTORE = false; public static final String OFFSET_FETCH_TIMER = "offsetFetchTimer"; private final Set<String> moveToLatestTopics = Sets.newTreeSet(String.CASE_INSENSITIVE_ORDER); private final Map<KafkaPartition, Long> previousOffsets = Maps.newConcurrentMap(); private final Map<KafkaPartition, Long> previousExpectedHighWatermarks = Maps.newConcurrentMap(); private final Map<KafkaPartition, Long> previousOffsetFetchEpochTimes = Maps.newConcurrentMap(); private final Set<KafkaPartition> partitionsToBeProcessed = Sets.newConcurrentHashSet(); private final AtomicInteger failToGetOffsetCount = new AtomicInteger(0); private final AtomicInteger offsetTooEarlyCount = new AtomicInteger(0); private final AtomicInteger offsetTooLateCount = new AtomicInteger(0); // sharing the kafka consumer may result in contention, so support thread local consumers private final ConcurrentLinkedQueue<GobblinKafkaConsumerClient> kafkaConsumerClientPool = new ConcurrentLinkedQueue(); private static final ThreadLocal<GobblinKafkaConsumerClient> kafkaConsumerClient = new ThreadLocal<GobblinKafkaConsumerClient>(); private GobblinKafkaConsumerClient sharedKafkaConsumerClient = null; private final ClassAliasResolver<GobblinKafkaConsumerClientFactory> kafkaConsumerClientResolver = new ClassAliasResolver<>( GobblinKafkaConsumerClientFactory.class); private volatile boolean doneGettingAllPreviousOffsets = false; private Extract.TableType tableType; private String extractNamespace; private boolean isFullExtract; private String kafkaBrokers; private boolean shouldEnableDatasetStateStore; private AtomicBoolean isDatasetStateEnabled = new AtomicBoolean(false); private Set<String> topicsToProcess; private MetricContext metricContext; protected Optional<LineageInfo> lineageInfo; private List<String> getLimiterExtractorReportKeys() { List<String> keyNames = new ArrayList<>(); keyNames.add(KafkaSource.TOPIC_NAME); keyNames.add(KafkaSource.PARTITION_ID); return keyNames; } private void setLimiterReportKeyListToWorkUnits(List<WorkUnit> workUnits, List<String> keyNameList) { if (keyNameList.isEmpty()) { return; } String keyList = Joiner.on(',').join(keyNameList.iterator()); for (WorkUnit workUnit : workUnits) { workUnit.setProp(LimiterConfigurationKeys.LIMITER_REPORT_KEY_LIST, keyList); } } @Override public List<WorkUnit> getWorkunits(SourceState state) { this.metricContext = Instrumented.getMetricContext(state, KafkaSource.class); this.lineageInfo = LineageInfo.getLineageInfo(state.getBroker()); Map<String, List<WorkUnit>> workUnits = Maps.newConcurrentMap(); if (state.getPropAsBoolean(KafkaSource.GOBBLIN_KAFKA_EXTRACT_ALLOW_TABLE_TYPE_NAMESPACE_CUSTOMIZATION)) { String tableTypeStr = state.getProp(ConfigurationKeys.EXTRACT_TABLE_TYPE_KEY, KafkaSource.DEFAULT_TABLE_TYPE.toString()); tableType = Extract.TableType.valueOf(tableTypeStr); extractNamespace = state.getProp(ConfigurationKeys.EXTRACT_NAMESPACE_NAME_KEY, KafkaSource.DEFAULT_NAMESPACE_NAME); } else { // To be compatible, reject table type and namespace configuration keys as previous implementation tableType = KafkaSource.DEFAULT_TABLE_TYPE; extractNamespace = KafkaSource.DEFAULT_NAMESPACE_NAME; } isFullExtract = state.getPropAsBoolean(ConfigurationKeys.EXTRACT_IS_FULL_KEY); kafkaBrokers = state.getProp(ConfigurationKeys.KAFKA_BROKERS, ""); this.shouldEnableDatasetStateStore = state.getPropAsBoolean(GOBBLIN_KAFKA_SHOULD_ENABLE_DATASET_STATESTORE, DEFAULT_GOBBLIN_KAFKA_SHOULD_ENABLE_DATASET_STATESTORE); try { Config config = ConfigUtils.propertiesToConfig(state.getProperties()); GobblinKafkaConsumerClientFactory kafkaConsumerClientFactory = kafkaConsumerClientResolver .resolveClass(state.getProp(GOBBLIN_KAFKA_CONSUMER_CLIENT_FACTORY_CLASS, DEFAULT_GOBBLIN_KAFKA_CONSUMER_CLIENT_FACTORY_CLASS)) .newInstance(); this.kafkaConsumerClient.set(kafkaConsumerClientFactory.create(config)); List<KafkaTopic> topics = getFilteredTopics(state); this.topicsToProcess = topics.stream().map(KafkaTopic::getName).collect(toSet()); for (String topic : this.topicsToProcess) { LOG.info("Discovered topic " + topic); } Map<String, State> topicSpecificStateMap = DatasetUtils .getDatasetSpecificProps(Iterables.transform(topics, new Function<KafkaTopic, String>() { @Override public String apply(KafkaTopic topic) { return topic.getName(); } }), state); int numOfThreads = state.getPropAsInt(ConfigurationKeys.KAFKA_SOURCE_WORK_UNITS_CREATION_THREADS, ConfigurationKeys.KAFKA_SOURCE_WORK_UNITS_CREATION_DEFAULT_THREAD_COUNT); ExecutorService threadPool = Executors.newFixedThreadPool(numOfThreads, ExecutorsUtils.newThreadFactory(Optional.of(LOG))); if (state.getPropAsBoolean(ConfigurationKeys.KAFKA_SOURCE_SHARE_CONSUMER_CLIENT, ConfigurationKeys.DEFAULT_KAFKA_SOURCE_SHARE_CONSUMER_CLIENT)) { this.sharedKafkaConsumerClient = this.kafkaConsumerClient.get(); } else { // preallocate one client per thread for (int i = 0; i < numOfThreads; i++) { kafkaConsumerClientPool.offer(kafkaConsumerClientFactory.create(config)); } } Stopwatch createWorkUnitStopwatch = Stopwatch.createStarted(); for (KafkaTopic topic : topics) { threadPool.submit(new WorkUnitCreator(topic, state, Optional.fromNullable(topicSpecificStateMap.get(topic.getName())), workUnits)); } ExecutorsUtils.shutdownExecutorService(threadPool, Optional.of(LOG), 1L, TimeUnit.HOURS); LOG.info(String.format("Created workunits for %d topics in %d seconds", workUnits.size(), createWorkUnitStopwatch.elapsed(TimeUnit.SECONDS))); // Create empty WorkUnits for skipped partitions (i.e., partitions that have previous offsets, // but aren't processed). createEmptyWorkUnitsForSkippedPartitions(workUnits, topicSpecificStateMap, state); int numOfMultiWorkunits = state.getPropAsInt(ConfigurationKeys.MR_JOB_MAX_MAPPERS_KEY, ConfigurationKeys.DEFAULT_MR_JOB_MAX_MAPPERS); List<WorkUnit> workUnitList = KafkaWorkUnitPacker.getInstance(this, state).pack(workUnits, numOfMultiWorkunits); addTopicSpecificPropsToWorkUnits(workUnitList, topicSpecificStateMap); setLimiterReportKeyListToWorkUnits(workUnitList, getLimiterExtractorReportKeys()); return workUnitList; } catch (InstantiationException | IllegalAccessException | ClassNotFoundException e) { throw new RuntimeException(e); } finally { try { if (this.kafkaConsumerClient.get() != null) { this.kafkaConsumerClient.get().close(); } // cleanup clients from pool for (GobblinKafkaConsumerClient client : kafkaConsumerClientPool) { client.close(); } } catch (IOException e) { throw new RuntimeException("Exception closing kafkaConsumerClient"); } } } private void addTopicSpecificPropsToWorkUnits(List<WorkUnit> workUnits, Map<String, State> topicSpecificStateMap) { for (WorkUnit workUnit : workUnits) { addTopicSpecificPropsToWorkUnit(workUnit, topicSpecificStateMap); } } private void addTopicSpecificPropsToWorkUnit(WorkUnit workUnit, Map<String, State> topicSpecificStateMap) { if (workUnit instanceof MultiWorkUnit) { for (WorkUnit wu : ((MultiWorkUnit) workUnit).getWorkUnits()) { addTopicSpecificPropsToWorkUnit(wu, topicSpecificStateMap); } } else if (!workUnit.contains(TOPIC_NAME)) { return; } else { addDatasetUrnOptionally(workUnit); if (topicSpecificStateMap == null) { return; } else if (!topicSpecificStateMap.containsKey(workUnit.getProp(TOPIC_NAME))) { return; } else { workUnit.addAll(topicSpecificStateMap.get(workUnit.getProp(TOPIC_NAME))); } } } private void addDatasetUrnOptionally(WorkUnit workUnit) { if (!this.shouldEnableDatasetStateStore) { return; } workUnit.setProp(ConfigurationKeys.DATASET_URN_KEY, workUnit.getProp(TOPIC_NAME)); } private void createEmptyWorkUnitsForSkippedPartitions(Map<String, List<WorkUnit>> workUnits, Map<String, State> topicSpecificStateMap, SourceState state) { // in case the previous offset not been set getAllPreviousOffsetState(state); // For each partition that has a previous offset, create an empty WorkUnit for it if // it is not in this.partitionsToBeProcessed. for (Map.Entry<KafkaPartition, Long> entry : this.previousOffsets.entrySet()) { KafkaPartition partition = entry.getKey(); if (!this.partitionsToBeProcessed.contains(partition)) { String topicName = partition.getTopicName(); if (!this.isDatasetStateEnabled.get() || this.topicsToProcess.contains(topicName)) { long previousOffset = entry.getValue(); WorkUnit emptyWorkUnit = createEmptyWorkUnit(partition, previousOffset, this.previousOffsetFetchEpochTimes.get(partition), Optional.fromNullable(topicSpecificStateMap.get(partition.getTopicName()))); if (workUnits.containsKey(topicName)) { workUnits.get(topicName).add(emptyWorkUnit); } else { workUnits.put(topicName, Lists.newArrayList(emptyWorkUnit)); } } } } } /* * This function need to be thread safe since it is called in the Runnable */ private List<WorkUnit> getWorkUnitsForTopic(KafkaTopic topic, SourceState state, Optional<State> topicSpecificState) { Timer.Context context = this.metricContext.timer("isTopicQualifiedTimer").time(); boolean topicQualified = isTopicQualified(topic); context.close(); List<WorkUnit> workUnits = Lists.newArrayList(); for (KafkaPartition partition : topic.getPartitions()) { WorkUnit workUnit = getWorkUnitForTopicPartition(partition, state, topicSpecificState); this.partitionsToBeProcessed.add(partition); if (workUnit != null) { // For disqualified topics, for each of its workunits set the high watermark to be the same // as the low watermark, so that it will be skipped. if (!topicQualified) { skipWorkUnit(workUnit); } workUnits.add(workUnit); } } return workUnits; } /** * Whether a {@link KafkaTopic} is qualified to be pulled. * * This method can be overridden by subclasses for verifying topic eligibility, e.g., one may want to * skip a topic if its schema cannot be found in the schema registry. */ protected boolean isTopicQualified(KafkaTopic topic) { return true; } @SuppressWarnings("deprecation") private static void skipWorkUnit(WorkUnit workUnit) { workUnit.setProp(ConfigurationKeys.WORK_UNIT_HIGH_WATER_MARK_KEY, workUnit.getLowWaterMark()); } private WorkUnit getWorkUnitForTopicPartition(KafkaPartition partition, SourceState state, Optional<State> topicSpecificState) { Offsets offsets = new Offsets(); boolean failedToGetKafkaOffsets = false; try (Timer.Context context = this.metricContext.timer(OFFSET_FETCH_TIMER).time()) { offsets.setOffsetFetchEpochTime(System.currentTimeMillis()); offsets.setEarliestOffset(this.kafkaConsumerClient.get().getEarliestOffset(partition)); offsets.setLatestOffset(this.kafkaConsumerClient.get().getLatestOffset(partition)); } catch (KafkaOffsetRetrievalFailureException e) { failedToGetKafkaOffsets = true; } long previousOffset = 0; long previousOffsetFetchEpochTime = 0; boolean previousOffsetNotFound = false; try { previousOffset = getPreviousOffsetForPartition(partition, state); offsets.setPreviousLatestOffset(getPreviousExpectedHighWatermark(partition, state)); previousOffsetFetchEpochTime = getPreviousOffsetFetchEpochTimeForPartition(partition, state); offsets.setPreviousOffsetFetchEpochTime(previousOffsetFetchEpochTime); } catch (PreviousOffsetNotFoundException e) { previousOffsetNotFound = true; } if (failedToGetKafkaOffsets) { // Increment counts, which will be reported as job metrics this.failToGetOffsetCount.incrementAndGet(); // When unable to get earliest/latest offsets from Kafka, skip the partition and create an empty workunit, // so that previousOffset is persisted. LOG.warn(String.format( "Failed to retrieve earliest and/or latest offset for partition %s. This partition will be skipped.", partition)); return previousOffsetNotFound ? null : createEmptyWorkUnit(partition, previousOffset, previousOffsetFetchEpochTime, topicSpecificState); } if (shouldMoveToLatestOffset(partition, state)) { offsets.startAtLatestOffset(); } else if (previousOffsetNotFound) { // When previous offset cannot be found, either start at earliest offset or latest offset, or skip the partition // (no need to create an empty workunit in this case since there's no offset to persist). String offsetNotFoundMsg = String.format("Previous offset for partition %s does not exist. ", partition); String offsetOption = state.getProp(BOOTSTRAP_WITH_OFFSET, DEFAULT_BOOTSTRAP_WITH_OFFSET).toLowerCase(); if (offsetOption.equals(LATEST_OFFSET)) { LOG.warn(offsetNotFoundMsg + "This partition will start from the latest offset: " + offsets.getLatestOffset()); offsets.startAtLatestOffset(); } else if (offsetOption.equals(EARLIEST_OFFSET)) { LOG.warn(offsetNotFoundMsg + "This partition will start from the earliest offset: " + offsets.getEarliestOffset()); offsets.startAtEarliestOffset(); } else { LOG.warn(offsetNotFoundMsg + "This partition will be skipped."); return null; } } else { try { offsets.startAt(previousOffset); } catch (StartOffsetOutOfRangeException e) { // Increment counts, which will be reported as job metrics if (offsets.getStartOffset() <= offsets.getLatestOffset()) { this.offsetTooEarlyCount.incrementAndGet(); } else { this.offsetTooLateCount.incrementAndGet(); } // When previous offset is out of range, either start at earliest, latest or nearest offset, or skip the // partition. If skipping, need to create an empty workunit so that previousOffset is persisted. String offsetOutOfRangeMsg = String.format( "Start offset for partition %s is out of range. Start offset = %d, earliest offset = %d, latest offset = %d.", partition, offsets.getStartOffset(), offsets.getEarliestOffset(), offsets.getLatestOffset()); String offsetOption = state .getProp(RESET_ON_OFFSET_OUT_OF_RANGE, DEFAULT_RESET_ON_OFFSET_OUT_OF_RANGE).toLowerCase(); if (offsetOption.equals(LATEST_OFFSET) || (offsetOption.equals(NEAREST_OFFSET) && offsets.getStartOffset() >= offsets.getLatestOffset())) { LOG.warn(offsetOutOfRangeMsg + "This partition will start from the latest offset: " + offsets.getLatestOffset()); offsets.startAtLatestOffset(); } else if (offsetOption.equals(EARLIEST_OFFSET) || offsetOption.equals(NEAREST_OFFSET)) { LOG.warn(offsetOutOfRangeMsg + "This partition will start from the earliest offset: " + offsets.getEarliestOffset()); offsets.startAtEarliestOffset(); } else { LOG.warn(offsetOutOfRangeMsg + "This partition will be skipped."); return createEmptyWorkUnit(partition, previousOffset, previousOffsetFetchEpochTime, topicSpecificState); } } } return getWorkUnitForTopicPartition(partition, offsets, topicSpecificState); } private long getPreviousOffsetFetchEpochTimeForPartition(KafkaPartition partition, SourceState state) throws PreviousOffsetNotFoundException { getAllPreviousOffsetState(state); if (this.previousOffsetFetchEpochTimes.containsKey(partition)) { return this.previousOffsetFetchEpochTimes.get(partition); } throw new PreviousOffsetNotFoundException( String.format("Previous offset fetch epoch time for topic %s, partition %s not found.", partition.getTopicName(), partition.getId())); } private long getPreviousOffsetForPartition(KafkaPartition partition, SourceState state) throws PreviousOffsetNotFoundException { getAllPreviousOffsetState(state); if (this.previousOffsets.containsKey(partition)) { return this.previousOffsets.get(partition); } throw new PreviousOffsetNotFoundException( String.format("Previous offset for topic %s, partition %s not found.", partition.getTopicName(), partition.getId())); } private long getPreviousExpectedHighWatermark(KafkaPartition partition, SourceState state) throws PreviousOffsetNotFoundException { getAllPreviousOffsetState(state); if (this.previousExpectedHighWatermarks.containsKey(partition)) { return this.previousExpectedHighWatermarks.get(partition); } throw new PreviousOffsetNotFoundException( String.format("Previous expected high watermark for topic %s, partition %s not found.", partition.getTopicName(), partition.getId())); } // need to be synchronized as this.previousOffsets, this.previousExpectedHighWatermarks, and // this.previousOffsetFetchEpochTimes need to be initialized once private synchronized void getAllPreviousOffsetState(SourceState state) { if (this.doneGettingAllPreviousOffsets) { return; } this.previousOffsets.clear(); this.previousExpectedHighWatermarks.clear(); this.previousOffsetFetchEpochTimes.clear(); Map<String, Iterable<WorkUnitState>> workUnitStatesByDatasetUrns = state .getPreviousWorkUnitStatesByDatasetUrns(); if (!workUnitStatesByDatasetUrns.isEmpty() && !(workUnitStatesByDatasetUrns.size() == 1 && workUnitStatesByDatasetUrns.keySet().iterator().next().equals(""))) { this.isDatasetStateEnabled.set(true); } for (WorkUnitState workUnitState : state.getPreviousWorkUnitStates()) { List<KafkaPartition> partitions = KafkaUtils.getPartitions(workUnitState); MultiLongWatermark watermark = workUnitState.getActualHighWatermark(MultiLongWatermark.class); MultiLongWatermark previousExpectedHighWatermark = workUnitState.getWorkunit() .getExpectedHighWatermark(MultiLongWatermark.class); Preconditions.checkArgument(partitions.size() == watermark.size(), String.format( "Num of partitions doesn't match number of watermarks: partitions=%s, watermarks=%s", partitions, watermark)); for (int i = 0; i < partitions.size(); i++) { KafkaPartition partition = partitions.get(i); if (watermark.get(i) != ConfigurationKeys.DEFAULT_WATERMARK_VALUE) { this.previousOffsets.put(partition, watermark.get(i)); } if (previousExpectedHighWatermark.get(i) != ConfigurationKeys.DEFAULT_WATERMARK_VALUE) { this.previousExpectedHighWatermarks.put(partition, previousExpectedHighWatermark.get(i)); } this.previousOffsetFetchEpochTimes.put(partition, Long.valueOf(workUnitState .getProp(KafkaUtils.getPartitionPropName(KafkaSource.OFFSET_FETCH_EPOCH_TIME, i), "0"))); } } this.doneGettingAllPreviousOffsets = true; } /** * A topic can be configured to move to the latest offset in {@link #TOPICS_MOVE_TO_LATEST_OFFSET}. * * Need to be synchronized as access by multiple threads */ private synchronized boolean shouldMoveToLatestOffset(KafkaPartition partition, SourceState state) { if (!state.contains(TOPICS_MOVE_TO_LATEST_OFFSET)) { return false; } if (this.moveToLatestTopics.isEmpty()) { this.moveToLatestTopics.addAll(Splitter.on(',').trimResults().omitEmptyStrings() .splitToList(state.getProp(TOPICS_MOVE_TO_LATEST_OFFSET))); } return this.moveToLatestTopics.contains(partition.getTopicName()) || this.moveToLatestTopics.contains(ALL_TOPICS); } // thread safe private WorkUnit createEmptyWorkUnit(KafkaPartition partition, long previousOffset, long previousFetchEpochTime, Optional<State> topicSpecificState) { Offsets offsets = new Offsets(); offsets.setEarliestOffset(previousOffset); offsets.setLatestOffset(previousOffset); offsets.startAtEarliestOffset(); offsets.setOffsetFetchEpochTime(previousFetchEpochTime); return getWorkUnitForTopicPartition(partition, offsets, topicSpecificState); } private WorkUnit getWorkUnitForTopicPartition(KafkaPartition partition, Offsets offsets, Optional<State> topicSpecificState) { // Default to job level configurations Extract.TableType currentTableType = tableType; String currentExtractNamespace = extractNamespace; String currentExtractTableName = partition.getTopicName(); boolean isCurrentFullExtract = isFullExtract; // Update to topic specific configurations if any if (topicSpecificState.isPresent()) { State topicState = topicSpecificState.get(); if (topicState.contains(ConfigurationKeys.EXTRACT_TABLE_TYPE_KEY)) { currentTableType = Extract.TableType .valueOf(topicState.getProp(ConfigurationKeys.EXTRACT_TABLE_TYPE_KEY)); } currentExtractNamespace = topicState.getProp(ConfigurationKeys.EXTRACT_NAMESPACE_NAME_KEY, extractNamespace); currentExtractTableName = topicState.getProp(ConfigurationKeys.EXTRACT_TABLE_NAME_KEY, partition.getTopicName()); isCurrentFullExtract = topicState.getPropAsBoolean(ConfigurationKeys.EXTRACT_IS_FULL_KEY, isFullExtract); } Extract extract = this.createExtract(currentTableType, currentExtractNamespace, currentExtractTableName); if (isCurrentFullExtract) { extract.setProp(ConfigurationKeys.EXTRACT_IS_FULL_KEY, true); } WorkUnit workUnit = WorkUnit.create(extract); workUnit.setProp(TOPIC_NAME, partition.getTopicName()); addDatasetUrnOptionally(workUnit); workUnit.setProp(PARTITION_ID, partition.getId()); workUnit.setProp(LEADER_ID, partition.getLeader().getId()); workUnit.setProp(LEADER_HOSTANDPORT, partition.getLeader().getHostAndPort().toString()); workUnit.setProp(ConfigurationKeys.WORK_UNIT_LOW_WATER_MARK_KEY, offsets.getStartOffset()); workUnit.setProp(ConfigurationKeys.WORK_UNIT_HIGH_WATER_MARK_KEY, offsets.getLatestOffset()); workUnit.setProp(PREVIOUS_OFFSET_FETCH_EPOCH_TIME, offsets.getPreviousOffsetFetchEpochTime()); workUnit.setProp(OFFSET_FETCH_EPOCH_TIME, offsets.getOffsetFetchEpochTime()); workUnit.setProp(PREVIOUS_LATEST_OFFSET, offsets.getPreviousLatestOffset()); // Add lineage info DatasetDescriptor source = new DatasetDescriptor(DatasetConstants.PLATFORM_KAFKA, partition.getTopicName()); source.addMetadata(DatasetConstants.BROKERS, kafkaBrokers); if (this.lineageInfo.isPresent()) { this.lineageInfo.get().setSource(source, workUnit); } LOG.info(String.format("Created workunit for partition %s: lowWatermark=%d, highWatermark=%d, range=%d", partition, offsets.getStartOffset(), offsets.getLatestOffset(), offsets.getLatestOffset() - offsets.getStartOffset())); return workUnit; } /** * If config store is enabled, then intersection of topics from blacklisting/whitelisting will be taken against * the topics from config-store */ private List<KafkaTopic> getFilteredTopics(SourceState state) { List<Pattern> blacklist = DatasetFilterUtils.getPatternList(state, TOPIC_BLACKLIST); List<Pattern> whitelist = DatasetFilterUtils.getPatternList(state, TOPIC_WHITELIST); List<KafkaTopic> topics = this.kafkaConsumerClient.get().getFilteredTopics(blacklist, whitelist); Optional<String> configStoreUri = ConfigStoreUtils.getConfigStoreUri(state.getProperties()); if (configStoreUri.isPresent()) { List<KafkaTopic> topicsFromConfigStore = ConfigStoreUtils.getTopicsFromConfigStore( state.getProperties(), configStoreUri.get(), this.kafkaConsumerClient.get()); return topics.stream() .filter((KafkaTopic p) -> (topicsFromConfigStore.stream() .anyMatch((KafkaTopic q) -> q.getName().equalsIgnoreCase(p.getName())))) .collect(toList()); } return topics; } @Override public void shutdown(SourceState state) { state.setProp(ConfigurationKeys.OFFSET_TOO_EARLY_COUNT, this.offsetTooEarlyCount); state.setProp(ConfigurationKeys.OFFSET_TOO_LATE_COUNT, this.offsetTooLateCount); state.setProp(ConfigurationKeys.FAIL_TO_GET_OFFSET_COUNT, this.failToGetOffsetCount); } /** * This class contains startOffset, earliestOffset and latestOffset for a Kafka partition. */ private static class Offsets { @Getter private long startOffset = 0; @Getter @Setter private long earliestOffset = 0; @Getter @Setter private long latestOffset = 0; @Getter @Setter private long offsetFetchEpochTime = 0; @Getter @Setter private long previousOffsetFetchEpochTime = 0; @Getter @Setter private long previousLatestOffset = 0; private void startAt(long offset) throws StartOffsetOutOfRangeException { if (offset < this.earliestOffset || offset > this.latestOffset) { throw new StartOffsetOutOfRangeException( String.format("start offset = %d, earliest offset = %d, latest offset = %d", offset, this.earliestOffset, this.latestOffset)); } this.startOffset = offset; } private void startAtEarliestOffset() { this.startOffset = this.earliestOffset; } private void startAtLatestOffset() { this.startOffset = this.latestOffset; } } private class WorkUnitCreator implements Runnable { public static final String WORK_UNITS_FOR_TOPIC_TIMER = "workUnitsForTopicTimer"; private final KafkaTopic topic; private final SourceState state; private final Optional<State> topicSpecificState; private final Map<String, List<WorkUnit>> allTopicWorkUnits; WorkUnitCreator(KafkaTopic topic, SourceState state, Optional<State> topicSpecificState, Map<String, List<WorkUnit>> workUnits) { this.topic = topic; this.state = state; this.topicSpecificState = topicSpecificState; this.allTopicWorkUnits = workUnits; } @Override public void run() { try (Timer.Context context = metricContext.timer(WORK_UNITS_FOR_TOPIC_TIMER).time()) { // use shared client if configure, otherwise set a thread local one from the pool if (KafkaSource.this.sharedKafkaConsumerClient != null) { KafkaSource.this.kafkaConsumerClient.set(KafkaSource.this.sharedKafkaConsumerClient); } else { GobblinKafkaConsumerClient client = KafkaSource.this.kafkaConsumerClientPool.poll(); Preconditions.checkNotNull(client, "Unexpectedly ran out of preallocated consumer clients"); KafkaSource.this.kafkaConsumerClient.set(client); } this.allTopicWorkUnits.put(this.topic.getName(), KafkaSource.this.getWorkUnitsForTopic(this.topic, this.state, this.topicSpecificState)); } catch (Throwable t) { LOG.error("Caught error in creating work unit for " + this.topic.getName(), t); throw new RuntimeException(t); } finally { // return the client to the pool if (KafkaSource.this.sharedKafkaConsumerClient == null) { KafkaSource.this.kafkaConsumerClientPool.offer(KafkaSource.this.kafkaConsumerClient.get()); KafkaSource.this.kafkaConsumerClient.remove(); } } } } }