Java tutorial
/* * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, * software distributed under the License is distributed on an * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY * KIND, either express or implied. See the License for the * specific language governing permissions and limitations * under the License. */ package org.apache.druid.indexing.kafka.supervisor; import com.fasterxml.jackson.core.JsonProcessingException; import com.fasterxml.jackson.core.type.TypeReference; import com.fasterxml.jackson.databind.MapperFeature; import com.fasterxml.jackson.databind.ObjectMapper; import com.google.common.annotations.VisibleForTesting; import com.google.common.base.Function; import com.google.common.base.Joiner; import com.google.common.base.Optional; import com.google.common.base.Preconditions; import com.google.common.base.Predicate; import com.google.common.base.Throwables; import com.google.common.collect.ImmutableList; import com.google.common.collect.ImmutableMap; import com.google.common.collect.ImmutableSet; import com.google.common.collect.Iterables; import com.google.common.collect.Lists; import com.google.common.collect.Maps; import com.google.common.primitives.Longs; import com.google.common.util.concurrent.Futures; import com.google.common.util.concurrent.ListenableFuture; import com.google.common.util.concurrent.ListeningExecutorService; import com.google.common.util.concurrent.MoreExecutors; import org.apache.commons.codec.digest.DigestUtils; import org.apache.druid.indexer.TaskLocation; import org.apache.druid.indexer.TaskStatus; import org.apache.druid.indexing.common.TaskInfoProvider; import org.apache.druid.indexing.common.stats.RowIngestionMetersFactory; import org.apache.druid.indexing.common.task.RealtimeIndexTask; import org.apache.druid.indexing.common.task.Task; import org.apache.druid.indexing.common.task.TaskResource; import org.apache.druid.indexing.kafka.KafkaDataSourceMetadata; import org.apache.druid.indexing.kafka.KafkaIOConfig; import org.apache.druid.indexing.kafka.KafkaIndexTask; import org.apache.druid.indexing.kafka.KafkaIndexTaskClient; import org.apache.druid.indexing.kafka.KafkaIndexTaskClientFactory; import org.apache.druid.indexing.kafka.KafkaPartitions; import org.apache.druid.indexing.kafka.KafkaTuningConfig; import org.apache.druid.indexing.overlord.DataSourceMetadata; import org.apache.druid.indexing.overlord.IndexerMetadataStorageCoordinator; import org.apache.druid.indexing.overlord.TaskMaster; import org.apache.druid.indexing.overlord.TaskQueue; import org.apache.druid.indexing.overlord.TaskRunner; import org.apache.druid.indexing.overlord.TaskRunnerListener; import org.apache.druid.indexing.overlord.TaskRunnerWorkItem; import org.apache.druid.indexing.overlord.TaskStorage; import org.apache.druid.indexing.overlord.supervisor.Supervisor; import org.apache.druid.indexing.overlord.supervisor.SupervisorReport; import org.apache.druid.java.util.common.DateTimes; import org.apache.druid.java.util.common.IAE; import org.apache.druid.java.util.common.ISE; import org.apache.druid.java.util.common.Pair; import org.apache.druid.java.util.common.StringUtils; import org.apache.druid.java.util.common.concurrent.Execs; import org.apache.druid.java.util.emitter.EmittingLogger; import org.apache.druid.java.util.emitter.service.ServiceEmitter; import org.apache.druid.java.util.emitter.service.ServiceMetricEvent; import org.apache.druid.metadata.EntryExistsException; import org.apache.druid.server.metrics.DruidMonitorSchedulerConfig; import org.apache.kafka.clients.consumer.KafkaConsumer; import org.apache.kafka.common.PartitionInfo; import org.apache.kafka.common.TopicPartition; import org.apache.kafka.common.serialization.ByteArrayDeserializer; import org.joda.time.DateTime; import javax.annotation.Nullable; import java.io.IOException; import java.util.ArrayList; import java.util.Collection; import java.util.Collections; import java.util.HashMap; import java.util.HashSet; import java.util.Iterator; import java.util.List; import java.util.Map; import java.util.Map.Entry; import java.util.Properties; import java.util.Set; import java.util.SortedMap; import java.util.TreeMap; import java.util.concurrent.BlockingQueue; import java.util.concurrent.ConcurrentHashMap; import java.util.concurrent.CopyOnWriteArrayList; import java.util.concurrent.ExecutionException; import java.util.concurrent.ExecutorService; import java.util.concurrent.LinkedBlockingDeque; import java.util.concurrent.ScheduledExecutorService; import java.util.concurrent.TimeUnit; import java.util.concurrent.TimeoutException; import java.util.concurrent.atomic.AtomicInteger; import java.util.stream.Collectors; import java.util.stream.Stream; /** * Supervisor responsible for managing the KafkaIndexTasks for a single dataSource. At a high level, the class accepts a * {@link KafkaSupervisorSpec} which includes the Kafka topic and configuration as well as an ingestion spec which will * be used to generate the indexing tasks. The run loop periodically refreshes its view of the Kafka topic's partitions * and the list of running indexing tasks and ensures that all partitions are being read from and that there are enough * tasks to satisfy the desired number of replicas. As tasks complete, new tasks are queued to process the next range of * Kafka offsets. */ public class KafkaSupervisor implements Supervisor { private static final EmittingLogger log = new EmittingLogger(KafkaSupervisor.class); private static final long MAX_RUN_FREQUENCY_MILLIS = 1000; // prevent us from running too often in response to events private static final long NOT_SET = -1; private static final long MINIMUM_FUTURE_TIMEOUT_IN_SECONDS = 120; private static final long MINIMUM_GET_OFFSET_PERIOD_MILLIS = 5000; private static final long INITIAL_GET_OFFSET_DELAY_MILLIS = 15000; private static final long INITIAL_EMIT_LAG_METRIC_DELAY_MILLIS = 25000; private static final CopyOnWriteArrayList EMPTY_LIST = Lists.newCopyOnWriteArrayList(); public static final String IS_INCREMENTAL_HANDOFF_SUPPORTED = "IS_INCREMENTAL_HANDOFF_SUPPORTED"; // Internal data structures // -------------------------------------------------------- /** * A TaskGroup is the main data structure used by KafkaSupervisor to organize and monitor Kafka partitions and * indexing tasks. All the tasks in a TaskGroup should always be doing the same thing (reading the same partitions and * starting from the same offset) and if [replicas] is configured to be 1, a TaskGroup will contain a single task (the * exception being if the supervisor started up and discovered and adopted some already running tasks). At any given * time, there should only be up to a maximum of [taskCount] actively-reading task groups (tracked in the [taskGroups] * map) + zero or more pending-completion task groups (tracked in [pendingCompletionTaskGroups]). */ private class TaskGroup { final int groupId; // This specifies the partitions and starting offsets for this task group. It is set on group creation from the data // in [partitionGroups] and never changes during the lifetime of this task group, which will live until a task in // this task group has completed successfully, at which point this will be destroyed and a new task group will be // created with new starting offsets. This allows us to create replacement tasks for failed tasks that process the // same offsets, even if the values in [partitionGroups] has been changed. final ImmutableMap<Integer, Long> partitionOffsets; final ConcurrentHashMap<String, TaskData> tasks = new ConcurrentHashMap<>(); final Optional<DateTime> minimumMessageTime; final Optional<DateTime> maximumMessageTime; DateTime completionTimeout; // is set after signalTasksToFinish(); if not done by timeout, take corrective action final TreeMap<Integer, Map<Integer, Long>> sequenceOffsets = new TreeMap<>(); final String baseSequenceName; TaskGroup(int groupId, ImmutableMap<Integer, Long> partitionOffsets, Optional<DateTime> minimumMessageTime, Optional<DateTime> maximumMessageTime) { this.groupId = groupId; this.partitionOffsets = partitionOffsets; this.minimumMessageTime = minimumMessageTime; this.maximumMessageTime = maximumMessageTime; this.sequenceOffsets.put(0, partitionOffsets); this.baseSequenceName = generateSequenceName(partitionOffsets, minimumMessageTime, maximumMessageTime); } int addNewCheckpoint(Map<Integer, Long> checkpoint) { sequenceOffsets.put(sequenceOffsets.lastKey() + 1, checkpoint); return sequenceOffsets.lastKey(); } Set<String> taskIds() { return tasks.keySet(); } } private static class TaskData { @Nullable volatile TaskStatus status; @Nullable volatile DateTime startTime; volatile Map<Integer, Long> currentOffsets = new HashMap<>(); @Override public String toString() { return "TaskData{" + "status=" + status + ", startTime=" + startTime + ", currentOffsets=" + currentOffsets + '}'; } } // Map<{group ID}, {actively reading task group}>; see documentation for TaskGroup class private final ConcurrentHashMap<Integer, TaskGroup> taskGroups = new ConcurrentHashMap<>(); // After telling a taskGroup to stop reading and begin publishing a segment, it is moved from [taskGroups] to here so // we can monitor its status while we queue new tasks to read the next range of offsets. This is a list since we could // have multiple sets of tasks publishing at once if time-to-publish > taskDuration. // Map<{group ID}, List<{pending completion task groups}>> private final ConcurrentHashMap<Integer, CopyOnWriteArrayList<TaskGroup>> pendingCompletionTaskGroups = new ConcurrentHashMap<>(); // The starting offset for a new partition in [partitionGroups] is initially set to NOT_SET. When a new task group // is created and is assigned partitions, if the offset in [partitionGroups] is NOT_SET it will take the starting // offset value from the metadata store, and if it can't find it there, from Kafka. Once a task begins // publishing, the offset in partitionGroups will be updated to the ending offset of the publishing-but-not-yet- // completed task, which will cause the next set of tasks to begin reading from where the previous task left // off. If that previous task now fails, we will set the offset in [partitionGroups] back to NOT_SET which will // cause successive tasks to again grab their starting offset from metadata store. This mechanism allows us to // start up successive tasks without waiting for the previous tasks to succeed and still be able to handle task // failures during publishing. // Map<{group ID}, Map<{partition ID}, {startingOffset}>> private final ConcurrentHashMap<Integer, ConcurrentHashMap<Integer, Long>> partitionGroups = new ConcurrentHashMap<>(); // -------------------------------------------------------- private final TaskStorage taskStorage; private final TaskMaster taskMaster; private final IndexerMetadataStorageCoordinator indexerMetadataStorageCoordinator; private final KafkaIndexTaskClient taskClient; private final ObjectMapper sortingMapper; private final KafkaSupervisorSpec spec; private final ServiceEmitter emitter; private final DruidMonitorSchedulerConfig monitorSchedulerConfig; private final String dataSource; private final KafkaSupervisorIOConfig ioConfig; private final KafkaSupervisorTuningConfig tuningConfig; private final KafkaTuningConfig taskTuningConfig; private final String supervisorId; private final TaskInfoProvider taskInfoProvider; private final long futureTimeoutInSeconds; // how long to wait for async operations to complete private final RowIngestionMetersFactory rowIngestionMetersFactory; private final ExecutorService exec; private final ScheduledExecutorService scheduledExec; private final ScheduledExecutorService reportingExec; private final ListeningExecutorService workerExec; private final BlockingQueue<Notice> notices = new LinkedBlockingDeque<>(); private final Object stopLock = new Object(); private final Object stateChangeLock = new Object(); private final Object consumerLock = new Object(); private boolean listenerRegistered = false; private long lastRunTime; private volatile DateTime firstRunTime; private volatile KafkaConsumer consumer; private volatile boolean started = false; private volatile boolean stopped = false; private volatile Map<Integer, Long> latestOffsetsFromKafka; private volatile DateTime offsetsLastUpdated; public KafkaSupervisor(final TaskStorage taskStorage, final TaskMaster taskMaster, final IndexerMetadataStorageCoordinator indexerMetadataStorageCoordinator, final KafkaIndexTaskClientFactory taskClientFactory, final ObjectMapper mapper, final KafkaSupervisorSpec spec, final RowIngestionMetersFactory rowIngestionMetersFactory) { this.taskStorage = taskStorage; this.taskMaster = taskMaster; this.indexerMetadataStorageCoordinator = indexerMetadataStorageCoordinator; this.sortingMapper = mapper.copy().configure(MapperFeature.SORT_PROPERTIES_ALPHABETICALLY, true); this.spec = spec; this.emitter = spec.getEmitter(); this.monitorSchedulerConfig = spec.getMonitorSchedulerConfig(); this.rowIngestionMetersFactory = rowIngestionMetersFactory; this.dataSource = spec.getDataSchema().getDataSource(); this.ioConfig = spec.getIoConfig(); this.tuningConfig = spec.getTuningConfig(); this.taskTuningConfig = KafkaTuningConfig.copyOf(this.tuningConfig); this.supervisorId = StringUtils.format("KafkaSupervisor-%s", dataSource); this.exec = Execs.singleThreaded(supervisorId); this.scheduledExec = Execs.scheduledSingleThreaded(supervisorId + "-Scheduler-%d"); this.reportingExec = Execs.scheduledSingleThreaded(supervisorId + "-Reporting-%d"); int workerThreads = (this.tuningConfig.getWorkerThreads() != null ? this.tuningConfig.getWorkerThreads() : Math.min(10, this.ioConfig.getTaskCount())); this.workerExec = MoreExecutors .listeningDecorator(Execs.multiThreaded(workerThreads, supervisorId + "-Worker-%d")); log.info("Created worker pool with [%d] threads for dataSource [%s]", workerThreads, this.dataSource); this.taskInfoProvider = new TaskInfoProvider() { @Override public TaskLocation getTaskLocation(final String id) { Preconditions.checkNotNull(id, "id"); Optional<TaskRunner> taskRunner = taskMaster.getTaskRunner(); if (taskRunner.isPresent()) { Optional<? extends TaskRunnerWorkItem> item = Iterables.tryFind( taskRunner.get().getRunningTasks(), (Predicate<TaskRunnerWorkItem>) taskRunnerWorkItem -> id .equals(taskRunnerWorkItem.getTaskId())); if (item.isPresent()) { return item.get().getLocation(); } } else { log.error("Failed to get task runner because I'm not the leader!"); } return TaskLocation.unknown(); } @Override public Optional<TaskStatus> getTaskStatus(String id) { return taskStorage.getStatus(id); } }; this.futureTimeoutInSeconds = Math.max(MINIMUM_FUTURE_TIMEOUT_IN_SECONDS, tuningConfig.getChatRetries() * (tuningConfig.getHttpTimeout().getStandardSeconds() + KafkaIndexTaskClient.MAX_RETRY_WAIT_SECONDS)); int chatThreads = (this.tuningConfig.getChatThreads() != null ? this.tuningConfig.getChatThreads() : Math.min(10, this.ioConfig.getTaskCount() * this.ioConfig.getReplicas())); this.taskClient = taskClientFactory.build(taskInfoProvider, dataSource, chatThreads, this.tuningConfig.getHttpTimeout(), this.tuningConfig.getChatRetries()); log.info("Created taskClient with dataSource[%s] chatThreads[%d] httpTimeout[%s] chatRetries[%d]", dataSource, chatThreads, this.tuningConfig.getHttpTimeout(), this.tuningConfig.getChatRetries()); } @Override public void start() { synchronized (stateChangeLock) { Preconditions.checkState(!started, "already started"); Preconditions.checkState(!exec.isShutdown(), "already stopped"); try { consumer = getKafkaConsumer(); exec.submit(() -> { try { long pollTimeout = Math.max(ioConfig.getPeriod().getMillis(), MAX_RUN_FREQUENCY_MILLIS); while (!Thread.currentThread().isInterrupted() && !stopped) { final Notice notice = notices.poll(pollTimeout, TimeUnit.MILLISECONDS); if (notice == null) { continue; } try { notice.handle(); } catch (Throwable e) { log.makeAlert(e, "KafkaSupervisor[%s] failed to handle notice", dataSource) .addData("noticeClass", notice.getClass().getSimpleName()).emit(); } } } catch (InterruptedException e) { log.info("KafkaSupervisor[%s] interrupted, exiting", dataSource); } }); firstRunTime = DateTimes.nowUtc().plus(ioConfig.getStartDelay()); scheduledExec.scheduleAtFixedRate(buildRunTask(), ioConfig.getStartDelay().getMillis(), Math.max(ioConfig.getPeriod().getMillis(), MAX_RUN_FREQUENCY_MILLIS), TimeUnit.MILLISECONDS); reportingExec.scheduleAtFixedRate(updateCurrentAndLatestOffsets(), ioConfig.getStartDelay().getMillis() + INITIAL_GET_OFFSET_DELAY_MILLIS, // wait for tasks to start up Math.max(tuningConfig.getOffsetFetchPeriod().getMillis(), MINIMUM_GET_OFFSET_PERIOD_MILLIS), TimeUnit.MILLISECONDS); reportingExec.scheduleAtFixedRate(emitLag(), ioConfig.getStartDelay().getMillis() + INITIAL_EMIT_LAG_METRIC_DELAY_MILLIS, // wait for tasks to start up monitorSchedulerConfig.getEmitterPeriod().getMillis(), TimeUnit.MILLISECONDS); started = true; log.info("Started KafkaSupervisor[%s], first run in [%s], with spec: [%s]", dataSource, ioConfig.getStartDelay(), spec.toString()); } catch (Exception e) { if (consumer != null) { consumer.close(); } log.makeAlert(e, "Exception starting KafkaSupervisor[%s]", dataSource).emit(); throw Throwables.propagate(e); } } } @Override public void stop(boolean stopGracefully) { synchronized (stateChangeLock) { Preconditions.checkState(started, "not started"); log.info("Beginning shutdown of KafkaSupervisor[%s]", dataSource); try { scheduledExec.shutdownNow(); // stop recurring executions reportingExec.shutdownNow(); Optional<TaskRunner> taskRunner = taskMaster.getTaskRunner(); if (taskRunner.isPresent()) { taskRunner.get().unregisterListener(supervisorId); } // Stopping gracefully will synchronize the end offsets of the tasks and signal them to publish, and will block // until the tasks have acknowledged or timed out. We want this behavior when we're explicitly shut down through // the API, but if we shut down for other reasons (e.g. we lose leadership) we want to just stop and leave the // tasks as they are. synchronized (stopLock) { if (stopGracefully) { log.info( "Posting GracefulShutdownNotice, signalling managed tasks to complete and publish"); notices.add(new GracefulShutdownNotice()); } else { log.info("Posting ShutdownNotice"); notices.add(new ShutdownNotice()); } long shutdownTimeoutMillis = tuningConfig.getShutdownTimeout().getMillis(); long endTime = System.currentTimeMillis() + shutdownTimeoutMillis; while (!stopped) { long sleepTime = endTime - System.currentTimeMillis(); if (sleepTime <= 0) { log.info("Timed out while waiting for shutdown (timeout [%,dms])", shutdownTimeoutMillis); stopped = true; break; } stopLock.wait(sleepTime); } } log.info("Shutdown notice handled"); taskClient.close(); workerExec.shutdownNow(); exec.shutdownNow(); started = false; log.info("KafkaSupervisor[%s] has stopped", dataSource); } catch (Exception e) { log.makeAlert(e, "Exception stopping KafkaSupervisor[%s]", dataSource).emit(); } } } @Override public SupervisorReport getStatus() { return generateReport(true); } @Override public Map<String, Map<String, Object>> getStats() { try { return getCurrentTotalStats(); } catch (InterruptedException ie) { Thread.currentThread().interrupt(); log.error(ie, "getStats() interrupted."); throw new RuntimeException(ie); } catch (ExecutionException | TimeoutException eete) { throw new RuntimeException(eete); } } @Override public void reset(DataSourceMetadata dataSourceMetadata) { log.info("Posting ResetNotice"); notices.add(new ResetNotice(dataSourceMetadata)); } @Override public void checkpoint(@Nullable Integer taskGroupId, @Deprecated String baseSequenceName, DataSourceMetadata previousCheckPoint, DataSourceMetadata currentCheckPoint) { Preconditions.checkNotNull(previousCheckPoint, "previousCheckpoint"); Preconditions.checkNotNull(currentCheckPoint, "current checkpoint cannot be null"); Preconditions.checkArgument( ioConfig.getTopic() .equals(((KafkaDataSourceMetadata) currentCheckPoint).getKafkaPartitions().getTopic()), "Supervisor topic [%s] and topic in checkpoint [%s] does not match", ioConfig.getTopic(), ((KafkaDataSourceMetadata) currentCheckPoint).getKafkaPartitions().getTopic()); log.info("Checkpointing [%s] for taskGroup [%s]", currentCheckPoint, taskGroupId); notices.add(new CheckpointNotice(taskGroupId, baseSequenceName, (KafkaDataSourceMetadata) previousCheckPoint, (KafkaDataSourceMetadata) currentCheckPoint)); } public void possiblyRegisterListener() { // getTaskRunner() sometimes fails if the task queue is still being initialized so retry later until we succeed if (listenerRegistered) { return; } Optional<TaskRunner> taskRunner = taskMaster.getTaskRunner(); if (taskRunner.isPresent()) { taskRunner.get().registerListener(new TaskRunnerListener() { @Override public String getListenerId() { return supervisorId; } @Override public void locationChanged(final String taskId, final TaskLocation newLocation) { // do nothing } @Override public void statusChanged(String taskId, TaskStatus status) { notices.add(new RunNotice()); } }, MoreExecutors.sameThreadExecutor()); listenerRegistered = true; } } private interface Notice { void handle() throws ExecutionException, InterruptedException, TimeoutException, JsonProcessingException; } private class RunNotice implements Notice { @Override public void handle() throws ExecutionException, InterruptedException, TimeoutException, JsonProcessingException { long nowTime = System.currentTimeMillis(); if (nowTime - lastRunTime < MAX_RUN_FREQUENCY_MILLIS) { return; } lastRunTime = nowTime; runInternal(); } } private class GracefulShutdownNotice extends ShutdownNotice { @Override public void handle() throws InterruptedException, ExecutionException, TimeoutException { gracefulShutdownInternal(); super.handle(); } } private class ShutdownNotice implements Notice { @Override public void handle() throws InterruptedException, ExecutionException, TimeoutException { consumer.close(); synchronized (stopLock) { stopped = true; stopLock.notifyAll(); } } } private class ResetNotice implements Notice { final DataSourceMetadata dataSourceMetadata; ResetNotice(DataSourceMetadata dataSourceMetadata) { this.dataSourceMetadata = dataSourceMetadata; } @Override public void handle() { resetInternal(dataSourceMetadata); } } private class CheckpointNotice implements Notice { @Nullable private final Integer nullableTaskGroupId; @Deprecated private final String baseSequenceName; private final KafkaDataSourceMetadata previousCheckpoint; private final KafkaDataSourceMetadata currentCheckpoint; CheckpointNotice(@Nullable Integer nullableTaskGroupId, @Deprecated String baseSequenceName, KafkaDataSourceMetadata previousCheckpoint, KafkaDataSourceMetadata currentCheckpoint) { this.baseSequenceName = baseSequenceName; this.nullableTaskGroupId = nullableTaskGroupId; this.previousCheckpoint = previousCheckpoint; this.currentCheckpoint = currentCheckpoint; } @Override public void handle() throws ExecutionException, InterruptedException { // Find taskGroupId using taskId if it's null. It can be null while rolling update. final int taskGroupId; if (nullableTaskGroupId == null) { // We search taskId in taskGroups and pendingCompletionTaskGroups sequentially. This should be fine because // 1) a taskGroup can be moved from taskGroups to pendingCompletionTaskGroups in RunNotice // (see checkTaskDuration()). // 2) Notices are proceesed by a single thread. So, CheckpointNotice and RunNotice cannot be processed at the // same time. final java.util.Optional<Integer> maybeGroupId = taskGroups.entrySet().stream().filter(entry -> { final TaskGroup taskGroup = entry.getValue(); return taskGroup.baseSequenceName.equals(baseSequenceName); }).findAny().map(Entry::getKey); taskGroupId = maybeGroupId.orElse(pendingCompletionTaskGroups.entrySet().stream().filter(entry -> { final List<TaskGroup> taskGroups = entry.getValue(); return taskGroups.stream().anyMatch(group -> group.baseSequenceName.equals(baseSequenceName)); }).findAny() .orElseThrow( () -> new ISE("Cannot find taskGroup for baseSequenceName[%s]", baseSequenceName)) .getKey()); } else { taskGroupId = nullableTaskGroupId; } // check for consistency // if already received request for this sequenceName and dataSourceMetadata combination then return final TaskGroup taskGroup = taskGroups.get(taskGroupId); if (isValidTaskGroup(taskGroupId, taskGroup)) { final TreeMap<Integer, Map<Integer, Long>> checkpoints = taskGroup.sequenceOffsets; // check validity of previousCheckpoint int index = checkpoints.size(); for (int sequenceId : checkpoints.descendingKeySet()) { Map<Integer, Long> checkpoint = checkpoints.get(sequenceId); // We have already verified the topic of the current checkpoint is same with that in ioConfig. // See checkpoint(). if (checkpoint.equals(previousCheckpoint.getKafkaPartitions().getPartitionOffsetMap())) { break; } index--; } if (index == 0) { throw new ISE("No such previous checkpoint [%s] found", previousCheckpoint); } else if (index < checkpoints.size()) { // if the found checkpoint is not the latest one then already checkpointed by a replica Preconditions.checkState(index == checkpoints.size() - 1, "checkpoint consistency failure"); log.info("Already checkpointed with offsets [%s]", checkpoints.lastEntry().getValue()); return; } final Map<Integer, Long> newCheckpoint = checkpointTaskGroup(taskGroup, false).get(); taskGroup.addNewCheckpoint(newCheckpoint); log.info("Handled checkpoint notice, new checkpoint is [%s] for taskGroup [%s]", newCheckpoint, taskGroupId); } } private boolean isValidTaskGroup(int taskGroupId, @Nullable TaskGroup taskGroup) { if (taskGroup == null) { // taskGroup might be in pendingCompletionTaskGroups or partitionGroups if (pendingCompletionTaskGroups.containsKey(taskGroupId)) { log.warn( "Ignoring checkpoint request because taskGroup[%d] has already stopped indexing and is waiting for " + "publishing segments", taskGroupId); return false; } else if (partitionGroups.containsKey(taskGroupId)) { log.warn("Ignoring checkpoint request because taskGroup[%d] is inactive", taskGroupId); return false; } else { throw new ISE("WTH?! cannot find taskGroup [%s] among all taskGroups [%s]", taskGroupId, taskGroups); } } return true; } } @VisibleForTesting void resetInternal(DataSourceMetadata dataSourceMetadata) { if (dataSourceMetadata == null) { // Reset everything boolean result = indexerMetadataStorageCoordinator.deleteDataSourceMetadata(dataSource); log.info("Reset dataSource[%s] - dataSource metadata entry deleted? [%s]", dataSource, result); taskGroups.values().forEach(this::killTasksInGroup); taskGroups.clear(); partitionGroups.clear(); } else if (!(dataSourceMetadata instanceof KafkaDataSourceMetadata)) { throw new IAE("Expected KafkaDataSourceMetadata but found instance of [%s]", dataSourceMetadata.getClass()); } else { // Reset only the partitions in dataSourceMetadata if it has not been reset yet final KafkaDataSourceMetadata resetKafkaMetadata = (KafkaDataSourceMetadata) dataSourceMetadata; if (resetKafkaMetadata.getKafkaPartitions().getTopic().equals(ioConfig.getTopic())) { // metadata can be null final DataSourceMetadata metadata = indexerMetadataStorageCoordinator .getDataSourceMetadata(dataSource); if (metadata != null && !(metadata instanceof KafkaDataSourceMetadata)) { throw new IAE("Expected KafkaDataSourceMetadata from metadata store but found instance of [%s]", metadata.getClass()); } final KafkaDataSourceMetadata currentMetadata = (KafkaDataSourceMetadata) metadata; // defend against consecutive reset requests from replicas // as well as the case where the metadata store do not have an entry for the reset partitions boolean doReset = false; for (Entry<Integer, Long> resetPartitionOffset : resetKafkaMetadata.getKafkaPartitions() .getPartitionOffsetMap().entrySet()) { final Long partitionOffsetInMetadataStore = currentMetadata == null ? null : currentMetadata.getKafkaPartitions().getPartitionOffsetMap() .get(resetPartitionOffset.getKey()); final TaskGroup partitionTaskGroup = taskGroups .get(getTaskGroupIdForPartition(resetPartitionOffset.getKey())); final boolean isSameOffset = partitionTaskGroup != null && partitionTaskGroup.partitionOffsets .get(resetPartitionOffset.getKey()).equals(resetPartitionOffset.getValue()); if (partitionOffsetInMetadataStore != null || isSameOffset) { doReset = true; break; } } if (!doReset) { log.info("Ignoring duplicate reset request [%s]", dataSourceMetadata); return; } boolean metadataUpdateSuccess = false; if (currentMetadata == null) { metadataUpdateSuccess = true; } else { final DataSourceMetadata newMetadata = currentMetadata.minus(resetKafkaMetadata); try { metadataUpdateSuccess = indexerMetadataStorageCoordinator .resetDataSourceMetadata(dataSource, newMetadata); } catch (IOException e) { log.error("Resetting DataSourceMetadata failed [%s]", e.getMessage()); Throwables.propagate(e); } } if (metadataUpdateSuccess) { resetKafkaMetadata.getKafkaPartitions().getPartitionOffsetMap().keySet().forEach(partition -> { final int groupId = getTaskGroupIdForPartition(partition); killTaskGroupForPartitions(ImmutableSet.of(partition)); taskGroups.remove(groupId); partitionGroups.get(groupId).replaceAll((partitionId, offset) -> NOT_SET); }); } else { throw new ISE("Unable to reset metadata"); } } else { log.warn("Reset metadata topic [%s] and supervisor's topic [%s] do not match", resetKafkaMetadata.getKafkaPartitions().getTopic(), ioConfig.getTopic()); } } } private void killTaskGroupForPartitions(Set<Integer> partitions) { for (Integer partition : partitions) { killTasksInGroup(taskGroups.get(getTaskGroupIdForPartition(partition))); } } private void killTasksInGroup(TaskGroup taskGroup) { if (taskGroup != null) { for (String taskId : taskGroup.tasks.keySet()) { log.info("Killing task [%s] in the task group", taskId); killTask(taskId); } } } @VisibleForTesting void gracefulShutdownInternal() throws ExecutionException, InterruptedException, TimeoutException { // Prepare for shutdown by 1) killing all tasks that haven't been assigned to a worker yet, and 2) causing all // running tasks to begin publishing by setting their startTime to a very long time ago so that the logic in // checkTaskDuration() will be triggered. This is better than just telling these tasks to publish whatever they // have, as replicas that are supposed to publish the same segment may not have read the same set of offsets. for (TaskGroup taskGroup : taskGroups.values()) { for (Entry<String, TaskData> entry : taskGroup.tasks.entrySet()) { if (taskInfoProvider.getTaskLocation(entry.getKey()).equals(TaskLocation.unknown())) { killTask(entry.getKey()); } else { entry.getValue().startTime = DateTimes.EPOCH; } } } checkTaskDuration(); } @VisibleForTesting void runInternal() throws ExecutionException, InterruptedException, TimeoutException, JsonProcessingException { possiblyRegisterListener(); updatePartitionDataFromKafka(); discoverTasks(); updateTaskStatus(); checkTaskDuration(); checkPendingCompletionTasks(); checkCurrentTaskState(); // if supervisor is not suspended, ensure required tasks are running // if suspended, ensure tasks have been requested to gracefully stop if (!spec.isSuspended()) { log.info("[%s] supervisor is running.", dataSource); createNewTasks(); } else { log.info("[%s] supervisor is suspended.", dataSource); gracefulShutdownInternal(); } if (log.isDebugEnabled()) { log.debug(generateReport(true).toString()); } else { log.info(generateReport(false).toString()); } } String generateSequenceName(Map<Integer, Long> startPartitions, Optional<DateTime> minimumMessageTime, Optional<DateTime> maximumMessageTime) { StringBuilder sb = new StringBuilder(); for (Entry<Integer, Long> entry : startPartitions.entrySet()) { sb.append(StringUtils.format("+%d(%d)", entry.getKey(), entry.getValue())); } String partitionOffsetStr = sb.toString().substring(1); String minMsgTimeStr = (minimumMessageTime.isPresent() ? String.valueOf(minimumMessageTime.get().getMillis()) : ""); String maxMsgTimeStr = (maximumMessageTime.isPresent() ? String.valueOf(maximumMessageTime.get().getMillis()) : ""); String dataSchema, tuningConfig; try { dataSchema = sortingMapper.writeValueAsString(spec.getDataSchema()); tuningConfig = sortingMapper.writeValueAsString(taskTuningConfig); } catch (JsonProcessingException e) { throw Throwables.propagate(e); } String hashCode = DigestUtils .sha1Hex(dataSchema + tuningConfig + partitionOffsetStr + minMsgTimeStr + maxMsgTimeStr) .substring(0, 15); return Joiner.on("_").join("index_kafka", dataSource, hashCode); } private KafkaConsumer<byte[], byte[]> getKafkaConsumer() { final Properties props = new Properties(); props.setProperty("metadata.max.age.ms", "10000"); props.setProperty("group.id", StringUtils.format("kafka-supervisor-%s", RealtimeIndexTask.makeRandomId())); props.putAll(ioConfig.getConsumerProperties()); props.setProperty("enable.auto.commit", "false"); ClassLoader currCtxCl = Thread.currentThread().getContextClassLoader(); try { Thread.currentThread().setContextClassLoader(getClass().getClassLoader()); return new KafkaConsumer<>(props, new ByteArrayDeserializer(), new ByteArrayDeserializer()); } finally { Thread.currentThread().setContextClassLoader(currCtxCl); } } private void updatePartitionDataFromKafka() { Map<String, List<PartitionInfo>> topics; try { synchronized (consumerLock) { topics = consumer.listTopics(); // updates the consumer's list of partitions from the brokers } } catch (Exception e) { // calls to the consumer throw NPEs when the broker doesn't respond log.warn(e, "Unable to get partition data from Kafka for brokers [%s], are the brokers up?", ioConfig.getConsumerProperties().get(KafkaSupervisorIOConfig.BOOTSTRAP_SERVERS_KEY)); return; } List<PartitionInfo> partitions = topics.get(ioConfig.getTopic()); if (partitions == null) { log.warn("No such topic [%s] found, list of discovered topics [%s]", ioConfig.getTopic(), topics.keySet()); } int numPartitions = (partitions != null ? partitions.size() : 0); log.debug("Found [%d] Kafka partitions for topic [%s]", numPartitions, ioConfig.getTopic()); for (int partition = 0; partition < numPartitions; partition++) { int taskGroupId = getTaskGroupIdForPartition(partition); ConcurrentHashMap<Integer, Long> partitionMap = partitionGroups.computeIfAbsent(taskGroupId, k -> new ConcurrentHashMap<>()); // The starting offset for a new partition in [partitionGroups] is initially set to NOT_SET; when a new task group // is created and is assigned partitions, if the offset in [partitionGroups] is NOT_SET it will take the starting // offset value from the metadata store, and if it can't find it there, from Kafka. Once a task begins // publishing, the offset in partitionGroups will be updated to the ending offset of the publishing-but-not-yet- // completed task, which will cause the next set of tasks to begin reading from where the previous task left // off. If that previous task now fails, we will set the offset in [partitionGroups] back to NOT_SET which will // cause successive tasks to again grab their starting offset from metadata store. This mechanism allows us to // start up successive tasks without waiting for the previous tasks to succeed and still be able to handle task // failures during publishing. if (partitionMap.putIfAbsent(partition, NOT_SET) == null) { log.info("New partition [%d] discovered for topic [%s], added to task group [%d]", partition, ioConfig.getTopic(), taskGroupId); } } } private void discoverTasks() throws ExecutionException, InterruptedException, TimeoutException { int taskCount = 0; List<String> futureTaskIds = Lists.newArrayList(); List<ListenableFuture<Boolean>> futures = Lists.newArrayList(); List<Task> tasks = taskStorage.getActiveTasks(); final Map<Integer, TaskGroup> taskGroupsToVerify = new HashMap<>(); for (Task task : tasks) { if (!(task instanceof KafkaIndexTask) || !dataSource.equals(task.getDataSource())) { continue; } taskCount++; final KafkaIndexTask kafkaTask = (KafkaIndexTask) task; final String taskId = task.getId(); // Determine which task group this task belongs to based on one of the partitions handled by this task. If we // later determine that this task is actively reading, we will make sure that it matches our current partition // allocation (getTaskGroupIdForPartition(partition) should return the same value for every partition being read // by this task) and kill it if it is not compatible. If the task is instead found to be in the publishing // state, we will permit it to complete even if it doesn't match our current partition allocation to support // seamless schema migration. Iterator<Integer> it = kafkaTask.getIOConfig().getStartPartitions().getPartitionOffsetMap().keySet() .iterator(); final Integer taskGroupId = (it.hasNext() ? getTaskGroupIdForPartition(it.next()) : null); if (taskGroupId != null) { // check to see if we already know about this task, either in [taskGroups] or in [pendingCompletionTaskGroups] // and if not add it to taskGroups or pendingCompletionTaskGroups (if status = PUBLISHING) TaskGroup taskGroup = taskGroups.get(taskGroupId); if (!isTaskInPendingCompletionGroups(taskId) && (taskGroup == null || !taskGroup.tasks.containsKey(taskId))) { futureTaskIds.add(taskId); futures.add(Futures.transform(taskClient.getStatusAsync(taskId), new Function<KafkaIndexTask.Status, Boolean>() { @Override public Boolean apply(KafkaIndexTask.Status status) { try { log.debug("Task [%s], status [%s]", taskId, status); if (status == KafkaIndexTask.Status.PUBLISHING) { kafkaTask.getIOConfig().getStartPartitions().getPartitionOffsetMap() .keySet().forEach( partition -> addDiscoveredTaskToPendingCompletionTaskGroups( getTaskGroupIdForPartition(partition), taskId, kafkaTask.getIOConfig().getStartPartitions() .getPartitionOffsetMap())); // update partitionGroups with the publishing task's offsets (if they are greater than what is // existing) so that the next tasks will start reading from where this task left off Map<Integer, Long> publishingTaskEndOffsets = taskClient .getEndOffsets(taskId); for (Entry<Integer, Long> entry : publishingTaskEndOffsets.entrySet()) { Integer partition = entry.getKey(); Long offset = entry.getValue(); ConcurrentHashMap<Integer, Long> partitionOffsets = partitionGroups .get(getTaskGroupIdForPartition(partition)); boolean succeeded; do { succeeded = true; Long previousOffset = partitionOffsets.putIfAbsent(partition, offset); if (previousOffset != null && previousOffset < offset) { succeeded = partitionOffsets.replace(partition, previousOffset, offset); } } while (!succeeded); } } else { for (Integer partition : kafkaTask.getIOConfig().getStartPartitions() .getPartitionOffsetMap().keySet()) { if (!taskGroupId.equals(getTaskGroupIdForPartition(partition))) { log.warn( "Stopping task [%s] which does not match the expected partition allocation", taskId); try { stopTask(taskId, false).get(futureTimeoutInSeconds, TimeUnit.SECONDS); } catch (InterruptedException | ExecutionException | TimeoutException e) { log.warn(e, "Exception while stopping task"); } return false; } } // make sure the task's io and tuning configs match with the supervisor config // if it is current then only create corresponding taskGroup if it does not exist if (!isTaskCurrent(taskGroupId, taskId)) { log.info( "Stopping task [%s] which does not match the expected parameters and ingestion spec", taskId); try { stopTask(taskId, false).get(futureTimeoutInSeconds, TimeUnit.SECONDS); } catch (InterruptedException | ExecutionException | TimeoutException e) { log.warn(e, "Exception while stopping task"); } return false; } else { final TaskGroup taskGroup = taskGroups.computeIfAbsent(taskGroupId, k -> { log.info( "Creating a new task group for taskGroupId[%d]", taskGroupId); return new TaskGroup(taskGroupId, ImmutableMap.copyOf(kafkaTask.getIOConfig() .getStartPartitions() .getPartitionOffsetMap()), kafkaTask.getIOConfig().getMinimumMessageTime(), kafkaTask.getIOConfig() .getMaximumMessageTime()); }); taskGroupsToVerify.put(taskGroupId, taskGroup); final TaskData prevTaskData = taskGroup.tasks.putIfAbsent(taskId, new TaskData()); if (prevTaskData != null) { throw new ISE( "WTH? a taskData[%s] already exists for new task[%s]", prevTaskData, taskId); } } } return true; } catch (Throwable t) { log.error(t, "Something bad while discovering task [%s]", taskId); return null; } } }, workerExec)); } } } List<Boolean> results = Futures.successfulAsList(futures).get(futureTimeoutInSeconds, TimeUnit.SECONDS); for (int i = 0; i < results.size(); i++) { if (results.get(i) == null) { String taskId = futureTaskIds.get(i); log.warn("Task [%s] failed to return status, killing task", taskId); killTask(taskId); } } log.debug("Found [%d] Kafka indexing tasks for dataSource [%s]", taskCount, dataSource); // make sure the checkpoints are consistent with each other and with the metadata store verifyAndMergeCheckpoints(taskGroupsToVerify.values()); } private void verifyAndMergeCheckpoints(final Collection<TaskGroup> taskGroupsToVerify) { final List<ListenableFuture<?>> futures = new ArrayList<>(); for (TaskGroup taskGroup : taskGroupsToVerify) { futures.add(workerExec.submit(() -> verifyAndMergeCheckpoints(taskGroup))); } try { Futures.allAsList(futures).get(futureTimeoutInSeconds, TimeUnit.SECONDS); } catch (InterruptedException | ExecutionException | TimeoutException e) { throw new RuntimeException(e); } } /** * This method does two things - * 1. Makes sure the checkpoints information in the taskGroup is consistent with that of the tasks, if not kill * inconsistent tasks. * 2. truncates the checkpoints in the taskGroup corresponding to which segments have been published, so that any newly * created tasks for the taskGroup start indexing from after the latest published offsets. */ private void verifyAndMergeCheckpoints(final TaskGroup taskGroup) { final int groupId = taskGroup.groupId; final List<Pair<String, TreeMap<Integer, Map<Integer, Long>>>> taskSequences = new ArrayList<>(); final List<ListenableFuture<TreeMap<Integer, Map<Integer, Long>>>> futures = new ArrayList<>(); final List<String> taskIds = new ArrayList<>(); for (String taskId : taskGroup.taskIds()) { final ListenableFuture<TreeMap<Integer, Map<Integer, Long>>> checkpointsFuture = taskClient .getCheckpointsAsync(taskId, true); taskIds.add(taskId); futures.add(checkpointsFuture); } try { List<TreeMap<Integer, Map<Integer, Long>>> futuresResult = Futures.successfulAsList(futures) .get(futureTimeoutInSeconds, TimeUnit.SECONDS); for (int i = 0; i < futuresResult.size(); i++) { final TreeMap<Integer, Map<Integer, Long>> checkpoints = futuresResult.get(i); final String taskId = taskIds.get(i); if (checkpoints == null) { try { // catch the exception in failed futures futures.get(i).get(); } catch (Exception e) { log.error(e, "Problem while getting checkpoints for task [%s], killing the task", taskId); killTask(taskId); taskGroup.tasks.remove(taskId); } } else if (checkpoints.isEmpty()) { log.warn("Ignoring task [%s], as probably it is not started running yet", taskId); } else { taskSequences.add(new Pair<>(taskId, checkpoints)); } } } catch (Exception e) { throw new RuntimeException(e); } final KafkaDataSourceMetadata latestDataSourceMetadata = (KafkaDataSourceMetadata) indexerMetadataStorageCoordinator .getDataSourceMetadata(dataSource); final boolean hasValidOffsetsFromDb = latestDataSourceMetadata != null && latestDataSourceMetadata.getKafkaPartitions() != null && ioConfig.getTopic().equals(latestDataSourceMetadata.getKafkaPartitions().getTopic()); final Map<Integer, Long> latestOffsetsFromDb; if (hasValidOffsetsFromDb) { latestOffsetsFromDb = latestDataSourceMetadata.getKafkaPartitions().getPartitionOffsetMap(); } else { latestOffsetsFromDb = null; } // order tasks of this taskGroup by the latest sequenceId taskSequences.sort((o1, o2) -> o2.rhs.firstKey().compareTo(o1.rhs.firstKey())); final Set<String> tasksToKill = new HashSet<>(); final AtomicInteger earliestConsistentSequenceId = new AtomicInteger(-1); int taskIndex = 0; while (taskIndex < taskSequences.size()) { TreeMap<Integer, Map<Integer, Long>> taskCheckpoints = taskSequences.get(taskIndex).rhs; String taskId = taskSequences.get(taskIndex).lhs; if (earliestConsistentSequenceId.get() == -1) { // find the first replica task with earliest sequenceId consistent with datasource metadata in the metadata // store if (taskCheckpoints.entrySet().stream() .anyMatch(sequenceCheckpoint -> sequenceCheckpoint.getValue().entrySet().stream() .allMatch(partitionOffset -> Longs.compare(partitionOffset.getValue(), latestOffsetsFromDb == null ? partitionOffset.getValue() : latestOffsetsFromDb.getOrDefault(partitionOffset.getKey(), partitionOffset.getValue())) == 0) && earliestConsistentSequenceId.compareAndSet(-1, sequenceCheckpoint.getKey())) || (pendingCompletionTaskGroups.getOrDefault(groupId, EMPTY_LIST).size() > 0 && earliestConsistentSequenceId.compareAndSet(-1, taskCheckpoints.firstKey()))) { final SortedMap<Integer, Map<Integer, Long>> latestCheckpoints = new TreeMap<>( taskCheckpoints.tailMap(earliestConsistentSequenceId.get())); log.info("Setting taskGroup sequences to [%s] for group [%d]", latestCheckpoints, groupId); taskGroup.sequenceOffsets.clear(); taskGroup.sequenceOffsets.putAll(latestCheckpoints); } else { log.debug("Adding task [%s] to kill list, checkpoints[%s], latestoffsets from DB [%s]", taskId, taskCheckpoints, latestOffsetsFromDb); tasksToKill.add(taskId); } } else { // check consistency with taskGroup sequences if (taskCheckpoints.get(taskGroup.sequenceOffsets.firstKey()) == null || !(taskCheckpoints.get(taskGroup.sequenceOffsets.firstKey()) .equals(taskGroup.sequenceOffsets.firstEntry().getValue())) || taskCheckpoints.tailMap(taskGroup.sequenceOffsets.firstKey()) .size() != taskGroup.sequenceOffsets.size()) { log.debug("Adding task [%s] to kill list, checkpoints[%s], taskgroup checkpoints [%s]", taskId, taskCheckpoints, taskGroup.sequenceOffsets); tasksToKill.add(taskId); } } taskIndex++; } if ((tasksToKill.size() > 0 && tasksToKill.size() == taskGroup.tasks.size()) || (taskGroup.tasks.size() == 0 && pendingCompletionTaskGroups.getOrDefault(groupId, EMPTY_LIST).size() == 0)) { // killing all tasks or no task left in the group ? // clear state about the taskgroup so that get latest offset information is fetched from metadata store log.warn("Clearing task group [%d] information as no valid tasks left the group", groupId); taskGroups.remove(groupId); partitionGroups.get(groupId).replaceAll((partition, offset) -> NOT_SET); } taskSequences.stream().filter(taskIdSequences -> tasksToKill.contains(taskIdSequences.lhs)) .forEach(sequenceCheckpoint -> { log.warn( "Killing task [%s], as its checkpoints [%s] are not consistent with group checkpoints[%s] or latest " + "persisted offsets in metadata store [%s]", sequenceCheckpoint.lhs, sequenceCheckpoint.rhs, taskGroup.sequenceOffsets, latestOffsetsFromDb); killTask(sequenceCheckpoint.lhs); taskGroup.tasks.remove(sequenceCheckpoint.lhs); }); } private void addDiscoveredTaskToPendingCompletionTaskGroups(int groupId, String taskId, Map<Integer, Long> startingPartitions) { final CopyOnWriteArrayList<TaskGroup> taskGroupList = pendingCompletionTaskGroups.computeIfAbsent(groupId, k -> new CopyOnWriteArrayList<>()); for (TaskGroup taskGroup : taskGroupList) { if (taskGroup.partitionOffsets.equals(startingPartitions)) { if (taskGroup.tasks.putIfAbsent(taskId, new TaskData()) == null) { log.info("Added discovered task [%s] to existing pending task group [%s]", taskId, groupId); } return; } } log.info("Creating new pending completion task group [%s] for discovered task [%s]", groupId, taskId); // reading the minimumMessageTime & maximumMessageTime from the publishing task and setting it here is not necessary as this task cannot // change to a state where it will read any more events TaskGroup newTaskGroup = new TaskGroup(groupId, ImmutableMap.copyOf(startingPartitions), Optional.absent(), Optional.absent()); newTaskGroup.tasks.put(taskId, new TaskData()); newTaskGroup.completionTimeout = DateTimes.nowUtc().plus(ioConfig.getCompletionTimeout()); taskGroupList.add(newTaskGroup); } private void updateTaskStatus() throws ExecutionException, InterruptedException, TimeoutException { final List<ListenableFuture<Boolean>> futures = Lists.newArrayList(); final List<String> futureTaskIds = Lists.newArrayList(); // update status (and startTime if unknown) of current tasks in taskGroups for (TaskGroup group : taskGroups.values()) { for (Entry<String, TaskData> entry : group.tasks.entrySet()) { final String taskId = entry.getKey(); final TaskData taskData = entry.getValue(); if (taskData.startTime == null) { futureTaskIds.add(taskId); futures.add(Futures.transform(taskClient.getStartTimeAsync(taskId), new Function<DateTime, Boolean>() { @Nullable @Override public Boolean apply(@Nullable DateTime startTime) { if (startTime == null) { return false; } taskData.startTime = startTime; long millisRemaining = ioConfig.getTaskDuration().getMillis() - (System.currentTimeMillis() - taskData.startTime.getMillis()); if (millisRemaining > 0) { scheduledExec.schedule(buildRunTask(), millisRemaining + MAX_RUN_FREQUENCY_MILLIS, TimeUnit.MILLISECONDS); } return true; } }, workerExec)); } taskData.status = taskStorage.getStatus(taskId).get(); } } // update status of pending completion tasks in pendingCompletionTaskGroups for (List<TaskGroup> taskGroups : pendingCompletionTaskGroups.values()) { for (TaskGroup group : taskGroups) { for (Entry<String, TaskData> entry : group.tasks.entrySet()) { entry.getValue().status = taskStorage.getStatus(entry.getKey()).get(); } } } List<Boolean> results = Futures.successfulAsList(futures).get(futureTimeoutInSeconds, TimeUnit.SECONDS); for (int i = 0; i < results.size(); i++) { // false means the task hasn't started running yet and that's okay; null means it should be running but the HTTP // request threw an exception so kill the task if (results.get(i) == null) { String taskId = futureTaskIds.get(i); log.warn("Task [%s] failed to return start time, killing task", taskId); killTask(taskId); } } } private void checkTaskDuration() throws InterruptedException, ExecutionException, TimeoutException { final List<ListenableFuture<Map<Integer, Long>>> futures = Lists.newArrayList(); final List<Integer> futureGroupIds = Lists.newArrayList(); for (Entry<Integer, TaskGroup> entry : taskGroups.entrySet()) { Integer groupId = entry.getKey(); TaskGroup group = entry.getValue(); // find the longest running task from this group DateTime earliestTaskStart = DateTimes.nowUtc(); for (TaskData taskData : group.tasks.values()) { // startTime can be null if kafkaSupervisor is stopped gracefully before processing any runNotice if (taskData.startTime != null && earliestTaskStart.isAfter(taskData.startTime)) { earliestTaskStart = taskData.startTime; } } // if this task has run longer than the configured duration, signal all tasks in the group to persist if (earliestTaskStart.plus(ioConfig.getTaskDuration()).isBeforeNow()) { log.info("Task group [%d] has run for [%s]", groupId, ioConfig.getTaskDuration()); futureGroupIds.add(groupId); futures.add(checkpointTaskGroup(group, true)); } } List<Map<Integer, Long>> results = Futures.successfulAsList(futures).get(futureTimeoutInSeconds, TimeUnit.SECONDS); for (int j = 0; j < results.size(); j++) { Integer groupId = futureGroupIds.get(j); TaskGroup group = taskGroups.get(groupId); Map<Integer, Long> endOffsets = results.get(j); if (endOffsets != null) { // set a timeout and put this group in pendingCompletionTaskGroups so that it can be monitored for completion group.completionTimeout = DateTimes.nowUtc().plus(ioConfig.getCompletionTimeout()); pendingCompletionTaskGroups.computeIfAbsent(groupId, k -> new CopyOnWriteArrayList<>()).add(group); // set endOffsets as the next startOffsets for (Entry<Integer, Long> entry : endOffsets.entrySet()) { partitionGroups.get(groupId).put(entry.getKey(), entry.getValue()); } } else { log.warn("All tasks in group [%s] failed to transition to publishing state, killing tasks [%s]", groupId, group.taskIds()); for (String id : group.taskIds()) { killTask(id); } // clear partitionGroups, so that latest offsets from db is used as start offsets not the stale ones // if tasks did some successful incremental handoffs partitionGroups.get(groupId).replaceAll((partition, offset) -> NOT_SET); } // remove this task group from the list of current task groups now that it has been handled taskGroups.remove(groupId); } } private ListenableFuture<Map<Integer, Long>> checkpointTaskGroup(final TaskGroup taskGroup, final boolean finalize) { if (finalize) { // 1) Check if any task completed (in which case we're done) and kill unassigned tasks Iterator<Entry<String, TaskData>> i = taskGroup.tasks.entrySet().iterator(); while (i.hasNext()) { Entry<String, TaskData> taskEntry = i.next(); String taskId = taskEntry.getKey(); TaskData task = taskEntry.getValue(); // task.status can be null if kafkaSupervisor is stopped gracefully before processing any runNotice. if (task.status != null) { if (task.status.isSuccess()) { // If any task in this group has already completed, stop the rest of the tasks in the group and return. // This will cause us to create a new set of tasks next cycle that will start from the offsets in // metadata store (which will have advanced if we succeeded in publishing and will remain the same if // publishing failed and we need to re-ingest) return Futures.transform(stopTasksInGroup(taskGroup), new Function<Object, Map<Integer, Long>>() { @Nullable @Override public Map<Integer, Long> apply(@Nullable Object input) { return null; } }); } if (task.status.isRunnable()) { if (taskInfoProvider.getTaskLocation(taskId).equals(TaskLocation.unknown())) { log.info("Killing task [%s] which hasn't been assigned to a worker", taskId); killTask(taskId); i.remove(); } } } } } // 2) Pause running tasks final List<ListenableFuture<Map<Integer, Long>>> pauseFutures = Lists.newArrayList(); final List<String> pauseTaskIds = ImmutableList.copyOf(taskGroup.taskIds()); for (final String taskId : pauseTaskIds) { pauseFutures.add(taskClient.pauseAsync(taskId)); } return Futures.transform(Futures.successfulAsList(pauseFutures), new Function<List<Map<Integer, Long>>, Map<Integer, Long>>() { @Nullable @Override public Map<Integer, Long> apply(List<Map<Integer, Long>> input) { // 3) Build a map of the highest offset read by any task in the group for each partition final Map<Integer, Long> endOffsets = new HashMap<>(); for (int i = 0; i < input.size(); i++) { Map<Integer, Long> result = input.get(i); if (result == null || result.isEmpty()) { // kill tasks that didn't return a value String taskId = pauseTaskIds.get(i); log.warn("Task [%s] failed to respond to [pause] in a timely manner, killing task", taskId); killTask(taskId); taskGroup.tasks.remove(taskId); } else { // otherwise build a map of the highest offsets seen for (Entry<Integer, Long> offset : result.entrySet()) { if (!endOffsets.containsKey(offset.getKey()) || endOffsets.get(offset.getKey()).compareTo(offset.getValue()) < 0) { endOffsets.put(offset.getKey(), offset.getValue()); } } } } // 4) Set the end offsets for each task to the values from step 3 and resume the tasks. All the tasks should // finish reading and start publishing within a short period, depending on how in sync the tasks were. final List<ListenableFuture<Boolean>> setEndOffsetFutures = Lists.newArrayList(); final List<String> setEndOffsetTaskIds = ImmutableList.copyOf(taskGroup.taskIds()); if (setEndOffsetTaskIds.isEmpty()) { log.info("All tasks in taskGroup [%d] have failed, tasks will be re-created", taskGroup.groupId); return null; } try { if (endOffsets.equals(taskGroup.sequenceOffsets.lastEntry().getValue())) { log.warn( "Checkpoint [%s] is same as the start offsets [%s] of latest sequence for the task group [%d]", endOffsets, taskGroup.sequenceOffsets.lastEntry().getValue(), taskGroup.groupId); } log.info("Setting endOffsets for tasks in taskGroup [%d] to %s and resuming", taskGroup.groupId, endOffsets); for (final String taskId : setEndOffsetTaskIds) { setEndOffsetFutures .add(taskClient.setEndOffsetsAsync(taskId, endOffsets, finalize)); } List<Boolean> results = Futures.successfulAsList(setEndOffsetFutures) .get(futureTimeoutInSeconds, TimeUnit.SECONDS); for (int i = 0; i < results.size(); i++) { if (results.get(i) == null || !results.get(i)) { String taskId = setEndOffsetTaskIds.get(i); log.warn( "Task [%s] failed to respond to [set end offsets] in a timely manner, killing task", taskId); killTask(taskId); taskGroup.tasks.remove(taskId); } } } catch (Exception e) { log.error("Something bad happened [%s]", e.getMessage()); Throwables.propagate(e); } if (taskGroup.tasks.isEmpty()) { log.info("All tasks in taskGroup [%d] have failed, tasks will be re-created", taskGroup.groupId); return null; } return endOffsets; } }, workerExec); } /** * Monitors [pendingCompletionTaskGroups] for tasks that have completed. If any task in a task group has completed, we * can safely stop the rest of the tasks in that group. If a task group has exceeded its publishing timeout, then * we need to stop all tasks in not only that task group but also 1) any subsequent task group that is also pending * completion and 2) the current task group that is running, because the assumption that we have handled up to the * starting offset for subsequent task groups is no longer valid, and subsequent tasks would fail as soon as they * attempted to publish because of the contiguous range consistency check. */ private void checkPendingCompletionTasks() throws ExecutionException, InterruptedException, TimeoutException { List<ListenableFuture<?>> futures = Lists.newArrayList(); for (Entry<Integer, CopyOnWriteArrayList<TaskGroup>> pendingGroupList : pendingCompletionTaskGroups .entrySet()) { boolean stopTasksInTaskGroup = false; Integer groupId = pendingGroupList.getKey(); CopyOnWriteArrayList<TaskGroup> taskGroupList = pendingGroupList.getValue(); List<TaskGroup> toRemove = Lists.newArrayList(); for (TaskGroup group : taskGroupList) { boolean foundSuccess = false, entireTaskGroupFailed = false; if (stopTasksInTaskGroup) { // One of the earlier groups that was handling the same partition set timed out before the segments were // published so stop any additional groups handling the same partition set that are pending completion. futures.add(stopTasksInGroup(group)); toRemove.add(group); continue; } Iterator<Entry<String, TaskData>> iTask = group.tasks.entrySet().iterator(); while (iTask.hasNext()) { final Entry<String, TaskData> entry = iTask.next(); final String taskId = entry.getKey(); final TaskData taskData = entry.getValue(); Preconditions.checkNotNull(taskData.status, "WTH? task[%s] has a null status", taskId); if (taskData.status.isFailure()) { iTask.remove(); // remove failed task if (group.tasks.isEmpty()) { // if all tasks in the group have failed, just nuke all task groups with this partition set and restart entireTaskGroupFailed = true; break; } } if (taskData.status.isSuccess()) { // If one of the pending completion tasks was successful, stop the rest of the tasks in the group as // we no longer need them to publish their segment. log.info("Task [%s] completed successfully, stopping tasks %s", taskId, group.taskIds()); futures.add(stopTasksInGroup(group)); foundSuccess = true; toRemove.add(group); // remove the TaskGroup from the list of pending completion task groups break; // skip iterating the rest of the tasks in this group as they've all been stopped now } } if ((!foundSuccess && group.completionTimeout.isBeforeNow()) || entireTaskGroupFailed) { if (entireTaskGroupFailed) { log.warn( "All tasks in group [%d] failed to publish, killing all tasks for these partitions", groupId); } else { log.makeAlert( "No task in [%s] for taskGroup [%d] succeeded before the completion timeout elapsed [%s]!", group.taskIds(), groupId, ioConfig.getCompletionTimeout()).emit(); } // reset partitions offsets for this task group so that they will be re-read from metadata storage partitionGroups.get(groupId).replaceAll((partition, offset) -> NOT_SET); // kill all the tasks in this pending completion group killTasksInGroup(group); // set a flag so the other pending completion groups for this set of partitions will also stop stopTasksInTaskGroup = true; // kill all the tasks in the currently reading task group and remove the bad task group killTasksInGroup(taskGroups.remove(groupId)); toRemove.add(group); } } taskGroupList.removeAll(toRemove); } // wait for all task shutdowns to complete before returning Futures.successfulAsList(futures).get(futureTimeoutInSeconds, TimeUnit.SECONDS); } private void checkCurrentTaskState() throws ExecutionException, InterruptedException, TimeoutException { List<ListenableFuture<?>> futures = Lists.newArrayList(); Iterator<Entry<Integer, TaskGroup>> iTaskGroups = taskGroups.entrySet().iterator(); while (iTaskGroups.hasNext()) { Entry<Integer, TaskGroup> taskGroupEntry = iTaskGroups.next(); Integer groupId = taskGroupEntry.getKey(); TaskGroup taskGroup = taskGroupEntry.getValue(); // Iterate the list of known tasks in this group and: // 1) Kill any tasks which are not "current" (have the partitions, starting offsets, and minimumMessageTime // & maximumMessageTime (if applicable) in [taskGroups]) // 2) Remove any tasks that have failed from the list // 3) If any task completed successfully, stop all the tasks in this group and move to the next group log.debug("Task group [%d] pre-pruning: %s", groupId, taskGroup.taskIds()); Iterator<Entry<String, TaskData>> iTasks = taskGroup.tasks.entrySet().iterator(); while (iTasks.hasNext()) { Entry<String, TaskData> task = iTasks.next(); String taskId = task.getKey(); TaskData taskData = task.getValue(); // stop and remove bad tasks from the task group if (!isTaskCurrent(groupId, taskId)) { log.info("Stopping task [%s] which does not match the expected offset range and ingestion spec", taskId); futures.add(stopTask(taskId, false)); iTasks.remove(); continue; } Preconditions.checkNotNull(taskData.status, "WTH? task[%s] has a null status", taskId); // remove failed tasks if (taskData.status.isFailure()) { iTasks.remove(); continue; } // check for successful tasks, and if we find one, stop all tasks in the group and remove the group so it can // be recreated with the next set of offsets if (taskData.status.isSuccess()) { futures.add(stopTasksInGroup(taskGroup)); iTaskGroups.remove(); break; } } log.debug("Task group [%d] post-pruning: %s", groupId, taskGroup.taskIds()); } // wait for all task shutdowns to complete before returning Futures.successfulAsList(futures).get(futureTimeoutInSeconds, TimeUnit.SECONDS); } void createNewTasks() throws JsonProcessingException { // update the checkpoints in the taskGroup to latest ones so that new tasks do not read what is already published verifyAndMergeCheckpoints(taskGroups.values().stream() .filter(taskGroup -> taskGroup.tasks.size() < ioConfig.getReplicas()).collect(Collectors.toList())); // check that there is a current task group for each group of partitions in [partitionGroups] for (Integer groupId : partitionGroups.keySet()) { if (!taskGroups.containsKey(groupId)) { log.info("Creating new task group [%d] for partitions %s", groupId, partitionGroups.get(groupId).keySet()); Optional<DateTime> minimumMessageTime = (ioConfig.getLateMessageRejectionPeriod().isPresent() ? Optional.of(DateTimes.nowUtc().minus(ioConfig.getLateMessageRejectionPeriod().get())) : Optional.absent()); Optional<DateTime> maximumMessageTime = (ioConfig.getEarlyMessageRejectionPeriod().isPresent() ? Optional.of(DateTimes.nowUtc().plus(ioConfig.getTaskDuration()) .plus(ioConfig.getEarlyMessageRejectionPeriod().get())) : Optional.absent()); final TaskGroup taskGroup = new TaskGroup(groupId, generateStartingOffsetsForPartitionGroup(groupId), minimumMessageTime, maximumMessageTime); taskGroups.put(groupId, taskGroup); } } // iterate through all the current task groups and make sure each one has the desired number of replica tasks boolean createdTask = false; for (Entry<Integer, TaskGroup> entry : taskGroups.entrySet()) { TaskGroup taskGroup = entry.getValue(); Integer groupId = entry.getKey(); if (ioConfig.getReplicas() > taskGroup.tasks.size()) { log.info( "Number of tasks [%d] does not match configured numReplicas [%d] in task group [%d], creating more tasks", taskGroup.tasks.size(), ioConfig.getReplicas(), groupId); createKafkaTasksForGroup(groupId, ioConfig.getReplicas() - taskGroup.tasks.size()); createdTask = true; } } if (createdTask && firstRunTime.isBeforeNow()) { // Schedule a run event after a short delay to update our internal data structures with the new tasks that were // just created. This is mainly for the benefit of the status API in situations where the run period is lengthy. scheduledExec.schedule(buildRunTask(), 5000, TimeUnit.MILLISECONDS); } } private void createKafkaTasksForGroup(int groupId, int replicas) throws JsonProcessingException { Map<Integer, Long> startPartitions = taskGroups.get(groupId).partitionOffsets; Map<Integer, Long> endPartitions = new HashMap<>(); for (Integer partition : startPartitions.keySet()) { endPartitions.put(partition, Long.MAX_VALUE); } TaskGroup group = taskGroups.get(groupId); Map<String, String> consumerProperties = Maps.newHashMap(ioConfig.getConsumerProperties()); DateTime minimumMessageTime = taskGroups.get(groupId).minimumMessageTime.orNull(); DateTime maximumMessageTime = taskGroups.get(groupId).maximumMessageTime.orNull(); KafkaIOConfig kafkaIOConfig = new KafkaIOConfig(groupId, group.baseSequenceName, new KafkaPartitions(ioConfig.getTopic(), startPartitions), new KafkaPartitions(ioConfig.getTopic(), endPartitions), consumerProperties, true, minimumMessageTime, maximumMessageTime, ioConfig.isSkipOffsetGaps()); final String checkpoints = sortingMapper .writerWithType(new TypeReference<TreeMap<Integer, Map<Integer, Long>>>() { }).writeValueAsString(taskGroups.get(groupId).sequenceOffsets); final Map<String, Object> context = spec.getContext() == null ? ImmutableMap.of("checkpoints", checkpoints, IS_INCREMENTAL_HANDOFF_SUPPORTED, true) : ImmutableMap.<String, Object>builder().put("checkpoints", checkpoints) .put(IS_INCREMENTAL_HANDOFF_SUPPORTED, true).putAll(spec.getContext()).build(); for (int i = 0; i < replicas; i++) { String taskId = Joiner.on("_").join(group.baseSequenceName, RealtimeIndexTask.makeRandomId()); KafkaIndexTask indexTask = new KafkaIndexTask(taskId, new TaskResource(group.baseSequenceName, 1), spec.getDataSchema(), taskTuningConfig, kafkaIOConfig, context, null, null, rowIngestionMetersFactory); Optional<TaskQueue> taskQueue = taskMaster.getTaskQueue(); if (taskQueue.isPresent()) { try { taskQueue.get().add(indexTask); } catch (EntryExistsException e) { log.error("Tried to add task [%s] but it already exists", indexTask.getId()); } } else { log.error("Failed to get task queue because I'm not the leader!"); } } } private ImmutableMap<Integer, Long> generateStartingOffsetsForPartitionGroup(int groupId) { ImmutableMap.Builder<Integer, Long> builder = ImmutableMap.builder(); for (Entry<Integer, Long> entry : partitionGroups.get(groupId).entrySet()) { Integer partition = entry.getKey(); Long offset = entry.getValue(); if (offset != null && offset != NOT_SET) { // if we are given a startingOffset (set by a previous task group which is pending completion) then use it builder.put(partition, offset); } else { // if we don't have a startingOffset (first run or we had some previous failures and reset the offsets) then // get the offset from metadata storage (if available) or Kafka (otherwise) builder.put(partition, getOffsetFromStorageForPartition(partition)); } } return builder.build(); } /** * Queries the dataSource metadata table to see if there is a previous ending offset for this partition. If it doesn't * find any data, it will retrieve the latest or earliest Kafka offset depending on the useEarliestOffset config. */ private long getOffsetFromStorageForPartition(int partition) { long offset; final Map<Integer, Long> metadataOffsets = getOffsetsFromMetadataStorage(); if (metadataOffsets.get(partition) != null) { offset = metadataOffsets.get(partition); log.debug("Getting offset [%,d] from metadata storage for partition [%d]", offset, partition); long latestKafkaOffset = getOffsetFromKafkaForPartition(partition, false); if (offset > latestKafkaOffset) { throw new ISE( "Offset in metadata storage [%,d] > latest Kafka offset [%,d] for partition[%d] dataSource[%s]. If these " + "messages are no longer available (perhaps you deleted and re-created your Kafka topic) you can use the " + "supervisor reset API to restart ingestion.", offset, latestKafkaOffset, partition, dataSource); } } else { offset = getOffsetFromKafkaForPartition(partition, ioConfig.isUseEarliestOffset()); log.debug("Getting offset [%,d] from Kafka for partition [%d]", offset, partition); } return offset; } private Map<Integer, Long> getOffsetsFromMetadataStorage() { final DataSourceMetadata dataSourceMetadata = indexerMetadataStorageCoordinator .getDataSourceMetadata(dataSource); if (dataSourceMetadata instanceof KafkaDataSourceMetadata) { KafkaPartitions partitions = ((KafkaDataSourceMetadata) dataSourceMetadata).getKafkaPartitions(); if (partitions != null) { if (!ioConfig.getTopic().equals(partitions.getTopic())) { log.warn( "Topic in metadata storage [%s] doesn't match spec topic [%s], ignoring stored offsets", partitions.getTopic(), ioConfig.getTopic()); return Collections.emptyMap(); } else if (partitions.getPartitionOffsetMap() != null) { return partitions.getPartitionOffsetMap(); } } } return Collections.emptyMap(); } private long getOffsetFromKafkaForPartition(int partition, boolean useEarliestOffset) { synchronized (consumerLock) { TopicPartition topicPartition = new TopicPartition(ioConfig.getTopic(), partition); if (!consumer.assignment().contains(topicPartition)) { consumer.assign(Collections.singletonList(topicPartition)); } if (useEarliestOffset) { consumer.seekToBeginning(Collections.singletonList(topicPartition)); } else { consumer.seekToEnd(Collections.singletonList(topicPartition)); } return consumer.position(topicPartition); } } /** * Compares the sequence name from the task with one generated for the task's group ID and returns false if they do * not match. The sequence name is generated from a hash of the dataSchema, tuningConfig, starting offsets, and the * minimumMessageTime or maximumMessageTime if set. */ private boolean isTaskCurrent(int taskGroupId, String taskId) { Optional<Task> taskOptional = taskStorage.getTask(taskId); if (!taskOptional.isPresent() || !(taskOptional.get() instanceof KafkaIndexTask)) { return false; } String taskSequenceName = ((KafkaIndexTask) taskOptional.get()).getIOConfig().getBaseSequenceName(); if (taskGroups.get(taskGroupId) != null) { return Preconditions.checkNotNull(taskGroups.get(taskGroupId), "null taskGroup for taskId[%s]", taskGroupId).baseSequenceName.equals(taskSequenceName); } else { return generateSequenceName( ((KafkaIndexTask) taskOptional.get()).getIOConfig().getStartPartitions() .getPartitionOffsetMap(), ((KafkaIndexTask) taskOptional.get()).getIOConfig().getMinimumMessageTime(), ((KafkaIndexTask) taskOptional.get()).getIOConfig().getMaximumMessageTime()) .equals(taskSequenceName); } } private ListenableFuture<?> stopTasksInGroup(@Nullable TaskGroup taskGroup) { if (taskGroup == null) { return Futures.immediateFuture(null); } final List<ListenableFuture<Void>> futures = Lists.newArrayList(); for (Entry<String, TaskData> entry : taskGroup.tasks.entrySet()) { final String taskId = entry.getKey(); final TaskData taskData = entry.getValue(); if (taskData.status == null) { killTask(taskId); } else if (!taskData.status.isComplete()) { futures.add(stopTask(taskId, false)); } } return Futures.successfulAsList(futures); } private ListenableFuture<Void> stopTask(final String id, final boolean publish) { return Futures.transform(taskClient.stopAsync(id, publish), new Function<Boolean, Void>() { @Nullable @Override public Void apply(@Nullable Boolean result) { if (result == null || !result) { log.info("Task [%s] failed to stop in a timely manner, killing task", id); killTask(id); } return null; } }); } private void killTask(final String id) { Optional<TaskQueue> taskQueue = taskMaster.getTaskQueue(); if (taskQueue.isPresent()) { taskQueue.get().shutdown(id); } else { log.error("Failed to get task queue because I'm not the leader!"); } } protected int getTaskGroupIdForPartition(int partition) { return partition % ioConfig.getTaskCount(); } private boolean isTaskInPendingCompletionGroups(String taskId) { for (List<TaskGroup> taskGroups : pendingCompletionTaskGroups.values()) { for (TaskGroup taskGroup : taskGroups) { if (taskGroup.tasks.containsKey(taskId)) { return true; } } } return false; } private SupervisorReport<KafkaSupervisorReportPayload> generateReport(boolean includeOffsets) { int numPartitions = partitionGroups.values().stream().mapToInt(Map::size).sum(); Map<Integer, Long> partitionLag = getLagPerPartition(getHighestCurrentOffsets()); final KafkaSupervisorReportPayload payload = new KafkaSupervisorReportPayload(dataSource, ioConfig.getTopic(), numPartitions, ioConfig.getReplicas(), ioConfig.getTaskDuration().getMillis() / 1000, includeOffsets ? latestOffsetsFromKafka : null, includeOffsets ? partitionLag : null, includeOffsets ? partitionLag.values().stream().mapToLong(x -> Math.max(x, 0)).sum() : null, includeOffsets ? offsetsLastUpdated : null, spec.isSuspended()); SupervisorReport<KafkaSupervisorReportPayload> report = new SupervisorReport<>(dataSource, DateTimes.nowUtc(), payload); List<TaskReportData> taskReports = Lists.newArrayList(); try { for (TaskGroup taskGroup : taskGroups.values()) { for (Entry<String, TaskData> entry : taskGroup.tasks.entrySet()) { String taskId = entry.getKey(); @Nullable DateTime startTime = entry.getValue().startTime; Map<Integer, Long> currentOffsets = entry.getValue().currentOffsets; Long remainingSeconds = null; if (startTime != null) { remainingSeconds = Math.max(0, ioConfig.getTaskDuration().getMillis() - (System.currentTimeMillis() - startTime.getMillis())) / 1000; } taskReports.add(new TaskReportData(taskId, includeOffsets ? taskGroup.partitionOffsets : null, includeOffsets ? currentOffsets : null, startTime, remainingSeconds, TaskReportData.TaskType.ACTIVE, includeOffsets ? getLagPerPartition(currentOffsets) : null)); } } for (List<TaskGroup> taskGroups : pendingCompletionTaskGroups.values()) { for (TaskGroup taskGroup : taskGroups) { for (Entry<String, TaskData> entry : taskGroup.tasks.entrySet()) { String taskId = entry.getKey(); @Nullable DateTime startTime = entry.getValue().startTime; Map<Integer, Long> currentOffsets = entry.getValue().currentOffsets; Long remainingSeconds = null; if (taskGroup.completionTimeout != null) { remainingSeconds = Math.max(0, taskGroup.completionTimeout.getMillis() - System.currentTimeMillis()) / 1000; } taskReports .add(new TaskReportData(taskId, includeOffsets ? taskGroup.partitionOffsets : null, includeOffsets ? currentOffsets : null, startTime, remainingSeconds, TaskReportData.TaskType.PUBLISHING, null)); } } } taskReports.forEach(payload::addTask); } catch (Exception e) { log.warn(e, "Failed to generate status report"); } return report; } private Runnable buildRunTask() { return () -> notices.add(new RunNotice()); } private void updateLatestOffsetsFromKafka() { synchronized (consumerLock) { final Map<String, List<PartitionInfo>> topics = consumer.listTopics(); if (topics == null || !topics.containsKey(ioConfig.getTopic())) { throw new ISE("Could not retrieve partitions for topic [%s]", ioConfig.getTopic()); } final Set<TopicPartition> topicPartitions = topics.get(ioConfig.getTopic()).stream() .map(x -> new TopicPartition(x.topic(), x.partition())).collect(Collectors.toSet()); consumer.assign(topicPartitions); consumer.seekToEnd(topicPartitions); latestOffsetsFromKafka = topicPartitions.stream() .collect(Collectors.toMap(TopicPartition::partition, consumer::position)); } } private Map<Integer, Long> getHighestCurrentOffsets() { return taskGroups.values().stream().flatMap(taskGroup -> taskGroup.tasks.entrySet().stream()) .flatMap(taskData -> taskData.getValue().currentOffsets.entrySet().stream()) .collect(Collectors.toMap(Entry::getKey, Entry::getValue, Long::max)); } private Map<Integer, Long> getLagPerPartition(Map<Integer, Long> currentOffsets) { return currentOffsets.entrySet().stream() .collect(Collectors.toMap(Entry::getKey, e -> latestOffsetsFromKafka != null && latestOffsetsFromKafka.get(e.getKey()) != null && e.getValue() != null ? latestOffsetsFromKafka.get(e.getKey()) - e.getValue() : Integer.MIN_VALUE)); } private Runnable emitLag() { return () -> { try { Map<Integer, Long> highestCurrentOffsets = getHighestCurrentOffsets(); if (latestOffsetsFromKafka == null) { throw new ISE("Latest offsets from Kafka have not been fetched"); } if (!latestOffsetsFromKafka.keySet().equals(highestCurrentOffsets.keySet())) { log.warn("Lag metric: Kafka partitions %s do not match task partitions %s", latestOffsetsFromKafka.keySet(), highestCurrentOffsets.keySet()); } long lag = getLagPerPartition(highestCurrentOffsets).values().stream() .mapToLong(x -> Math.max(x, 0)).sum(); emitter.emit(ServiceMetricEvent.builder().setDimension("dataSource", dataSource) .build("ingest/kafka/lag", lag)); } catch (Exception e) { log.warn(e, "Unable to compute Kafka lag"); } }; } private void updateCurrentOffsets() throws InterruptedException, ExecutionException, TimeoutException { final List<ListenableFuture<Void>> futures = Stream .concat(taskGroups.values().stream().flatMap(taskGroup -> taskGroup.tasks.entrySet().stream()), pendingCompletionTaskGroups.values().stream().flatMap(List::stream) .flatMap(taskGroup -> taskGroup.tasks.entrySet().stream())) .map(task -> Futures.transform(taskClient.getCurrentOffsetsAsync(task.getKey(), false), (Function<Map<Integer, Long>, Void>) (currentOffsets) -> { if (currentOffsets != null && !currentOffsets.isEmpty()) { task.getValue().currentOffsets = currentOffsets; } return null; })) .collect(Collectors.toList()); Futures.successfulAsList(futures).get(futureTimeoutInSeconds, TimeUnit.SECONDS); } @VisibleForTesting Runnable updateCurrentAndLatestOffsets() { return () -> { try { updateCurrentOffsets(); updateLatestOffsetsFromKafka(); offsetsLastUpdated = DateTimes.nowUtc(); } catch (Exception e) { log.warn(e, "Exception while getting current/latest offsets"); } }; } /** * Collect row ingestion stats from all tasks managed by this supervisor. * * @return A map of groupId->taskId->task row stats * * @throws InterruptedException * @throws ExecutionException * @throws TimeoutException */ private Map<String, Map<String, Object>> getCurrentTotalStats() throws InterruptedException, ExecutionException, TimeoutException { Map<String, Map<String, Object>> allStats = Maps.newHashMap(); final List<ListenableFuture<StatsFromTaskResult>> futures = new ArrayList<>(); final List<Pair<Integer, String>> groupAndTaskIds = new ArrayList<>(); for (int groupId : taskGroups.keySet()) { TaskGroup group = taskGroups.get(groupId); for (String taskId : group.taskIds()) { futures.add(Futures.transform(taskClient.getMovingAveragesAsync(taskId), (Function<Map<String, Object>, StatsFromTaskResult>) (currentStats) -> { return new StatsFromTaskResult(groupId, taskId, currentStats); })); groupAndTaskIds.add(new Pair<>(groupId, taskId)); } } for (int groupId : pendingCompletionTaskGroups.keySet()) { TaskGroup group = taskGroups.get(groupId); for (String taskId : group.taskIds()) { futures.add(Futures.transform(taskClient.getMovingAveragesAsync(taskId), (Function<Map<String, Object>, StatsFromTaskResult>) (currentStats) -> { return new StatsFromTaskResult(groupId, taskId, currentStats); })); groupAndTaskIds.add(new Pair<>(groupId, taskId)); } } List<StatsFromTaskResult> results = Futures.successfulAsList(futures).get(futureTimeoutInSeconds, TimeUnit.SECONDS); for (int i = 0; i < results.size(); i++) { StatsFromTaskResult result = results.get(i); if (result != null) { Map<String, Object> groupMap = allStats.computeIfAbsent(result.getGroupId(), k -> Maps.newHashMap()); groupMap.put(result.getTaskId(), result.getStats()); } else { Pair<Integer, String> groupAndTaskId = groupAndTaskIds.get(i); log.error("Failed to get stats for group[%d]-task[%s]", groupAndTaskId.lhs, groupAndTaskId.rhs); } } return allStats; } @VisibleForTesting @Nullable TaskGroup removeTaskGroup(int taskGroupId) { return taskGroups.remove(taskGroupId); } @VisibleForTesting void moveTaskGroupToPendingCompletion(int taskGroupId) { final TaskGroup taskGroup = taskGroups.remove(taskGroupId); if (taskGroup != null) { pendingCompletionTaskGroups.computeIfAbsent(taskGroupId, k -> new CopyOnWriteArrayList<>()) .add(taskGroup); } } @VisibleForTesting int getNoticesQueueSize() { return notices.size(); } private static class StatsFromTaskResult { private final String groupId; private final String taskId; private final Map<String, Object> stats; public StatsFromTaskResult(int groupId, String taskId, Map<String, Object> stats) { this.groupId = String.valueOf(groupId); this.taskId = taskId; this.stats = stats; } public String getGroupId() { return groupId; } public String getTaskId() { return taskId; } public Map<String, Object> getStats() { return stats; } } }