org.apache.druid.indexing.kafka.IncrementalPublishingKafkaIndexTaskRunner.java Source code

Introduction

Here is the source code for org.apache.druid.indexing.kafka.IncrementalPublishingKafkaIndexTaskRunner.java
Source

/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */
package org.apache.druid.indexing.kafka;

import com.fasterxml.jackson.annotation.JsonCreator;
import com.fasterxml.jackson.annotation.JsonProperty;
import com.fasterxml.jackson.core.JsonProcessingException;
import com.fasterxml.jackson.core.type.TypeReference;
import com.google.common.base.Function;
import com.google.common.base.Joiner;
import com.google.common.base.Optional;
import com.google.common.base.Preconditions;
import com.google.common.base.Supplier;
import com.google.common.base.Throwables;
import com.google.common.collect.ImmutableMap;
import com.google.common.collect.Maps;
import com.google.common.collect.Sets;
import com.google.common.primitives.Longs;
import com.google.common.util.concurrent.FutureCallback;
import com.google.common.util.concurrent.Futures;
import com.google.common.util.concurrent.ListenableFuture;
import com.google.common.util.concurrent.SettableFuture;
import org.apache.druid.data.input.Committer;
import org.apache.druid.data.input.InputRow;
import org.apache.druid.data.input.impl.InputRowParser;
import org.apache.druid.discovery.DiscoveryDruidNode;
import org.apache.druid.discovery.DruidNodeDiscoveryProvider;
import org.apache.druid.discovery.LookupNodeService;
import org.apache.druid.indexer.IngestionState;
import org.apache.druid.indexer.TaskStatus;
import org.apache.druid.indexing.common.IngestionStatsAndErrorsTaskReport;
import org.apache.druid.indexing.common.IngestionStatsAndErrorsTaskReportData;
import org.apache.druid.indexing.common.TaskRealtimeMetricsMonitorBuilder;
import org.apache.druid.indexing.common.TaskReport;
import org.apache.druid.indexing.common.TaskToolbox;
import org.apache.druid.indexing.common.actions.CheckPointDataSourceMetadataAction;
import org.apache.druid.indexing.common.actions.ResetDataSourceMetadataAction;
import org.apache.druid.indexing.common.actions.SegmentTransactionalInsertAction;
import org.apache.druid.indexing.common.stats.RowIngestionMeters;
import org.apache.druid.indexing.common.stats.RowIngestionMetersFactory;
import org.apache.druid.indexing.common.task.IndexTaskUtils;
import org.apache.druid.indexing.common.task.RealtimeIndexTask;
import org.apache.druid.indexing.kafka.KafkaIndexTask.Status;
import org.apache.druid.java.util.common.DateTimes;
import org.apache.druid.java.util.common.ISE;
import org.apache.druid.java.util.common.StringUtils;
import org.apache.druid.java.util.common.collect.Utils;
import org.apache.druid.java.util.common.parsers.ParseException;
import org.apache.druid.java.util.emitter.EmittingLogger;
import org.apache.druid.segment.indexing.RealtimeIOConfig;
import org.apache.druid.segment.realtime.FireDepartment;
import org.apache.druid.segment.realtime.FireDepartmentMetrics;
import org.apache.druid.segment.realtime.appenderator.Appenderator;
import org.apache.druid.segment.realtime.appenderator.AppenderatorDriverAddResult;
import org.apache.druid.segment.realtime.appenderator.SegmentsAndMetadata;
import org.apache.druid.segment.realtime.appenderator.StreamAppenderatorDriver;
import org.apache.druid.segment.realtime.appenderator.TransactionalSegmentPublisher;
import org.apache.druid.segment.realtime.firehose.ChatHandlerProvider;
import org.apache.druid.server.security.Access;
import org.apache.druid.server.security.Action;
import org.apache.druid.server.security.AuthorizerMapper;
import org.apache.druid.timeline.DataSegment;
import org.apache.druid.utils.CircularBuffer;
import org.apache.kafka.clients.consumer.ConsumerRecord;
import org.apache.kafka.clients.consumer.ConsumerRecords;
import org.apache.kafka.clients.consumer.KafkaConsumer;
import org.apache.kafka.clients.consumer.OffsetOutOfRangeException;
import org.apache.kafka.common.TopicPartition;
import org.joda.time.DateTime;

import javax.annotation.Nullable;
import javax.servlet.http.HttpServletRequest;
import javax.ws.rs.Consumes;
import javax.ws.rs.DefaultValue;
import javax.ws.rs.GET;
import javax.ws.rs.POST;
import javax.ws.rs.Path;
import javax.ws.rs.Produces;
import javax.ws.rs.QueryParam;
import javax.ws.rs.core.Context;
import javax.ws.rs.core.MediaType;
import javax.ws.rs.core.Response;
import java.io.File;
import java.io.IOException;
import java.nio.ByteBuffer;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.Set;
import java.util.TreeMap;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.CopyOnWriteArrayList;
import java.util.concurrent.ExecutionException;
import java.util.concurrent.Future;
import java.util.concurrent.RejectedExecutionException;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.TimeoutException;
import java.util.concurrent.atomic.AtomicBoolean;
import java.util.concurrent.locks.Condition;
import java.util.concurrent.locks.Lock;
import java.util.concurrent.locks.ReentrantLock;
import java.util.stream.Collectors;

/**
 * Kafka indexing task runner supporting incremental segments publishing
 */
public class IncrementalPublishingKafkaIndexTaskRunner implements KafkaIndexTaskRunner {
    private static final EmittingLogger log = new EmittingLogger(IncrementalPublishingKafkaIndexTaskRunner.class);
    private static final String METADATA_NEXT_PARTITIONS = "nextPartitions";
    private static final String METADATA_PUBLISH_PARTITIONS = "publishPartitions";

    private final Map<Integer, Long> endOffsets;
    private final Map<Integer, Long> nextOffsets = new ConcurrentHashMap<>();
    private final Map<Integer, Long> lastPersistedOffsets = new ConcurrentHashMap<>();

    // The pause lock and associated conditions are to support coordination between the Jetty threads and the main
    // ingestion loop. The goal is to provide callers of the API a guarantee that if pause() returns successfully
    // the ingestion loop has been stopped at the returned offsets and will not ingest any more data until resumed. The
    // fields are used as follows (every step requires acquiring [pauseLock]):
    //   Pausing:
    //   - In pause(), [pauseRequested] is set to true and then execution waits for [status] to change to PAUSED, with the
    //     condition checked when [hasPaused] is signalled.
    //   - In possiblyPause() called from the main loop, if [pauseRequested] is true, [status] is set to PAUSED,
    //     [hasPaused] is signalled, and execution pauses until [pauseRequested] becomes false, either by being set or by
    //     the [pauseMillis] timeout elapsing. [pauseRequested] is checked when [shouldResume] is signalled.
    //   Resuming:
    //   - In resume(), [pauseRequested] is set to false, [shouldResume] is signalled, and execution waits for [status] to
    //     change to something other than PAUSED, with the condition checked when [shouldResume] is signalled.
    //   - In possiblyPause(), when [shouldResume] is signalled, if [pauseRequested] has become false the pause loop ends,
    //     [status] is changed to STARTING and [shouldResume] is signalled.

    private final Lock pauseLock = new ReentrantLock();
    private final Condition hasPaused = pauseLock.newCondition();
    private final Condition shouldResume = pauseLock.newCondition();

    private final AtomicBoolean stopRequested = new AtomicBoolean(false);
    private final AtomicBoolean publishOnStop = new AtomicBoolean(false);

    // [statusLock] is used to synchronize the Jetty thread calling stopGracefully() with the main run thread. It prevents
    // the main run thread from switching into a publishing state while the stopGracefully() thread thinks it's still in
    // a pre-publishing state. This is important because stopGracefully() will try to use the [stopRequested] flag to stop
    // the main thread where possible, but this flag is not honored once publishing has begun so in this case we must
    // interrupt the thread. The lock ensures that if the run thread is about to transition into publishing state, it
    // blocks until after stopGracefully() has set [stopRequested] and then does a final check on [stopRequested] before
    // transitioning to publishing state.
    private final Object statusLock = new Object();

    private final Lock pollRetryLock = new ReentrantLock();
    private final Condition isAwaitingRetry = pollRetryLock.newCondition();

    private final KafkaIndexTask task;
    private final KafkaIOConfig ioConfig;
    private final KafkaTuningConfig tuningConfig;
    private final InputRowParser<ByteBuffer> parser;
    private final AuthorizerMapper authorizerMapper;
    private final Optional<ChatHandlerProvider> chatHandlerProvider;
    private final CircularBuffer<Throwable> savedParseExceptions;
    private final String topic;
    private final RowIngestionMeters rowIngestionMeters;

    private final Set<String> publishingSequences = Sets.newConcurrentHashSet();
    private final List<ListenableFuture<SegmentsAndMetadata>> publishWaitList = new ArrayList<>();
    private final List<ListenableFuture<SegmentsAndMetadata>> handOffWaitList = new ArrayList<>();

    private volatile DateTime startTime;
    private volatile Status status = Status.NOT_STARTED; // this is only ever set by the task runner thread (runThread)
    private volatile TaskToolbox toolbox;
    private volatile Thread runThread;
    private volatile Appenderator appenderator;
    private volatile StreamAppenderatorDriver driver;
    private volatile FireDepartmentMetrics fireDepartmentMetrics;
    private volatile IngestionState ingestionState;

    private volatile boolean pauseRequested = false;
    private volatile long nextCheckpointTime;

    private volatile CopyOnWriteArrayList<SequenceMetadata> sequences;
    private volatile Throwable backgroundThreadException;

    public IncrementalPublishingKafkaIndexTaskRunner(KafkaIndexTask task, InputRowParser<ByteBuffer> parser,
            AuthorizerMapper authorizerMapper, Optional<ChatHandlerProvider> chatHandlerProvider,
            CircularBuffer<Throwable> savedParseExceptions, RowIngestionMetersFactory rowIngestionMetersFactory) {
        this.task = task;
        this.ioConfig = task.getIOConfig();
        this.tuningConfig = task.getTuningConfig();
        this.parser = parser;
        this.authorizerMapper = authorizerMapper;
        this.chatHandlerProvider = chatHandlerProvider;
        this.savedParseExceptions = savedParseExceptions;
        this.topic = ioConfig.getStartPartitions().getTopic();
        this.rowIngestionMeters = rowIngestionMetersFactory.createRowIngestionMeters();

        this.endOffsets = new ConcurrentHashMap<>(ioConfig.getEndPartitions().getPartitionOffsetMap());
        this.sequences = new CopyOnWriteArrayList<>();
        this.ingestionState = IngestionState.NOT_STARTED;

        resetNextCheckpointTime();
    }

    @Override
    public TaskStatus run(TaskToolbox toolbox) {
        try {
            return runInternal(toolbox);
        } catch (Exception e) {
            log.error(e, "Encountered exception while running task.");
            final String errorMsg = Throwables.getStackTraceAsString(e);
            toolbox.getTaskReportFileWriter().write(getTaskCompletionReports(errorMsg));
            return TaskStatus.failure(task.getId(), errorMsg);
        }
    }

    private TaskStatus runInternal(TaskToolbox toolbox) throws Exception {
        log.info("Starting up!");

        startTime = DateTimes.nowUtc();
        status = Status.STARTING;
        this.toolbox = toolbox;

        if (!restoreSequences()) {
            final TreeMap<Integer, Map<Integer, Long>> checkpoints = getCheckPointsFromContext(toolbox, task);
            if (checkpoints != null) {
                Iterator<Entry<Integer, Map<Integer, Long>>> sequenceOffsets = checkpoints.entrySet().iterator();
                Map.Entry<Integer, Map<Integer, Long>> previous = sequenceOffsets.next();
                while (sequenceOffsets.hasNext()) {
                    Map.Entry<Integer, Map<Integer, Long>> current = sequenceOffsets.next();
                    sequences.add(new SequenceMetadata(previous.getKey(),
                            StringUtils.format("%s_%s", ioConfig.getBaseSequenceName(), previous.getKey()),
                            previous.getValue(), current.getValue(), true));
                    previous = current;
                }
                sequences.add(new SequenceMetadata(previous.getKey(),
                        StringUtils.format("%s_%s", ioConfig.getBaseSequenceName(), previous.getKey()),
                        previous.getValue(), endOffsets, false));
            } else {
                sequences
                        .add(new SequenceMetadata(0, StringUtils.format("%s_%s", ioConfig.getBaseSequenceName(), 0),
                                ioConfig.getStartPartitions().getPartitionOffsetMap(), endOffsets, false));
            }
        }
        log.info("Starting with sequences:  %s", sequences);

        if (chatHandlerProvider.isPresent()) {
            log.info("Found chat handler of class[%s]", chatHandlerProvider.get().getClass().getName());
            chatHandlerProvider.get().register(task.getId(), this, false);
        } else {
            log.warn("No chat handler detected");
        }

        runThread = Thread.currentThread();

        // Set up FireDepartmentMetrics
        final FireDepartment fireDepartmentForMetrics = new FireDepartment(task.getDataSchema(),
                new RealtimeIOConfig(null, null, null), null);
        fireDepartmentMetrics = fireDepartmentForMetrics.getMetrics();
        toolbox.getMonitorScheduler().addMonitor(
                TaskRealtimeMetricsMonitorBuilder.build(task, fireDepartmentForMetrics, rowIngestionMeters));

        final String lookupTier = task.getContextValue(RealtimeIndexTask.CTX_KEY_LOOKUP_TIER);
        LookupNodeService lookupNodeService = lookupTier == null ? toolbox.getLookupNodeService()
                : new LookupNodeService(lookupTier);
        DiscoveryDruidNode discoveryDruidNode = new DiscoveryDruidNode(toolbox.getDruidNode(),
                DruidNodeDiscoveryProvider.NODE_TYPE_PEON, ImmutableMap.of(toolbox.getDataNodeService().getName(),
                        toolbox.getDataNodeService(), lookupNodeService.getName(), lookupNodeService));

        Throwable caughtExceptionOuter = null;
        try (final KafkaConsumer<byte[], byte[]> consumer = task.newConsumer()) {
            toolbox.getDataSegmentServerAnnouncer().announce();
            toolbox.getDruidNodeAnnouncer().announce(discoveryDruidNode);

            appenderator = task.newAppenderator(fireDepartmentMetrics, toolbox);
            driver = task.newDriver(appenderator, toolbox, fireDepartmentMetrics);

            final String topic = ioConfig.getStartPartitions().getTopic();

            // Start up, set up initial offsets.
            final Object restoredMetadata = driver.startJob();
            if (restoredMetadata == null) {
                // no persist has happened so far
                // so either this is a brand new task or replacement of a failed task
                Preconditions.checkState(
                        sequences.get(0).startOffsets.entrySet().stream()
                                .allMatch(partitionOffsetEntry -> Longs.compare(partitionOffsetEntry.getValue(),
                                        ioConfig.getStartPartitions().getPartitionOffsetMap()
                                                .get(partitionOffsetEntry.getKey())) >= 0),
                        "Sequence offsets are not compatible with start offsets of task");
                nextOffsets.putAll(sequences.get(0).startOffsets);
            } else {
                final Map<String, Object> restoredMetadataMap = (Map) restoredMetadata;
                final KafkaPartitions restoredNextPartitions = toolbox.getObjectMapper()
                        .convertValue(restoredMetadataMap.get(METADATA_NEXT_PARTITIONS), KafkaPartitions.class);
                nextOffsets.putAll(restoredNextPartitions.getPartitionOffsetMap());

                // Sanity checks.
                if (!restoredNextPartitions.getTopic().equals(ioConfig.getStartPartitions().getTopic())) {
                    throw new ISE("WTF?! Restored topic[%s] but expected topic[%s]",
                            restoredNextPartitions.getTopic(), ioConfig.getStartPartitions().getTopic());
                }

                if (!nextOffsets.keySet().equals(ioConfig.getStartPartitions().getPartitionOffsetMap().keySet())) {
                    throw new ISE("WTF?! Restored partitions[%s] but expected partitions[%s]", nextOffsets.keySet(),
                            ioConfig.getStartPartitions().getPartitionOffsetMap().keySet());
                }
                // sequences size can be 0 only when all sequences got published and task stopped before it could finish
                // which is super rare
                if (sequences.size() == 0 || sequences.get(sequences.size() - 1).isCheckpointed()) {
                    this.endOffsets.putAll(sequences.size() == 0 ? nextOffsets
                            : sequences.get(sequences.size() - 1).getEndOffsets());
                    log.info("End offsets changed to [%s]", endOffsets);
                }
            }

            // Set up committer.
            final Supplier<Committer> committerSupplier = () -> {
                final Map<Integer, Long> snapshot = ImmutableMap.copyOf(nextOffsets);
                lastPersistedOffsets.clear();
                lastPersistedOffsets.putAll(snapshot);

                return new Committer() {
                    @Override
                    public Object getMetadata() {
                        return ImmutableMap.of(METADATA_NEXT_PARTITIONS,
                                new KafkaPartitions(ioConfig.getStartPartitions().getTopic(), snapshot));
                    }

                    @Override
                    public void run() {
                        // Do nothing.
                    }
                };
            };

            // restart publishing of sequences (if any)
            maybePersistAndPublishSequences(committerSupplier);

            Set<Integer> assignment = assignPartitionsAndSeekToNext(consumer, topic);

            ingestionState = IngestionState.BUILD_SEGMENTS;

            // Main loop.
            // Could eventually support leader/follower mode (for keeping replicas more in sync)
            boolean stillReading = !assignment.isEmpty();
            status = Status.READING;
            Throwable caughtExceptionInner = null;
            try {
                while (stillReading) {
                    if (possiblyPause()) {
                        // The partition assignments may have changed while paused by a call to setEndOffsets() so reassign
                        // partitions upon resuming. This is safe even if the end offsets have not been modified.
                        assignment = assignPartitionsAndSeekToNext(consumer, topic);

                        if (assignment.isEmpty()) {
                            log.info("All partitions have been fully read");
                            publishOnStop.set(true);
                            stopRequested.set(true);
                        }
                    }

                    // if stop is requested or task's end offset is set by call to setEndOffsets method with finish set to true
                    if (stopRequested.get() || sequences.get(sequences.size() - 1).isCheckpointed()) {
                        status = Status.PUBLISHING;
                        break;
                    }

                    if (backgroundThreadException != null) {
                        throw new RuntimeException(backgroundThreadException);
                    }

                    checkPublishAndHandoffFailure();

                    maybePersistAndPublishSequences(committerSupplier);

                    // The retrying business is because the KafkaConsumer throws OffsetOutOfRangeException if the seeked-to
                    // offset is not present in the topic-partition. This can happen if we're asking a task to read from data
                    // that has not been written yet (which is totally legitimate). So let's wait for it to show up.
                    ConsumerRecords<byte[], byte[]> records = ConsumerRecords.empty();
                    try {
                        records = consumer.poll(KafkaIndexTask.POLL_TIMEOUT_MILLIS);
                    } catch (OffsetOutOfRangeException e) {
                        log.warn("OffsetOutOfRangeException with message [%s]", e.getMessage());
                        possiblyResetOffsetsOrWait(e.offsetOutOfRangePartitions(), consumer, toolbox);
                        stillReading = !assignment.isEmpty();
                    }

                    SequenceMetadata sequenceToCheckpoint = null;
                    for (ConsumerRecord<byte[], byte[]> record : records) {
                        log.trace("Got topic[%s] partition[%d] offset[%,d].", record.topic(), record.partition(),
                                record.offset());

                        if (record.offset() < endOffsets.get(record.partition())) {
                            if (record.offset() != nextOffsets.get(record.partition())) {
                                if (ioConfig.isSkipOffsetGaps()) {
                                    log.warn("Skipped to offset[%,d] after offset[%,d] in partition[%d].",
                                            record.offset(), nextOffsets.get(record.partition()),
                                            record.partition());
                                } else {
                                    throw new ISE("WTF?! Got offset[%,d] after offset[%,d] in partition[%d].",
                                            record.offset(), nextOffsets.get(record.partition()),
                                            record.partition());
                                }
                            }

                            try {
                                final byte[] valueBytes = record.value();
                                final List<InputRow> rows = valueBytes == null
                                        ? Utils.nullableListOf((InputRow) null)
                                        : parser.parseBatch(ByteBuffer.wrap(valueBytes));
                                boolean isPersistRequired = false;

                                final SequenceMetadata sequenceToUse = sequences.stream()
                                        .filter(sequenceMetadata -> sequenceMetadata.canHandle(record)).findFirst()
                                        .orElse(null);

                                if (sequenceToUse == null) {
                                    throw new ISE(
                                            "WTH?! cannot find any valid sequence for record with partition [%d] and offset [%d]. Current sequences: %s",
                                            record.partition(), record.offset(), sequences);
                                }

                                for (InputRow row : rows) {
                                    if (row != null && task.withinMinMaxRecordTime(row)) {
                                        final AppenderatorDriverAddResult addResult = driver.add(row,
                                                sequenceToUse.getSequenceName(), committerSupplier,
                                                // skip segment lineage check as there will always be one segment
                                                // for combination of sequence and segment granularity.
                                                // It is necessary to skip it as the task puts messages polled from all the
                                                // assigned Kafka partitions into a single Druid segment, thus ordering of
                                                // messages among replica tasks across assigned partitions is not guaranteed
                                                // which may cause replica tasks to ask for segments with different interval
                                                // in different order which might cause SegmentAllocateAction to fail.
                                                true,
                                                // do not allow incremental persists to happen until all the rows from this batch
                                                // of rows are indexed
                                                false);

                                        if (addResult.isOk()) {
                                            // If the number of rows in the segment exceeds the threshold after adding a row,
                                            // move the segment out from the active segments of BaseAppenderatorDriver to make a new segment.
                                            if (addResult.isPushRequired(tuningConfig)
                                                    && !sequenceToUse.isCheckpointed()) {
                                                sequenceToCheckpoint = sequenceToUse;
                                            }
                                            isPersistRequired |= addResult.isPersistRequired();
                                        } else {
                                            // Failure to allocate segment puts determinism at risk, bail out to be safe.
                                            // May want configurable behavior here at some point.
                                            // If we allow continuing, then consider blacklisting the interval for a while to avoid constant checks.
                                            throw new ISE("Could not allocate segment for row with timestamp[%s]",
                                                    row.getTimestamp());
                                        }

                                        if (addResult.getParseException() != null) {
                                            handleParseException(addResult.getParseException(), record);
                                        } else {
                                            rowIngestionMeters.incrementProcessed();
                                        }
                                    } else {
                                        rowIngestionMeters.incrementThrownAway();
                                    }
                                }
                                if (isPersistRequired) {
                                    Futures.addCallback(driver.persistAsync(committerSupplier.get()),
                                            new FutureCallback<Object>() {
                                                @Override
                                                public void onSuccess(@Nullable Object result) {
                                                    log.info("Persist completed with metadata [%s]", result);
                                                }

                                                @Override
                                                public void onFailure(Throwable t) {
                                                    log.error("Persist failed, dying");
                                                    backgroundThreadException = t;
                                                }
                                            });
                                }
                            } catch (ParseException e) {
                                handleParseException(e, record);
                            }

                            nextOffsets.put(record.partition(), record.offset() + 1);
                        }

                        if (nextOffsets.get(record.partition()).equals(endOffsets.get(record.partition()))
                                && assignment.remove(record.partition())) {
                            log.info("Finished reading topic[%s], partition[%,d].", record.topic(),
                                    record.partition());
                            KafkaIndexTask.assignPartitions(consumer, topic, assignment);
                            stillReading = !assignment.isEmpty();
                        }
                    }

                    if (System.currentTimeMillis() > nextCheckpointTime) {
                        sequenceToCheckpoint = sequences.get(sequences.size() - 1);
                    }

                    if (sequenceToCheckpoint != null && stillReading) {
                        Preconditions.checkArgument(
                                sequences.get(sequences.size() - 1).getSequenceName()
                                        .equals(sequenceToCheckpoint.getSequenceName()),
                                "Cannot checkpoint a sequence [%s] which is not the latest one, sequences %s",
                                sequenceToCheckpoint, sequences);
                        requestPause();
                        final CheckPointDataSourceMetadataAction checkpointAction = new CheckPointDataSourceMetadataAction(
                                task.getDataSource(), ioConfig.getTaskGroupId(),
                                task.getIOConfig().getBaseSequenceName(),
                                new KafkaDataSourceMetadata(
                                        new KafkaPartitions(topic, sequenceToCheckpoint.getStartOffsets())),
                                new KafkaDataSourceMetadata(new KafkaPartitions(topic, nextOffsets)));
                        if (!toolbox.getTaskActionClient().submit(checkpointAction)) {
                            throw new ISE("Checkpoint request with offsets [%s] failed, dying", nextOffsets);
                        }
                    }
                }
                ingestionState = IngestionState.COMPLETED;
            } catch (Exception e) {
                // (1) catch all exceptions while reading from kafka
                caughtExceptionInner = e;
                log.error(e, "Encountered exception in run() before persisting.");
                throw e;
            } finally {
                log.info("Persisting all pending data");
                try {
                    driver.persist(committerSupplier.get()); // persist pending data
                } catch (Exception e) {
                    if (caughtExceptionInner != null) {
                        caughtExceptionInner.addSuppressed(e);
                    } else {
                        throw e;
                    }
                }
            }

            synchronized (statusLock) {
                if (stopRequested.get() && !publishOnStop.get()) {
                    throw new InterruptedException("Stopping without publishing");
                }

                status = Status.PUBLISHING;
            }

            for (SequenceMetadata sequenceMetadata : sequences) {
                if (!publishingSequences.contains(sequenceMetadata.getSequenceName())) {
                    // this is done to prevent checks in sequence specific commit supplier from failing
                    sequenceMetadata.setEndOffsets(nextOffsets);
                    sequenceMetadata.updateAssignments(nextOffsets);
                    publishingSequences.add(sequenceMetadata.getSequenceName());
                    // persist already done in finally, so directly add to publishQueue
                    publishAndRegisterHandoff(sequenceMetadata);
                }
            }

            if (backgroundThreadException != null) {
                throw new RuntimeException(backgroundThreadException);
            }

            // Wait for publish futures to complete.
            Futures.allAsList(publishWaitList).get();

            // Wait for handoff futures to complete.
            // Note that every publishing task (created by calling AppenderatorDriver.publish()) has a corresponding
            // handoffFuture. handoffFuture can throw an exception if 1) the corresponding publishFuture failed or 2) it
            // failed to persist sequences. It might also return null if handoff failed, but was recoverable.
            // See publishAndRegisterHandoff() for details.
            List<SegmentsAndMetadata> handedOffList = Collections.emptyList();
            if (tuningConfig.getHandoffConditionTimeout() == 0) {
                handedOffList = Futures.allAsList(handOffWaitList).get();
            } else {
                try {
                    handedOffList = Futures.allAsList(handOffWaitList)
                            .get(tuningConfig.getHandoffConditionTimeout(), TimeUnit.MILLISECONDS);
                } catch (TimeoutException e) {
                    // Handoff timeout is not an indexing failure, but coordination failure. We simply ignore timeout exception
                    // here.
                    log.makeAlert("Timed out after [%d] millis waiting for handoffs",
                            tuningConfig.getHandoffConditionTimeout()).addData("TaskId", task.getId()).emit();
                }
            }

            for (SegmentsAndMetadata handedOff : handedOffList) {
                log.info("Handoff completed for segments[%s] with metadata[%s].",
                        Joiner.on(", ")
                                .join(handedOff.getSegments().stream().map(DataSegment::getIdentifier)
                                        .collect(Collectors.toList())),
                        Preconditions.checkNotNull(handedOff.getCommitMetadata(), "commitMetadata"));
            }

            appenderator.close();
        } catch (InterruptedException | RejectedExecutionException e) {
            // (2) catch InterruptedException and RejectedExecutionException thrown for the whole ingestion steps including
            // the final publishing.
            caughtExceptionOuter = e;
            try {
                Futures.allAsList(publishWaitList).cancel(true);
                Futures.allAsList(handOffWaitList).cancel(true);
                if (appenderator != null) {
                    appenderator.closeNow();
                }
            } catch (Exception e2) {
                e.addSuppressed(e2);
            }

            // handle the InterruptedException that gets wrapped in a RejectedExecutionException
            if (e instanceof RejectedExecutionException
                    && (e.getCause() == null || !(e.getCause() instanceof InterruptedException))) {
                throw e;
            }

            // if we were interrupted because we were asked to stop, handle the exception and return success, else rethrow
            if (!stopRequested.get()) {
                Thread.currentThread().interrupt();
                throw e;
            }

            log.info("The task was asked to stop before completing");
        } catch (Exception e) {
            // (3) catch all other exceptions thrown for the whole ingestion steps including the final publishing.
            caughtExceptionOuter = e;
            try {
                Futures.allAsList(publishWaitList).cancel(true);
                Futures.allAsList(handOffWaitList).cancel(true);
                if (appenderator != null) {
                    appenderator.closeNow();
                }
            } catch (Exception e2) {
                e.addSuppressed(e2);
            }
            throw e;
        } finally {
            try {
                if (driver != null) {
                    driver.close();
                }
                if (chatHandlerProvider.isPresent()) {
                    chatHandlerProvider.get().unregister(task.getId());
                }

                toolbox.getDruidNodeAnnouncer().unannounce(discoveryDruidNode);
                toolbox.getDataSegmentServerAnnouncer().unannounce();
            } catch (Exception e) {
                if (caughtExceptionOuter != null) {
                    caughtExceptionOuter.addSuppressed(e);
                } else {
                    throw e;
                }
            }
        }

        toolbox.getTaskReportFileWriter().write(getTaskCompletionReports(null));
        return TaskStatus.success(task.getId());
    }

    private void checkPublishAndHandoffFailure() throws ExecutionException, InterruptedException {
        // Check if any publishFuture failed.
        final List<ListenableFuture<SegmentsAndMetadata>> publishFinished = publishWaitList.stream()
                .filter(Future::isDone).collect(Collectors.toList());

        for (ListenableFuture<SegmentsAndMetadata> publishFuture : publishFinished) {
            // If publishFuture failed, the below line will throw an exception and catched by (1), and then (2) or (3).
            publishFuture.get();
        }

        publishWaitList.removeAll(publishFinished);

        // Check if any handoffFuture failed.
        final List<ListenableFuture<SegmentsAndMetadata>> handoffFinished = handOffWaitList.stream()
                .filter(Future::isDone).collect(Collectors.toList());

        for (ListenableFuture<SegmentsAndMetadata> handoffFuture : handoffFinished) {
            // If handoffFuture failed, the below line will throw an exception and catched by (1), and then (2) or (3).
            handoffFuture.get();
        }

        handOffWaitList.removeAll(handoffFinished);
    }

    private void publishAndRegisterHandoff(SequenceMetadata sequenceMetadata) {
        log.info("Publishing segments for sequence [%s]", sequenceMetadata);

        final ListenableFuture<SegmentsAndMetadata> publishFuture = Futures.transform(
                driver.publish(sequenceMetadata.createPublisher(toolbox, ioConfig.isUseTransaction()),
                        sequenceMetadata.getCommitterSupplier(topic, lastPersistedOffsets).get(),
                        Collections.singletonList(sequenceMetadata.getSequenceName())),
                (Function<SegmentsAndMetadata, SegmentsAndMetadata>) publishedSegmentsAndMetadata -> {
                    if (publishedSegmentsAndMetadata == null) {
                        throw new ISE("Transaction failure publishing segments for sequence [%s]",
                                sequenceMetadata);
                    } else {
                        return publishedSegmentsAndMetadata;
                    }
                });
        publishWaitList.add(publishFuture);

        // Create a handoffFuture for every publishFuture. The created handoffFuture must fail if publishFuture fails.
        final SettableFuture<SegmentsAndMetadata> handoffFuture = SettableFuture.create();
        handOffWaitList.add(handoffFuture);

        Futures.addCallback(publishFuture, new FutureCallback<SegmentsAndMetadata>() {
            @Override
            public void onSuccess(SegmentsAndMetadata publishedSegmentsAndMetadata) {
                log.info("Published segments[%s] with metadata[%s].",
                        publishedSegmentsAndMetadata.getSegments().stream().map(DataSegment::getIdentifier)
                                .collect(Collectors.toList()),
                        Preconditions.checkNotNull(publishedSegmentsAndMetadata.getCommitMetadata(),
                                "commitMetadata"));

                sequences.remove(sequenceMetadata);
                publishingSequences.remove(sequenceMetadata.getSequenceName());
                try {
                    persistSequences();
                } catch (IOException e) {
                    log.error(e, "Unable to persist state, dying");
                    handoffFuture.setException(e);
                    throw new RuntimeException(e);
                }

                Futures.transform(driver.registerHandoff(publishedSegmentsAndMetadata),
                        new Function<SegmentsAndMetadata, Void>() {
                            @Nullable
                            @Override
                            public Void apply(@Nullable SegmentsAndMetadata handoffSegmentsAndMetadata) {
                                if (handoffSegmentsAndMetadata == null) {
                                    log.warn("Failed to handoff segments[%s]",
                                            publishedSegmentsAndMetadata.getSegments().stream()
                                                    .map(DataSegment::getIdentifier).collect(Collectors.toList()));
                                }
                                handoffFuture.set(handoffSegmentsAndMetadata);
                                return null;
                            }
                        });
            }

            @Override
            public void onFailure(Throwable t) {
                log.error(t, "Error while publishing segments for sequence[%s]", sequenceMetadata);
                handoffFuture.setException(t);
            }
        });
    }

    private static File getSequencesPersistFile(TaskToolbox toolbox) {
        return new File(toolbox.getPersistDir(), "sequences.json");
    }

    private boolean restoreSequences() throws IOException {
        final File sequencesPersistFile = getSequencesPersistFile(toolbox);
        if (sequencesPersistFile.exists()) {
            sequences = new CopyOnWriteArrayList<>(toolbox.getObjectMapper().<List<SequenceMetadata>>readValue(
                    sequencesPersistFile, new TypeReference<List<SequenceMetadata>>() {
                    }));
            return true;
        } else {
            return false;
        }
    }

    private synchronized void persistSequences() throws IOException {
        log.info("Persisting Sequences Metadata [%s]", sequences);
        toolbox.getObjectMapper().writerWithType(new TypeReference<List<SequenceMetadata>>() {
        }).writeValue(getSequencesPersistFile(toolbox), sequences);
    }

    private Map<String, TaskReport> getTaskCompletionReports(@Nullable String errorMsg) {
        return TaskReport.buildTaskReports(new IngestionStatsAndErrorsTaskReport(task.getId(),
                new IngestionStatsAndErrorsTaskReportData(ingestionState, getTaskCompletionUnparseableEvents(),
                        getTaskCompletionRowStats(), errorMsg)));
    }

    private Map<String, Object> getTaskCompletionUnparseableEvents() {
        Map<String, Object> unparseableEventsMap = Maps.newHashMap();
        List<String> buildSegmentsParseExceptionMessages = IndexTaskUtils
                .getMessagesFromSavedParseExceptions(savedParseExceptions);
        if (buildSegmentsParseExceptionMessages != null) {
            unparseableEventsMap.put(RowIngestionMeters.BUILD_SEGMENTS, buildSegmentsParseExceptionMessages);
        }
        return unparseableEventsMap;
    }

    private Map<String, Object> getTaskCompletionRowStats() {
        Map<String, Object> metrics = Maps.newHashMap();
        metrics.put(RowIngestionMeters.BUILD_SEGMENTS, rowIngestionMeters.getTotals());
        return metrics;
    }

    private void maybePersistAndPublishSequences(Supplier<Committer> committerSupplier)
            throws InterruptedException {
        for (SequenceMetadata sequenceMetadata : sequences) {
            sequenceMetadata.updateAssignments(nextOffsets);
            if (!sequenceMetadata.isOpen() && !publishingSequences.contains(sequenceMetadata.getSequenceName())) {
                publishingSequences.add(sequenceMetadata.getSequenceName());
                try {
                    Object result = driver.persist(committerSupplier.get());
                    log.info("Persist completed with results: [%s], adding sequence [%s] to publish queue", result,
                            sequenceMetadata);
                    publishAndRegisterHandoff(sequenceMetadata);
                } catch (InterruptedException e) {
                    log.warn("Interrupted while persisting sequence [%s]", sequenceMetadata);
                    throw e;
                }
            }
        }
    }

    private Set<Integer> assignPartitionsAndSeekToNext(KafkaConsumer consumer, String topic) {
        // Initialize consumer assignment.
        final Set<Integer> assignment = Sets.newHashSet();
        for (Map.Entry<Integer, Long> entry : nextOffsets.entrySet()) {
            final long endOffset = endOffsets.get(entry.getKey());
            if (entry.getValue() < endOffset) {
                assignment.add(entry.getKey());
            } else if (entry.getValue() == endOffset) {
                log.info("Finished reading partition[%d].", entry.getKey());
            } else {
                throw new ISE("WTF?! Cannot start from offset[%,d] > endOffset[%,d]", entry.getValue(), endOffset);
            }
        }

        KafkaIndexTask.assignPartitions(consumer, topic, assignment);

        // Seek to starting offsets.
        for (final int partition : assignment) {
            final long offset = nextOffsets.get(partition);
            log.info("Seeking partition[%d] to offset[%,d].", partition, offset);
            consumer.seek(new TopicPartition(topic, partition), offset);
        }

        return assignment;
    }

    /**
     * Checks if the pauseRequested flag was set and if so blocks until pauseRequested is cleared.
     * <p/>
     * Sets paused = true and signals paused so callers can be notified when the pause command has been accepted.
     * <p/>
     *
     * @return true if a pause request was handled, false otherwise
     */
    private boolean possiblyPause() throws InterruptedException {
        pauseLock.lockInterruptibly();
        try {
            if (pauseRequested) {
                status = Status.PAUSED;
                hasPaused.signalAll();

                while (pauseRequested) {
                    log.info("Pausing ingestion until resumed");
                    shouldResume.await();
                }

                status = Status.READING;
                shouldResume.signalAll();
                log.info("Ingestion loop resumed");
                return true;
            }
        } finally {
            pauseLock.unlock();
        }

        return false;
    }

    private void possiblyResetOffsetsOrWait(Map<TopicPartition, Long> outOfRangePartitions,
            KafkaConsumer<byte[], byte[]> consumer, TaskToolbox taskToolbox)
            throws InterruptedException, IOException {
        final Map<TopicPartition, Long> resetPartitions = Maps.newHashMap();
        boolean doReset = false;
        if (tuningConfig.isResetOffsetAutomatically()) {
            for (Map.Entry<TopicPartition, Long> outOfRangePartition : outOfRangePartitions.entrySet()) {
                final TopicPartition topicPartition = outOfRangePartition.getKey();
                final long nextOffset = outOfRangePartition.getValue();
                // seek to the beginning to get the least available offset
                consumer.seekToBeginning(Collections.singletonList(topicPartition));
                final long leastAvailableOffset = consumer.position(topicPartition);
                // reset the seek
                consumer.seek(topicPartition, nextOffset);
                // Reset consumer offset if resetOffsetAutomatically is set to true
                // and the current message offset in the kafka partition is more than the
                // next message offset that we are trying to fetch
                if (leastAvailableOffset > nextOffset) {
                    doReset = true;
                    resetPartitions.put(topicPartition, nextOffset);
                }
            }
        }

        if (doReset) {
            sendResetRequestAndWait(resetPartitions, taskToolbox);
        } else {
            log.warn("Retrying in %dms", task.getPollRetryMs());
            pollRetryLock.lockInterruptibly();
            try {
                long nanos = TimeUnit.MILLISECONDS.toNanos(task.getPollRetryMs());
                while (nanos > 0L && !pauseRequested && !stopRequested.get()) {
                    nanos = isAwaitingRetry.awaitNanos(nanos);
                }
            } finally {
                pollRetryLock.unlock();
            }
        }
    }

    private void handleParseException(ParseException pe, ConsumerRecord<byte[], byte[]> record) {
        if (pe.isFromPartiallyValidRow()) {
            rowIngestionMeters.incrementProcessedWithError();
        } else {
            rowIngestionMeters.incrementUnparseable();
        }

        if (tuningConfig.isLogParseExceptions()) {
            log.error(pe, "Encountered parse exception on row from partition[%d] offset[%d]", record.partition(),
                    record.offset());
        }

        if (savedParseExceptions != null) {
            savedParseExceptions.add(pe);
        }

        if (rowIngestionMeters.getUnparseable() + rowIngestionMeters.getProcessedWithError() > tuningConfig
                .getMaxParseExceptions()) {
            log.error("Max parse exceptions exceeded, terminating task...");
            throw new RuntimeException("Max parse exceptions exceeded, terminating task...");
        }
    }

    private boolean isPaused() {
        return status == Status.PAUSED;
    }

    private void requestPause() {
        pauseRequested = true;
    }

    private void sendResetRequestAndWait(Map<TopicPartition, Long> outOfRangePartitions, TaskToolbox taskToolbox)
            throws IOException {
        Map<Integer, Long> partitionOffsetMap = Maps.newHashMap();
        for (Map.Entry<TopicPartition, Long> outOfRangePartition : outOfRangePartitions.entrySet()) {
            partitionOffsetMap.put(outOfRangePartition.getKey().partition(), outOfRangePartition.getValue());
        }
        boolean result = taskToolbox.getTaskActionClient()
                .submit(new ResetDataSourceMetadataAction(task.getDataSource(), new KafkaDataSourceMetadata(
                        new KafkaPartitions(ioConfig.getStartPartitions().getTopic(), partitionOffsetMap))));

        if (result) {
            log.makeAlert("Resetting Kafka offsets for datasource [%s]", task.getDataSource())
                    .addData("partitions", partitionOffsetMap.keySet()).emit();
            // wait for being killed by supervisor
            requestPause();
        } else {
            log.makeAlert("Failed to send reset request for partitions [%s]", partitionOffsetMap.keySet()).emit();
        }
    }

    /**
     * Authorizes action to be performed on this task's datasource
     *
     * @return authorization result
     */
    private Access authorizationCheck(final HttpServletRequest req, Action action) {
        return IndexTaskUtils.datasourceAuthorizationCheck(req, action, task.getDataSource(), authorizerMapper);
    }

    @Override
    public Appenderator getAppenderator() {
        return appenderator;
    }

    @Override
    public RowIngestionMeters getRowIngestionMeters() {
        return rowIngestionMeters;
    }

    @Override
    public void stopGracefully() {
        log.info("Stopping gracefully (status: [%s])", status);
        stopRequested.set(true);

        synchronized (statusLock) {
            if (status == Status.PUBLISHING) {
                runThread.interrupt();
                return;
            }
        }

        try {
            if (pauseLock.tryLock(KafkaIndexTask.LOCK_ACQUIRE_TIMEOUT_SECONDS, TimeUnit.SECONDS)) {
                try {
                    if (pauseRequested) {
                        pauseRequested = false;
                        shouldResume.signalAll();
                    }
                } finally {
                    pauseLock.unlock();
                }
            } else {
                log.warn("While stopping: failed to acquire pauseLock before timeout, interrupting run thread");
                runThread.interrupt();
                return;
            }

            if (pollRetryLock.tryLock(KafkaIndexTask.LOCK_ACQUIRE_TIMEOUT_SECONDS, TimeUnit.SECONDS)) {
                try {
                    isAwaitingRetry.signalAll();
                } finally {
                    pollRetryLock.unlock();
                }
            } else {
                log.warn("While stopping: failed to acquire pollRetryLock before timeout, interrupting run thread");
                runThread.interrupt();
            }
        } catch (Exception e) {
            Throwables.propagate(e);
        }
    }

    @POST
    @Path("/stop")
    public Response stop(@Context final HttpServletRequest req) {
        authorizationCheck(req, Action.WRITE);
        stopGracefully();
        return Response.status(Response.Status.OK).build();
    }

    @GET
    @Path("/status")
    @Produces(MediaType.APPLICATION_JSON)
    public Status getStatusHTTP(@Context final HttpServletRequest req) {
        authorizationCheck(req, Action.READ);
        return status;
    }

    @Override
    public Status getStatus() {
        return status;
    }

    @GET
    @Path("/offsets/current")
    @Produces(MediaType.APPLICATION_JSON)
    public Map<Integer, Long> getCurrentOffsets(@Context final HttpServletRequest req) {
        authorizationCheck(req, Action.READ);
        return getCurrentOffsets();
    }

    @Override
    public Map<Integer, Long> getCurrentOffsets() {
        return nextOffsets;
    }

    @GET
    @Path("/offsets/end")
    @Produces(MediaType.APPLICATION_JSON)
    public Map<Integer, Long> getEndOffsetsHTTP(@Context final HttpServletRequest req) {
        authorizationCheck(req, Action.READ);
        return getEndOffsets();
    }

    @Override
    public Map<Integer, Long> getEndOffsets() {
        return endOffsets;
    }

    @POST
    @Path("/offsets/end")
    @Consumes(MediaType.APPLICATION_JSON)
    @Produces(MediaType.APPLICATION_JSON)
    public Response setEndOffsetsHTTP(Map<Integer, Long> offsets,
            @QueryParam("finish") @DefaultValue("true") final boolean finish,
            // this field is only for internal purposes, shouldn't be usually set by users
            @Context final HttpServletRequest req) throws InterruptedException {
        authorizationCheck(req, Action.WRITE);
        return setEndOffsets(offsets, finish);
    }

    @GET
    @Path("/rowStats")
    @Produces(MediaType.APPLICATION_JSON)
    public Response getRowStats(@Context final HttpServletRequest req) {
        authorizationCheck(req, Action.READ);
        Map<String, Object> returnMap = Maps.newHashMap();
        Map<String, Object> totalsMap = Maps.newHashMap();
        Map<String, Object> averagesMap = Maps.newHashMap();

        totalsMap.put(RowIngestionMeters.BUILD_SEGMENTS, rowIngestionMeters.getTotals());
        averagesMap.put(RowIngestionMeters.BUILD_SEGMENTS, rowIngestionMeters.getMovingAverages());

        returnMap.put("movingAverages", averagesMap);
        returnMap.put("totals", totalsMap);
        return Response.ok(returnMap).build();
    }

    @GET
    @Path("/unparseableEvents")
    @Produces(MediaType.APPLICATION_JSON)
    public Response getUnparseableEvents(@Context final HttpServletRequest req) {
        authorizationCheck(req, Action.READ);
        List<String> events = IndexTaskUtils.getMessagesFromSavedParseExceptions(savedParseExceptions);
        return Response.ok(events).build();
    }

    @Override
    public Response setEndOffsets(Map<Integer, Long> offsets, final boolean finish // this field is only for internal purposes, shouldn't be usually set by users
    ) throws InterruptedException {
        if (offsets == null) {
            return Response.status(Response.Status.BAD_REQUEST)
                    .entity("Request body must contain a map of { partition:endOffset }").build();
        } else if (!endOffsets.keySet().containsAll(offsets.keySet())) {
            return Response.status(Response.Status.BAD_REQUEST)
                    .entity(StringUtils.format(
                            "Request contains partitions not being handled by this task, my partitions: %s",
                            endOffsets.keySet()))
                    .build();
        } else {
            try {
                pauseLock.lockInterruptibly();
                // Perform all sequence related checks before checking for isPaused()
                // and after acquiring pauseLock to correctly guard against duplicate requests
                Preconditions.checkState(sequences.size() > 0, "WTH?! No Sequences found to set end offsets");

                final SequenceMetadata latestSequence = sequences.get(sequences.size() - 1);
                if ((latestSequence.getStartOffsets().equals(offsets) && !finish)
                        || (latestSequence.getEndOffsets().equals(offsets) && finish)) {
                    log.warn("Ignoring duplicate request, end offsets already set for sequences [%s]", sequences);
                    return Response.ok(offsets).build();
                } else if (latestSequence.isCheckpointed()) {
                    return Response.status(Response.Status.BAD_REQUEST)
                            .entity(StringUtils.format(
                                    "WTH?! Sequence [%s] has already endOffsets set, cannot set to [%s]",
                                    latestSequence, offsets))
                            .build();
                } else if (!isPaused()) {
                    return Response.status(Response.Status.BAD_REQUEST)
                            .entity("Task must be paused before changing the end offsets").build();
                }

                for (Map.Entry<Integer, Long> entry : offsets.entrySet()) {
                    if (entry.getValue().compareTo(nextOffsets.get(entry.getKey())) < 0) {
                        return Response.status(Response.Status.BAD_REQUEST)
                                .entity(StringUtils.format(
                                        "End offset must be >= current offset for partition [%s] (current: %s)",
                                        entry.getKey(), nextOffsets.get(entry.getKey())))
                                .build();
                    }
                }

                resetNextCheckpointTime();
                latestSequence.setEndOffsets(offsets);

                if (finish) {
                    log.info("Updating endOffsets from [%s] to [%s]", endOffsets, offsets);
                    endOffsets.putAll(offsets);
                } else {
                    // create new sequence
                    final SequenceMetadata newSequence = new SequenceMetadata(
                            latestSequence.getSequenceId() + 1, StringUtils.format("%s_%d",
                                    ioConfig.getBaseSequenceName(), latestSequence.getSequenceId() + 1),
                            offsets, endOffsets, false);
                    sequences.add(newSequence);
                }

                persistSequences();
            } catch (Exception e) {
                log.error(e, "Unable to set end offsets, dying");
                backgroundThreadException = e;
                // should resume to immediately finish kafka index task as failed
                resume();
                return Response.status(Response.Status.INTERNAL_SERVER_ERROR)
                        .entity(Throwables.getStackTraceAsString(e)).build();
            } finally {
                pauseLock.unlock();
            }
        }

        resume();

        return Response.ok(offsets).build();
    }

    private void resetNextCheckpointTime() {
        nextCheckpointTime = DateTimes.nowUtc().plus(tuningConfig.getIntermediateHandoffPeriod()).getMillis();
    }

    @GET
    @Path("/checkpoints")
    @Produces(MediaType.APPLICATION_JSON)
    public Map<Integer, Map<Integer, Long>> getCheckpointsHTTP(@Context final HttpServletRequest req) {
        authorizationCheck(req, Action.READ);
        return getCheckpoints();
    }

    private Map<Integer, Map<Integer, Long>> getCheckpoints() {
        TreeMap<Integer, Map<Integer, Long>> result = new TreeMap<>();
        result.putAll(sequences.stream()
                .collect(Collectors.toMap(SequenceMetadata::getSequenceId, SequenceMetadata::getStartOffsets)));
        return result;
    }

    /**
     * Signals the ingestion loop to pause.
     *
     * @return one of the following Responses: 400 Bad Request if the task has started publishing; 202 Accepted if the
     * method has timed out and returned before the task has paused; 200 OK with a map of the current partition offsets
     * in the response body if the task successfully paused
     */
    @POST
    @Path("/pause")
    @Produces(MediaType.APPLICATION_JSON)
    public Response pauseHTTP(@Context final HttpServletRequest req) throws InterruptedException {
        authorizationCheck(req, Action.WRITE);
        return pause();
    }

    @Override
    public Response pause() throws InterruptedException {
        if (!(status == Status.PAUSED || status == Status.READING)) {
            return Response.status(Response.Status.BAD_REQUEST).entity(
                    StringUtils.format("Can't pause, task is not in a pausable state (state: [%s])", status))
                    .build();
        }

        pauseLock.lockInterruptibly();
        try {
            pauseRequested = true;

            pollRetryLock.lockInterruptibly();
            try {
                isAwaitingRetry.signalAll();
            } finally {
                pollRetryLock.unlock();
            }

            if (isPaused()) {
                shouldResume.signalAll(); // kick the monitor so it re-awaits with the new pauseMillis
            }

            long nanos = TimeUnit.SECONDS.toNanos(2);
            while (!isPaused()) {
                if (nanos <= 0L) {
                    return Response.status(Response.Status.ACCEPTED)
                            .entity("Request accepted but task has not yet paused").build();
                }
                nanos = hasPaused.awaitNanos(nanos);
            }
        } finally {
            pauseLock.unlock();
        }

        try {
            return Response.ok().entity(toolbox.getObjectMapper().writeValueAsString(getCurrentOffsets())).build();
        } catch (JsonProcessingException e) {
            throw Throwables.propagate(e);
        }
    }

    @POST
    @Path("/resume")
    public Response resumeHTTP(@Context final HttpServletRequest req) throws InterruptedException {
        authorizationCheck(req, Action.WRITE);
        resume();
        return Response.status(Response.Status.OK).build();
    }

    @Override
    public void resume() throws InterruptedException {
        pauseLock.lockInterruptibly();
        try {
            pauseRequested = false;
            shouldResume.signalAll();

            long nanos = TimeUnit.SECONDS.toNanos(5);
            while (isPaused()) {
                if (nanos <= 0L) {
                    throw new RuntimeException("Resume command was not accepted within 5 seconds");
                }
                nanos = shouldResume.awaitNanos(nanos);
            }
        } finally {
            pauseLock.unlock();
        }
    }

    @GET
    @Path("/time/start")
    @Produces(MediaType.APPLICATION_JSON)
    public DateTime getStartTime(@Context final HttpServletRequest req) {
        authorizationCheck(req, Action.WRITE);
        return startTime;
    }

    private static class SequenceMetadata {
        /**
         * Lock for accessing {@link #endOffsets} and {@link #checkpointed}. This lock is required because
         * {@link #setEndOffsets)} can be called by both the main thread and the HTTP thread.
         */
        private final ReentrantLock lock = new ReentrantLock();

        private final int sequenceId;
        private final String sequenceName;
        private final Map<Integer, Long> startOffsets;
        private final Map<Integer, Long> endOffsets;
        private final Set<Integer> assignments;
        private final boolean sentinel;
        private boolean checkpointed;

        @JsonCreator
        public SequenceMetadata(@JsonProperty("sequenceId") int sequenceId,
                @JsonProperty("sequenceName") String sequenceName,
                @JsonProperty("startOffsets") Map<Integer, Long> startOffsets,
                @JsonProperty("endOffsets") Map<Integer, Long> endOffsets,
                @JsonProperty("checkpointed") boolean checkpointed) {
            Preconditions.checkNotNull(sequenceName);
            Preconditions.checkNotNull(startOffsets);
            Preconditions.checkNotNull(endOffsets);
            this.sequenceId = sequenceId;
            this.sequenceName = sequenceName;
            this.startOffsets = ImmutableMap.copyOf(startOffsets);
            this.endOffsets = new HashMap<>(endOffsets);
            this.assignments = new HashSet<>(startOffsets.keySet());
            this.checkpointed = checkpointed;
            this.sentinel = false;
        }

        @JsonProperty
        public int getSequenceId() {
            return sequenceId;
        }

        @JsonProperty
        public boolean isCheckpointed() {
            lock.lock();
            try {
                return checkpointed;
            } finally {
                lock.unlock();
            }
        }

        @JsonProperty
        public String getSequenceName() {
            return sequenceName;
        }

        @JsonProperty
        public Map<Integer, Long> getStartOffsets() {
            return startOffsets;
        }

        @JsonProperty
        public Map<Integer, Long> getEndOffsets() {
            lock.lock();
            try {
                return endOffsets;
            } finally {
                lock.unlock();
            }
        }

        @JsonProperty
        public boolean isSentinel() {
            return sentinel;
        }

        void setEndOffsets(Map<Integer, Long> newEndOffsets) {
            lock.lock();
            try {
                endOffsets.putAll(newEndOffsets);
                checkpointed = true;
            } finally {
                lock.unlock();
            }
        }

        void updateAssignments(Map<Integer, Long> nextPartitionOffset) {
            lock.lock();
            try {
                assignments.clear();
                nextPartitionOffset.forEach((key, value) -> {
                    if (Longs.compare(endOffsets.get(key), nextPartitionOffset.get(key)) > 0) {
                        assignments.add(key);
                    }
                });
            } finally {
                lock.unlock();
            }
        }

        boolean isOpen() {
            return !assignments.isEmpty();
        }

        boolean canHandle(ConsumerRecord<byte[], byte[]> record) {
            lock.lock();
            try {
                final Long partitionEndOffset = endOffsets.get(record.partition());
                return isOpen() && partitionEndOffset != null
                        && record.offset() >= startOffsets.get(record.partition())
                        && record.offset() < partitionEndOffset;
            } finally {
                lock.unlock();
            }
        }

        @Override
        public String toString() {
            lock.lock();
            try {
                return "SequenceMetadata{" + "sequenceName='" + sequenceName + '\'' + ", sequenceId=" + sequenceId
                        + ", startOffsets=" + startOffsets + ", endOffsets=" + endOffsets + ", assignments="
                        + assignments + ", sentinel=" + sentinel + ", checkpointed=" + checkpointed + '}';
            } finally {
                lock.unlock();
            }
        }

        Supplier<Committer> getCommitterSupplier(String topic, Map<Integer, Long> lastPersistedOffsets) {
            // Set up committer.
            return () -> new Committer() {
                @Override
                public Object getMetadata() {
                    lock.lock();

                    try {
                        Preconditions.checkState(assignments.isEmpty(),
                                "This committer can be used only once all the records till offsets [%s] have been consumed, also make"
                                        + " sure to call updateAssignments before using this committer",
                                endOffsets);

                        // merge endOffsets for this sequence with globally lastPersistedOffsets
                        // This is done because this committer would be persisting only sub set of segments
                        // corresponding to the current sequence. Generally, lastPersistedOffsets should already
                        // cover endOffsets but just to be sure take max of offsets and persist that
                        for (Map.Entry<Integer, Long> partitionOffset : endOffsets.entrySet()) {
                            lastPersistedOffsets.put(partitionOffset.getKey(), Math.max(partitionOffset.getValue(),
                                    lastPersistedOffsets.getOrDefault(partitionOffset.getKey(), 0L)));
                        }

                        // Publish metadata can be different from persist metadata as we are going to publish only
                        // subset of segments
                        return ImmutableMap.of(METADATA_NEXT_PARTITIONS,
                                new KafkaPartitions(topic, lastPersistedOffsets), METADATA_PUBLISH_PARTITIONS,
                                new KafkaPartitions(topic, endOffsets));
                    } finally {
                        lock.unlock();
                    }
                }

                @Override
                public void run() {
                    // Do nothing.
                }
            };
        }

        TransactionalSegmentPublisher createPublisher(TaskToolbox toolbox, boolean useTransaction) {
            return (segments, commitMetadata) -> {
                final KafkaPartitions finalPartitions = toolbox.getObjectMapper()
                        .convertValue(((Map) Preconditions.checkNotNull(commitMetadata, "commitMetadata"))
                                .get(METADATA_PUBLISH_PARTITIONS), KafkaPartitions.class);

                // Sanity check, we should only be publishing things that match our desired end state.
                if (!getEndOffsets().equals(finalPartitions.getPartitionOffsetMap())) {
                    throw new ISE("WTF?! Driver for sequence [%s], attempted to publish invalid metadata[%s].",
                            toString(), commitMetadata);
                }

                final SegmentTransactionalInsertAction action;

                if (useTransaction) {
                    action = new SegmentTransactionalInsertAction(segments,
                            new KafkaDataSourceMetadata(
                                    new KafkaPartitions(finalPartitions.getTopic(), getStartOffsets())),
                            new KafkaDataSourceMetadata(finalPartitions));
                } else {
                    action = new SegmentTransactionalInsertAction(segments, null, null);
                }

                log.info("Publishing with isTransaction[%s].", useTransaction);

                return toolbox.getTaskActionClient().submit(action);
            };
        }
    }

    @Nullable
    private static TreeMap<Integer, Map<Integer, Long>> getCheckPointsFromContext(TaskToolbox toolbox,
            KafkaIndexTask task) throws IOException {
        final String checkpointsString = task.getContextValue("checkpoints");
        if (checkpointsString != null) {
            log.info("Checkpoints [%s]", checkpointsString);
            return toolbox.getObjectMapper().readValue(checkpointsString,
                    new TypeReference<TreeMap<Integer, Map<Integer, Long>>>() {
                    });
        } else {
            return null;
        }
    }
}