ai.grakn.engine.tasks.manager.multiqueue.MultiQueueTaskRunner.java Source code

Introduction

Here is the source code for ai.grakn.engine.tasks.manager.multiqueue.MultiQueueTaskRunner.java
Source

/*
 * Grakn - A Distributed Semantic Database
 * Copyright (C) 2016  Grakn Labs Limited
 *
 * Grakn is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 *
 * Grakn is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with Grakn. If not, see <http://www.gnu.org/licenses/gpl.txt>.
 *
 */

package ai.grakn.engine.tasks.manager.multiqueue;

import ai.grakn.engine.tasks.BackgroundTask;
import ai.grakn.engine.TaskId;
import ai.grakn.engine.tasks.ExternalOffsetStorage;
import ai.grakn.engine.tasks.TaskCheckpoint;
import ai.grakn.engine.tasks.TaskState;
import ai.grakn.engine.tasks.TaskStateStorage;
import ai.grakn.engine.tasks.manager.ZookeeperConnection;
import ai.grakn.engine.GraknEngineConfig;
import ai.grakn.engine.util.EngineID;
import ai.grakn.exception.EngineStorageException;
import com.google.common.util.concurrent.ThreadFactoryBuilder;
import org.apache.kafka.clients.consumer.Consumer;
import org.apache.kafka.clients.consumer.ConsumerRecord;
import org.apache.kafka.clients.consumer.ConsumerRecords;
import org.apache.kafka.common.TopicPartition;
import org.apache.kafka.common.errors.WakeupException;
import org.apache.zookeeper.CreateMode;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.util.HashSet;
import java.util.Set;
import java.util.concurrent.CountDownLatch;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.ThreadFactory;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.atomic.AtomicInteger;

import static ai.grakn.engine.TaskStatus.SCHEDULED;
import static ai.grakn.engine.tasks.config.ConfigHelper.kafkaConsumer;
import static ai.grakn.engine.tasks.config.KafkaTerms.TASK_RUNNER_GROUP;
import static ai.grakn.engine.tasks.config.KafkaTerms.WORK_QUEUE_TOPIC;
import static ai.grakn.engine.tasks.config.ZookeeperPaths.SINGLE_ENGINE_WATCH_PATH;
import static ai.grakn.engine.tasks.manager.ExternalStorageRebalancer.rebalanceListener;
import static ai.grakn.engine.GraknEngineConfig.TASKRUNNER_POLLING_FREQ;
import static ai.grakn.engine.util.ExceptionWrapper.noThrow;
import static java.lang.String.format;
import static java.util.Collections.singletonList;
import static java.util.concurrent.Executors.newFixedThreadPool;
import static org.apache.commons.lang.exception.ExceptionUtils.getFullStackTrace;

/**
 * <p>
 *      Picks up tasks from the work queue, runs them and marks them as completed or failed.
 * </p>
 *
 * <p>
 *     Runs tasks in a pool. The size of this pool is configurable in the properties file.
 *     Controls marking the state of running TaskRunner in Zookeeper.
 * </p>
 *
 * @author Denis Lobanov, alexandraorth
 */
public class MultiQueueTaskRunner implements Runnable, AutoCloseable {
    private final static Logger LOG = LoggerFactory.getLogger(MultiQueueTaskRunner.class);
    private final static GraknEngineConfig properties = GraknEngineConfig.getInstance();

    private final static int POLLING_FREQUENCY = properties.getPropertyAsInt(TASKRUNNER_POLLING_FREQ);
    private final EngineID engineId;

    private final Set<TaskId> runningTasks = new HashSet<>();
    private final TaskStateStorage storage;
    private final ZookeeperConnection connection;
    private final CountDownLatch shutdownLatch;

    private final ExecutorService executor;
    private final int executorSize;
    private final AtomicInteger acceptedTasks = new AtomicInteger(0);
    private final Consumer<TaskId, TaskState> consumer;

    public MultiQueueTaskRunner(EngineID engineId, TaskStateStorage storage, ZookeeperConnection connection) {
        this.engineId = engineId;
        this.storage = storage;
        this.connection = connection;

        // Create the consumer
        consumer = kafkaConsumer(TASK_RUNNER_GROUP);

        // Configure callback for a Kafka rebalance
        consumer.subscribe(singletonList(WORK_QUEUE_TOPIC),
                rebalanceListener(consumer, new ExternalOffsetStorage(connection)));

        // Create initial entries in ZK for TaskFailover to watch.
        registerAsRunning();

        // Instantiate the executor where tasks will run
        // executorSize is the maximum executor queue size
        int numberAvailableThreads = properties.getAvailableThreads();
        ThreadFactory namedThreadFactory = new ThreadFactoryBuilder().setNameFormat("task-runner-pool-%d").build();
        executor = newFixedThreadPool(numberAvailableThreads, namedThreadFactory);
        executorSize = numberAvailableThreads * 4;

        shutdownLatch = new CountDownLatch(1);

        LOG.info("TaskRunner started");
    }

    /**
     * Start the main loop, this will block until a call to close() that wakes up the consumer.
     *
     * The only way to exit this loop without throwing an exception is by calling consumer.wakeup()
     *
     * We do not want to catch any exceptions here. The caller of this TaskRunner should handle the case
     * where an exception is thrown. It is recommended to register the TaskRunner thread with a UncaughtExceptionHandler.
     * The catch(Throwable t) here is merely for logging purposes. You will notice that the exception is re-thrown.
     */
    public void run() {
        try {
            while (true) {
                ConsumerRecords<TaskId, TaskState> records = consumer.poll(POLLING_FREQUENCY);

                long startTime = System.currentTimeMillis();
                for (ConsumerRecord<TaskId, TaskState> record : records) {

                    // If TaskRunner capacity full commit offset as current record and exit
                    if (acceptedTasks.get() >= executorSize) {
                        acknowledgeRecordSeen(record);
                    } else {
                        processAndAcknowledgeProcessed(record);
                    }
                }

                LOG.debug(format("Took [%s] ms to process [%s] records in taskrunner",
                        System.currentTimeMillis() - startTime, records.count()));
            }
        } catch (WakeupException e) {
            LOG.debug("TaskRunner exiting, woken up.");
        } catch (Throwable throwable) {
            LOG.error("Error in TaskRunner poll " + throwable.getMessage());

            // re-throw the exception
            throw throwable;
        } finally {
            noThrow(consumer::close, "Exception while closing consumer in TaskRunner");
            noThrow(shutdownLatch::countDown, "Exception while counting down close latch in TaskRunner");

            LOG.debug("TaskRunner run() end");
        }
    }

    /**
     * Stop the main loop, causing run() to exit.
     *
     * noThrow() functions used here so that if an error occurs during execution of a
     * certain step, the subsequent stops continue to execute.
     */
    public void close() {
        // Stop execution of kafka consumer
        noThrow(consumer::wakeup, "Could not wake up task runner thread.");

        // Wait for the shutdown latch to complete
        noThrow(shutdownLatch::await, "Error waiting for TaskRunner consumer to exit");

        // Interrupt all currently running threads - these will be re-allocated to another Engine.
        noThrow(executor::shutdown, "Could not shutdown TaskRunner executor.");
        noThrow(() -> executor.awaitTermination(1, TimeUnit.MINUTES),
                "Error waiting for TaskRunner executor to shutdown.");

        LOG.debug("TaskRunner stopped");
    }

    /**
     * Add a single record to the threadpool if it has been marked as SCHEDULED. At the end of
     * the method acknowledge the record has been read to the consumer.
     *
     * @param record The record to execute.
     */
    private void processAndAcknowledgeProcessed(ConsumerRecord<TaskId, TaskState> record) {
        try {
            LOG.debug(format("Received [%s], currently running: %s has: %s allowed: %s", record.key(),
                    getRunningTasksCount(), acceptedTasks.get(), executorSize));

            // Get up-to-date state from the storage
            TaskState state = storage.getState(record.key());

            // If the task is scheduled, run it
            if (state.status() == SCHEDULED) {

                // Mark as RUNNING and update task & runner states.
                storage.updateState(state.markRunning(engineId));
                acceptedTasks.incrementAndGet();

                // Submit to executor
                executor.execute(() -> executeTask(state));
            } else {
                LOG.debug(format("Will not run [%s] because status: [%s]", record.key(), state.status()));
            }
        } catch (EngineStorageException e) {
            LOG.error(format("Cant run [%s] because state was not found in storage", record.key()));
        } finally {
            // Acknowledge that the TaskRunner has processed this record
            acknowledgeRecordProcessed(record);
        }
    }

    /**
     * Instantiate a BackgroundTask object and run it, catching any thrown Exceptions.
     * @param state TaskState for task @id.
     */
    private void executeTask(TaskState state) {
        LOG.debug("Executing task " + state.getId());

        try {
            // Should add running task here, so it always gets removed in the finally
            addRunningTask(state.getId());

            // Instantiate task.
            BackgroundTask task = state.taskClass().newInstance();

            // Resume task from the checkpoint, if it exists. Otherwise run from the beginning.
            if (state.checkpoint() != null) {
                task.resume(saveCheckpoint(state), state.checkpoint());
            } else {
                task.start(saveCheckpoint(state), state.configuration());
            }

            // remove the configuration and mark as COMPLETED
            state.markCompleted();
        } catch (Throwable throwable) {
            state.markFailed(throwable);
            LOG.error("Failed task - " + state.getId() + ": " + getFullStackTrace(throwable));
        } finally {
            storage.updateState(state);
            removeRunningTask(state.getId());
            acceptedTasks.decrementAndGet();
            LOG.debug("Finished executing task - " + state.getId());
        }
    }

    /**
     * Persists a Background Task's checkpoint to ZK and graph.
     * @param taskState task to update in storage
     * @return A Consumer<String> function that can be called by the background task on demand to save its checkpoint.
     */
    private java.util.function.Consumer<TaskCheckpoint> saveCheckpoint(TaskState taskState) {
        return checkpoint -> storage.updateState(taskState.checkpoint(checkpoint));
    }

    private void registerAsRunning() {
        try {
            if (connection.connection().checkExists()
                    .forPath(format(SINGLE_ENGINE_WATCH_PATH, engineId.value())) == null) {
                connection.connection().create().creatingParentContainersIfNeeded().withMode(CreateMode.EPHEMERAL)
                        .forPath(format(SINGLE_ENGINE_WATCH_PATH, engineId.value()));
            }
        } catch (RuntimeException e) {
            throw e;
        } catch (Exception exception) {
            throw new RuntimeException("Could not create Zookeeper paths in TaskRunner");
        }

        LOG.debug("Registered TaskRunner");
    }

    private synchronized int getRunningTasksCount() {
        return runningTasks.size();
    }

    private synchronized void addRunningTask(TaskId id) {
        runningTasks.add(id);
    }

    private synchronized void removeRunningTask(TaskId id) {
        runningTasks.remove(id);
    }

    /**
     * Instruct kafka to read from the current record
     * @param record The record to read from
     */
    private void acknowledgeRecordSeen(ConsumerRecord record) {
        commitOffset(record, record.offset());
    }

    /**
     * Instruct kafka to read from the next record
     * @param record The record to read from
     */
    private void acknowledgeRecordProcessed(ConsumerRecord record) {
        commitOffset(record, record.offset() + 1);
    }

    /**
     * Commit the given offset for the partition & topic the given record belongs to
     * @param record Record from which to extract partition and topic
     * @param offset Offset to commit
     */
    private void commitOffset(ConsumerRecord record, long offset) {
        consumer.seek(new TopicPartition(record.topic(), record.partition()), offset);
        consumer.commitSync();
    }
}