ai.grakn.engine.backgroundtasks.distributed.TaskRunner.java Source code

Java tutorial

Introduction

Here is the source code for ai.grakn.engine.backgroundtasks.distributed.TaskRunner.java

Source

/*
 * Grakn - A Distributed Semantic Database
 * Copyright (C) 2016  Grakn Labs Ltd
 *
 * Grakn is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 *
 * Grakn is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with Grakn. If not, see <http://www.gnu.org/licenses/gpl.txt>.
 */

package ai.grakn.engine.backgroundtasks.distributed;

import static ai.grakn.engine.backgroundtasks.TaskStatus.COMPLETED;
import static ai.grakn.engine.backgroundtasks.TaskStatus.FAILED;
import static ai.grakn.engine.backgroundtasks.TaskStatus.RUNNING;
import static ai.grakn.engine.backgroundtasks.TaskStatus.SCHEDULED;
import static ai.grakn.engine.backgroundtasks.config.ConfigHelper.kafkaConsumer;
import static ai.grakn.engine.backgroundtasks.config.KafkaTerms.TASK_RUNNER_GROUP;
import static ai.grakn.engine.backgroundtasks.config.KafkaTerms.WORK_QUEUE_TOPIC;
import static ai.grakn.engine.backgroundtasks.config.ZookeeperPaths.RUNNERS_STATE;
import static ai.grakn.engine.backgroundtasks.config.ZookeeperPaths.RUNNERS_WATCH;
import static ai.grakn.engine.backgroundtasks.config.ZookeeperPaths.TASKS_PATH_PREFIX;
import static ai.grakn.engine.backgroundtasks.config.ZookeeperPaths.TASK_LOCK_SUFFIX;
import static ai.grakn.engine.util.ConfigProperties.TASKRUNNER_POLLING_FREQ;
import static ai.grakn.engine.util.ExceptionWrapper.noThrow;
import static java.util.Collections.singletonList;
import static java.util.concurrent.TimeUnit.MILLISECONDS;
import static java.util.stream.Collectors.toSet;
import static org.apache.commons.lang.exception.ExceptionUtils.getFullStackTrace;

import java.util.HashSet;
import java.util.Set;
import java.util.concurrent.CountDownLatch;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.RejectedExecutionException;
import java.util.concurrent.atomic.AtomicBoolean;
import java.util.function.Consumer;

import org.apache.curator.framework.recipes.locks.InterProcessMutex;
import org.apache.kafka.clients.consumer.ConsumerRecord;
import org.apache.kafka.clients.consumer.ConsumerRecords;
import org.apache.kafka.clients.consumer.KafkaConsumer;
import org.apache.kafka.common.TopicPartition;
import org.apache.kafka.common.errors.WakeupException;
import org.apache.zookeeper.CreateMode;
import org.json.JSONArray;
import org.json.JSONObject;

import ai.grakn.engine.backgroundtasks.BackgroundTask;
import ai.grakn.engine.backgroundtasks.StateStorage;
import ai.grakn.engine.backgroundtasks.TaskState;
import ai.grakn.engine.backgroundtasks.TaskStatus;
import ai.grakn.engine.backgroundtasks.taskstorage.GraknStateStorage;
import ai.grakn.engine.backgroundtasks.taskstorage.SynchronizedState;
import ai.grakn.engine.backgroundtasks.taskstorage.SynchronizedStateStorage;
import ai.grakn.engine.util.ConfigProperties;
import ai.grakn.engine.util.EngineID;

public class TaskRunner implements Runnable, AutoCloseable {
    private final KafkaLogger LOG = KafkaLogger.getInstance();
    private final static ConfigProperties properties = ConfigProperties.getInstance();

    private ExecutorService executor;
    private final Integer allowableRunningTasks;
    private final Set<String> runningTasks = new HashSet<>();
    private final String engineID = EngineID.getInstance().id();
    //private final CountDownLatch startupLatch;
    private final AtomicBoolean OPENED = new AtomicBoolean(false);

    private StateStorage graknStorage;
    private SynchronizedStateStorage zkStorage;
    private KafkaConsumer<String, String> consumer;
    private volatile boolean running;
    private CountDownLatch waitToClose;
    private boolean initialised = false;

    TaskRunner(/*CountDownLatch startupLatch*/) {
        allowableRunningTasks = properties.getAvailableThreads();
        //this.startupLatch = startupLatch;
        running = false;
    }

    /**
     * Start the main loop, this will block until a call to stop().
     */
    public void run() {
        running = true;
        try {
            while (running) {
                printInitialization();
                LOG.debug("TaskRunner polling, size of new tasks "
                        + consumer.endOffsets(consumer.partitionsFor(WORK_QUEUE_TOPIC).stream()
                                .map(i -> new TopicPartition(WORK_QUEUE_TOPIC, i.partition())).collect(toSet())));
                // Poll for new tasks only when we know we have space to accept them.
                if (getRunningTasksCount() < allowableRunningTasks) {
                    ConsumerRecords<String, String> records = consumer
                            .poll(properties.getPropertyAsInt(TASKRUNNER_POLLING_FREQ));
                    processRecords(records);
                } else {
                    Thread.sleep(500);
                }
            }

        } catch (WakeupException | InterruptedException e) {
            if (running)
                LOG.error("TaskRunner interrupted unexpectedly (without clearing 'running' flag first", e);
            else
                LOG.debug("TaskRunner exiting gracefully.");
        } finally {
            consumer.commitSync();
            consumer.close();
            waitToClose.countDown();
        }
    }

    public TaskRunner open() throws Exception {
        if (OPENED.compareAndSet(false, true)) {
            graknStorage = new GraknStateStorage();

            consumer = kafkaConsumer(TASK_RUNNER_GROUP);
            consumer.subscribe(singletonList(WORK_QUEUE_TOPIC), new RebalanceListener(consumer));

            zkStorage = SynchronizedStateStorage.getInstance();

            // Create initial entries in ZK for TaskFailover to watch.
            registerAsRunning();
            updateOwnState();
            executor = Executors.newFixedThreadPool(properties.getAvailableThreads());

            waitToClose = new CountDownLatch(1);
            //            startupLatch.countDown();
            LOG.info("TaskRunner opened.");
        } else {
            LOG.error("TaskRunner already opened!");
        }

        return this;
    }

    /**
     * Stop the main loop, causing run() to exit.
     */
    @Override
    public void close() {
        if (OPENED.compareAndSet(true, false)) {
            running = false;
            noThrow(consumer::wakeup, "Could not call wakeup on Kafka Consumer.");

            // Wait for thread calling run() to wakeup and close consumer.
            try {
                waitToClose.await(5 * properties.getPropertyAsLong(TASKRUNNER_POLLING_FREQ), MILLISECONDS);
            } catch (Throwable t) {
                LOG.error(
                        "Exception whilst waiting for scheduler run() thread to finish - " + getFullStackTrace(t));
            }

            // Interrupt all currently running threads - these will be re-allocated to another Engine.
            noThrow(executor::shutdownNow, "Could shutdown executor pool.");

            graknStorage = null;

            // Closed by ClusterManager
            zkStorage = null;

            LOG.debug("TaskRunner stopped");
        } else {
            LOG.error("TaskRunner close() called before open()!");
        }
    }

    private void processRecords(ConsumerRecords<String, String> records) {
        for (ConsumerRecord<String, String> record : records) {
            LOG.debug("Got a record\n\t\tkey: " + record.key() + "\n\t\toffset " + record.offset() + "\n\t\tvalue "
                    + record.value());
            LOG.debug(
                    "Runner currently has tasks: " + getRunningTasksCount() + " allowed: " + allowableRunningTasks);
            if (getRunningTasksCount() >= allowableRunningTasks) {
                seekAndCommit(new TopicPartition(record.topic(), record.partition()), record.offset());
                break;
            }

            String id = record.key();
            InterProcessMutex mutex = acquireMutex(id);
            if (mutex == null) {
                seekAndCommit(new TopicPartition(record.topic(), record.partition()), record.offset());
                break;
            }

            // Check if its marked as SCHEDULED.
            TaskStatus status = getStatus(id);
            if (status == null) {
                seekAndCommit(new TopicPartition(record.topic(), record.partition()), record.offset());
                releaseMutex(mutex, id);
                break;
            } else if (status != SCHEDULED) {
                LOG.debug("Cant schedule this task - " + id + " because\n\t\tstatus: " + status);
                releaseMutex(mutex, id);
                continue;
            }

            // Mark as RUNNING and update task & runner states.
            addRunningTask(id);
            updateTaskState(id, RUNNING, this.getClass().getName(), engineID, null, null);

            releaseMutex(mutex, id);

            // Submit to executor
            try {
                JSONObject configuration = new JSONObject(record.value());
                executor.submit(() -> executeTask(id, configuration));
            } catch (RejectedExecutionException | NullPointerException e) {
                removeRunningTask(id);
                LOG.error(getFullStackTrace(e));
            }

            // Advance offset
            LOG.debug("Runner next read from " + record.key() + " OFFSET " + (record.offset() + 1) + " topic "
                    + record.topic());
            seekAndCommit(new TopicPartition(record.topic(), record.partition()), record.offset() + 1);
        }
    }

    /**
     * Checks to see if task can be marked as running, and does so if possible. Updates TaskState in ZK & Grakn.
     * @param id String id
     * @return Boolean, true if task could be marked as running (and we should run), false otherwise.
     */
    private TaskStatus getStatus(String id) {
        SynchronizedState state = zkStorage.getState(id);
        if (state == null) {
            LOG.error("Cant run task - " + id + " - because zkStorage returned null");
            return null;
        }

        return state.status();
    }

    /**
     * Instantiate a BackgroundTask object and run it, catching any thrown Exceptions.
     * @param id String ID of task as used *both* in ZooKeeper and GraknGraph. This must be the ID generated by Grakn Graph.
     * @param configuration TaskState for task @id.
     */
    private void executeTask(String id, JSONObject configuration) {
        try {
            LOG.debug("Executing task " + id);

            // Get full task state.
            TaskState state = graknStorage.getState(id);

            LOG.debug("Got state of " + id + " from storage");

            // Instantiate task.
            Class<?> c = Class.forName(state.taskClassName());
            BackgroundTask task = (BackgroundTask) c.newInstance();

            // Run task.
            task.start(saveCheckpoint(id), configuration);

            LOG.debug("Task - " + id + " completed successfully, updating state in graph");
            updateTaskState(id, COMPLETED, this.getClass().getName(), null, null, null);
        } catch (Throwable t) {
            LOG.debug("Failed task - " + id + ": " + getFullStackTrace(t));
            updateTaskState(id, FAILED, this.getClass().getName(), null, t, null);
            LOG.debug("Updated state " + id);
        } finally {
            removeRunningTask(id);
            LOG.debug("Finished executing task - " + id);
        }
    }

    /**
     * Returns a new InterProcessMutex object, creating ZNodes if needed
     * @param id String id of task that this lock should be associated to.
     * @return InterProcessMutex object
     */
    private InterProcessMutex acquireMutex(String id) {
        InterProcessMutex mutex = null;
        try {
            if (zkStorage.connection().checkExists()
                    .forPath(TASKS_PATH_PREFIX + "/" + id + TASK_LOCK_SUFFIX) == null)
                zkStorage.connection().create().creatingParentContainersIfNeeded()
                        .forPath(TASKS_PATH_PREFIX + "/" + id + TASK_LOCK_SUFFIX);

            mutex = new InterProcessMutex(zkStorage.connection(), TASKS_PATH_PREFIX + "/" + id + TASK_LOCK_SUFFIX);

            if (!mutex.acquire(5000, MILLISECONDS)) {
                LOG.debug("Could not acquire mutex");
                mutex = null;
            }
        } catch (Exception e) {
            LOG.debug("Exception whilst trying to get mutex for task - " + id + " - " + getFullStackTrace(e));
        }

        LOG.debug("<<<<<<<<<<<< Got mutex for - " + id);
        return mutex;
    }

    private void releaseMutex(InterProcessMutex mutex, String id) {
        try {
            mutex.release();
            LOG.debug(">>>>>>>>>>>> released mutex for - " + id);
        } catch (Exception e) {
            LOG.error("********************************\nCOULD NOT RELEASE MUTEX FOR TASK - " + id + "\n"
                    + getFullStackTrace(e) + "\n********************************");
        }
    }

    /**
     * Persists a Background Task's checkpoint to ZK and graph.
     * @param id ID of task
     * @return A Consumer<String> function that can be called by the background task on demand to save its checkpoint.
     */
    private Consumer<String> saveCheckpoint(String id) {
        return checkpoint -> {
            LOG.debug("Writing checkpoint");
            updateTaskState(id, null, null, null, null, checkpoint);
        };
    }

    private void updateTaskState(String id, TaskStatus status, String statusChangeBy, String engineID,
            Throwable failure, String checkpoint) {
        LOG.debug("Updating state of task " + id);
        zkStorage.updateState(id, status, engineID, checkpoint);
        try {
            graknStorage.updateState(id, status, statusChangeBy, engineID, failure, checkpoint, null);
        } catch (Exception ignored) {
        }
    }

    private void updateOwnState() {
        JSONArray out = new JSONArray();
        out.put(runningTasks);

        try {
            zkStorage.connection().setData().forPath(RUNNERS_STATE + "/" + engineID, out.toString().getBytes());
        } catch (Exception e) {
            LOG.error("Could not update TaskRunner taskstorage in ZooKeeper! " + e);
        }
    }

    private void registerAsRunning() throws Exception {
        if (zkStorage.connection().checkExists().forPath(RUNNERS_WATCH + "/" + engineID) == null) {
            zkStorage.connection().create().creatingParentContainersIfNeeded().withMode(CreateMode.EPHEMERAL)
                    .forPath(RUNNERS_WATCH + "/" + engineID);
        }

        if (zkStorage.connection().checkExists().forPath(RUNNERS_STATE + "/" + engineID) == null) {
            zkStorage.connection().create().creatingParentContainersIfNeeded()
                    .forPath(RUNNERS_STATE + "/" + engineID);
        }

        LOG.debug("Registered TaskRunner");
    }

    private synchronized int getRunningTasksCount() {
        return runningTasks.size();
    }

    private synchronized void addRunningTask(String id) {
        runningTasks.add(id);
        updateOwnState();
    }

    private synchronized void removeRunningTask(String id) {
        runningTasks.remove(id);
        updateOwnState();
    }

    private void seekAndCommit(TopicPartition partition, long offset) {
        consumer.seek(partition, offset);
        consumer.commitSync();
    }

    private void printInitialization() {
        if (!initialised) {
            initialised = true;
            LOG.info("TaskRunner initialised");
        }
    }
}