Java tutorial
/* * Druid - a distributed column store. * Copyright (C) 2012 Metamarkets Group Inc. * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License * as published by the Free Software Foundation; either version 2 * of the License, or (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. */ package com.metamx.druid.indexing.coordinator; import com.google.common.base.Optional; import com.google.common.base.Preconditions; import com.google.common.base.Throwables; import com.google.common.collect.ArrayListMultimap; import com.google.common.collect.ImmutableList; import com.google.common.collect.Lists; import com.google.common.collect.Multimap; import com.google.common.collect.Ordering; import com.metamx.common.lifecycle.LifecycleStart; import com.metamx.common.lifecycle.LifecycleStop; import com.metamx.druid.indexing.common.TaskLock; import com.metamx.druid.indexing.common.TaskStatus; import com.metamx.druid.indexing.common.task.Task; import com.metamx.emitter.EmittingLogger; import java.util.List; import java.util.Map; import java.util.concurrent.locks.Condition; import java.util.concurrent.locks.ReentrantLock; /** * Interface between task producers and task consumers. * <p/> * The queue accepts tasks from producers using {@link #add} and delivers tasks to consumers using either * {@link #take} or {@link #poll}. Ordering is mostly-FIFO, with deviations when the natural next task would conflict * with a currently-running task. In that case, tasks are skipped until a runnable one is found. * <p/> * To manage locking, the queue keeps track of currently-running tasks as {@link com.metamx.druid.indexing.common.TaskLock} objects. The idea is that * only one TaskLock can be running on a particular dataSource + interval, and that TaskLock has a single version * string that all tasks in the group must use to publish segments. Tasks in the same TaskLock may run concurrently. * <p/> * For persistence, the queue saves new tasks from {@link #add} and task status updates from {@link #notify} using a * {@link TaskStorage} obj * <p/> * To support leader election of our containing system, the queue can be stopped (in which case it will not accept * any new tasks, or hand out any more tasks, until started again). */ public class TaskQueue { private final List<Task> queue = Lists.newLinkedList(); private final TaskStorage taskStorage; private final TaskLockbox taskLockbox; private final ReentrantLock giant = new ReentrantLock(); private final Condition workMayBeAvailable = giant.newCondition(); private volatile boolean active = false; private static final EmittingLogger log = new EmittingLogger(TaskQueue.class); public TaskQueue(TaskStorage taskStorage, TaskLockbox taskLockbox) { this.taskStorage = Preconditions.checkNotNull(taskStorage, "taskStorage"); this.taskLockbox = Preconditions.checkNotNull(taskLockbox, "taskLockbox"); } /** * Bootstraps this task queue and associated task lockbox. Clears the lockbox before running. Should be called * while the queue is stopped. It is not a good idea to start the queue if this method fails. */ public void bootstrap() { // NOTE: Bootstraps can resurrect bogus stuff caused by leader races or whatevs. // We may want to periodically fixup the database to refer to what we think is happening, to prevent // this from occurring and also so that bogus stuff is detected by clients in a timely manner. giant.lock(); try { Preconditions.checkState(!active, "queue must be stopped"); log.info("Bootstrapping queue (and associated lockbox)"); queue.clear(); taskLockbox.clear(); // Get all running tasks and their locks final Multimap<TaskLock, Task> tasksByLock = ArrayListMultimap.create(); for (final Task task : taskStorage.getRunningTasks()) { try { final List<TaskLock> taskLocks = taskStorage.getLocks(task.getId()); queue.add(task); for (final TaskLock taskLock : taskLocks) { tasksByLock.put(taskLock, task); } } catch (Exception e) { log.makeAlert("Failed to bootstrap task").addData("task", task.getId()).emit(); throw Throwables.propagate(e); } } // Sort locks by version final Ordering<Map.Entry<TaskLock, Task>> byVersionOrdering = new Ordering<Map.Entry<TaskLock, Task>>() { @Override public int compare(Map.Entry<TaskLock, Task> left, Map.Entry<TaskLock, Task> right) { return left.getKey().getVersion().compareTo(right.getKey().getVersion()); } }; // Acquire as many locks as possible, in version order for (final Map.Entry<TaskLock, Task> taskAndLock : byVersionOrdering .sortedCopy(tasksByLock.entries())) { final Task task = taskAndLock.getValue(); final TaskLock savedTaskLock = taskAndLock.getKey(); final Optional<TaskLock> acquiredTaskLock = taskLockbox.tryLock(task, savedTaskLock.getInterval(), Optional.of(savedTaskLock.getVersion())); if (acquiredTaskLock.isPresent() && savedTaskLock.getVersion().equals(acquiredTaskLock.get().getVersion())) { log.info("Reacquired lock on interval[%s] version[%s] for task: %s", savedTaskLock.getInterval(), savedTaskLock.getVersion(), task.getId()); } else if (acquiredTaskLock.isPresent()) { log.info( "Could not reacquire lock on interval[%s] version[%s] (got version[%s] instead) for task: %s", savedTaskLock.getInterval(), savedTaskLock.getVersion(), acquiredTaskLock.get().getVersion(), task.getId()); } else { log.info("Could not reacquire lock on interval[%s] version[%s] for task: %s", savedTaskLock.getInterval(), savedTaskLock.getVersion(), task.getId()); } } log.info("Bootstrapped %,d tasks with %,d locks. Ready to go!", queue.size(), tasksByLock.keySet().size()); } finally { giant.unlock(); } } /** * Returns an immutable snapshot of the current status of this queue. */ public List<Task> snapshot() { giant.lock(); try { return ImmutableList.copyOf(queue); } finally { giant.unlock(); } } /** * Starts this task queue. Allows {@link #add(Task)} to accept new tasks. This should not be called on * an already-started queue. */ @LifecycleStart public void start() { giant.lock(); try { Preconditions.checkState(!active, "queue must be stopped"); active = true; workMayBeAvailable.signalAll(); } finally { giant.unlock(); } } /** * Shuts down the queue, for now. This may safely be called on an already-stopped queue. The queue may be restarted * if desired. */ @LifecycleStop public void stop() { giant.lock(); try { log.info("Naptime! Shutting down until we are started again."); queue.clear(); taskLockbox.clear(); active = false; } finally { giant.unlock(); } } /** * Adds some work to the queue and the underlying task storage facility with a generic "running" status. * * @param task task to add * * @return true */ public boolean add(final Task task) { giant.lock(); try { Preconditions.checkState(active, "Queue is not active!"); Preconditions.checkNotNull(task, "task"); // If this throws with any sort of exception, including TaskExistsException, we don't want to // insert the task into our queue. try { taskStorage.insert(task, TaskStatus.running(task.getId())); } catch (TaskExistsException e) { log.warn("Attempt to add task twice: %s", task.getId()); throw Throwables.propagate(e); } queue.add(task); workMayBeAvailable.signalAll(); // Attempt to add this task to a running task group. Silently continue if this is not possible. // The main reason this is here is so when subtasks are added, they end up in the same task group // as their parent whenever possible. if (task.getImplicitLockInterval().isPresent()) { taskLockbox.tryLock(task, task.getImplicitLockInterval().get()); } return true; } finally { giant.unlock(); } } /** * Locks and returns next doable work from the queue. Blocks if there is no doable work. * * @return runnable task */ public Task take() throws InterruptedException { giant.lock(); try { Task task; log.info("Waiting for work..."); while ((task = poll()) == null) { // awaitNanos because work may become available without this condition signalling, // due to other folks messing with the taskLockbox workMayBeAvailable.awaitNanos(1000000000L /* 1 second */); } return task; } finally { giant.unlock(); } } /** * Locks and removes next doable work from the queue. Returns null if there is no doable work. * * @return runnable task or null */ public Task poll() { giant.lock(); try { for (final Task task : queue) { if (task.getImplicitLockInterval().isPresent()) { // If this task has a fixed interval, attempt to lock it right now. final Optional<TaskLock> maybeLock = taskLockbox.tryLock(task, task.getImplicitLockInterval().get()); if (maybeLock.isPresent()) { log.info("Task claimed with fixed interval lock: %s", task.getId()); queue.remove(task); return task; } } else { // No fixed interval. Let's just run this and see what happens. log.info("Task claimed with no fixed interval lock: %s", task.getId()); queue.remove(task); return task; } } return null; } finally { giant.unlock(); } } /** * Notify this queue that some task has an updated status. If this update is valid, the status will be persisted in * the task storage facility. If the status is a completed status, the task will be unlocked and no further * updates will be accepted. * * @param task task to update * @param taskStatus new task status * * @throws NullPointerException if task or status is null * @throws IllegalArgumentException if the task ID does not match the status ID * @throws IllegalStateException if this queue is currently shut down */ public void notify(final Task task, final TaskStatus taskStatus) { giant.lock(); try { Preconditions.checkNotNull(task, "task"); Preconditions.checkNotNull(taskStatus, "status"); Preconditions.checkState(active, "Queue is not active!"); Preconditions.checkArgument(task.getId().equals(taskStatus.getId()), "Mismatching task ids[%s/%s]", task.getId(), taskStatus.getId()); // Save status to DB boolean didPersistStatus = false; try { final Optional<TaskStatus> previousStatus = taskStorage.getStatus(task.getId()); if (!previousStatus.isPresent() || !previousStatus.get().isRunnable()) { log.makeAlert("Ignoring notification for dead task").addData("task", task.getId()).emit(); return; } else { taskStorage.setStatus(taskStatus); didPersistStatus = true; } } catch (Exception e) { log.makeAlert(e, "Failed to persist status for task").addData("task", task.getId()) .addData("statusCode", taskStatus.getStatusCode()).emit(); } if (taskStatus.isComplete()) { if (didPersistStatus) { log.info("Task done: %s", task); taskLockbox.unlock(task); workMayBeAvailable.signalAll(); } else { // TODO: This could be a task-status-submission retry queue instead of retrying the entire task, // TODO: which is heavy and probably not necessary. log.warn("Status could not be persisted! Reinserting task: %s", task.getId()); queue.add(task); } } } finally { giant.unlock(); } } }