org.apache.aurora.scheduler.state.TaskStateMachine.java Source code

Java tutorial

Introduction

Here is the source code for org.apache.aurora.scheduler.state.TaskStateMachine.java

Source

/**
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.aurora.scheduler.state;

import java.util.Set;
import java.util.concurrent.atomic.AtomicLong;
import java.util.function.Consumer;

import javax.annotation.Nullable;

import com.google.common.annotations.VisibleForTesting;
import com.google.common.base.Function;
import com.google.common.base.Functions;
import com.google.common.base.Optional;
import com.google.common.base.Preconditions;
import com.google.common.collect.ImmutableList;
import com.google.common.collect.ImmutableSet;
import com.google.common.collect.Sets;

import org.apache.aurora.common.base.Command;
import org.apache.aurora.common.base.Consumers;
import org.apache.aurora.common.base.MorePreconditions;
import org.apache.aurora.common.stats.Stats;
import org.apache.aurora.common.util.StateMachine;
import org.apache.aurora.common.util.StateMachine.Rule;
import org.apache.aurora.common.util.StateMachine.Transition;
import org.apache.aurora.gen.ScheduleStatus;
import org.apache.aurora.scheduler.base.Tasks;
import org.apache.aurora.scheduler.storage.entities.IScheduledTask;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import static java.util.Objects.requireNonNull;

import static org.apache.aurora.scheduler.state.SideEffect.Action;
import static org.apache.aurora.scheduler.state.SideEffect.Action.DELETE;
import static org.apache.aurora.scheduler.state.SideEffect.Action.INCREMENT_FAILURES;
import static org.apache.aurora.scheduler.state.SideEffect.Action.KILL;
import static org.apache.aurora.scheduler.state.SideEffect.Action.RESCHEDULE;
import static org.apache.aurora.scheduler.state.SideEffect.Action.SAVE_STATE;
import static org.apache.aurora.scheduler.state.StateChangeResult.ILLEGAL;
import static org.apache.aurora.scheduler.state.StateChangeResult.ILLEGAL_WITH_SIDE_EFFECTS;
import static org.apache.aurora.scheduler.state.StateChangeResult.NOOP;
import static org.apache.aurora.scheduler.state.StateChangeResult.SUCCESS;
import static org.apache.aurora.scheduler.state.TaskStateMachine.TaskState.ASSIGNED;
import static org.apache.aurora.scheduler.state.TaskStateMachine.TaskState.DELETED;
import static org.apache.aurora.scheduler.state.TaskStateMachine.TaskState.DRAINING;
import static org.apache.aurora.scheduler.state.TaskStateMachine.TaskState.FAILED;
import static org.apache.aurora.scheduler.state.TaskStateMachine.TaskState.FINISHED;
import static org.apache.aurora.scheduler.state.TaskStateMachine.TaskState.INIT;
import static org.apache.aurora.scheduler.state.TaskStateMachine.TaskState.KILLED;
import static org.apache.aurora.scheduler.state.TaskStateMachine.TaskState.KILLING;
import static org.apache.aurora.scheduler.state.TaskStateMachine.TaskState.LOST;
import static org.apache.aurora.scheduler.state.TaskStateMachine.TaskState.PENDING;
import static org.apache.aurora.scheduler.state.TaskStateMachine.TaskState.PREEMPTING;
import static org.apache.aurora.scheduler.state.TaskStateMachine.TaskState.RESTARTING;
import static org.apache.aurora.scheduler.state.TaskStateMachine.TaskState.RUNNING;
import static org.apache.aurora.scheduler.state.TaskStateMachine.TaskState.STARTING;
import static org.apache.aurora.scheduler.state.TaskStateMachine.TaskState.THROTTLED;

/**
 * State machine for a task.
 * <p>
 * This enforces the lifecycle of a task, and triggers the actions that should be taken in response
 * to different state transitions.  These responses are externally communicated by populating a
 * provided work queue.
 * <p>
 * TODO(wfarner): Augment this class to force the one-time-use nature.  This is probably best done
 * by hiding the constructor and exposing only a static function to transition a task and get the
 * resulting actions.
 */
class TaskStateMachine {
    private static final Logger LOG = LoggerFactory.getLogger(TaskStateMachine.class);

    private static final AtomicLong ILLEGAL_TRANSITIONS = Stats
            .exportLong("scheduler_illegal_task_state_transitions");

    private final StateMachine<TaskState> stateMachine;
    private Optional<TaskState> previousState = Optional.absent();

    private final Set<SideEffect> sideEffects = Sets.newHashSet();

    private static final Function<ScheduleStatus, TaskState> STATUS_TO_TASK_STATE = input -> TaskState
            .valueOf(input.name());

    private static final Function<IScheduledTask, TaskState> SCHEDULED_TO_TASK_STATE = Functions
            .compose(STATUS_TO_TASK_STATE, IScheduledTask::getStatus);

    /**
     * ScheduleStatus enum extension to account for cases where no direct state mapping exists.
     * TODO:(maxim): Consider making this private.
     */
    @VisibleForTesting
    enum TaskState {
        INIT(Optional.of(ScheduleStatus.INIT)), THROTTLED(Optional.of(ScheduleStatus.THROTTLED)), PENDING(
                Optional.of(ScheduleStatus.PENDING)), ASSIGNED(Optional.of(ScheduleStatus.ASSIGNED)), STARTING(
                        Optional.of(ScheduleStatus.STARTING)), RUNNING(
                                Optional.of(ScheduleStatus.RUNNING)), FINISHED(
                                        Optional.of(ScheduleStatus.FINISHED)), PREEMPTING(
                                                Optional.of(ScheduleStatus.PREEMPTING)), RESTARTING(
                                                        Optional.of(ScheduleStatus.RESTARTING)), DRAINING(
                                                                Optional.of(ScheduleStatus.DRAINING)), FAILED(
                                                                        Optional.of(ScheduleStatus.FAILED)), KILLED(
                                                                                Optional.of(
                                                                                        ScheduleStatus.KILLED)), KILLING(
                                                                                                Optional.of(
                                                                                                        ScheduleStatus.KILLING)), LOST(
                                                                                                                Optional.of(
                                                                                                                        ScheduleStatus.LOST)),
        /**
         * The task does not have an associated state as it has been deleted from the store.
         */
        DELETED(Optional.<ScheduleStatus>absent());

        private final Optional<ScheduleStatus> status;

        TaskState(Optional<ScheduleStatus> status) {
            this.status = status;
        }

        Optional<ScheduleStatus> getStatus() {
            return status;
        }
    }

    /**
     * Creates a new task state machine representing a non-existent task.  This allows for consistent
     * state-reconciliation actions when the external system disagrees with the scheduler.
     *
     * @param name Name of the state machine, for logging.
     */
    TaskStateMachine(String name) {
        this(name, Optional.absent());
    }

    /**
     * Creates a new task state machine representing an existent task.  The state machine will be
     * named with the tasks ID.
     *.
     * @param task Read-only task that this state machine manages.
     */
    TaskStateMachine(IScheduledTask task) {
        this(Tasks.id(task), Optional.of(task));
    }

    private TaskStateMachine(final String name, final Optional<IScheduledTask> task) {
        MorePreconditions.checkNotBlank(name);
        requireNonNull(task);

        final TaskState initialState = task.transform(SCHEDULED_TO_TASK_STATE).or(DELETED);
        if (task.isPresent()) {
            Preconditions.checkState(initialState != DELETED, "A task that exists may not be in DELETED state.");
        } else {
            Preconditions.checkState(initialState == DELETED,
                    "A task that does not exist must start in DELETED state.");
        }

        Consumer<Transition<TaskState>> manageTerminatedTasks = Consumers
                .combine(ImmutableList.<Consumer<Transition<TaskState>>>builder()
                        // Kill a task that we believe to be terminated when an attempt is made to revive.
                        .add(Consumers.filter(Transition.to(ASSIGNED, STARTING, RUNNING), addFollowupClosure(KILL)))
                        // Remove a terminated task that is requested to be deleted.
                        .add(Consumers.filter(Transition.to(DELETED), addFollowupClosure(DELETE))).build());

        final Consumer<Transition<TaskState>> manageRestartingTask = transition -> {
            switch (transition.getTo()) {
            case ASSIGNED:
                addFollowup(KILL);
                break;

            case STARTING:
                addFollowup(KILL);
                break;

            case RUNNING:
                addFollowup(KILL);
                break;

            case LOST:
                addFollowup(KILL);
                addFollowup(RESCHEDULE);
                break;

            case FINISHED:
                addFollowup(RESCHEDULE);
                break;

            case FAILED:
                addFollowup(RESCHEDULE);
                break;

            case KILLED:
                addFollowup(RESCHEDULE);
                break;

            default:
                // No-op.
            }
        };

        // To be called on a task transitioning into the FINISHED state.
        final Command rescheduleIfService = () -> {
            if (task.get().getAssignedTask().getTask().isIsService()) {
                addFollowup(RESCHEDULE);
            }
        };

        // To be called on a task transitioning into the FAILED state.
        final Command incrementFailuresMaybeReschedule = new Command() {
            @Override
            public void execute() {
                addFollowup(INCREMENT_FAILURES);

                // Max failures is ignored for service task.
                boolean isService = task.get().getAssignedTask().getTask().isIsService();

                // Max failures is ignored when set to -1.
                int maxFailures = task.get().getAssignedTask().getTask().getMaxTaskFailures();
                boolean belowMaxFailures = maxFailures == -1 || task.get().getFailureCount() < (maxFailures - 1);
                if (isService || belowMaxFailures) {
                    addFollowup(RESCHEDULE);
                } else {
                    LOG.info("Task " + name + " reached failure limit, not rescheduling");
                }
            }
        };

        final Consumer<Transition<TaskState>> deleteIfKilling = Consumers.filter(Transition.to(KILLING),
                addFollowupClosure(DELETE));

        stateMachine = StateMachine.<TaskState>builder(name).logTransitions().initialState(initialState)
                .addState(Rule.from(INIT).to(PENDING, THROTTLED))
                .addState(Rule.from(PENDING).to(ASSIGNED, KILLING).withCallback(deleteIfKilling))
                .addState(Rule.from(THROTTLED).to(PENDING, KILLING).withCallback(deleteIfKilling))
                .addState(Rule.from(ASSIGNED).to(STARTING, RUNNING, FINISHED, FAILED, RESTARTING, DRAINING, KILLED,
                        KILLING, LOST, PREEMPTING).withCallback(transition -> {
                            switch (transition.getTo()) {
                            case FINISHED:
                                rescheduleIfService.execute();
                                break;

                            case PREEMPTING:
                                addFollowup(KILL);
                                break;

                            case FAILED:
                                incrementFailuresMaybeReschedule.execute();
                                break;

                            case RESTARTING:
                                addFollowup(KILL);
                                break;

                            case DRAINING:
                                addFollowup(KILL);
                                break;

                            case KILLED:
                                addFollowup(RESCHEDULE);
                                break;

                            case LOST:
                                addFollowup(RESCHEDULE);
                                addFollowup(KILL);
                                break;

                            case KILLING:
                                addFollowup(KILL);
                                break;

                            default:
                                // No-op.
                            }
                        }))
                .addState(Rule.from(STARTING)
                        .to(RUNNING, FINISHED, FAILED, RESTARTING, DRAINING, KILLING, KILLED, LOST, PREEMPTING)
                        .withCallback(transition -> {
                            switch (transition.getTo()) {
                            case FINISHED:
                                rescheduleIfService.execute();
                                break;

                            case RESTARTING:
                                addFollowup(KILL);
                                break;

                            case DRAINING:
                                addFollowup(KILL);
                                break;

                            case PREEMPTING:
                                addFollowup(KILL);
                                break;

                            case FAILED:
                                incrementFailuresMaybeReschedule.execute();
                                break;

                            case KILLED:
                                addFollowup(RESCHEDULE);
                                break;

                            case KILLING:
                                addFollowup(KILL);
                                break;

                            case LOST:
                                addFollowup(RESCHEDULE);
                                break;

                            default:
                                // No-op.
                            }
                        }))
                .addState(Rule.from(RUNNING)
                        .to(FINISHED, RESTARTING, DRAINING, FAILED, KILLING, KILLED, LOST, PREEMPTING)
                        .withCallback(transition -> {
                            switch (transition.getTo()) {
                            case FINISHED:
                                rescheduleIfService.execute();
                                break;

                            case PREEMPTING:
                                addFollowup(KILL);
                                break;

                            case RESTARTING:
                                addFollowup(KILL);
                                break;

                            case DRAINING:
                                addFollowup(KILL);
                                break;

                            case FAILED:
                                incrementFailuresMaybeReschedule.execute();
                                break;

                            case KILLED:
                                addFollowup(RESCHEDULE);
                                break;

                            case KILLING:
                                addFollowup(KILL);
                                break;

                            case LOST:
                                addFollowup(RESCHEDULE);
                                break;

                            default:
                                // No-op.
                            }
                        }))
                .addState(Rule.from(FINISHED).to(DELETED).withCallback(manageTerminatedTasks))
                .addState(Rule.from(PREEMPTING).to(FINISHED, FAILED, KILLING, KILLED, LOST)
                        .withCallback(manageRestartingTask))
                .addState(Rule.from(RESTARTING).to(FINISHED, FAILED, KILLING, KILLED, LOST)
                        .withCallback(manageRestartingTask))
                .addState(Rule.from(DRAINING).to(FINISHED, FAILED, KILLING, KILLED, LOST)
                        .withCallback(manageRestartingTask))
                .addState(Rule.from(FAILED).to(DELETED).withCallback(manageTerminatedTasks))
                .addState(Rule.from(KILLED).to(DELETED).withCallback(manageTerminatedTasks))
                // TODO(maxim): Re-evaluate if *DELETED states are valid transitions here.
                .addState(Rule.from(KILLING).to(FINISHED, FAILED, KILLED, LOST, DELETED)
                        .withCallback(manageTerminatedTasks))
                .addState(Rule.from(LOST).to(DELETED).withCallback(manageTerminatedTasks))
                .addState(Rule.from(DELETED).noTransitions().withCallback(manageTerminatedTasks))
                // Since we want this action to be performed last in the transition sequence, the callback
                // must be the last chained transition callback.
                .onAnyTransition(new Consumer<Transition<TaskState>>() {
                    @Override
                    public void accept(final Transition<TaskState> transition) {
                        if (transition.isValidStateChange()) {
                            TaskState from = transition.getFrom();
                            TaskState to = transition.getTo();

                            // TODO(wfarner): Clean up this hack.  This is here to suppress unnecessary work
                            // (save followed by delete), but it shows a wart with this catch-all behavior.
                            // Strongly consider pushing the SAVE_STATE behavior to each transition handler.
                            boolean pendingDeleteHack = !((from == PENDING || from == THROTTLED) && to == KILLING);

                            // Don't bother saving state of a task that is being removed.
                            if (to != DELETED && pendingDeleteHack) {
                                addFollowup(SAVE_STATE);
                            }
                            previousState = Optional.of(from);
                        } else {
                            LOG.error("Illegal state transition attempted: " + transition);
                            ILLEGAL_TRANSITIONS.incrementAndGet();
                        }
                    }
                })
                // TODO(wfarner): Consider alternatives to allow exceptions to surface.  This would allow
                // the state machine to surface illegal state transitions and propagate better information
                // to the caller.  As it stands, the caller must implement logic that really belongs in
                // the state machine.  For example, preventing RESTARTING->UPDATING transitions
                // (or for that matter, almost any user-initiated state transition) is awkward.
                .throwOnBadTransition(false).build();
    }

    private void addFollowup(Action action) {
        addFollowup(new SideEffect(action, Optional.absent()));
    }

    private void addFollowup(SideEffect sideEffect) {
        LOG.debug("Adding work command {} for {}", sideEffect, this);
        sideEffects.add(sideEffect);
    }

    private Consumer<Transition<TaskState>> addFollowupClosure(final Action action) {
        return item -> addFollowup(action);
    }

    /**
     * Attempt to transition the state machine to the provided state.
     * At the time this method returns, any work commands required to satisfy the state transition
     * will be appended to the work queue.
     *
     * TODO(maxim): The current StateManager/TaskStateMachine interaction makes it hard to expose
     * a dedicated task deletion method without leaking out the state machine implementation details.
     * Consider refactoring here to allow for an unambiguous task deletion without resorting to
     * Optional.absent().
     *
     * @param status Status to apply to the task or absent if a task deletion is required.
     * @return {@code true} if the state change was allowed, {@code false} otherwise.
     */
    public synchronized TransitionResult updateState(final Optional<ScheduleStatus> status) {
        requireNonNull(status);
        Preconditions.checkState(sideEffects.isEmpty());

        /**
         * Don't bother applying noop state changes.  If we end up modifying task state without a
         * state transition (e.g. storing resource consumption of a running task), we need to find
         * a different way to suppress noop transitions.
         */
        TaskState taskState = status.transform(STATUS_TO_TASK_STATE).or(DELETED);
        if (stateMachine.getState() == taskState) {
            return new TransitionResult(NOOP, ImmutableSet.of());
        }

        boolean success = stateMachine.transition(taskState);
        ImmutableSet<SideEffect> transitionEffects = ImmutableSet.copyOf(sideEffects);
        sideEffects.clear();
        if (success) {
            return new TransitionResult(SUCCESS, transitionEffects);
        }
        return new TransitionResult(transitionEffects.isEmpty() ? ILLEGAL : ILLEGAL_WITH_SIDE_EFFECTS,
                transitionEffects);
    }

    /**
     * Gets the previous state of this state machine.
     *
     * @return The state machine's previous state, or {@code null} if the state machine has not
     *     transitioned since being created.
     */
    @Nullable
    ScheduleStatus getPreviousState() {
        return previousState.transform(item -> item.getStatus().orNull()).orNull();
    }

    @Override
    public String toString() {
        return stateMachine.getName();
    }
}