org.apache.aurora.scheduler.thrift.SchedulerThriftInterface.java Source code

Java tutorial

Introduction

Here is the source code for org.apache.aurora.scheduler.thrift.SchedulerThriftInterface.java

Source

/**
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.aurora.scheduler.thrift;

import java.util.Comparator;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.concurrent.atomic.AtomicLong;

import javax.annotation.Nullable;
import javax.inject.Inject;

import com.google.common.annotations.VisibleForTesting;
import com.google.common.base.Optional;
import com.google.common.base.Preconditions;
import com.google.common.collect.ContiguousSet;
import com.google.common.collect.DiscreteDomain;
import com.google.common.collect.FluentIterable;
import com.google.common.collect.HashMultimap;
import com.google.common.collect.ImmutableSet;
import com.google.common.collect.Iterables;
import com.google.common.collect.Lists;
import com.google.common.collect.Maps;
import com.google.common.collect.Multimap;
import com.google.common.collect.Multimaps;
import com.google.common.collect.Range;
import com.google.common.collect.Sets;

import org.apache.aurora.common.stats.StatsProvider;
import org.apache.aurora.gen.ConfigRewrite;
import org.apache.aurora.gen.DrainHostsResult;
import org.apache.aurora.gen.EndMaintenanceResult;
import org.apache.aurora.gen.ExplicitReconciliationSettings;
import org.apache.aurora.gen.Hosts;
import org.apache.aurora.gen.InstanceKey;
import org.apache.aurora.gen.InstanceTaskConfig;
import org.apache.aurora.gen.JobConfiguration;
import org.apache.aurora.gen.JobKey;
import org.apache.aurora.gen.JobUpdate;
import org.apache.aurora.gen.JobUpdateInstructions;
import org.apache.aurora.gen.JobUpdateKey;
import org.apache.aurora.gen.JobUpdatePulseStatus;
import org.apache.aurora.gen.JobUpdateQuery;
import org.apache.aurora.gen.JobUpdateRequest;
import org.apache.aurora.gen.JobUpdateSettings;
import org.apache.aurora.gen.JobUpdateSummary;
import org.apache.aurora.gen.ListBackupsResult;
import org.apache.aurora.gen.LockKey;
import org.apache.aurora.gen.MaintenanceStatusResult;
import org.apache.aurora.gen.PulseJobUpdateResult;
import org.apache.aurora.gen.QueryRecoveryResult;
import org.apache.aurora.gen.ReadOnlyScheduler;
import org.apache.aurora.gen.ResourceAggregate;
import org.apache.aurora.gen.Response;
import org.apache.aurora.gen.Result;
import org.apache.aurora.gen.RewriteConfigsRequest;
import org.apache.aurora.gen.ScheduleStatus;
import org.apache.aurora.gen.StartJobUpdateResult;
import org.apache.aurora.gen.StartMaintenanceResult;
import org.apache.aurora.gen.TaskQuery;
import org.apache.aurora.scheduler.TaskIdGenerator;
import org.apache.aurora.scheduler.base.JobKeys;
import org.apache.aurora.scheduler.base.Numbers;
import org.apache.aurora.scheduler.base.Query;
import org.apache.aurora.scheduler.base.Tasks;
import org.apache.aurora.scheduler.configuration.ConfigurationManager;
import org.apache.aurora.scheduler.configuration.ConfigurationManager.TaskDescriptionException;
import org.apache.aurora.scheduler.configuration.SanitizedConfiguration;
import org.apache.aurora.scheduler.cron.CronException;
import org.apache.aurora.scheduler.cron.CronJobManager;
import org.apache.aurora.scheduler.cron.SanitizedCronJob;
import org.apache.aurora.scheduler.quota.QuotaCheckResult;
import org.apache.aurora.scheduler.quota.QuotaManager;
import org.apache.aurora.scheduler.quota.QuotaManager.QuotaException;
import org.apache.aurora.scheduler.reconciliation.TaskReconciler;
import org.apache.aurora.scheduler.state.LockManager;
import org.apache.aurora.scheduler.state.LockManager.LockException;
import org.apache.aurora.scheduler.state.MaintenanceController;
import org.apache.aurora.scheduler.state.StateChangeResult;
import org.apache.aurora.scheduler.state.StateManager;
import org.apache.aurora.scheduler.state.UUIDGenerator;
import org.apache.aurora.scheduler.storage.CronJobStore;
import org.apache.aurora.scheduler.storage.Storage.MutableStoreProvider;
import org.apache.aurora.scheduler.storage.Storage.MutateWork.NoResult;
import org.apache.aurora.scheduler.storage.Storage.NonVolatileStorage;
import org.apache.aurora.scheduler.storage.Storage.StoreProvider;
import org.apache.aurora.scheduler.storage.backup.Recovery;
import org.apache.aurora.scheduler.storage.backup.StorageBackup;
import org.apache.aurora.scheduler.storage.entities.IAssignedTask;
import org.apache.aurora.scheduler.storage.entities.IConfigRewrite;
import org.apache.aurora.scheduler.storage.entities.IHostStatus;
import org.apache.aurora.scheduler.storage.entities.IInstanceConfigRewrite;
import org.apache.aurora.scheduler.storage.entities.IInstanceKey;
import org.apache.aurora.scheduler.storage.entities.IJobConfigRewrite;
import org.apache.aurora.scheduler.storage.entities.IJobConfiguration;
import org.apache.aurora.scheduler.storage.entities.IJobKey;
import org.apache.aurora.scheduler.storage.entities.IJobUpdate;
import org.apache.aurora.scheduler.storage.entities.IJobUpdateKey;
import org.apache.aurora.scheduler.storage.entities.IJobUpdateRequest;
import org.apache.aurora.scheduler.storage.entities.IJobUpdateSettings;
import org.apache.aurora.scheduler.storage.entities.ILockKey;
import org.apache.aurora.scheduler.storage.entities.IMetadata;
import org.apache.aurora.scheduler.storage.entities.IRange;
import org.apache.aurora.scheduler.storage.entities.IScheduledTask;
import org.apache.aurora.scheduler.storage.entities.ITaskConfig;
import org.apache.aurora.scheduler.storage.log.ThriftBackfill;
import org.apache.aurora.scheduler.thrift.aop.AnnotatedAuroraAdmin;
import org.apache.aurora.scheduler.thrift.aop.ThriftWorkload;
import org.apache.aurora.scheduler.thrift.auth.DecoratedThrift;
import org.apache.aurora.scheduler.updater.JobDiff;
import org.apache.aurora.scheduler.updater.JobUpdateController;
import org.apache.aurora.scheduler.updater.JobUpdateController.AuditData;
import org.apache.aurora.scheduler.updater.UpdateInProgressException;
import org.apache.aurora.scheduler.updater.UpdateStateException;
import org.apache.thrift.TException;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import static java.util.Objects.requireNonNull;

import static org.apache.aurora.common.base.MorePreconditions.checkNotBlank;
import static org.apache.aurora.gen.ResponseCode.INVALID_REQUEST;
import static org.apache.aurora.gen.ResponseCode.LOCK_ERROR;
import static org.apache.aurora.gen.ResponseCode.OK;
import static org.apache.aurora.gen.ResponseCode.WARNING;
import static org.apache.aurora.scheduler.base.Numbers.convertRanges;
import static org.apache.aurora.scheduler.base.Numbers.toRanges;
import static org.apache.aurora.scheduler.base.Tasks.ACTIVE_STATES;
import static org.apache.aurora.scheduler.base.Tasks.TERMINAL_STATES;
import static org.apache.aurora.scheduler.quota.QuotaCheckResult.Result.INSUFFICIENT_QUOTA;
import static org.apache.aurora.scheduler.thrift.Responses.addMessage;
import static org.apache.aurora.scheduler.thrift.Responses.empty;
import static org.apache.aurora.scheduler.thrift.Responses.error;
import static org.apache.aurora.scheduler.thrift.Responses.invalidRequest;
import static org.apache.aurora.scheduler.thrift.Responses.ok;

/**
 * Aurora scheduler thrift server implementation.
 * <p/>
 * Interfaces between users and the scheduler to access/modify jobs and perform cluster
 * administration tasks.
 */
@DecoratedThrift
class SchedulerThriftInterface implements AnnotatedAuroraAdmin {

    // This number is derived from the maximum file name length limit on most UNIX systems, less
    // the number of characters we've observed being added by mesos for the executor ID, prefix, and
    // delimiters.
    @VisibleForTesting
    static final int MAX_TASK_ID_LENGTH = 255 - 90;
    @VisibleForTesting
    static final String STAT_PREFIX = "thrift_workload_";
    @VisibleForTesting
    static final String CREATE_JOB = STAT_PREFIX + "createJob";
    @VisibleForTesting
    static final String CREATE_OR_UPDATE_CRON = STAT_PREFIX + "createOrUpdateCronTemplate";
    @VisibleForTesting
    static final String KILL_TASKS = STAT_PREFIX + "killTasks";
    @VisibleForTesting
    static final String RESTART_SHARDS = STAT_PREFIX + "restartShards";
    @VisibleForTesting
    static final String START_MAINTENANCE = STAT_PREFIX + "startMaintenance";
    @VisibleForTesting
    static final String DRAIN_HOSTS = STAT_PREFIX + "drainHosts";
    @VisibleForTesting
    static final String MAINTENANCE_STATUS = STAT_PREFIX + "maintenanceStatus";
    @VisibleForTesting
    static final String END_MAINTENANCE = STAT_PREFIX + "endMaintenance";
    @VisibleForTesting
    static final String REWRITE_CONFIGS = STAT_PREFIX + "rewriteConfigs";
    @VisibleForTesting
    static final String ADD_INSTANCES = STAT_PREFIX + "addInstances";
    @VisibleForTesting
    static final String START_JOB_UPDATE = STAT_PREFIX + "startJobUpdate";

    private static final Logger LOG = LoggerFactory.getLogger(SchedulerThriftInterface.class);

    private final ConfigurationManager configurationManager;
    private final Thresholds thresholds;
    private final NonVolatileStorage storage;
    private final LockManager lockManager;
    private final StorageBackup backup;
    private final Recovery recovery;
    private final MaintenanceController maintenance;
    private final CronJobManager cronJobManager;
    private final QuotaManager quotaManager;
    private final StateManager stateManager;
    private final TaskIdGenerator taskIdGenerator;
    private final UUIDGenerator uuidGenerator;
    private final JobUpdateController jobUpdateController;
    private final ReadOnlyScheduler.Iface readOnlyScheduler;
    private final AuditMessages auditMessages;
    private final TaskReconciler taskReconciler;

    private final AtomicLong createJobCounter;
    private final AtomicLong createOrUpdateCronCounter;
    private final AtomicLong killTasksCounter;
    private final AtomicLong restartShardsCounter;
    private final AtomicLong startMaintenanceCounter;
    private final AtomicLong drainHostsCounter;
    private final AtomicLong maintenanceStatusCounter;
    private final AtomicLong endMaintenanceCounter;
    private final AtomicLong rewriteConfigsCounter;
    private final AtomicLong addInstancesCounter;
    private final AtomicLong startJobUpdateCounter;

    @Inject
    SchedulerThriftInterface(ConfigurationManager configurationManager, Thresholds thresholds,
            NonVolatileStorage storage, LockManager lockManager, StorageBackup backup, Recovery recovery,
            CronJobManager cronJobManager, MaintenanceController maintenance, QuotaManager quotaManager,
            StateManager stateManager, TaskIdGenerator taskIdGenerator, UUIDGenerator uuidGenerator,
            JobUpdateController jobUpdateController, ReadOnlyScheduler.Iface readOnlyScheduler,
            AuditMessages auditMessages, TaskReconciler taskReconciler, StatsProvider statsProvider) {

        this.configurationManager = requireNonNull(configurationManager);
        this.thresholds = requireNonNull(thresholds);
        this.storage = requireNonNull(storage);
        this.lockManager = requireNonNull(lockManager);
        this.backup = requireNonNull(backup);
        this.recovery = requireNonNull(recovery);
        this.maintenance = requireNonNull(maintenance);
        this.cronJobManager = requireNonNull(cronJobManager);
        this.quotaManager = requireNonNull(quotaManager);
        this.stateManager = requireNonNull(stateManager);
        this.taskIdGenerator = requireNonNull(taskIdGenerator);
        this.uuidGenerator = requireNonNull(uuidGenerator);
        this.jobUpdateController = requireNonNull(jobUpdateController);
        this.readOnlyScheduler = requireNonNull(readOnlyScheduler);
        this.auditMessages = requireNonNull(auditMessages);
        this.taskReconciler = requireNonNull(taskReconciler);

        this.createJobCounter = statsProvider.makeCounter(CREATE_JOB);
        this.createOrUpdateCronCounter = statsProvider.makeCounter(CREATE_OR_UPDATE_CRON);
        this.killTasksCounter = statsProvider.makeCounter(KILL_TASKS);
        this.restartShardsCounter = statsProvider.makeCounter(RESTART_SHARDS);
        this.startMaintenanceCounter = statsProvider.makeCounter(START_MAINTENANCE);
        this.drainHostsCounter = statsProvider.makeCounter(DRAIN_HOSTS);
        this.maintenanceStatusCounter = statsProvider.makeCounter(MAINTENANCE_STATUS);
        this.endMaintenanceCounter = statsProvider.makeCounter(END_MAINTENANCE);
        this.rewriteConfigsCounter = statsProvider.makeCounter(REWRITE_CONFIGS);
        this.addInstancesCounter = statsProvider.makeCounter(ADD_INSTANCES);
        this.startJobUpdateCounter = statsProvider.makeCounter(START_JOB_UPDATE);
    }

    @Override
    public Response createJob(JobConfiguration mutableJob) {
        SanitizedConfiguration sanitized;
        try {
            sanitized = SanitizedConfiguration.fromUnsanitized(configurationManager,
                    IJobConfiguration.build(mutableJob));
        } catch (TaskDescriptionException e) {
            return error(INVALID_REQUEST, e);
        }

        if (sanitized.isCron()) {
            return invalidRequest(NO_CRON);
        }

        return storage.write(storeProvider -> {
            IJobConfiguration job = sanitized.getJobConfig();

            try {
                lockManager.assertNotLocked(ILockKey.build(LockKey.job(job.getKey().newBuilder())));

                checkJobExists(storeProvider, job.getKey());

                ITaskConfig template = sanitized.getJobConfig().getTaskConfig();
                int count = sanitized.getJobConfig().getInstanceCount();

                validateTaskLimits(template, count,
                        quotaManager.checkInstanceAddition(template, count, storeProvider));

                LOG.info("Launching " + count + " tasks.");
                stateManager.insertPendingTasks(storeProvider, template, sanitized.getInstanceIds());
                createJobCounter.addAndGet(sanitized.getInstanceIds().size());

                return ok();
            } catch (LockException e) {
                return error(LOCK_ERROR, e);
            } catch (JobExistsException | TaskValidationException e) {
                return error(INVALID_REQUEST, e);
            }
        });
    }

    private static class JobExistsException extends Exception {
        JobExistsException(String message) {
            super(message);
        }
    }

    private void checkJobExists(StoreProvider store, IJobKey jobKey) throws JobExistsException {
        if (!Iterables.isEmpty(store.getTaskStore().fetchTasks(Query.jobScoped(jobKey).active()))
                || getCronJob(store, jobKey).isPresent()) {

            throw new JobExistsException(jobAlreadyExistsMessage(jobKey));
        }
    }

    private Response createOrUpdateCronTemplate(JobConfiguration mutableJob, boolean updateOnly) {

        IJobConfiguration job = IJobConfiguration.build(mutableJob);
        IJobKey jobKey = JobKeys.assertValid(job.getKey());

        SanitizedConfiguration sanitized;
        try {
            sanitized = SanitizedConfiguration.fromUnsanitized(configurationManager, job);
        } catch (TaskDescriptionException e) {
            return error(INVALID_REQUEST, e);
        }

        if (!sanitized.isCron()) {
            return invalidRequest(noCronScheduleMessage(jobKey));
        }

        return storage.write(storeProvider -> {
            try {
                lockManager.assertNotLocked(ILockKey.build(LockKey.job(jobKey.newBuilder())));

                ITaskConfig template = sanitized.getJobConfig().getTaskConfig();
                int count = sanitized.getJobConfig().getInstanceCount();

                validateTaskLimits(template, count,
                        quotaManager.checkCronUpdate(sanitized.getJobConfig(), storeProvider));

                // TODO(mchucarroll): Merge CronJobManager.createJob/updateJob
                if (updateOnly || getCronJob(storeProvider, jobKey).isPresent()) {
                    // The job already has a schedule: so update it.
                    cronJobManager.updateJob(SanitizedCronJob.from(sanitized));
                } else {
                    checkJobExists(storeProvider, jobKey);
                    cronJobManager.createJob(SanitizedCronJob.from(sanitized));
                }
                createOrUpdateCronCounter.addAndGet(count);

                return ok();
            } catch (LockException e) {
                return error(LOCK_ERROR, e);
            } catch (JobExistsException | TaskValidationException | CronException e) {
                return error(INVALID_REQUEST, e);
            }
        });
    }

    @Override
    public Response scheduleCronJob(JobConfiguration mutableJob) {
        return createOrUpdateCronTemplate(mutableJob, false);
    }

    @Override
    public Response replaceCronTemplate(JobConfiguration mutableJob) {
        return createOrUpdateCronTemplate(mutableJob, true);
    }

    @Override
    public Response descheduleCronJob(JobKey mutableJobKey) {
        try {
            IJobKey jobKey = JobKeys.assertValid(IJobKey.build(mutableJobKey));
            lockManager.assertNotLocked(ILockKey.build(LockKey.job(jobKey.newBuilder())));

            if (cronJobManager.deleteJob(jobKey)) {
                return ok();
            } else {
                return addMessage(empty(), OK, notScheduledCronMessage(jobKey));
            }
        } catch (LockException e) {
            return error(LOCK_ERROR, e);
        }
    }

    @Override
    public Response populateJobConfig(JobConfiguration description) throws TException {
        return readOnlyScheduler.populateJobConfig(description);
    }

    @Override
    public Response startCronJob(JobKey mutableJobKey) {
        IJobKey jobKey = JobKeys.assertValid(IJobKey.build(mutableJobKey));

        try {
            cronJobManager.startJobNow(jobKey);
            return ok();
        } catch (CronException e) {
            return invalidRequest("Failed to start cron job - " + e.getMessage());
        }
    }

    // TODO(William Farner): Provide status information about cron jobs here.
    @ThriftWorkload
    @Override
    public Response getTasksStatus(TaskQuery query) throws TException {
        return readOnlyScheduler.getTasksStatus(query);
    }

    @ThriftWorkload
    @Override
    public Response getTasksWithoutConfigs(TaskQuery query) throws TException {
        return readOnlyScheduler.getTasksWithoutConfigs(query);
    }

    @ThriftWorkload
    @Override
    public Response getPendingReason(TaskQuery query) throws TException {
        return readOnlyScheduler.getPendingReason(query);
    }

    @ThriftWorkload
    @Override
    public Response getConfigSummary(JobKey job) throws TException {
        return readOnlyScheduler.getConfigSummary(job);
    }

    @ThriftWorkload
    @Override
    public Response getRoleSummary() throws TException {
        return readOnlyScheduler.getRoleSummary();
    }

    @ThriftWorkload
    @Override
    public Response getJobSummary(@Nullable String maybeNullRole) throws TException {
        return readOnlyScheduler.getJobSummary(maybeNullRole);
    }

    @ThriftWorkload
    @Override
    public Response getJobs(@Nullable String maybeNullRole) throws TException {
        return readOnlyScheduler.getJobs(maybeNullRole);
    }

    @Override
    public Response getJobUpdateDiff(JobUpdateRequest request) throws TException {
        return readOnlyScheduler.getJobUpdateDiff(request);
    }

    @Override
    public Response getTierConfigs() throws TException {
        return readOnlyScheduler.getTierConfigs();
    }

    private void validateLockForTasks(Iterable<IScheduledTask> tasks) throws LockException {
        ImmutableSet<IJobKey> uniqueKeys = FluentIterable.from(tasks).transform(Tasks::getJob).toSet();

        // Validate lock against every unique job key derived from the tasks.
        for (IJobKey key : uniqueKeys) {
            lockManager.assertNotLocked(ILockKey.build(LockKey.job(key.newBuilder())));
        }
    }

    private static Query.Builder implicitKillQuery(Query.Builder query) {
        // Unless statuses were specifically supplied, only attempt to kill active tasks.
        return query.get().getStatuses().isEmpty() ? query.byStatus(ACTIVE_STATES) : query;
    }

    @Override
    public Response killTasks(@Nullable JobKey mutableJob, @Nullable Set<Integer> instances,
            @Nullable String message) {

        Response response = empty();
        IJobKey jobKey = JobKeys.assertValid(IJobKey.build(mutableJob));
        Query.Builder query;
        if (instances == null || Iterables.isEmpty(instances)) {
            query = implicitKillQuery(Query.jobScoped(jobKey));
        } else {
            query = implicitKillQuery(Query.instanceScoped(jobKey, instances));
        }

        return storage.write(storeProvider -> {
            Iterable<IScheduledTask> tasks = storeProvider.getTaskStore().fetchTasks(query);
            try {
                validateLockForTasks(tasks);
            } catch (LockException e) {
                return error(LOCK_ERROR, e);
            }

            LOG.info("Killing tasks matching " + query);

            int tasksKilled = 0;
            for (String taskId : Tasks.ids(tasks)) {
                if (StateChangeResult.SUCCESS == stateManager.changeState(storeProvider, taskId, Optional.absent(),
                        ScheduleStatus.KILLING, auditMessages.killedByRemoteUser(Optional.fromNullable(message)))) {
                    ++tasksKilled;
                }
            }
            killTasksCounter.addAndGet(tasksKilled);

            return tasksKilled > 0 ? response.setResponseCode(OK)
                    : addMessage(response, OK, NO_TASKS_TO_KILL_MESSAGE);
        });
    }

    @Override
    public Response restartShards(JobKey mutableJobKey, Set<Integer> shardIds) {
        IJobKey jobKey = JobKeys.assertValid(IJobKey.build(mutableJobKey));
        checkNotBlank(shardIds);

        return storage.write(storeProvider -> {
            try {
                lockManager.assertNotLocked(ILockKey.build(LockKey.job(jobKey.newBuilder())));
            } catch (LockException e) {
                return error(LOCK_ERROR, e);
            }

            Query.Builder query = Query.instanceScoped(jobKey, shardIds).active();
            Iterable<IScheduledTask> matchingTasks = storeProvider.getTaskStore().fetchTasks(query);
            if (Iterables.size(matchingTasks) != shardIds.size()) {
                return invalidRequest("Not all requested shards are active.");
            }

            LOG.info("Restarting shards matching " + query);
            for (String taskId : Tasks.ids(matchingTasks)) {
                stateManager.changeState(storeProvider, taskId, Optional.absent(), ScheduleStatus.RESTARTING,
                        auditMessages.restartedByRemoteUser());
            }
            restartShardsCounter.addAndGet(shardIds.size());

            return ok();
        });
    }

    @Override
    public Response getQuota(String ownerRole) throws TException {
        return readOnlyScheduler.getQuota(ownerRole);
    }

    @Override
    public Response setQuota(String ownerRole, ResourceAggregate resourceAggregate) {
        checkNotBlank(ownerRole);
        requireNonNull(resourceAggregate);

        try {
            storage.write((NoResult<QuotaException>) store -> quotaManager.saveQuota(ownerRole,
                    ThriftBackfill.backfillResourceAggregate(resourceAggregate), store));
            return ok();
        } catch (QuotaException e) {
            return error(INVALID_REQUEST, e);
        }
    }

    @Override
    public Response startMaintenance(Hosts hosts) {
        startMaintenanceCounter.addAndGet(hosts.getHostNamesSize());
        return ok(Result.startMaintenanceResult(new StartMaintenanceResult()
                .setStatuses(IHostStatus.toBuildersSet(maintenance.startMaintenance(hosts.getHostNames())))));
    }

    @Override
    public Response drainHosts(Hosts hosts) {
        drainHostsCounter.addAndGet(hosts.getHostNamesSize());
        return ok(Result.drainHostsResult(new DrainHostsResult()
                .setStatuses(IHostStatus.toBuildersSet(maintenance.drain(hosts.getHostNames())))));
    }

    @Override
    public Response maintenanceStatus(Hosts hosts) {
        maintenanceStatusCounter.addAndGet(hosts.getHostNamesSize());
        return ok(Result.maintenanceStatusResult(new MaintenanceStatusResult()
                .setStatuses(IHostStatus.toBuildersSet(maintenance.getStatus(hosts.getHostNames())))));
    }

    @Override
    public Response endMaintenance(Hosts hosts) {
        endMaintenanceCounter.addAndGet(hosts.getHostNamesSize());
        return ok(Result.endMaintenanceResult(new EndMaintenanceResult()
                .setStatuses(IHostStatus.toBuildersSet(maintenance.endMaintenance(hosts.getHostNames())))));
    }

    @Override
    public Response forceTaskState(String taskId, ScheduleStatus status) {
        checkNotBlank(taskId);
        requireNonNull(status);

        storage.write(storeProvider -> stateManager.changeState(storeProvider, taskId, Optional.absent(), status,
                auditMessages.transitionedBy()));

        return ok();
    }

    @Override
    public Response performBackup() {
        backup.backupNow();
        return ok();
    }

    @Override
    public Response listBackups() {
        return ok(Result.listBackupsResult(new ListBackupsResult().setBackups(recovery.listBackups())));
    }

    @Override
    public Response stageRecovery(String backupId) {
        recovery.stage(backupId);
        return ok();
    }

    @Override
    public Response queryRecovery(TaskQuery query) {
        return ok(Result.queryRecoveryResult(new QueryRecoveryResult()
                .setTasks(IScheduledTask.toBuildersSet(recovery.query(Query.arbitrary(query))))));
    }

    @Override
    public Response deleteRecoveryTasks(TaskQuery query) {
        recovery.deleteTasks(Query.arbitrary(query));
        return ok();
    }

    @Override
    public Response commitRecovery() {
        recovery.commit();
        return ok();
    }

    @Override
    public Response unloadRecovery() {
        recovery.unload();
        return ok();
    }

    @Override
    public Response snapshot() {
        storage.snapshot();
        return ok();
    }

    @Override
    public Response rewriteConfigs(RewriteConfigsRequest request) {
        if (request.getRewriteCommandsSize() == 0) {
            return addMessage(empty(), INVALID_REQUEST, "No rewrite commands provided.");
        }

        return storage.write(storeProvider -> {
            List<String> errors = Lists.newArrayList();

            for (ConfigRewrite command : request.getRewriteCommands()) {
                Optional<String> error = rewriteConfig(IConfigRewrite.build(command), storeProvider);
                if (error.isPresent()) {
                    errors.add(error.get());
                } else {
                    rewriteConfigsCounter.incrementAndGet();
                }
            }

            Response resp = empty();
            if (errors.isEmpty()) {
                resp.setResponseCode(OK);
            } else {
                for (String error : errors) {
                    addMessage(resp, WARNING, error);
                }
            }
            return resp;
        });
    }

    @Override
    public Response triggerExplicitTaskReconciliation(ExplicitReconciliationSettings settings) throws TException {
        try {
            requireNonNull(settings);
            Preconditions.checkArgument(!settings.isSetBatchSize() || settings.getBatchSize() > 0,
                    "Batch size must be greater than zero.");

            Optional<Integer> batchSize = settings.isSetBatchSize() ? Optional.of(settings.getBatchSize())
                    : Optional.absent();

            taskReconciler.triggerExplicitReconciliation(batchSize);
            return ok();
        } catch (IllegalArgumentException e) {
            return error(INVALID_REQUEST, e);
        }
    }

    @Override
    public Response triggerImplicitTaskReconciliation() throws TException {
        taskReconciler.triggerImplicitReconciliation();
        return ok();
    }

    private Optional<String> rewriteJob(IJobConfigRewrite jobRewrite, CronJobStore.Mutable jobStore) {
        IJobConfiguration existingJob = jobRewrite.getOldJob();
        IJobConfiguration rewrittenJob;
        Optional<String> error = Optional.absent();
        try {
            rewrittenJob = configurationManager.validateAndPopulate(jobRewrite.getRewrittenJob());
        } catch (TaskDescriptionException e) {
            // We could add an error here, but this is probably a hint of something wrong in
            // the client that's causing a bad configuration to be applied.
            throw new RuntimeException(e);
        }

        if (existingJob.getKey().equals(rewrittenJob.getKey())) {
            Optional<IJobConfiguration> job = jobStore.fetchJob(existingJob.getKey());
            if (job.isPresent()) {
                IJobConfiguration storedJob = job.get();
                if (storedJob.equals(existingJob)) {
                    jobStore.saveAcceptedJob(rewrittenJob);
                } else {
                    error = Optional.of("CAS compare failed for " + JobKeys.canonicalString(storedJob.getKey()));
                }
            } else {
                error = Optional.of("No jobs found for key " + JobKeys.canonicalString(existingJob.getKey()));
            }
        } else {
            error = Optional.of("Disallowing rewrite attempting to change job key.");
        }

        return error;
    }

    private Optional<String> rewriteInstance(IInstanceConfigRewrite instanceRewrite,
            MutableStoreProvider storeProvider) {

        IInstanceKey instanceKey = instanceRewrite.getInstanceKey();
        Optional<String> error = Optional.absent();
        Iterable<IScheduledTask> tasks = storeProvider.getTaskStore()
                .fetchTasks(Query.instanceScoped(instanceKey.getJobKey(), instanceKey.getInstanceId()).active());
        Optional<IAssignedTask> task = Optional.fromNullable(Iterables.getOnlyElement(tasks, null))
                .transform(IScheduledTask::getAssignedTask);

        if (task.isPresent()) {
            if (task.get().getTask().equals(instanceRewrite.getOldTask())) {
                ITaskConfig newConfiguration = instanceRewrite.getRewrittenTask();
                boolean changed = storeProvider.getUnsafeTaskStore().unsafeModifyInPlace(task.get().getTaskId(),
                        newConfiguration);
                if (!changed) {
                    error = Optional.of("Did not change " + task.get().getTaskId());
                }
            } else {
                error = Optional.of("CAS compare failed for " + instanceKey);
            }
        } else {
            error = Optional.of("No active task found for " + instanceKey);
        }

        return error;
    }

    private Optional<String> rewriteConfig(IConfigRewrite command, MutableStoreProvider storeProvider) {

        Optional<String> error;
        switch (command.getSetField()) {
        case JOB_REWRITE:
            error = rewriteJob(command.getJobRewrite(), storeProvider.getCronJobStore());
            break;

        case INSTANCE_REWRITE:
            error = rewriteInstance(command.getInstanceRewrite(), storeProvider);
            break;

        default:
            throw new IllegalArgumentException("Unhandled command type " + command.getSetField());
        }

        return error;
    }

    @Override
    public Response addInstances(InstanceKey key, int count) {
        IJobKey jobKey = JobKeys.assertValid(IJobKey.build(key.getJobKey()));

        Response response = empty();
        return storage.write(storeProvider -> {
            try {
                if (getCronJob(storeProvider, jobKey).isPresent()) {
                    return invalidRequest("Instances may not be added to cron jobs.");
                }

                lockManager.assertNotLocked(ILockKey.build(LockKey.job(jobKey.newBuilder())));

                FluentIterable<IScheduledTask> currentTasks = FluentIterable
                        .from(storeProvider.getTaskStore().fetchTasks(Query.jobScoped(jobKey).active()));

                if (count <= 0) {
                    return invalidRequest(INVALID_INSTANCE_COUNT);
                }

                Optional<IScheduledTask> templateTask = Iterables.tryFind(currentTasks,
                        e -> e.getAssignedTask().getInstanceId() == key.getInstanceId());
                if (!templateTask.isPresent()) {
                    return invalidRequest(INVALID_INSTANCE_ID);
                }

                int lastId = currentTasks.transform(e -> e.getAssignedTask().getInstanceId()).toList().stream()
                        .max(Comparator.naturalOrder()).get();

                Set<Integer> instanceIds = ContiguousSet.create(Range.openClosed(lastId, lastId + count),
                        DiscreteDomain.integers());

                ITaskConfig task = templateTask.get().getAssignedTask().getTask();
                validateTaskLimits(task, Iterables.size(currentTasks) + instanceIds.size(),
                        quotaManager.checkInstanceAddition(task, instanceIds.size(), storeProvider));

                stateManager.insertPendingTasks(storeProvider, task, instanceIds);
                addInstancesCounter.addAndGet(instanceIds.size());

                return response.setResponseCode(OK);
            } catch (LockException e) {
                return error(LOCK_ERROR, e);
            } catch (TaskValidationException | IllegalArgumentException e) {
                return error(INVALID_REQUEST, e);
            }
        });
    }

    private Optional<IJobConfiguration> getCronJob(StoreProvider storeProvider, IJobKey jobKey) {
        requireNonNull(jobKey);
        return storeProvider.getCronJobStore().fetchJob(jobKey);
    }

    private static class TaskValidationException extends Exception {
        TaskValidationException(String message) {
            super(message);
        }
    }

    private void validateTaskLimits(ITaskConfig task, int totalInstances, QuotaCheckResult quotaCheck)
            throws TaskValidationException {

        if (totalInstances <= 0 || totalInstances > thresholds.getMaxTasksPerJob()) {
            throw new TaskValidationException(String.format("Instance count must be between 1 and %d inclusive.",
                    thresholds.getMaxTasksPerJob()));
        }

        // TODO(maximk): This is a short-term hack to stop the bleeding from
        //               https://issues.apache.org/jira/browse/MESOS-691
        if (taskIdGenerator.generate(task, totalInstances).length() > MAX_TASK_ID_LENGTH) {
            throw new TaskValidationException("Task ID is too long, please shorten your role or job name.");
        }

        if (quotaCheck.getResult() == INSUFFICIENT_QUOTA) {
            throw new TaskValidationException("Insufficient resource quota: " + quotaCheck.getDetails().or(""));
        }
    }

    private static Set<InstanceTaskConfig> buildInitialState(Map<Integer, ITaskConfig> tasks) {
        // Translate tasks into instance IDs.
        Multimap<ITaskConfig, Integer> instancesByConfig = HashMultimap.create();
        Multimaps.invertFrom(Multimaps.forMap(tasks), instancesByConfig);

        // Reduce instance IDs into contiguous ranges.
        Map<ITaskConfig, Set<Range<Integer>>> rangesByConfig = Maps.transformValues(instancesByConfig.asMap(),
                Numbers::toRanges);

        ImmutableSet.Builder<InstanceTaskConfig> builder = ImmutableSet.builder();
        for (Map.Entry<ITaskConfig, Set<Range<Integer>>> entry : rangesByConfig.entrySet()) {
            builder.add(new InstanceTaskConfig().setTask(entry.getKey().newBuilder())
                    .setInstances(IRange.toBuildersSet(convertRanges(entry.getValue()))));
        }

        return builder.build();
    }

    @Override
    public Response startJobUpdate(JobUpdateRequest mutableRequest, @Nullable String message) {
        requireNonNull(mutableRequest);

        if (!mutableRequest.getTaskConfig().isIsService()) {
            return invalidRequest(NON_SERVICE_TASK);
        }

        JobUpdateSettings settings = requireNonNull(mutableRequest.getSettings());
        if (settings.getUpdateGroupSize() <= 0) {
            return invalidRequest(INVALID_GROUP_SIZE);
        }

        if (settings.getMaxPerInstanceFailures() < 0) {
            return invalidRequest(INVALID_MAX_INSTANCE_FAILURES);
        }

        if (settings.getMaxFailedInstances() < 0) {
            return invalidRequest(INVALID_MAX_FAILED_INSTANCES);
        }

        if (settings.getMaxPerInstanceFailures() * mutableRequest.getInstanceCount() > thresholds
                .getMaxUpdateInstanceFailures()) {
            return invalidRequest(TOO_MANY_POTENTIAL_FAILED_INSTANCES);
        }

        if (settings.getMinWaitInInstanceRunningMs() < 0) {
            return invalidRequest(INVALID_MIN_WAIT_TO_RUNNING);
        }

        if (settings.getBlockIfNoPulsesAfterMs() < 0) {
            return invalidRequest(INVALID_PULSE_TIMEOUT);
        }

        IJobUpdateRequest request;
        try {
            request = IJobUpdateRequest
                    .build(new JobUpdateRequest(mutableRequest).setTaskConfig(configurationManager
                            .validateAndPopulate(ITaskConfig.build(mutableRequest.getTaskConfig())).newBuilder()));
        } catch (TaskDescriptionException e) {
            return error(INVALID_REQUEST, e);
        }

        return storage.write(storeProvider -> {
            IJobKey job = request.getTaskConfig().getJob();
            if (getCronJob(storeProvider, job).isPresent()) {
                return invalidRequest(NO_CRON);
            }

            String updateId = uuidGenerator.createNew().toString();
            IJobUpdateSettings settings1 = request.getSettings();

            JobDiff diff = JobDiff.compute(storeProvider.getTaskStore(), job,
                    JobDiff.asMap(request.getTaskConfig(), request.getInstanceCount()),
                    settings1.getUpdateOnlyTheseInstances());

            Set<Integer> invalidScope = diff
                    .getOutOfScopeInstances(Numbers.rangesToInstanceIds(settings1.getUpdateOnlyTheseInstances()));
            if (!invalidScope.isEmpty()) {
                return invalidRequest("The update request attempted to update specific instances,"
                        + " but some are irrelevant to the update and current job state: " + invalidScope);
            }

            if (diff.isNoop()) {
                return addMessage(empty(), OK, NOOP_JOB_UPDATE_MESSAGE);
            }

            JobUpdateInstructions instructions = new JobUpdateInstructions().setSettings(settings1.newBuilder())
                    .setInitialState(buildInitialState(diff.getReplacedInstances()));

            Set<Integer> replacements = diff.getReplacementInstances();
            if (!replacements.isEmpty()) {
                instructions.setDesiredState(new InstanceTaskConfig().setTask(request.getTaskConfig().newBuilder())
                        .setInstances(IRange.toBuildersSet(convertRanges(toRanges(replacements)))));
            }

            String remoteUserName = auditMessages.getRemoteUserName();
            IJobUpdate update = IJobUpdate.build(new JobUpdate()
                    .setSummary(new JobUpdateSummary().setKey(new JobUpdateKey(job.newBuilder(), updateId))
                            .setUser(remoteUserName).setMetadata(IMetadata.toBuildersSet(request.getMetadata())))
                    .setInstructions(instructions));

            Response response = empty();
            try {
                validateTaskLimits(request.getTaskConfig(), request.getInstanceCount(),
                        quotaManager.checkJobUpdate(update, storeProvider));

                jobUpdateController.start(update, new AuditData(remoteUserName, Optional.fromNullable(message)));
                startJobUpdateCounter.addAndGet(request.getInstanceCount());
                return response.setResponseCode(OK)
                        .setResult(Result.startJobUpdateResult(
                                new StartJobUpdateResult(update.getSummary().getKey().newBuilder())
                                        .setUpdateSummary(update.getSummary().newBuilder())));
            } catch (UpdateInProgressException e) {
                return error(INVALID_REQUEST, e).setResult(Result.startJobUpdateResult(
                        new StartJobUpdateResult(e.getInProgressUpdateSummary().getKey().newBuilder())
                                .setUpdateSummary(e.getInProgressUpdateSummary().newBuilder())));
            } catch (UpdateStateException | TaskValidationException e) {
                return error(INVALID_REQUEST, e);
            }
        });
    }

    private Response changeJobUpdateState(JobUpdateKey mutableKey, JobUpdateStateChange change,
            Optional<String> message) {

        IJobUpdateKey key = IJobUpdateKey.build(mutableKey);
        JobKeys.assertValid(key.getJob());
        return storage.write(storeProvider -> {
            try {
                change.modifyUpdate(jobUpdateController, key,
                        new AuditData(auditMessages.getRemoteUserName(), message));
                return ok();
            } catch (UpdateStateException e) {
                return error(INVALID_REQUEST, e);
            }
        });
    }

    private interface JobUpdateStateChange {
        void modifyUpdate(JobUpdateController controller, IJobUpdateKey key, AuditData auditData)
                throws UpdateStateException;
    }

    @Override
    public Response pauseJobUpdate(JobUpdateKey mutableKey, @Nullable String message) {
        return changeJobUpdateState(mutableKey, JobUpdateController::pause, Optional.fromNullable(message));
    }

    @Override
    public Response resumeJobUpdate(JobUpdateKey mutableKey, @Nullable String message) {
        return changeJobUpdateState(mutableKey, JobUpdateController::resume, Optional.fromNullable(message));
    }

    @Override
    public Response abortJobUpdate(JobUpdateKey mutableKey, @Nullable String message) {
        return changeJobUpdateState(mutableKey, JobUpdateController::abort, Optional.fromNullable(message));
    }

    @Override
    public Response rollbackJobUpdate(JobUpdateKey mutableKey, @Nullable String message) {
        return changeJobUpdateState(mutableKey, JobUpdateController::rollback, Optional.fromNullable(message));
    }

    @Override
    public Response pulseJobUpdate(JobUpdateKey mutableUpdateKey) {
        IJobUpdateKey updateKey = validateJobUpdateKey(mutableUpdateKey);
        try {
            JobUpdatePulseStatus result = jobUpdateController.pulse(updateKey);
            return ok(Result.pulseJobUpdateResult(new PulseJobUpdateResult(result)));
        } catch (UpdateStateException e) {
            return error(INVALID_REQUEST, e);
        }
    }

    @Override
    public Response pruneTasks(TaskQuery query) throws TException {
        if (query.isSetStatuses() && query.getStatuses().stream().anyMatch(ACTIVE_STATES::contains)) {
            return error("Tasks in non-terminal state cannot be pruned.");
        } else if (!query.isSetStatuses()) {
            query.setStatuses(TERMINAL_STATES);
        }

        Iterable<IScheduledTask> tasks = storage
                .read(storeProvider -> storeProvider.getTaskStore().fetchTasks(Query.arbitrary(query)));
        // For some reason fetchTasks ignores the offset/limit options of a TaskQuery. So we have to
        // manually apply the limit here. To be fixed in AURORA-1892.
        if (query.isSetLimit()) {
            tasks = Iterables.limit(tasks, query.getLimit());
        }

        Iterable<String> taskIds = Iterables.transform(tasks, task -> task.getAssignedTask().getTaskId());

        return storage.write(storeProvider -> {
            stateManager.deleteTasks(storeProvider, Sets.newHashSet(taskIds));
            return ok();
        });
    }

    @ThriftWorkload
    @Override
    public Response getJobUpdateSummaries(JobUpdateQuery mutableQuery) throws TException {
        return readOnlyScheduler.getJobUpdateSummaries(mutableQuery);
    }

    @ThriftWorkload
    @Override
    public Response getJobUpdateDetails(JobUpdateKey key, JobUpdateQuery query) throws TException {
        return readOnlyScheduler.getJobUpdateDetails(key, query);
    }

    private static IJobUpdateKey validateJobUpdateKey(JobUpdateKey mutableKey) {
        IJobUpdateKey key = IJobUpdateKey.build(mutableKey);
        JobKeys.assertValid(key.getJob());
        checkNotBlank(key.getId());
        return key;
    }

    @VisibleForTesting
    static String noCronScheduleMessage(IJobKey jobKey) {
        return String.format("Job %s has no cron schedule", JobKeys.canonicalString(jobKey));
    }

    @VisibleForTesting
    static String notScheduledCronMessage(IJobKey jobKey) {
        return String.format("Job %s is not scheduled with cron", JobKeys.canonicalString(jobKey));
    }

    @VisibleForTesting
    static String jobAlreadyExistsMessage(IJobKey jobKey) {
        return String.format("Job %s already exists", JobKeys.canonicalString(jobKey));
    }

    @VisibleForTesting
    static final String NO_TASKS_TO_KILL_MESSAGE = "No tasks to kill.";

    @VisibleForTesting
    static final String NOOP_JOB_UPDATE_MESSAGE = "Job is unchanged by proposed update.";

    @VisibleForTesting
    static final String NO_CRON = "Cron jobs may only be created/updated by calling scheduleCronJob.";

    @VisibleForTesting
    static final String NON_SERVICE_TASK = "Updates are not supported for non-service tasks.";

    @VisibleForTesting
    static final String INVALID_GROUP_SIZE = "updateGroupSize must be positive.";

    @VisibleForTesting
    static final String INVALID_MAX_FAILED_INSTANCES = "maxFailedInstances must be non-negative.";

    @VisibleForTesting
    static final String TOO_MANY_POTENTIAL_FAILED_INSTANCES = "Your update allows too many failures "
            + "to occur, consider decreasing the per-instance failures or maxFailedInstances.";

    @VisibleForTesting
    static final String INVALID_MAX_INSTANCE_FAILURES = "maxPerInstanceFailures must be non-negative.";

    @VisibleForTesting
    static final String INVALID_MIN_WAIT_TO_RUNNING = "minWaitInInstanceRunningMs must be non-negative.";

    @VisibleForTesting
    static final String INVALID_PULSE_TIMEOUT = "blockIfNoPulsesAfterMs must be positive.";

    @VisibleForTesting
    static final String INVALID_INSTANCE_ID = "No active task found for a given instance ID.";

    @VisibleForTesting
    static final String INVALID_INSTANCE_COUNT = "Instance count must be positive.";
}