org.apache.aurora.scheduler.async.preemptor.PreemptorImpl.java Source code

Java tutorial

Introduction

Here is the source code for org.apache.aurora.scheduler.async.preemptor.PreemptorImpl.java

Source

/**
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.aurora.scheduler.async.preemptor;

import java.lang.annotation.Retention;
import java.lang.annotation.Target;
import java.util.List;
import java.util.Set;
import java.util.concurrent.atomic.AtomicLong;

import javax.inject.Inject;
import javax.inject.Qualifier;

import com.google.common.base.Function;
import com.google.common.base.Optional;
import com.google.common.base.Predicate;
import com.google.common.collect.FluentIterable;
import com.google.common.collect.ImmutableSet;
import com.google.common.collect.Iterables;
import com.google.common.collect.Lists;
import com.google.common.collect.Multimap;
import com.google.common.collect.Multimaps;
import com.google.common.collect.Ordering;
import com.twitter.common.quantity.Amount;
import com.twitter.common.quantity.Time;
import com.twitter.common.stats.Stats;
import com.twitter.common.stats.StatsProvider;
import com.twitter.common.util.Clock;

import org.apache.aurora.gen.ScheduleStatus;
import org.apache.aurora.scheduler.HostOffer;
import org.apache.aurora.scheduler.ResourceSlot;
import org.apache.aurora.scheduler.async.OfferQueue;
import org.apache.aurora.scheduler.base.Query;
import org.apache.aurora.scheduler.base.Tasks;
import org.apache.aurora.scheduler.filter.AttributeAggregate;
import org.apache.aurora.scheduler.filter.SchedulingFilter;
import org.apache.aurora.scheduler.filter.SchedulingFilter.ResourceRequest;
import org.apache.aurora.scheduler.filter.SchedulingFilter.UnusedResource;
import org.apache.aurora.scheduler.state.StateManager;
import org.apache.aurora.scheduler.storage.Storage;
import org.apache.aurora.scheduler.storage.entities.IAssignedTask;
import org.apache.aurora.scheduler.storage.entities.IHostAttributes;
import org.apache.aurora.scheduler.storage.entities.IScheduledTask;
import org.apache.aurora.scheduler.storage.entities.ITaskConfig;

import static java.lang.annotation.ElementType.FIELD;
import static java.lang.annotation.ElementType.METHOD;
import static java.lang.annotation.ElementType.PARAMETER;
import static java.lang.annotation.RetentionPolicy.RUNTIME;
import static java.util.Objects.requireNonNull;

import static org.apache.aurora.gen.ScheduleStatus.PENDING;
import static org.apache.aurora.gen.ScheduleStatus.PREEMPTING;
import static org.apache.aurora.scheduler.base.Tasks.SCHEDULED_TO_ASSIGNED;

/**
 * A task preemptor that tries to find tasks that are waiting to be scheduled, which are of higher
 * priority than tasks that are currently running.
 *
 * To avoid excessive churn, the preemptor requires that a task is PENDING for a duration
 * (dictated by {@link #preemptionCandidacyDelay}) before it becomes eligible to preempt other
 */
class PreemptorImpl implements Preemptor {

    /**
     * Binding annotation for the time interval after which a pending task becomes eligible to
     * preempt other tasks.
     */
    @Qualifier
    @Target({ FIELD, PARAMETER, METHOD })
    @Retention(RUNTIME)
    @interface PreemptionDelay {
    }

    private final AtomicLong tasksPreempted = Stats.exportLong("preemptor_tasks_preempted");
    // Incremented every time the preemptor is invoked and finds tasks pending and preemptable tasks
    private final AtomicLong attemptedPreemptions = Stats.exportLong("preemptor_attempts");
    // Incremented every time we fail to find tasks to preempt for a pending task.
    private final AtomicLong noSlotsFound = Stats.exportLong("preemptor_no_slots_found");

    private final Predicate<IScheduledTask> isIdleTask = new Predicate<IScheduledTask>() {
        @Override
        public boolean apply(IScheduledTask task) {
            return (clock.nowMillis() - Tasks.getLatestEvent(task).getTimestamp()) >= preemptionCandidacyDelay
                    .as(Time.MILLISECONDS);
        }
    };

    private final Storage storage;
    private final StateManager stateManager;
    private final OfferQueue offerQueue;
    private final SchedulingFilter schedulingFilter;
    private final Amount<Long, Time> preemptionCandidacyDelay;
    private final Clock clock;
    private final AtomicLong missingAttributes;
    private final ClusterState clusterState;

    /**
     * Creates a new preemptor.
     *
     * @param storage Backing store for tasks.
     * @param stateManager Scheduler state controller to instruct when preempting tasks.
     * @param offerQueue Queue that contains available Mesos resource offers.
     * @param schedulingFilter Filter to identify whether tasks may reside on given slaves.
     * @param preemptionCandidacyDelay Time a task must be PENDING before it may preempt other
     *                                 tasks.
     * @param clock Clock to check current time.
     */
    @Inject
    public PreemptorImpl(Storage storage, StateManager stateManager, OfferQueue offerQueue,
            SchedulingFilter schedulingFilter, @PreemptionDelay Amount<Long, Time> preemptionCandidacyDelay,
            Clock clock, StatsProvider statsProvider, ClusterState clusterState) {

        this.storage = requireNonNull(storage);
        this.stateManager = requireNonNull(stateManager);
        this.offerQueue = requireNonNull(offerQueue);
        this.schedulingFilter = requireNonNull(schedulingFilter);
        this.preemptionCandidacyDelay = requireNonNull(preemptionCandidacyDelay);
        this.clock = requireNonNull(clock);
        missingAttributes = statsProvider.makeCounter("preemptor_missing_attributes");
        this.clusterState = requireNonNull(clusterState);
    }

    private static final Function<HostOffer, ResourceSlot> OFFER_TO_RESOURCE_SLOT = new Function<HostOffer, ResourceSlot>() {
        @Override
        public ResourceSlot apply(HostOffer offer) {
            return ResourceSlot.from(offer.getOffer());
        }
    };

    private static final Function<HostOffer, String> OFFER_TO_HOST = new Function<HostOffer, String>() {
        @Override
        public String apply(HostOffer offer) {
            return offer.getOffer().getHostname();
        }
    };

    private static final Function<HostOffer, IHostAttributes> OFFER_TO_ATTRIBUTES = new Function<HostOffer, IHostAttributes>() {
        @Override
        public IHostAttributes apply(HostOffer offer) {
            return offer.getAttributes();
        }
    };

    private static final Function<PreemptionVictim, ResourceSlot> VICTIM_TO_RESOURCES = new Function<PreemptionVictim, ResourceSlot>() {
        @Override
        public ResourceSlot apply(PreemptionVictim victim) {
            return victim.getResources();
        }
    };

    private static final Function<PreemptionVictim, String> VICTIM_TO_TASK_ID = new Function<PreemptionVictim, String>() {
        @Override
        public String apply(PreemptionVictim victim) {
            return victim.getTaskId();
        }
    };

    // TODO(zmanji) Consider using Dominant Resource Fairness for ordering instead of the vector
    // ordering
    private static final Ordering<PreemptionVictim> RESOURCE_ORDER = ResourceSlot.ORDER
            .onResultOf(VICTIM_TO_RESOURCES).reverse();

    private static final Function<PreemptionVictim, String> VICTIM_TO_HOST = new Function<PreemptionVictim, String>() {
        @Override
        public String apply(PreemptionVictim victim) {
            return victim.getSlaveHost();
        }
    };

    /**
     * Optional.absent indicates that this slave does not have enough resources to satisfy the task.
     * The empty set indicates the offers (slack) are enough.
     * A set with elements indicates those tasks and the offers are enough.
     */
    private Optional<Set<String>> getTasksToPreempt(Iterable<PreemptionVictim> possibleVictims,
            Iterable<HostOffer> offers, IAssignedTask pendingTask, AttributeAggregate jobState) {

        // This enforces the precondition that all of the resources are from the same host. We need to
        // get the host for the schedulingFilter.
        Set<String> hosts = ImmutableSet.<String>builder()
                .addAll(Iterables.transform(possibleVictims, VICTIM_TO_HOST))
                .addAll(Iterables.transform(offers, OFFER_TO_HOST)).build();

        String host = Iterables.getOnlyElement(hosts);

        ResourceSlot slackResources = ResourceSlot.sum(Iterables.transform(offers, OFFER_TO_RESOURCE_SLOT));

        if (!Iterables.isEmpty(offers)) {
            if (Iterables.size(offers) > 1) {
                // There are multiple offers for the same host. Since both have maintenance information
                // we don't preempt with this information and wait for mesos to merge the two offers for
                // us.
                return Optional.absent();
            }
            IHostAttributes attributes = Iterables
                    .getOnlyElement(FluentIterable.from(offers).transform(OFFER_TO_ATTRIBUTES).toSet());

            Set<SchedulingFilter.Veto> vetoes = schedulingFilter.filter(
                    new UnusedResource(slackResources, attributes),
                    new ResourceRequest(pendingTask.getTask(), pendingTask.getTaskId(), jobState));

            if (vetoes.isEmpty()) {
                return Optional.<Set<String>>of(ImmutableSet.<String>of());
            }
        }

        FluentIterable<PreemptionVictim> preemptableTasks = FluentIterable.from(possibleVictims)
                .filter(preemptionFilter(pendingTask.getTask()));

        if (preemptableTasks.isEmpty()) {
            return Optional.absent();
        }

        List<PreemptionVictim> toPreemptTasks = Lists.newArrayList();

        Iterable<PreemptionVictim> sortedVictims = RESOURCE_ORDER.immutableSortedCopy(preemptableTasks);

        for (PreemptionVictim victim : sortedVictims) {
            toPreemptTasks.add(victim);

            ResourceSlot totalResource = ResourceSlot.sum(
                    ResourceSlot.sum(Iterables.transform(toPreemptTasks, VICTIM_TO_RESOURCES)), slackResources);

            Optional<IHostAttributes> attributes = getHostAttributes(host);
            if (!attributes.isPresent()) {
                missingAttributes.incrementAndGet();
                continue;
            }

            Set<SchedulingFilter.Veto> vetoes = schedulingFilter.filter(
                    new UnusedResource(totalResource, attributes.get()),
                    new ResourceRequest(pendingTask.getTask(), pendingTask.getTaskId(), jobState));

            if (vetoes.isEmpty()) {
                Set<String> taskIds = FluentIterable.from(toPreemptTasks).transform(VICTIM_TO_TASK_ID).toSet();
                return Optional.of(taskIds);
            }
        }
        return Optional.absent();
    }

    private Optional<IHostAttributes> getHostAttributes(final String host) {
        return storage.read(new Storage.Work.Quiet<Optional<IHostAttributes>>() {
            @Override
            public Optional<IHostAttributes> apply(Storage.StoreProvider storeProvider) {
                return storeProvider.getAttributeStore().getHostAttributes(host);
            }
        });
    }

    private static final Function<HostOffer, String> OFFER_TO_SLAVE_ID = new Function<HostOffer, String>() {
        @Override
        public String apply(HostOffer offer) {
            return offer.getOffer().getSlaveId().getValue();
        }
    };

    private Optional<IAssignedTask> fetchIdlePendingTask(String taskId) {
        Query.Builder query = Query.taskScoped(taskId).byStatus(PENDING);
        Iterable<IAssignedTask> result = FluentIterable.from(Storage.Util.fetchTasks(storage, query))
                .filter(isIdleTask).transform(SCHEDULED_TO_ASSIGNED);
        return Optional.fromNullable(Iterables.getOnlyElement(result, null));
    }

    @Override
    public synchronized Optional<String> findPreemptionSlotFor(final String taskId,
            AttributeAggregate attributeAggregate) {

        final Optional<IAssignedTask> pendingTask = fetchIdlePendingTask(taskId);

        // Task is no longer PENDING no need to preempt.
        if (!pendingTask.isPresent()) {
            return Optional.absent();
        }

        Multimap<String, PreemptionVictim> slavesToActiveTasks = clusterState.getSlavesToActiveTasks();

        if (slavesToActiveTasks.isEmpty()) {
            return Optional.absent();
        }

        attemptedPreemptions.incrementAndGet();

        // Group the offers by slave id so they can be paired with active tasks from the same slave.
        Multimap<String, HostOffer> slavesToOffers = Multimaps.index(offerQueue.getOffers(), OFFER_TO_SLAVE_ID);

        Set<String> allSlaves = ImmutableSet.<String>builder().addAll(slavesToOffers.keySet())
                .addAll(slavesToActiveTasks.keySet()).build();

        for (String slaveID : allSlaves) {
            final Optional<Set<String>> toPreemptTasks = getTasksToPreempt(slavesToActiveTasks.get(slaveID),
                    slavesToOffers.get(slaveID), pendingTask.get(), attributeAggregate);

            if (toPreemptTasks.isPresent()) {
                storage.write(new Storage.MutateWork.NoResult.Quiet() {
                    @Override
                    protected void execute(Storage.MutableStoreProvider storeProvider) {
                        for (String toPreempt : toPreemptTasks.get()) {
                            stateManager.changeState(storeProvider, toPreempt, Optional.<ScheduleStatus>absent(),
                                    PREEMPTING, Optional.of("Preempting in favor of " + taskId));
                            tasksPreempted.incrementAndGet();
                        }
                    }
                });
                return Optional.of(slaveID);
            }
        }

        noSlotsFound.incrementAndGet();
        return Optional.absent();
    }

    /**
     * Creates a static filter that will identify tasks that may preempt the provided task.
     * A task may preempt another task if the following conditions hold true:
     * - The resources reserved for {@code preemptableTask} are sufficient to satisfy the task.
     * - The tasks are owned by the same user and the priority of {@code preemptableTask} is lower
     *     OR {@code preemptableTask} is non-production and the compared task is production.
     *
     * @param pendingTask A task that is not scheduled to possibly preempt other tasks for.
     * @return A filter that will compare the priorities and resources required by other tasks
     *     with {@code preemptableTask}.
     */
    private static Predicate<PreemptionVictim> preemptionFilter(final ITaskConfig pendingTask) {
        return new Predicate<PreemptionVictim>() {
            @Override
            public boolean apply(PreemptionVictim possibleVictim) {
                boolean pendingIsProduction = pendingTask.isProduction();
                boolean victimIsProduction = possibleVictim.isProduction();

                if (pendingIsProduction && !victimIsProduction) {
                    return true;
                } else if (pendingIsProduction == victimIsProduction) {
                    // If production flags are equal, preemption is based on priority within the same role.
                    if (pendingTask.getJob().getRole().equals(possibleVictim.getRole())) {
                        return pendingTask.getPriority() > possibleVictim.getPriority();
                    } else {
                        return false;
                    }
                } else {
                    return false;
                }
            }
        };
    }
}