import org.apache.commons.lang.mutable.MutableInt;
import org.apache.commons.math3.random.RandomDataGenerator;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.util.StringUtils;
import org.apache.hadoop.util.Time;
import org.apache.hadoop.yarn.api.protocolrecords.RegisterApplicationMasterResponse;
import org.apache.hadoop.yarn.api.records.Container;
import org.apache.hadoop.yarn.api.records.ContainerId;
import org.apache.hadoop.yarn.api.records.ContainerStatus;
import org.apache.hadoop.yarn.api.records.NodeId;
import org.apache.hadoop.yarn.api.records.NodeReport;
import org.apache.hadoop.yarn.api.records.Priority;
import org.apache.hadoop.yarn.api.records.Resource;
import org.apache.hadoop.yarn.api.records.ResourceRequest;
import org.apache.hadoop.yarn.client.api.AMRMClient;
import org.apache.hadoop.yarn.client.api.async.AMRMClientAsync;
import org.apache.hadoop.yarn.client.api.async.impl.AMRMClientAsyncImpl;
import org.apache.hadoop.yarn.client.api.impl.AMRMClientImpl;
import org.apache.hadoop.yarn.proto.YarnServiceProtos.SchedulerResourceTypes;
import org.apache.hadoop.yarn.util.RackResolver;
import org.apache.hadoop.yarn.util.resource.Resources;
import org.apache.tez.common.ContainerSignatureMatcher;
import org.apache.tez.common.TezUtils;
import org.apache.tez.dag.api.TezConfiguration;
import org.apache.tez.serviceplugins.api.DagInfo;
import org.apache.tez.serviceplugins.api.TaskAttemptEndReason;
import org.apache.tez.serviceplugins.api.TaskScheduler;
import org.apache.tez.serviceplugins.api.TaskSchedulerContext;
import org.apache.tez.serviceplugins.api.TaskSchedulerContext.AMState;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import javax.annotation.Nullable;
import javax.annotation.concurrent.GuardedBy;
import java.util.ArrayList;
import java.util.BitSet;
import java.util.Collection;
import java.util.Collections;
import java.util.Comparator;
import java.util.EnumSet;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.NavigableMap;
import java.util.PriorityQueue;
import java.util.Set;
import java.util.TreeMap;
import java.util.concurrent.Callable;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.ExecutionException;
import java.util.concurrent.Future;
import java.util.concurrent.RejectedExecutionException;
import java.util.concurrent.ScheduledExecutorService;
import java.util.concurrent.ScheduledThreadPoolExecutor;
import java.util.concurrent.TimeUnit;

 * A YARN task scheduler that is aware of the dependencies between vertices
 * in the DAG and takes them into account when deciding how to schedule
 * and preempt tasks.
 * This scheduler makes the assumption that vertex IDs start at 0 and are
 * densely allocated (i.e.: there are no "gaps" in the vertex ID space).
public class DagAwareYarnTaskScheduler extends TaskScheduler implements AMRMClientAsync.CallbackHandler {
    private static final Logger LOG = LoggerFactory.getLogger(DagAwareYarnTaskScheduler.class);
    private static final Comparator<HeldContainer> PREEMPT_ORDER_COMPARATOR = new PreemptOrderComparator();

    private final RandomDataGenerator random = new RandomDataGenerator();
    private AMRMClientAsyncWrapper client;
    private ScheduledExecutorService reuseExecutor;
    private ResourceCalculator resourceCalculator;
    private int numHeartbeats = 0;
    private Resource totalResources = Resource.newInstance(0, 0);
    private Resource allocatedResources = Resource.newInstance(0, 0);
    private final Set<NodeId> blacklistedNodes = Collections
            .newSetFromMap(new ConcurrentHashMap<NodeId, Boolean>());
    private final ContainerSignatureMatcher signatureMatcher;
    private final RequestTracker requestTracker = new RequestTracker();
    private final Map<ContainerId, HeldContainer> heldContainers = new HashMap<>();
    private final IdleContainerTracker idleTracker = new IdleContainerTracker();
    private final Map<Object, HeldContainer> taskAssignments = new HashMap<>();

    /** A mapping from the vertex ID to the set of containers assigned to tasks for that vertex */
    private final Map<Integer, Set<HeldContainer>> vertexAssignments = new HashMap<>();

    /** If vertex N has at least one task assigned to a container then the corresponding bit at index N is set */
    private final BitSet assignedVertices = new BitSet();

     * Tracks assigned tasks for released containers so the app can be notified properly when the
     * container completion event finally arrives.
    private final Map<ContainerId, Object> releasedContainers = new HashMap<>();

    private final Set<HeldContainer> sessionContainers = new HashSet<>();

     * Tracks the set of descendant vertices in the DAG for each vertex.  The BitSet for descendants of vertex N
     * are at array index N.  If a bit is set at index X in the descendants BitSet then vertex X is a descendant
     * of vertex N in the DAG.
    private ArrayList<BitSet> vertexDescendants = null;

    private volatile boolean stopRequested = false;
    private volatile boolean shouldUnregister = false;
    private volatile boolean hasUnregistered = false;

    // cached configuration parameters
    private boolean shouldReuseContainers;
    private boolean reuseRackLocal;
    private boolean reuseNonLocal;
    private boolean reuseNewContainers;
    private long localitySchedulingDelay;
    private long idleContainerTimeoutMin;
    private long idleContainerTimeoutMax;
    private int sessionNumMinHeldContainers;
    private int preemptionPercentage;
    private int numHeartbeatsBetweenPreemptions;
    private int lastPreemptionHeartbeat = 0;
    private long preemptionMaxWaitTime;

    public DagAwareYarnTaskScheduler(TaskSchedulerContext taskSchedulerContext) {
        signatureMatcher = taskSchedulerContext.getContainerSignatureMatcher();

    public void initialize() throws Exception {
        initialize(new AMRMClientAsyncWrapper(new AMRMClientImpl<TaskRequest>(), 1000, this));

    void initialize(AMRMClientAsyncWrapper client) throws Exception {
        this.client = client;
        Configuration conf = TezUtils.createConfFromUserPayload(getContext().getInitialUserPayload());

        int heartbeatIntervalMax = conf.getInt(TezConfiguration.TEZ_AM_RM_HEARTBEAT_INTERVAL_MS_MAX,

        shouldReuseContainers = conf.getBoolean(TezConfiguration.TEZ_AM_CONTAINER_REUSE_ENABLED,
        reuseRackLocal = conf.getBoolean(TezConfiguration.TEZ_AM_CONTAINER_REUSE_RACK_FALLBACK_ENABLED,
        reuseNonLocal = conf.getBoolean(TezConfiguration.TEZ_AM_CONTAINER_REUSE_NON_LOCAL_FALLBACK_ENABLED,
        Preconditions.checkArgument(((!reuseRackLocal && !reuseNonLocal) || (reuseRackLocal)),
                "Re-use Rack-Local cannot be disabled if Re-use Non-Local has been" + " enabled");

        reuseNewContainers = shouldReuseContainers
                && conf.getBoolean(TezConfiguration.TEZ_AM_CONTAINER_REUSE_NEW_CONTAINERS_ENABLED,

        localitySchedulingDelay = conf.getLong(
        Preconditions.checkArgument(localitySchedulingDelay >= 0, "Locality Scheduling delay should be >=0");

        idleContainerTimeoutMin = conf.getLong(TezConfiguration.TEZ_AM_CONTAINER_IDLE_RELEASE_TIMEOUT_MIN_MILLIS,
        Preconditions.checkArgument(idleContainerTimeoutMin >= 0 || idleContainerTimeoutMin == -1,
                "Idle container release min timeout should be either -1 or >=0");

        idleContainerTimeoutMax = conf.getLong(TezConfiguration.TEZ_AM_CONTAINER_IDLE_RELEASE_TIMEOUT_MAX_MILLIS,
                idleContainerTimeoutMax >= 0 && idleContainerTimeoutMax >= idleContainerTimeoutMin,
                "Idle container release max timeout should be >=0 and >= "
                        + TezConfiguration.TEZ_AM_CONTAINER_IDLE_RELEASE_TIMEOUT_MIN_MILLIS);

        sessionNumMinHeldContainers = conf.getInt(TezConfiguration.TEZ_AM_SESSION_MIN_HELD_CONTAINERS,
        Preconditions.checkArgument(sessionNumMinHeldContainers >= 0,
                "Session minimum held containers should be >=0");

        preemptionPercentage = conf.getInt(TezConfiguration.TEZ_AM_PREEMPTION_PERCENTAGE,
        Preconditions.checkArgument(preemptionPercentage >= 0 && preemptionPercentage <= 100,
                "Preemption percentage should be between 0-100");

        numHeartbeatsBetweenPreemptions = conf.getInt(
        Preconditions.checkArgument(numHeartbeatsBetweenPreemptions >= 1,
                "Heartbeats between preemptions should be >=1");

        preemptionMaxWaitTime = conf.getInt(TezConfiguration.TEZ_AM_PREEMPTION_MAX_WAIT_TIME_MS,
        Preconditions.checkArgument(preemptionMaxWaitTime >= 0, "Preemption max wait time must be >=0");"scheduler initialized with maxRMHeartbeatInterval:" + heartbeatIntervalMax + " reuseEnabled:"
                + shouldReuseContainers + " reuseRack:" + reuseRackLocal + " reuseAny:" + reuseNonLocal
                + " localityDelay:" + localitySchedulingDelay + " preemptPercentage:" + preemptionPercentage
                + " preemptMaxWaitTime:" + preemptionMaxWaitTime + " numHeartbeatsBetweenPreemptions:"
                + numHeartbeatsBetweenPreemptions + " idleContainerMinTimeout:" + idleContainerTimeoutMin
                + " idleContainerMaxTimeout:" + idleContainerTimeoutMax + " sessionMinHeldContainers:"
                + sessionNumMinHeldContainers);

    public void start() throws Exception {
        if (shouldReuseContainers) {
            reuseExecutor = createExecutor();
        TaskSchedulerContext ctx = getContext();
        RegisterApplicationMasterResponse response = client.registerApplicationMaster(ctx.getAppHostName(),
                ctx.getAppClientPort(), ctx.getAppTrackingUrl());
        ctx.setApplicationRegistrationData(response.getMaximumResourceCapability(), response.getApplicationACLs(),
                response.getClientToAMTokenMasterKey(), response.getQueue());
        if (response.getSchedulerResourceTypes().contains(SchedulerResourceTypes.CPU)) {
            resourceCalculator = new MemCpuResourceCalculator();
        } else {
            resourceCalculator = new MemResourceCalculator();

    protected ScheduledExecutorService createExecutor() {
        return new ReuseContainerExecutor();

    protected long now() {
        return Time.monotonicNow();

    public void initiateStop() {
        LOG.debug("Initiating stop of task scheduler");
        stopRequested = true;
        List<ContainerId> releasedLaunchedContainers;
        synchronized (this) {
            releasedLaunchedContainers = new ArrayList<>(heldContainers.size());
            List<HeldContainer> heldList = new ArrayList<>(heldContainers.values());
            for (HeldContainer hc : heldList) {
                if (releaseContainer(hc)) {

            List<Object> tasks = requestTracker.getTasks();
            for (Object task : tasks) {

        // perform app callback outside of locks
        for (ContainerId id : releasedLaunchedContainers) {

    public void shutdown() throws Exception {
        if (reuseExecutor != null) {
            reuseExecutor.awaitTermination(2, TimeUnit.SECONDS);
        synchronized (this) {
            if (shouldUnregister && !hasUnregistered) {
                TaskSchedulerContext.AppFinalStatus status = getContext().getFinalAppStatus();
      "Unregistering from RM, exitStatus={} exitMessage={} trackingURL={}", status.exitStatus,
                        status.exitMessage, status.postCompletionTrackingUrl);
                client.unregisterApplicationMaster(status.exitStatus, status.exitMessage,
                hasUnregistered = true;

    public void onContainersAllocated(List<Container> containers) {
        AMState appState = getContext().getAMState();
        if (stopRequested || appState == AMState.COMPLETED) {
  "Ignoring {} allocations since app is terminating", containers.size());
            for (Container c : containers) {
        List<Assignment> assignments = assignNewContainers(containers, getContext().getAMState(),

    private synchronized List<Assignment> assignNewContainers(List<Container> newContainers, AMState appState,
            boolean isSession) {
        // try to assign the containers as node-local
        List<Assignment> assignments = new ArrayList<>(newContainers.size());
        List<HeldContainer> unassigned = new ArrayList<>(newContainers.size());
        for (Container c : newContainers) {
            HeldContainer hc = new HeldContainer(c);
            heldContainers.put(hc.getId(), hc);
            Resources.addTo(allocatedResources, c.getResource());
            tryAssignNewContainer(hc, hc.getHost(), assignments, unassigned);

        // try to assign the remaining containers as rack-local
        List<HeldContainer> containers = unassigned;
        unassigned = new ArrayList<>(containers.size());
        for (HeldContainer hc : containers) {
            tryAssignNewContainer(hc, hc.getRack(), assignments, unassigned);

        // try to assign the remaining containers without locality
        containers = unassigned;
        unassigned = new ArrayList<>(containers.size());
        for (HeldContainer hc : containers) {
            tryAssignNewContainer(hc, ResourceRequest.ANY, assignments, unassigned);

        for (HeldContainer hc : unassigned) {
            if (reuseNewContainers) {
                TaskRequest assigned = tryAssignReuseContainer(hc, appState, isSession);
                if (assigned != null) {
                    assignments.add(new Assignment(assigned, hc.getContainer()));
            } else {

        return assignments;

     * Try to assign a newly acquired container to a task of the same priority.
     * @param hc the container to assign
     * @param location the locality to consider for assignment
     * @param assignments list to update if container is assigned
     * @param unassigned list to update if container is not assigned
    private void tryAssignNewContainer(HeldContainer hc, String location, List<Assignment> assignments,
            List<HeldContainer> unassigned) {
        List<? extends Collection<TaskRequest>> results = client.getMatchingRequests(hc.getPriority(), location,
        if (!results.isEmpty()) {
            for (Collection<TaskRequest> requests : results) {
                if (!requests.isEmpty()) {
                    TaskRequest request = requests.iterator().next();
                    assignContainer(request, hc, location);
                    assignments.add(new Assignment(request, hc.getContainer()));


    private TaskRequest tryAssignReuseContainer(HeldContainer hc, AMState appState, boolean isSession) {
        if (stopRequested) {
            return null;

        TaskRequest assignedRequest = null;
        switch (appState) {
        case IDLE:
            handleReuseContainerWhenIdle(hc, isSession);
        case RUNNING_APP:
            if (requestTracker.isEmpty()) {
                // treat no requests as if app is idle
                handleReuseContainerWhenIdle(hc, isSession);
            } else {
                assignedRequest = tryAssignReuseContainerAppRunning(hc);
                if (assignedRequest == null) {
                    if (hc.atMaxMatchLevel()) {
              "Releasing idle container {} due to pending requests", hc.getId());
                    } else {
        case COMPLETED:
  "Releasing container {} because app has completed", hc.getId());
            throw new IllegalStateException("Unexpected app state " + appState);

        return assignedRequest;

    private void handleReuseContainerWhenIdle(HeldContainer hc, boolean isSession) {
        if (isSession && sessionContainers.isEmpty() && sessionNumMinHeldContainers > 0) {

        if (sessionContainers.contains(hc)) {
  "Retaining container {} since it is a session container");
        } else {
            long now = now();
            long expiration = hc.getIdleExpirationTimestamp(now);
            if (now >= expiration) {
      "Releasing expired idle container {}", hc.getId());
            } else {
                hc.scheduleForReuse(expiration - now);

    private TaskRequest tryAssignReuseContainerAppRunning(HeldContainer hc) {
        if (!hc.isAssignable()) {
            LOG.debug("Skipping scheduling of container {} because it state is {}", hc.getId(), hc.getState());
            return null;

        TaskRequest assignedRequest = tryAssignReuseContainerForAffinity(hc);
        if (assignedRequest != null) {
            return assignedRequest;

        for (Entry<Priority, RequestPriorityStats> entry : requestTracker.getStatsEntries()) {
            Priority priority = entry.getKey();
            RequestPriorityStats stats = entry.getValue();
            if (!stats.allowedVertices.intersects(stats.vertices)) {
                        "Skipping requests at priority {} because all requesting vertices are blocked by higher priority requests",

            String matchLocation = hc.getMatchingLocation();
            if (stats.localityCount <= 0) {
                        "Overriding locality match of container {} to ANY since there are no locality requests at priority {}",
                        hc.getId(), priority);
                matchLocation = ResourceRequest.ANY;
            assignedRequest = tryAssignReuseContainerForPriority(hc, matchLocation, priority,
            if (assignedRequest != null) {
        return assignedRequest;

    private TaskRequest tryAssignReuseContainerForAffinity(HeldContainer hc) {
        Collection<TaskRequest> affinities = hc.getAffinities();
        if (affinities != null) {
            for (TaskRequest request : affinities) {
                if (requestTracker.isRequestBlocked(request)) {
                            "Cannot assign task {} to container {} since vertex {} is a descendant of pending tasks",
                            request.getTask(), hc.getId(), request.getVertexIndex());
                } else {
                    assignContainer(request, hc, hc.getId());
                    return request;
        return null;

    private TaskRequest tryAssignReuseContainerForPriority(HeldContainer hc, String matchLocation,
            Priority priority, BitSet allowedVertices) {
        List<? extends Collection<TaskRequest>> results = client.getMatchingRequests(priority, matchLocation,
        if (results.isEmpty()) {
            return null;

        for (Collection<TaskRequest> requests : results) {
            for (TaskRequest request : requests) {
                final int vertexIndex = request.getVertexIndex();
                if (!allowedVertices.get(vertexIndex)) {
                    LOG.debug("Not assigning task {} since it is a descendant of a pending vertex",

                Object signature = hc.getSignature();
                if (signature == null || signatureMatcher.isSuperSet(signature, request.getContainerSignature())) {
                    assignContainer(request, hc, matchLocation);
                    return request;
        return null;

    private void informAppAboutAssignments(List<Assignment> assignments) {
        if (!assignments.isEmpty()) {
            for (Assignment a : assignments) {
                informAppAboutAssignment(a.request, a.container);

     * Inform the app about a task assignment.  This should not be called with
     * any locks held.
     * @param request the corresponding task request
     * @param container the container assigned to the task
    private void informAppAboutAssignment(TaskRequest request, Container container) {
        if (blacklistedNodes.contains(container.getNodeId())) {
            Object task = request.getTask();
  "Container {} allocated for task {} on blacklisted node {}", container.getId(),
                    container.getNodeId(), task);
            // its ok to submit the same request again because the RM will not give us
            // the bad/unhealthy nodes again. The nodes may become healthy/unblacklisted
            // and so its better to give the RM the full information.
            allocateTask(task, request.getCapability(),
                    (request.getNodes() == null ? null
                            : request.getNodes().toArray(new String[request.getNodes().size()])),
                    (request.getRacks() == null ? null
                            : request.getRacks().toArray(new String[request.getRacks().size()])),
                    request.getPriority(), request.getContainerSignature(), request.getCookie());
        } else {
            getContext().taskAllocated(request.getTask(), request.getCookie(), container);

    private void computeSessionContainers() {
        Map<String, MutableInt> rackHeldNumber = new HashMap<>();
        Map<String, List<HeldContainer>> nodeHeldContainers = new HashMap<>();
        for (HeldContainer heldContainer : heldContainers.values()) {
            if (heldContainer.getSignature() == null) {
                // skip containers that have not been launched as there is no process to reuse
            MutableInt count = rackHeldNumber.get(heldContainer.getRack());
            if (count == null) {
                count = new MutableInt(0);
                rackHeldNumber.put(heldContainer.getRack(), count);
            String host = heldContainer.getHost();
            List<HeldContainer> nodeContainers = nodeHeldContainers.get(host);
            if (nodeContainers == null) {
                nodeContainers = new LinkedList<>();
                nodeHeldContainers.put(host, nodeContainers);

        Map<String, MutableInt> rackToHoldNumber = new HashMap<>();
        for (String rack : rackHeldNumber.keySet()) {
            rackToHoldNumber.put(rack, new MutableInt(0));

        // distribute evenly across nodes
        // the loop assigns 1 container per rack over all racks
        int containerCount = 0;
        while (containerCount < sessionNumMinHeldContainers && !rackHeldNumber.isEmpty()) {
            Iterator<Entry<String, MutableInt>> iter = rackHeldNumber.entrySet().iterator();
            while (containerCount < sessionNumMinHeldContainers && iter.hasNext()) {
                Entry<String, MutableInt> entry =;
                MutableInt rackCount = entry.getValue();
                if (rackCount.intValue() >= 0) {
                } else {

        // distribute containers evenly across nodes while not exceeding rack limit
        // the loop assigns 1 container per node over all nodes
        containerCount = 0;
        while (containerCount < sessionNumMinHeldContainers && !nodeHeldContainers.isEmpty()) {
            Iterator<Entry<String, List<HeldContainer>>> iter = nodeHeldContainers.entrySet().iterator();
            while (containerCount < sessionNumMinHeldContainers && iter.hasNext()) {
                List<HeldContainer> nodeContainers =;
                if (nodeContainers.isEmpty()) {
                    // node is empty. remove it.
                HeldContainer heldContainer = nodeContainers.remove(nodeContainers.size() - 1);
                MutableInt holdCount = rackToHoldNumber.get(heldContainer.getRack());
                if (holdCount.intValue() >= 0) {
                    // rack can hold a container
                } else {
                    // rack limit reached. remove node.
        }"Identified {} session containers out of {} total containers", sessionContainers.size(),

    private void activateSessionContainers() {
        if (!sessionContainers.isEmpty()) {
            for (HeldContainer hc : sessionContainers) {
                if (hc.isAssignable()) {

    public void onContainersCompleted(List<ContainerStatus> statuses) {
        if (stopRequested) {

        List<TaskStatus> taskStatusList = new ArrayList<>(statuses.size());
        synchronized (this) {
            for (ContainerStatus status : statuses) {
                ContainerId cid = status.getContainerId();
      "Container {} completed with status {}", cid, status);
                Object task = releasedContainers.remove(cid);
                if (task == null) {
                    HeldContainer hc = heldContainers.get(cid);
                    if (hc != null) {
                        task = containerCompleted(hc);
                if (task != null) {
                    taskStatusList.add(new TaskStatus(task, status));

        // perform app callback outside of locks
        for (TaskStatus taskStatus : taskStatusList) {
            getContext().containerCompleted(taskStatus.task, taskStatus.status);

    public void onNodesUpdated(List<NodeReport> updatedNodes) {
        if (!stopRequested) {

    public float getProgress() {
        if (stopRequested) {
            return 1;

        Collection<ContainerId> preemptedContainers;
        synchronized (this) {
            Resource freeResources = getAvailableResources();
            if (totalResources.getMemory() == 0) {
                // assume this is the first allocate callback. nothing is allocated.
                // available resource = totalResource
                // TODO this will not handle dynamic changes in resources
                totalResources = Resources.clone(freeResources);
      "App total resource memory: {} cpu: {} activeAssignments: {}", totalResources.getMemory(),
                        totalResources.getVirtualCores(), taskAssignments.size());

            if (LOG.isDebugEnabled() || numHeartbeats % 50 == 1) {

            preemptedContainers = maybePreempt(freeResources);
            if (preemptedContainers != null && !preemptedContainers.isEmpty()) {
                lastPreemptionHeartbeat = numHeartbeats;

        // perform app callback outside of locks
        if (preemptedContainers != null && !preemptedContainers.isEmpty()) {
            for (ContainerId cid : preemptedContainers) {
      "Preempting container {} currently allocated to a task", cid);

        return getContext().getProgress();

    public void onShutdownRequest() {
        if (!stopRequested) {

    public void onError(Throwable e) {
        LOG.error("Error from ARMRMClient", e);
        if (!stopRequested) {
                    StringUtils.stringifyException(e), null);

    public Resource getAvailableResources() {
        return client.getAvailableResources();

    public Resource getTotalResources() {
        return totalResources;

    public int getClusterNodeCount() {
        return client.getClusterNodeCount();

    public synchronized void blacklistNode(NodeId nodeId) {"Blacklisting node: {}", nodeId);
        client.updateBlacklist(Collections.singletonList(nodeId.getHost()), null);

    public synchronized void unblacklistNode(NodeId nodeId) {
        if (blacklistedNodes.remove(nodeId)) {
  "Removing blacklist for node: {}", nodeId);
            client.updateBlacklist(null, Collections.singletonList(nodeId.getHost()));

    public void allocateTask(Object task, Resource capability, String[] hosts, String[] racks, Priority priority,
            Object containerSignature, Object clientCookie) {
        int vertexIndex = getContext().getVertexIndexForTask(task);
        TaskRequest request = new TaskRequest(task, vertexIndex, capability, hosts, racks, priority,
                containerSignature, clientCookie);

    public void allocateTask(Object task, Resource capability, ContainerId containerId, Priority priority,
            Object containerSignature, Object clientCookie) {
        String[] hosts = null;
        synchronized (this) {
            HeldContainer held = heldContainers.get(containerId);
            if (held != null) {
                if (held.canFit(capability)) {
                    hosts = new String[] { held.getHost() };
                } else {
                    LOG.warn("Match request to container {} but {} does not fit in {}", containerId, capability,
                    containerId = null;
            } else {
      "Ignoring match request to unknown container {}", containerId);
                containerId = null;
        int vertexIndex = getContext().getVertexIndexForTask(task);
        TaskRequest request = new TaskRequest(task, vertexIndex, capability, hosts, null, priority,
                containerSignature, clientCookie, containerId);

    public boolean deallocateTask(Object task, boolean taskSucceeded, TaskAttemptEndReason endReason,
            String diagnostics) {
        ContainerId releasedLaunchedContainer = null;
        AMState appState = getContext().getAMState();
        boolean isSession = getContext().isSession();
        TaskRequest newAssignment = null;
        HeldContainer hc;
        synchronized (this) {
            TaskRequest request = removeTaskRequest(task);
            if (request != null) {
                LOG.debug("Deallocating task {} before it was allocated", task);
                return false;

            hc = removeTaskAssignment(task);
            if (hc != null) {
                if (taskSucceeded && shouldReuseContainers) {
                    newAssignment = tryAssignReuseContainer(hc, appState, isSession);
                    if (newAssignment == null && hc.isReleasedAndUsed()) {
                        releasedLaunchedContainer = hc.getId();
                } else {
                    if (releaseContainer(hc)) {
                        releasedLaunchedContainer = hc.getId();

        // perform app callback outside of locks
        if (newAssignment != null) {
            informAppAboutAssignment(newAssignment, hc.getContainer());
            return true;
        if (releasedLaunchedContainer != null) {
            return true;
        return hc != null;

    public Object deallocateContainer(ContainerId containerId) {
        Object task = null;
        ContainerId releasedLaunchedContainer = null;
        synchronized (this) {
            HeldContainer hc = heldContainers.remove(containerId);
            if (hc != null) {
                task = hc.getAssignedTask();
                if (task != null) {
          "Deallocated container {} from task {}", containerId, task);
                if (releaseContainer(hc)) {
                    releasedLaunchedContainer = hc.getId();
            } else {
      "Ignoring deallocation of unknown container {}", containerId);

        // perform app callback outside of locks
        if (releasedLaunchedContainer != null) {
        return task;

    private void assignContainer(TaskRequest request, HeldContainer hc, Object match) {"Assigning container {} to task {} host={} priority={} capability={} match={} lastTask={}",
                hc.getId(), request.getTask(), hc.getHost(), hc.getPriority(), hc.getCapability(), match,
        addTaskAssignment(request, hc);

    private synchronized boolean releaseContainer(HeldContainer hc) {
        Object task = containerCompleted(hc);
        if (task != null) {
            releasedContainers.put(hc.getId(), task);
            return true;
        return false;

    private void addTaskAssignment(TaskRequest request, HeldContainer hc) {
        HeldContainer oldContainer = taskAssignments.put(request.getTask(), hc);
        if (oldContainer != null) {
            LOG.error("Task {} being assigned to container {} but was already assigned to container {}",
                    request.getTask(), hc.getId(), oldContainer.getId());
        Integer vertexIndex = request.vertexIndex;
        Set<HeldContainer> cset = vertexAssignments.get(vertexIndex);
        if (cset == null) {
            cset = new HashSet<>();
            vertexAssignments.put(vertexIndex, cset);

    private HeldContainer removeTaskAssignment(Object task) {
        HeldContainer hc = taskAssignments.remove(task);
        if (hc != null) {
            TaskRequest request = hc.removeAssignment();
            if (request != null) {
                Integer vertexIndex = request.vertexIndex;
                Set<HeldContainer> cset = vertexAssignments.get(vertexIndex);
                if (cset != null && cset.remove(hc) && cset.isEmpty()) {
            } else {
                LOG.error("Container {} had assigned task {} but no request?!?", hc.getId(), task);
        return hc;

    private Object containerCompleted(HeldContainer hc) {
        Resources.subtractFrom(allocatedResources, hc.getCapability());
        return hc.getLastTask();

    private void ensureVertexDescendants() {
        if (vertexDescendants == null) {
            DagInfo info = getContext().getCurrentDagInfo();
            if (info == null) {
                throw new IllegalStateException("Scheduling tasks but no current DAG info?");
            int numVertices = info.getTotalVertices();
            ArrayList<BitSet> descendants = new ArrayList<>(numVertices);
            for (int i = 0; i < numVertices; ++i) {
            vertexDescendants = descendants;

    private void addTaskRequest(TaskRequest request) {
        Container assignedContainer = null;
        synchronized (this) {
            if (shouldReuseContainers && !stopRequested && getContext().getAMState() != AMState.COMPLETED) {
                HeldContainer hc = tryAssignTaskToIdleContainer(request);
                if (hc != null) {
                    assignedContainer = hc.getContainer();

            if (assignedContainer == null) {
                TaskRequest old = requestTracker.add(request);
                if (old != null) {

                HeldContainer hc = heldContainers.get(request.getAffinity());
                if (hc != null) {

        // perform app callback outside of locks
        if (assignedContainer != null) {
            informAppAboutAssignment(request, assignedContainer);

    private synchronized TaskRequest removeTaskRequest(Object task) {
        TaskRequest request = requestTracker.remove(task);
        if (request != null) {
        return request;

    private void removeTaskRequestByRequest(TaskRequest request) {
        HeldContainer hc = heldContainers.get(request.getAffinity());
        if (hc != null) {

    private HeldContainer tryAssignTaskToIdleContainer(TaskRequest request) {
        if (requestTracker.isRequestBlocked(request)) {
            LOG.debug("Cannot assign task {} to an idle container since vertex {} is a descendant of pending tasks",
                    request.getTask(), request.getVertexIndex());
            return null;

        // check if container affinity can be satisfied immediately
        ContainerId affinity = request.getAffinity();
        if (affinity != null) {
            HeldContainer hc = heldContainers.get(affinity);
            if (hc != null && hc.isAssignable()) {
                assignContainer(request, hc, affinity);
                return hc;

        // try to match the task against idle containers in order from best locality to worst
        HeldContainer hc;
        if (request.hasLocality()) {
            hc = tryAssignTaskToIdleContainer(request, request.getNodes(), HeldContainerState.MATCHES_LOCAL_STATES);
            if (hc == null) {
                hc = tryAssignTaskToIdleContainer(request, request.getRacks(),
                if (hc == null) {
                    hc = tryAssignTaskToIdleContainer(request, ResourceRequest.ANY,
        } else {
            hc = tryAssignTaskToIdleContainer(request, ResourceRequest.ANY,

        return hc;

    private HeldContainer tryAssignTaskToIdleContainer(TaskRequest request, List<String> locations,
            EnumSet<HeldContainerState> eligibleStates) {
        if (locations != null && !locations.isEmpty()) {
            for (String location : locations) {
                HeldContainer hc = tryAssignTaskToIdleContainer(request, location, eligibleStates);
                if (hc != null) {
                    return hc;
        return null;

    private HeldContainer tryAssignTaskToIdleContainer(TaskRequest request, String location,
            EnumSet<HeldContainerState> eligibleStates) {
        Set<HeldContainer> containers = idleTracker.getByLocation(location);
        HeldContainer bestMatch = null;
        if (containers != null && !containers.isEmpty()) {
            for (HeldContainer hc : containers) {
                if (eligibleStates.contains(hc.getState())) {
                    Object csig = hc.getSignature();
                    if (csig == null || signatureMatcher.isSuperSet(csig, request.getContainerSignature())) {
                        int numAffinities = hc.getNumAffinities();
                        if (numAffinities == 0) {
                            bestMatch = hc;
                        if (bestMatch == null || numAffinities < bestMatch.getNumAffinities()) {
                            bestMatch = hc;
                    } else {
                        LOG.debug("Unable to assign task {} to container {} due to signature mismatch",
                                request.getTask(), hc.getId());
        if (bestMatch != null) {
            assignContainer(request, bestMatch, location);
        return bestMatch;

    public void setShouldUnregister() {
        shouldUnregister = true;

    public boolean hasUnregistered() {
        return hasUnregistered;

    public synchronized void dagComplete() {
        for (HeldContainer hc : sessionContainers) {
        vertexDescendants = null;

    private Collection<ContainerId> maybePreempt(Resource freeResources) {
        if (preemptionPercentage == 0
                || numHeartbeats - lastPreemptionHeartbeat < numHeartbeatsBetweenPreemptions) {
            return null;
        if (!requestTracker.isPreemptionDeadlineExpired()
                && requestTracker.fitsHighestPriorityRequest(freeResources)) {
            if (numHeartbeats % 50 == 1) {
      "Highest priority request fits in free resources {}", freeResources);
            return null;

        int numIdleContainers = idleTracker.getNumContainers();
        if (numIdleContainers > 0) {
            if (numHeartbeats % 50 == 1) {
      "Avoiding preemption since there are {} idle containers", numIdleContainers);
            return null;

        BitSet blocked = requestTracker.createVertexBlockedSet();
        if (!blocked.intersects(assignedVertices)) {
            if (numHeartbeats % 50 == 1) {
                        "Avoiding preemption since there are no descendants of the highest priority requests running");
            return null;

        Resource preemptLeft = requestTracker.getAmountToPreempt(preemptionPercentage);
        if (!resourceCalculator.anyAvailable(preemptLeft)) {
            if (numHeartbeats % 50 == 1) {
      "Avoiding preemption since amount to preempt is {}", preemptLeft);
            return null;

        PriorityQueue<HeldContainer> candidates = new PriorityQueue<>(11, PREEMPT_ORDER_COMPARATOR);
        for (int i = blocked.nextSetBit(0); i >= 0; i = blocked.nextSetBit(i + 1)) {
            Collection<HeldContainer> containers = vertexAssignments.get(i);
            if (containers != null) {
            } else {
                LOG.error("Vertex {} in assignedVertices but no assignments?", i);

        ArrayList<ContainerId> preemptedContainers = new ArrayList<>();
        HeldContainer hc;
        while ((hc = candidates.poll()) != null) {
  "Preempting container {} currently allocated to task {}", hc.getId(), hc.getAssignedTask());
            resourceCalculator.deductFrom(preemptLeft, hc.getCapability());
            if (!resourceCalculator.anyAvailable(preemptLeft)) {

        return preemptedContainers;

    private String constructPeriodicLog(Resource freeResource) {
        Priority highestPriority = requestTracker.getHighestPriority();
        return "Allocated: " + allocatedResources + " Free: " + freeResource + " pendingRequests: "
                + requestTracker.getNumRequests() + " heldContainers: " + heldContainers.size() + " heartbeats: "
                + numHeartbeats + " lastPreemptionHeartbeat: " + lastPreemptionHeartbeat
                + ((highestPriority != null)
                        ? (" highestWaitingRequestWaitStartTime: "
                                + requestTracker.getHighestPriorityWaitTimestamp()
                                + " highestWaitingRequestPriority: " + highestPriority)
                        : "");

    int getNumBlacklistedNodes() {
        return blacklistedNodes.size();

    Collection<HeldContainer> getSessionContainers() {
        return sessionContainers;

    // Wrapper class to work around lack of blacklisting APIs in async client.
    // This can be removed once Tez requires YARN >= 2.7.0
    static class AMRMClientAsyncWrapper extends AMRMClientAsyncImpl<TaskRequest> {
        AMRMClientAsyncWrapper(AMRMClient<TaskRequest> syncClient, int intervalMs, CallbackHandler handler) {
            super(syncClient, intervalMs, handler);

        public void updateBlacklist(List<String> additions, List<String> removals) {
            client.updateBlacklist(additions, removals);

     * A utility class to track a task allocation.
    static class TaskRequest extends AMRMClient.ContainerRequest {
        final Object task;
        final int vertexIndex;
        final Object signature;
        final Object cookie;
        final ContainerId affinityContainerId;

        TaskRequest(Object task, int vertexIndex, Resource capability, String[] hosts, String[] racks,
                Priority priority, Object signature, Object cookie) {
            this(task, vertexIndex, capability, hosts, racks, priority, signature, cookie, null);

        TaskRequest(Object task, int vertexIndex, Resource capability, String[] hosts, String[] racks,
                Priority priority, Object signature, Object cookie, ContainerId affinityContainerId) {
            super(capability, hosts, racks, priority);
            this.task = task;
            this.vertexIndex = vertexIndex;
            this.signature = signature;
            this.cookie = cookie;
            this.affinityContainerId = affinityContainerId;

        Object getTask() {
            return task;

        int getVertexIndex() {
            return vertexIndex;

        Object getContainerSignature() {
            return signature;

        Object getCookie() {
            return cookie;

        ContainerId getAffinity() {
            return affinityContainerId;

        boolean hasLocality() {
            List<String> nodes = getNodes();
            List<String> racks = getRacks();
            return (nodes != null && !nodes.isEmpty()) || (racks != null && !racks.isEmpty());

    private enum HeldContainerState {
        MATCHING_LOCAL(true), MATCHING_RACK(true), MATCHING_ANY(true), ASSIGNED(false), RELEASED(false);

        private static final EnumSet<HeldContainerState> MATCHES_LOCAL_STATES = EnumSet.of(
                HeldContainerState.MATCHING_LOCAL, HeldContainerState.MATCHING_RACK,
        private static final EnumSet<HeldContainerState> MATCHES_RACK_STATES = EnumSet
                .of(HeldContainerState.MATCHING_RACK, HeldContainerState.MATCHING_ANY);
        private static final EnumSet<HeldContainerState> MATCHES_ANY_STATES = EnumSet

        private final boolean assignable;

        HeldContainerState(boolean assignable) {
            this.assignable = assignable;

        boolean isAssignable() {
            return assignable;

     * Tracking for an allocated container.
    class HeldContainer implements Callable<Void> {
        final Container container;
        final String rack;
        HeldContainerState state = HeldContainerState.MATCHING_LOCAL;

        /** The Future received when scheduling an idle container for re-allocation at a later time. */
        Future<Void> future = null;

        /** The collection of task requests that have specified this container as a scheduling affinity. */
        Collection<TaskRequest> affinities = null;

         * The task request corresponding to the currently assigned task to this container.
         * This field is null when the container is not currently assigned.
        TaskRequest assignedRequest = null;

        /** The task request corresponding to the last task that was assigned to this container. */
        TaskRequest lastRequest = null;

        /** The timestamp when the idle container will expire. 0 if the container is not idle. */
        long idleExpirationTimestamp = 0;

        /** The timestamp when this container was assigned. 0 if the container is not assigned. */
        long assignmentTimestamp = 0;

        HeldContainer(Container container) {
            this.container = container;
            this.rack = RackResolver.resolve(container.getNodeId().getHost()).getNetworkLocation();

        HeldContainerState getState() {
            return state;

        boolean isAssignable() {
            return state.isAssignable();

        boolean isReleasedAndUsed() {
            return state == HeldContainerState.RELEASED && getLastTask() != null;

        Container getContainer() {
            return container;

        ContainerId getId() {
            return container.getId();

        String getHost() {
            return container.getNodeId().getHost();

        String getRack() {
            return rack;

        Priority getPriority() {
            return container.getPriority();

        Resource getCapability() {
            return container.getResource();

        Object getAssignedTask() {
            return assignedRequest != null ? assignedRequest.getTask() : null;

        void assignTask(TaskRequest request) {
            assert state != HeldContainerState.ASSIGNED && state != HeldContainerState.RELEASED;
            if (assignedRequest != null) {
                LOG.error("Container {} assigned task {} but already running task {}", getId(), request.getTask(),
            assignedRequest = request;
            lastRequest = request;
            state = HeldContainerState.ASSIGNED;
            idleExpirationTimestamp = 0;
            assignmentTimestamp = now();
            if (future != null) {
                future = null;

        TaskRequest removeAssignment() {
            assert state == HeldContainerState.ASSIGNED;
            TaskRequest result = assignedRequest;
            assignedRequest = null;
            assignmentTimestamp = 0;
            state = HeldContainerState.MATCHING_LOCAL;
            return result;

        void addAffinity(TaskRequest request) {
            if (affinities == null) {
                affinities = new HashSet<>();

        void removeAffinity(TaskRequest request) {
            if (affinities != null && affinities.remove(request) && affinities.isEmpty()) {
                affinities = null;

        int getNumAffinities() {
            return affinities != null ? affinities.size() : 0;

        Collection<TaskRequest> getAffinities() {
            return affinities;

        void scheduleForReuse(long delayMillis) {
            assert state != HeldContainerState.ASSIGNED && state != HeldContainerState.RELEASED;
            try {
                if (future != null) {
                future = reuseExecutor.schedule(this, delayMillis, TimeUnit.MILLISECONDS);
            } catch (RejectedExecutionException e) {
                if (!stopRequested) {
                    LOG.error("Container {} could not be scheduled for reuse!", getId(), e);

        Object getSignature() {
            return lastRequest != null ? lastRequest.getContainerSignature() : null;

        Object getLastTask() {
            return lastRequest != null ? lastRequest.getTask() : null;

        String getMatchingLocation() {
            switch (state) {
            case MATCHING_LOCAL:
                return getHost();
            case MATCHING_RACK:
                return getRack();
            case MATCHING_ANY:
                return ResourceRequest.ANY;
                throw new IllegalStateException("Container " + getId() + " trying to match in state " + state);

        void moveToNextMatchingLevel() {
            switch (state) {
            case MATCHING_LOCAL:
                if (reuseRackLocal) {
                    state = HeldContainerState.MATCHING_RACK;
            case MATCHING_RACK:
                if (reuseNonLocal) {
                    state = HeldContainerState.MATCHING_ANY;
            case MATCHING_ANY:
                throw new IllegalStateException("Container " + getId() + " trying to match in state " + state);

        boolean atMaxMatchLevel() {
            switch (state) {
            case MATCHING_LOCAL:
                return !reuseRackLocal;
            case MATCHING_RACK:
                return !reuseNonLocal;
            case MATCHING_ANY:
                return true;
                throw new IllegalStateException("Container " + getId() + " trying to match in state " + state);

        void resetMatchingLevel() {
            if (isAssignable()) {
                state = HeldContainerState.MATCHING_LOCAL;

        long getIdleExpirationTimestamp(long now) {
            if (idleExpirationTimestamp == 0) {
                if (idleContainerTimeoutMin > 0) {
                    idleExpirationTimestamp = now
                            + random.nextLong(idleContainerTimeoutMin, idleContainerTimeoutMax);
                } else {
                    idleExpirationTimestamp = Long.MAX_VALUE;
            return idleExpirationTimestamp;

        long getAssignmentTimestamp() {
            return assignmentTimestamp;

        boolean canFit(Resource capability) {
            Resource cr = container.getResource();
            return cr.getMemory() >= capability.getMemory() && cr.getVirtualCores() >= capability.getVirtualCores();

        public Void call() throws Exception {
            AMState appState = getContext().getAMState();
            boolean isSession = getContext().isSession();
            TaskRequest assigned = null;
            ContainerId released = null;
            synchronized (DagAwareYarnTaskScheduler.this) {
                future = null;
                if (isAssignable()) {
                    assigned = tryAssignReuseContainer(this, appState, isSession);
                    if (assigned == null && isReleasedAndUsed()) {
                        released = getId();
            if (assigned != null) {
                informAppAboutAssignment(assigned, container);
            if (released != null) {
            return null;

        void released() {
            assert state != HeldContainerState.RELEASED;
            state = HeldContainerState.RELEASED;
            if (future != null) {
            future = null;

     * Utility comparator to order containers by assignment timestamp from
     * most recent to least recent.
    private static class PreemptOrderComparator implements Comparator<HeldContainer> {
        public int compare(HeldContainer o1, HeldContainer o2) {
            long timestamp1 = o1.getAssignmentTimestamp();
            if (timestamp1 == 0) {
                timestamp1 = Long.MAX_VALUE;
            long timestamp2 = o2.getAssignmentTimestamp();
            if (timestamp2 == 0) {
                timestamp2 = Long.MAX_VALUE;
            return, timestamp1);

     * Utility class for a request, container pair
    private static class Assignment {
        final TaskRequest request;
        final Container container;

        Assignment(TaskRequest request, Container container) {
            this.request = request;
            this.container = container;

     * Utility class for a task, container exit status pair
    private static class TaskStatus {
        final Object task;
        final ContainerStatus status;

        TaskStatus(Object task, ContainerStatus status) {
            this.task = task;
            this.status = status;

     * The task allocation request tracker tracks task allocations
     * and keeps statistics on which priorities have requests and which vertices
     * should be blocked from container reuse due to DAG topology.
    private class RequestTracker {
        private final Map<Object, TaskRequest> requests = new HashMap<>();
        /** request map ordered by priority with highest priority first */
        private final NavigableMap<Priority, RequestPriorityStats> priorityStats = new TreeMap<>(
        private Priority highestPriority = null;
        private long highestPriorityWaitTimestamp = 0;

        TaskRequest add(TaskRequest request) {
            TaskRequest oldRequest = requests.put(request.getTask(), request);
            Priority priority = request.getPriority();
            RequestPriorityStats stats = priorityStats.get(priority);
            if (stats == null) {
                stats = addStatsForPriority(priority);
            if (request.hasLocality()) {
            incrVertexTaskCount(priority, stats, request.getVertexIndex());

            if (oldRequest != null) {
            return oldRequest;

        TaskRequest remove(Object task) {
            TaskRequest request = requests.remove(task);
            if (request != null) {
                return request;
            return null;

        private RequestPriorityStats addStatsForPriority(Priority priority) {
            BitSet allowedVerts = new BitSet(vertexDescendants.size());
            Entry<Priority, RequestPriorityStats> lowerEntry = priorityStats.lowerEntry(priority);
            if (lowerEntry != null) {
                // initialize the allowed vertices BitSet using the information derived
                // from the next higher priority entry
                RequestPriorityStats priorStats = lowerEntry.getValue();
            } else {
                // no higher priority entry so this priority is currently the highest
                highestPriority = priority;
                highestPriorityWaitTimestamp = now();
                allowedVerts.set(0, vertexDescendants.size());
            RequestPriorityStats stats = new RequestPriorityStats(vertexDescendants.size(), allowedVerts);
            priorityStats.put(priority, stats);
            return stats;

        private void updateStatsForRemoval(TaskRequest request) {
            Priority priority = request.getPriority();
            RequestPriorityStats stats = priorityStats.get(priority);
            decrVertexTaskCount(priority, stats, request.getVertexIndex());
            if (request.hasLocality()) {
            if (stats.requestCount == 0) {
                if (highestPriority.equals(priority)) {
                    if (priorityStats.isEmpty()) {
                        highestPriority = null;
                        highestPriorityWaitTimestamp = 0;
                    } else {
                        highestPriority = priorityStats.firstKey();
                        highestPriorityWaitTimestamp = now();

        boolean isEmpty() {
            return requests.isEmpty();

        int getNumRequests() {
            return requests.size();

        List<Object> getTasks() {
            return new ArrayList<>(requests.keySet());

        Collection<Entry<Priority, RequestPriorityStats>> getStatsEntries() {
            return priorityStats.entrySet();

        Priority getHighestPriority() {
            if (priorityStats.isEmpty()) {
                return null;
            return priorityStats.firstKey();

        long getHighestPriorityWaitTimestamp() {
            return highestPriorityWaitTimestamp;

        boolean isRequestBlocked(TaskRequest request) {
            Entry<Priority, RequestPriorityStats> entry = priorityStats.floorEntry(request.getPriority());
            if (entry != null) {
                RequestPriorityStats stats = entry.getValue();
                int vertexIndex = request.getVertexIndex();
                return !stats.allowedVertices.get(vertexIndex) || stats.descendants.get(vertexIndex);
            return false;

        private void incrVertexTaskCount(Priority priority, RequestPriorityStats stats, int vertexIndex) {
            Integer vertexIndexInt = vertexIndex;
            MutableInt taskCount = stats.vertexTaskCount.get(vertexIndexInt);
            if (taskCount != null) {
            } else {
                addVertexToRequestStats(priority, stats, vertexIndexInt);

        private void decrVertexTaskCount(Priority priority, RequestPriorityStats stats, int vertexIndex) {
            Integer vertexIndexInt = vertexIndex;
            MutableInt taskCount = stats.vertexTaskCount.get(vertexIndexInt);
            if (taskCount.intValue() <= 0) {
                removeVertexFromRequestStats(priority, stats, vertexIndexInt);

         * Add a new vertex to a RequestPriorityStats.
         * Adding a vertex to the request stats requires updating the stats descendants bitmask to include the descendants
         * of the new vertex and also updating the allowedVertices bitmask for all lower priority requests to prevent any
         * task request from a descendant vertex in the DAG from being allocated. This avoids assigning allocations to
         * lower priority requests when a higher priority request of an ancestor is still pending, but it allows lower
         * priority requests to be satisfied if higher priority requests are not ancestors. This is particularly useful
         * for DAGs that have independent trees of vertices or significant, parallel branches within a tree.
         * Requests are blocked by taking the specified vertex's full descendant vertex bitmask in vertexDescendants and
         * clearing those bits for all lower priority requests. For the following example DAG where each vertex index
         * corresponds to its letter position (i.e.: A=0, B=1, C=2, etc.)
         *       A
         *       |
         *   C---B----E
         *   |        |
         *   D        F
         *            |
         *          G---H
         * Vertices F, G, and H are descendants of E but all other vertices are not. The vertexDescendants bitmask for
         * vertex E is therefore 11100000b or 0xE0. When the first vertex E task request arrives we need to disallow
         * requests for all descendants of E. That is accomplished by iterating through the request stats for all lower
         * priority requests and clearing the allowedVertex bits corresponding to the descendants,
         * i.e: allowedVertices = allowedVertices & ~descendants
        private void addVertexToRequestStats(Priority priority, RequestPriorityStats stats,
                Integer vertexIndexInt) {
            // Creating a new vertex entry for this priority, so the allowed vertices for all
            // lower priorities need to be updated based on the descendants of the new vertex.
            stats.vertexTaskCount.put(vertexIndexInt, new MutableInt(1));
            int vertexIndex = vertexIndexInt;
            BitSet d = vertexDescendants.get(vertexIndex);
            for (RequestPriorityStats lowerStat : priorityStats.tailMap(priority, false).values()) {

         * Removes a vertex from a RequestPriorityStats.
         * Removing a vertex is more expensive than adding a vertex. The stats contain bitmasks which only store on/off
         * values rather than reference counts. Therefore we must rebuild the descendants bitmasks from the remaining
         * vertices in the request stats. Once the new descendants mask is computed we then need to rebuild the
         * allowedVertices BitSet for all lower priority request stats in case the removal of this vertex unblocks lower
         * priority requests of a descendant vertex.
         * Rebuilding allowedVertices for the lower priorities involves starting with the allowedVertices mask at the
         * current priority then masking off the descendants at each priority level encountered, accumulating the results.
         * Any descendants of a level will be blocked at all lower levels. See the addVertexToRequestStats documentation
         * for details on how vertices map to the descendants and allowedVertices bit masks.
        private void removeVertexFromRequestStats(Priority priority, RequestPriorityStats stats,
                Integer vertexIndexInt) {
            int vertexIndex = vertexIndexInt;

            // Rebuild the descendants BitSet for the remaining vertices at this priority.
            for (Integer vIndex : stats.vertexTaskCount.keySet()) {

            // The allowedVertices for all lower priorities need to be recalculated where the vertex descendants at each
            // level are removed from the list of allowed vertices at all subsequent levels.
            Collection<RequestPriorityStats> tailStats = priorityStats.tailMap(priority, false).values();
            if (!tailStats.isEmpty()) {
                BitSet cumulativeAllowed = new BitSet(vertexDescendants.size());
                for (RequestPriorityStats s : tailStats) {

        boolean isPreemptionDeadlineExpired() {
            return highestPriorityWaitTimestamp != 0
                    && now() - highestPriorityWaitTimestamp > preemptionMaxWaitTime;

        boolean fitsHighestPriorityRequest(Resource freeResources) {
            if (priorityStats.isEmpty()) {
                return true;
            Priority priority = priorityStats.firstKey();
            List<? extends Collection> requestsList = client.getMatchingRequests(priority, ResourceRequest.ANY,
            return !requestsList.isEmpty();

        Resource getAmountToPreempt(int preemptionPercentage) {
            if (priorityStats.isEmpty()) {
                return Resources.none();
            Priority priority = priorityStats.firstKey();
            List<? extends Collection<TaskRequest>> requestsList = client.getMatchingRequests(priority,
                    ResourceRequest.ANY, Resources.unbounded());
            int numRequests = 0;
            for (Collection<TaskRequest> requests : requestsList) {
                numRequests += requests.size();
            numRequests = (int) Math.ceil(numRequests * (preemptionPercentage / 100.f));
            Resource toPreempt = Resource.newInstance(0, 0);
            if (numRequests != 0) {
                outer_loop: for (Collection<TaskRequest> requests : requestsList) {
                    for (TaskRequest request : requests) {
                        Resources.addTo(toPreempt, request.getCapability());
                        if (--numRequests == 0) {
                            break outer_loop;
            return toPreempt;

        // Create a new BitSet that represents all of the vertices that should not be
        // scheduled due to outstanding requests from higher priority predecessor vertices.
        BitSet createVertexBlockedSet() {
            BitSet blocked = new BitSet();
            Entry<Priority, RequestPriorityStats> entry = priorityStats.lastEntry();
            if (entry != null) {
                RequestPriorityStats stats = entry.getValue();
                blocked.flip(0, blocked.length());
            return blocked;

     * Tracks statistics on vertices that are requesting tasks at a particular priority
    private static class RequestPriorityStats {
        /** Map from vertex ID to number of task requests for that vertex */
        final Map<Integer, MutableInt> vertexTaskCount = new HashMap<>();
        /** BitSet of vertices that have oustanding requests at this priority */
        final BitSet vertices;
        /** BitSet of vertices that are descendants of this vertex */
        final BitSet descendants;
         * BitSet of vertices that are allowed to be scheduled at this priority
         * (i.e.: no oustanding predecessors requesting at higher priorities)
        final BitSet allowedVertices;
        int requestCount = 0;
        int localityCount = 0;

        RequestPriorityStats(int numTotalVertices, BitSet allowedVertices) {
            this.vertices = new BitSet(numTotalVertices);
            this.descendants = new BitSet(numTotalVertices);
            this.allowedVertices = allowedVertices;

     * Tracks idle containers and facilitates faster matching of task requests
     * against those containers given a desired location.
    private static class IdleContainerTracker {
         * Map of location ID (e.g.: a specific host, rack, or ANY) to set of
         * idle containers matching that location
        final Map<String, Set<HeldContainer>> containersByLocation = new HashMap<>();
        int numContainers = 0;

        void add(HeldContainer hc) {
            add(hc, hc.getHost());
            add(hc, hc.getRack());
            add(hc, ResourceRequest.ANY);

        void remove(HeldContainer hc) {
            remove(hc, hc.getHost());
            remove(hc, hc.getRack());
            remove(hc, ResourceRequest.ANY);

        int getNumContainers() {
            return numContainers;

        private void add(HeldContainer hc, String location) {
            Set<HeldContainer> containers = containersByLocation.get(location);
            if (containers == null) {
                containers = new HashSet<>();
                containersByLocation.put(location, containers);

        private void remove(HeldContainer hc, String location) {
            Set<HeldContainer> containers = containersByLocation.get(location);
            if (containers != null) {
                if (containers.remove(hc) && containers.isEmpty()) {

        Set<HeldContainer> getByLocation(String location) {
            return containersByLocation.get(location);

    private interface ResourceCalculator {
        boolean anyAvailable(Resource rsrc);

        void deductFrom(Resource total, Resource toSubtract);

     * ResourceCalculator for memory-only allocation
    private static class MemResourceCalculator implements ResourceCalculator {

        public boolean anyAvailable(Resource rsrc) {
            return rsrc.getMemory() > 0;

        public void deductFrom(Resource total, Resource toSubtract) {
            total.setMemory(total.getMemory() - toSubtract.getMemory());

     * ResourceCalculator for memory and vcore allocation
    private static class MemCpuResourceCalculator extends MemResourceCalculator {

        public boolean anyAvailable(Resource rsrc) {
            return super.anyAvailable(rsrc) || rsrc.getVirtualCores() > 0;

        public void deductFrom(Resource total, Resource toSubtract) {
            super.deductFrom(total, toSubtract);
            total.setVirtualCores(total.getVirtualCores() - toSubtract.getVirtualCores());

     * Scheduled thread pool executor that logs any errors that escape the worker thread.
     * This can be replaced with HadoopThreadPoolExecutor once Tez requires Hadoop 2.8 or later.
    static class ReuseContainerExecutor extends ScheduledThreadPoolExecutor {
        ReuseContainerExecutor() {
            super(1, new ThreadFactoryBuilder().setNameFormat("ReuseContainerExecutor #%d").build());

        protected void afterExecute(Runnable r, Throwable t) {
            super.afterExecute(r, t);

            if (t == null && r instanceof Future<?>) {
                try {
                    ((Future<?>) r).get();
                } catch (ExecutionException ee) {
                    LOG.warn("Execution exception when running task in {}", Thread.currentThread().getName());
                    t = ee.getCause();
                } catch (InterruptedException ie) {
                    LOG.warn("Thread ({}) interrupted: ", Thread.currentThread(), ie);
                } catch (Throwable throwable) {
                    t = throwable;

            if (t != null) {
                LOG.warn("Caught exception in thread {}", Thread.currentThread().getName(), t);