org.apache.hadoop.yarn.server.resourcemanager.scheduler.fair.FairScheduler.java Source code

Introduction

Here is the source code for org.apache.hadoop.yarn.server.resourcemanager.scheduler.fair.FairScheduler.java
Source

/**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.hadoop.yarn.server.resourcemanager.scheduler.fair;

import com.google.common.annotations.VisibleForTesting;
import com.google.common.base.Preconditions;
import com.google.common.collect.Lists;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.classification.InterfaceAudience.LimitedPrivate;
import org.apache.hadoop.classification.InterfaceStability.Unstable;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.security.UserGroupInformation;
import org.apache.hadoop.security.authorize.AccessControlList;
import org.apache.hadoop.yarn.api.records.ApplicationAttemptId;
import org.apache.hadoop.yarn.api.records.ApplicationId;
import org.apache.hadoop.yarn.api.records.Container;
import org.apache.hadoop.yarn.api.records.ContainerId;
import org.apache.hadoop.yarn.api.records.ContainerStatus;
import org.apache.hadoop.yarn.api.records.NMToken;
import org.apache.hadoop.yarn.api.records.NodeId;
import org.apache.hadoop.yarn.api.records.QueueACL;
import org.apache.hadoop.yarn.api.records.QueueInfo;
import org.apache.hadoop.yarn.api.records.QueueUserACLInfo;
import org.apache.hadoop.yarn.api.records.ReservationId;
import org.apache.hadoop.yarn.api.records.Resource;
import org.apache.hadoop.yarn.api.records.ResourceOption;
import org.apache.hadoop.yarn.api.records.ResourceRequest;
import org.apache.hadoop.yarn.api.records.SchedulingRequest;
import org.apache.hadoop.yarn.conf.YarnConfiguration;
import org.apache.hadoop.yarn.exceptions.SchedulerInvalidResoureRequestException;
import org.apache.hadoop.yarn.exceptions.YarnException;
import org.apache.hadoop.yarn.exceptions.YarnRuntimeException;
import org.apache.hadoop.yarn.proto.YarnServiceProtos.SchedulerResourceTypes;
import org.apache.hadoop.yarn.security.AccessType;
import org.apache.hadoop.yarn.security.Permission;
import org.apache.hadoop.yarn.security.PrivilegedEntity;
import org.apache.hadoop.yarn.security.PrivilegedEntity.EntityType;
import org.apache.hadoop.yarn.security.YarnAuthorizationProvider;
import org.apache.hadoop.yarn.server.api.protocolrecords.NMContainerStatus;
import org.apache.hadoop.yarn.server.resourcemanager.RMContext;
import org.apache.hadoop.yarn.server.resourcemanager.RMCriticalThreadUncaughtExceptionHandler;
import org.apache.hadoop.yarn.server.resourcemanager.recovery.RMStateStore.RMState;
import org.apache.hadoop.yarn.server.resourcemanager.reservation.ReservationConstants;
import org.apache.hadoop.yarn.server.resourcemanager.rmapp.RMApp;
import org.apache.hadoop.yarn.server.resourcemanager.rmapp.RMAppEvent;
import org.apache.hadoop.yarn.server.resourcemanager.rmapp.RMAppEventType;
import org.apache.hadoop.yarn.server.resourcemanager.rmapp.RMAppState;
import org.apache.hadoop.yarn.server.resourcemanager.rmapp.attempt.RMAppAttemptEvent;
import org.apache.hadoop.yarn.server.resourcemanager.rmapp.attempt.RMAppAttemptEventType;
import org.apache.hadoop.yarn.server.resourcemanager.rmapp.attempt.RMAppAttemptState;
import org.apache.hadoop.yarn.server.resourcemanager.rmcontainer.RMContainer;
import org.apache.hadoop.yarn.server.resourcemanager.rmcontainer.RMContainerEventType;
import org.apache.hadoop.yarn.server.resourcemanager.rmcontainer.RMContainerState;
import org.apache.hadoop.yarn.server.resourcemanager.rmnode.RMNode;
import org.apache.hadoop.yarn.server.resourcemanager.scheduler.AbstractYarnScheduler;
import org.apache.hadoop.yarn.server.resourcemanager.scheduler.ActiveUsersManager;
import org.apache.hadoop.yarn.server.resourcemanager.scheduler.Allocation;
import org.apache.hadoop.yarn.server.resourcemanager.scheduler.ContainerUpdates;
import org.apache.hadoop.yarn.server.resourcemanager.scheduler.QueueMetrics;
import org.apache.hadoop.yarn.server.resourcemanager.scheduler.SchedulerApplication;
import org.apache.hadoop.yarn.server.resourcemanager.scheduler.SchedulerUtils;
import org.apache.hadoop.yarn.server.resourcemanager.scheduler.SchedulerUtils.MaxResourceValidationResult;
import org.apache.hadoop.yarn.server.resourcemanager.scheduler.common.QueueEntitlement;
import org.apache.hadoop.yarn.server.resourcemanager.scheduler.event.AppAddedSchedulerEvent;
import org.apache.hadoop.yarn.server.resourcemanager.scheduler.event.AppAttemptAddedSchedulerEvent;
import org.apache.hadoop.yarn.server.resourcemanager.scheduler.event.AppAttemptRemovedSchedulerEvent;
import org.apache.hadoop.yarn.server.resourcemanager.scheduler.event.AppRemovedSchedulerEvent;
import org.apache.hadoop.yarn.server.resourcemanager.scheduler.event.ContainerExpiredSchedulerEvent;
import org.apache.hadoop.yarn.server.resourcemanager.scheduler.event.NodeAddedSchedulerEvent;
import org.apache.hadoop.yarn.server.resourcemanager.scheduler.event.NodeRemovedSchedulerEvent;
import org.apache.hadoop.yarn.server.resourcemanager.scheduler.event.NodeResourceUpdateSchedulerEvent;
import org.apache.hadoop.yarn.server.resourcemanager.scheduler.event.NodeUpdateSchedulerEvent;

import org.apache.hadoop.yarn.server.resourcemanager.scheduler.event.ReleaseContainerEvent;
import org.apache.hadoop.yarn.server.resourcemanager.scheduler.event.SchedulerEvent;
import org.apache.hadoop.yarn.server.resourcemanager.security.RMContainerTokenSecretManager;
import org.apache.hadoop.yarn.util.resource.DefaultResourceCalculator;
import org.apache.hadoop.yarn.util.resource.DominantResourceCalculator;
import org.apache.hadoop.yarn.util.resource.ResourceCalculator;
import org.apache.hadoop.yarn.util.resource.ResourceUtils;
import org.apache.hadoop.yarn.util.resource.Resources;

import java.io.IOException;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.Comparator;
import java.util.EnumSet;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.Set;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.locks.ReentrantReadWriteLock;
import java.util.concurrent.locks.ReentrantReadWriteLock.ReadLock;

/**
 * A scheduler that schedules resources between a set of queues. The scheduler
 * keeps track of the resources used by each queue, and attempts to maintain
 * fairness by scheduling tasks at queues whose allocations are farthest below
 * an ideal fair distribution.
 * 
 * The fair scheduler supports hierarchical queues. All queues descend from a
 * queue named "root". Available resources are distributed among the children
 * of the root queue in the typical fair scheduling fashion. Then, the children
 * distribute the resources assigned to them to their children in the same
 * fashion.  Applications may only be scheduled on leaf queues. Queues can be
 * specified as children of other queues by placing them as sub-elements of
 * their parents in the fair scheduler configuration file.
 *
 * A queue's name starts with the names of its parents, with periods as
 * separators.  So a queue named "queue1" under the root named, would be 
 * referred to as "root.queue1", and a queue named "queue2" under a queue
 * named "parent1" would be referred to as "root.parent1.queue2".
 */
@LimitedPrivate("yarn")
@Unstable
@SuppressWarnings("unchecked")
public class FairScheduler extends AbstractYarnScheduler<FSAppAttempt, FSSchedulerNode> {
    private FairSchedulerConfiguration conf;

    private FSContext context;
    private YarnAuthorizationProvider authorizer;
    private Resource incrAllocation;
    private QueueManager queueMgr;
    private boolean usePortForNodeName;

    private static final Log LOG = LogFactory.getLog(FairScheduler.class);
    private static final Log STATE_DUMP_LOG = LogFactory.getLog(FairScheduler.class.getName() + ".statedump");

    private static final ResourceCalculator RESOURCE_CALCULATOR = new DefaultResourceCalculator();
    private static final ResourceCalculator DOMINANT_RESOURCE_CALCULATOR = new DominantResourceCalculator();

    // Value that container assignment methods return when a container is
    // reserved
    public static final Resource CONTAINER_RESERVED = Resources.createResource(-1);

    private final int UPDATE_DEBUG_FREQUENCY = 25;
    private int updatesToSkipForDebug = UPDATE_DEBUG_FREQUENCY;

    @Deprecated
    @VisibleForTesting
    Thread schedulingThread;

    Thread preemptionThread;

    // Aggregate metrics
    FSQueueMetrics rootMetrics;
    FSOpDurations fsOpDurations;

    private float reservableNodesRatio; // percentage of available nodes
                                        // an app can be reserved on

    protected boolean sizeBasedWeight; // Give larger weights to larger jobs
    // Continuous Scheduling enabled or not
    @Deprecated
    protected boolean continuousSchedulingEnabled;
    // Sleep time for each pass in continuous scheduling
    @Deprecated
    protected volatile int continuousSchedulingSleepMs;
    // Node available resource comparator
    private Comparator<FSSchedulerNode> nodeAvailableResourceComparator = new NodeAvailableResourceComparator();
    protected double nodeLocalityThreshold; // Cluster threshold for node locality
    protected double rackLocalityThreshold; // Cluster threshold for rack locality
    @Deprecated
    protected long nodeLocalityDelayMs; // Delay for node locality
    @Deprecated
    protected long rackLocalityDelayMs; // Delay for rack locality
    protected boolean assignMultiple; // Allocate multiple containers per
                                      // heartbeat

    @VisibleForTesting
    boolean maxAssignDynamic;
    protected int maxAssign; // Max containers to assign per heartbeat

    @VisibleForTesting
    final MaxRunningAppsEnforcer maxRunningEnforcer;

    private AllocationFileLoaderService allocsLoader;
    @VisibleForTesting
    volatile AllocationConfiguration allocConf;

    // Container size threshold for making a reservation.
    @VisibleForTesting
    Resource reservationThreshold;

    public FairScheduler() {
        super(FairScheduler.class.getName());
        context = new FSContext(this);
        allocsLoader = new AllocationFileLoaderService();
        queueMgr = new QueueManager(this);
        maxRunningEnforcer = new MaxRunningAppsEnforcer(this);
    }

    public FSContext getContext() {
        return context;
    }

    public boolean isAtLeastReservationThreshold(ResourceCalculator resourceCalculator, Resource resource) {
        return Resources.greaterThanOrEqual(resourceCalculator, getClusterResource(), resource,
                reservationThreshold);
    }

    private void validateConf(FairSchedulerConfiguration config) {
        // validate scheduler memory allocation setting
        int minMem = config.getInt(YarnConfiguration.RM_SCHEDULER_MINIMUM_ALLOCATION_MB,
                YarnConfiguration.DEFAULT_RM_SCHEDULER_MINIMUM_ALLOCATION_MB);
        int maxMem = config.getInt(YarnConfiguration.RM_SCHEDULER_MAXIMUM_ALLOCATION_MB,
                YarnConfiguration.DEFAULT_RM_SCHEDULER_MAXIMUM_ALLOCATION_MB);

        if (minMem < 0 || minMem > maxMem) {
            throw new YarnRuntimeException("Invalid resource scheduler memory" + " allocation configuration: "
                    + YarnConfiguration.RM_SCHEDULER_MINIMUM_ALLOCATION_MB + "=" + minMem + ", "
                    + YarnConfiguration.RM_SCHEDULER_MAXIMUM_ALLOCATION_MB + "=" + maxMem
                    + ".  Both values must be greater than or equal to 0"
                    + "and the maximum allocation value must be greater than or equal to"
                    + "the minimum allocation value.");
        }

        long incrementMem = config.getIncrementAllocation().getMemorySize();
        if (incrementMem <= 0) {
            throw new YarnRuntimeException("Invalid resource scheduler memory" + " allocation configuration: "
                    + FairSchedulerConfiguration.RM_SCHEDULER_INCREMENT_ALLOCATION_MB + "=" + incrementMem
                    + ". Values must be greater than 0.");
        }

        // validate scheduler vcores allocation setting
        int minVcores = config.getInt(YarnConfiguration.RM_SCHEDULER_MINIMUM_ALLOCATION_VCORES,
                YarnConfiguration.DEFAULT_RM_SCHEDULER_MINIMUM_ALLOCATION_VCORES);
        int maxVcores = config.getInt(YarnConfiguration.RM_SCHEDULER_MAXIMUM_ALLOCATION_VCORES,
                YarnConfiguration.DEFAULT_RM_SCHEDULER_MAXIMUM_ALLOCATION_VCORES);

        if (minVcores < 0 || minVcores > maxVcores) {
            throw new YarnRuntimeException("Invalid resource scheduler vcores" + " allocation configuration: "
                    + YarnConfiguration.RM_SCHEDULER_MINIMUM_ALLOCATION_VCORES + "=" + minVcores + ", "
                    + YarnConfiguration.RM_SCHEDULER_MAXIMUM_ALLOCATION_VCORES + "=" + maxVcores
                    + ".  Both values must be greater than or equal to 0"
                    + "and the maximum allocation value must be greater than or equal to"
                    + "the minimum allocation value.");
        }

        int incrementVcore = config.getIncrementAllocation().getVirtualCores();
        if (incrementVcore <= 0) {
            throw new YarnRuntimeException("Invalid resource scheduler vcores" + " allocation configuration: "
                    + FairSchedulerConfiguration.RM_SCHEDULER_INCREMENT_ALLOCATION_VCORES + "=" + incrementVcore
                    + ". Values must be greater than 0.");
        }
    }

    public FairSchedulerConfiguration getConf() {
        return conf;
    }

    public int getNumNodesInRack(String rackName) {
        return nodeTracker.nodeCount(rackName);
    }

    public QueueManager getQueueManager() {
        return queueMgr;
    }

    /**
     * Thread which attempts scheduling resources continuously,
     * asynchronous to the node heartbeats.
     */
    @Deprecated
    private class ContinuousSchedulingThread extends Thread {

        @Override
        public void run() {
            while (!Thread.currentThread().isInterrupted()) {
                try {
                    continuousSchedulingAttempt();
                    Thread.sleep(getContinuousSchedulingSleepMs());
                } catch (InterruptedException e) {
                    LOG.warn("Continuous scheduling thread interrupted. Exiting.", e);
                    return;
                }
            }
        }
    }

    /**
     * Dump scheduler state including states of all queues.
     */
    private void dumpSchedulerState() {
        FSQueue rootQueue = queueMgr.getRootQueue();
        Resource clusterResource = getClusterResource();
        STATE_DUMP_LOG.debug("FairScheduler state: Cluster Capacity: " + clusterResource + "  Allocations: "
                + rootMetrics.getAllocatedResources() + "  Availability: "
                + Resource.newInstance(rootMetrics.getAvailableMB(), rootMetrics.getAvailableVirtualCores())
                + "  Demand: " + rootQueue.getDemand());

        STATE_DUMP_LOG.debug(rootQueue.dumpState());
    }

    /**
     * Recompute the internal variables used by the scheduler - per-job weights,
     * fair shares, deficits, minimum slot allocations, and amount of used and
     * required resources per job.
     */
    @VisibleForTesting
    @Override
    public void update() {
        // Storing start time for fsOpDurations
        long start = getClock().getTime();
        FSQueue rootQueue = queueMgr.getRootQueue();

        // Update demands and fairshares
        writeLock.lock();
        try {
            // Recursively update demands for all queues
            rootQueue.updateDemand();
            rootQueue.update(getClusterResource());

            // Update metrics
            updateRootQueueMetrics();
        } finally {
            writeLock.unlock();
        }

        readLock.lock();
        try {
            // Update starvation stats and identify starved applications
            if (shouldAttemptPreemption()) {
                for (FSLeafQueue queue : queueMgr.getLeafQueues()) {
                    queue.updateStarvedApps();
                }
            }

            // Log debug information
            if (STATE_DUMP_LOG.isDebugEnabled()) {
                if (--updatesToSkipForDebug < 0) {
                    updatesToSkipForDebug = UPDATE_DEBUG_FREQUENCY;
                    dumpSchedulerState();
                }
            }
        } finally {
            readLock.unlock();
        }
        fsOpDurations.addUpdateThreadRunDuration(getClock().getTime() - start);
    }

    public RMContainerTokenSecretManager getContainerTokenSecretManager() {
        return rmContext.getContainerTokenSecretManager();
    }

    public boolean isSizeBasedWeight() {
        return sizeBasedWeight;
    }

    public Resource getIncrementResourceCapability() {
        return incrAllocation;
    }

    private FSSchedulerNode getFSSchedulerNode(NodeId nodeId) {
        return nodeTracker.getNode(nodeId);
    }

    public double getNodeLocalityThreshold() {
        return nodeLocalityThreshold;
    }

    public double getRackLocalityThreshold() {
        return rackLocalityThreshold;
    }

    /**
     * Delay in milliseconds for locality fallback node to rack.
     * @deprecated linked to {@link #isContinuousSchedulingEnabled} deprecation
     * @return delay in ms
     */
    @Deprecated
    public long getNodeLocalityDelayMs() {
        return nodeLocalityDelayMs;
    }

    /**
     * Delay in milliseconds for locality fallback rack to other.
     * @deprecated linked to {@link #isContinuousSchedulingEnabled} deprecation
     * @return delay in ms
     */
    @Deprecated
    public long getRackLocalityDelayMs() {
        return rackLocalityDelayMs;
    }

    /**
     * Whether continuous scheduling is turned on.
     * @deprecated Continuous scheduling should not be turned ON. It is
     * deprecated because it can cause scheduler slowness due to locking issues.
     * Schedulers should use assignmultiple as a replacement.
     * @return whether continuous scheduling is enabled
     */
    @Deprecated
    public boolean isContinuousSchedulingEnabled() {
        return continuousSchedulingEnabled;
    }

    /**
     * The sleep time of the continuous scheduler thread.
     * @deprecated linked to {@link #isContinuousSchedulingEnabled} deprecation
     * @return sleep time in ms
     */
    @Deprecated
    public int getContinuousSchedulingSleepMs() {
        return continuousSchedulingSleepMs;
    }

    /**
     * Add a new application to the scheduler, with a given id, queue name, and
     * user. This will accept a new app even if the user or queue is above
     * configured limits, but the app will not be marked as runnable.
     */
    protected void addApplication(ApplicationId applicationId, String queueName, String user,
            boolean isAppRecovering) {
        if (queueName == null || queueName.isEmpty()) {
            String message = "Reject application " + applicationId + " submitted by user " + user
                    + " with an empty queue name.";
            rejectApplicationWithMessage(applicationId, message);
            return;
        }

        if (queueName.startsWith(".") || queueName.endsWith(".")) {
            String message = "Reject application " + applicationId + " submitted by user " + user
                    + " with an illegal queue name " + queueName + ". "
                    + "The queue name cannot start/end with period.";
            rejectApplicationWithMessage(applicationId, message);
            return;
        }

        writeLock.lock();
        try {
            RMApp rmApp = rmContext.getRMApps().get(applicationId);
            FSLeafQueue queue = assignToQueue(rmApp, queueName, user, applicationId);
            if (queue == null) {
                return;
            }

            if (rmApp != null && rmApp.getAMResourceRequests() != null) {
                // Resources.fitsIn would always return false when queueMaxShare is 0
                // for any resource, but only using Resources.fitsIn is not enough
                // is it would return false for such cases when the requested
                // resource is smaller than the max resource but that max resource is
                // not zero, e.g. requested vCores = 2, max vCores = 1.
                // With this check, we only reject those applications where resource
                // requested is greater than 0 and we have 0
                // of that resource on the queue.
                List<MaxResourceValidationResult> invalidAMResourceRequests = validateResourceRequests(
                        rmApp.getAMResourceRequests(), queue);

                if (!invalidAMResourceRequests.isEmpty()) {
                    String msg = String.format("Cannot submit application %s to queue %s because "
                            + "it has zero amount of resource for a requested "
                            + "resource! Invalid requested AM resources: %s, " + "maximum queue resources: %s",
                            applicationId, queue.getName(), invalidAMResourceRequests, queue.getMaxShare());
                    rejectApplicationWithMessage(applicationId, msg);
                    queue.removeAssignedApp(applicationId);
                    return;
                }
            }

            // Enforce ACLs
            UserGroupInformation userUgi = UserGroupInformation.createRemoteUser(user);

            if (!queue.hasAccess(QueueACL.SUBMIT_APPLICATIONS, userUgi)
                    && !queue.hasAccess(QueueACL.ADMINISTER_QUEUE, userUgi)) {
                String msg = "User " + userUgi.getUserName() + " cannot submit applications to queue "
                        + queue.getName() + "(requested queuename is " + queueName + ")";
                rejectApplicationWithMessage(applicationId, msg);
                queue.removeAssignedApp(applicationId);
                return;
            }

            SchedulerApplication<FSAppAttempt> application = new SchedulerApplication<FSAppAttempt>(queue, user);
            applications.put(applicationId, application);
            queue.getMetrics().submitApp(user);

            LOG.info("Accepted application " + applicationId + " from user: " + user + ", in queue: "
                    + queue.getName() + ", currently num of applications: " + applications.size());
            if (isAppRecovering) {
                if (LOG.isDebugEnabled()) {
                    LOG.debug(applicationId + " is recovering. Skip notifying APP_ACCEPTED");
                }
            } else {
                // During tests we do not always have an application object, handle
                // it here but we probably should fix the tests
                if (rmApp != null && rmApp.getApplicationSubmissionContext() != null) {
                    // Before we send out the event that the app is accepted is
                    // to set the queue in the submissionContext (needed on restore etc)
                    rmApp.getApplicationSubmissionContext().setQueue(queue.getName());
                }
                rmContext.getDispatcher().getEventHandler()
                        .handle(new RMAppEvent(applicationId, RMAppEventType.APP_ACCEPTED));
            }
        } finally {
            writeLock.unlock();
        }
    }

    /**
     * Add a new application attempt to the scheduler.
     */
    protected void addApplicationAttempt(ApplicationAttemptId applicationAttemptId,
            boolean transferStateFromPreviousAttempt, boolean isAttemptRecovering) {
        writeLock.lock();
        try {
            SchedulerApplication<FSAppAttempt> application = applications
                    .get(applicationAttemptId.getApplicationId());
            String user = application.getUser();
            FSLeafQueue queue = (FSLeafQueue) application.getQueue();

            FSAppAttempt attempt = new FSAppAttempt(this, applicationAttemptId, user, queue,
                    new ActiveUsersManager(getRootQueueMetrics()), rmContext);
            if (transferStateFromPreviousAttempt) {
                attempt.transferStateFromPreviousAttempt(application.getCurrentAppAttempt());
            }
            application.setCurrentAppAttempt(attempt);

            boolean runnable = maxRunningEnforcer.canAppBeRunnable(queue, attempt);
            queue.addApp(attempt, runnable);
            if (runnable) {
                maxRunningEnforcer.trackRunnableApp(attempt);
            } else {
                maxRunningEnforcer.trackNonRunnableApp(attempt);
            }

            queue.getMetrics().submitAppAttempt(user);

            LOG.info("Added Application Attempt " + applicationAttemptId + " to scheduler from user: " + user);

            if (isAttemptRecovering) {
                if (LOG.isDebugEnabled()) {
                    LOG.debug(applicationAttemptId + " is recovering. Skipping notifying ATTEMPT_ADDED");
                }
            } else {
                rmContext.getDispatcher().getEventHandler()
                        .handle(new RMAppAttemptEvent(applicationAttemptId, RMAppAttemptEventType.ATTEMPT_ADDED));
            }
        } finally {
            writeLock.unlock();
        }
    }

    /**
     * Helper method for the tests to assign the app to a queue.
     */
    @VisibleForTesting
    FSLeafQueue assignToQueue(RMApp rmApp, String queueName, String user) {
        return assignToQueue(rmApp, queueName, user, null);
    }

    /**
     * Helper method that attempts to assign the app to a queue. The method is
     * responsible to call the appropriate event-handler if the app is rejected.
     */
    private FSLeafQueue assignToQueue(RMApp rmApp, String queueName, String user, ApplicationId applicationId) {
        FSLeafQueue queue = null;
        String appRejectMsg = null;

        try {
            QueuePlacementPolicy placementPolicy = allocConf.getPlacementPolicy();
            queueName = placementPolicy.assignAppToQueue(queueName, user);
            if (queueName == null) {
                appRejectMsg = "Application rejected by queue placement policy";
            } else {
                queue = queueMgr.getLeafQueue(queueName, true, applicationId);
                if (queue == null) {
                    appRejectMsg = queueName + " is not a leaf queue";
                }
            }
        } catch (IllegalStateException se) {
            appRejectMsg = "Unable to match app " + rmApp.getApplicationId()
                    + " to a queue placement policy, and no valid terminal queue "
                    + " placement rule is configured. Please contact an administrator "
                    + " to confirm that the fair scheduler configuration contains a "
                    + " valid terminal queue placement rule.";
        } catch (InvalidQueueNameException qne) {
            appRejectMsg = qne.getMessage();
        } catch (IOException ioe) {
            // IOException should only happen for a user without groups
            appRejectMsg = "Error assigning app to a queue: " + ioe.getMessage();
        }

        if (appRejectMsg != null && rmApp != null) {
            rejectApplicationWithMessage(rmApp.getApplicationId(), appRejectMsg);
            return null;
        }

        if (rmApp != null) {
            rmApp.setQueue(queue.getName());
        } else {
            LOG.error("Couldn't find RM app to set queue name on");
        }
        return queue;
    }

    private void removeApplication(ApplicationId applicationId, RMAppState finalState) {
        SchedulerApplication<FSAppAttempt> application = applications.remove(applicationId);
        if (application == null) {
            LOG.warn("Couldn't find application " + applicationId);
        } else {
            application.stop(finalState);
        }
    }

    private void removeApplicationAttempt(ApplicationAttemptId applicationAttemptId,
            RMAppAttemptState rmAppAttemptFinalState, boolean keepContainers) {
        writeLock.lock();
        try {
            LOG.info("Application " + applicationAttemptId + " is done. finalState=" + rmAppAttemptFinalState);
            FSAppAttempt attempt = getApplicationAttempt(applicationAttemptId);

            if (attempt == null) {
                LOG.info("Unknown application " + applicationAttemptId + " has completed!");
                return;
            }

            // Check if the attempt is already stopped and don't stop it twice.
            if (attempt.isStopped()) {
                LOG.info("Application " + applicationAttemptId + " has already been " + "stopped!");
                return;
            }

            // Release all the running containers
            for (RMContainer rmContainer : attempt.getLiveContainers()) {
                if (keepContainers && rmContainer.getState().equals(RMContainerState.RUNNING)) {
                    // do not kill the running container in the case of work-preserving AM
                    // restart.
                    LOG.info("Skip killing " + rmContainer.getContainerId());
                    continue;
                }
                super.completedContainer(rmContainer,
                        SchedulerUtils.createAbnormalContainerStatus(rmContainer.getContainerId(),
                                SchedulerUtils.COMPLETED_APPLICATION),
                        RMContainerEventType.KILL);
            }

            // Release all reserved containers
            for (RMContainer rmContainer : attempt.getReservedContainers()) {
                super.completedContainer(rmContainer, SchedulerUtils.createAbnormalContainerStatus(
                        rmContainer.getContainerId(), "Application Complete"), RMContainerEventType.KILL);
            }
            // Clean up pending requests, metrics etc.
            attempt.stop(rmAppAttemptFinalState);

            // Inform the queue
            FSLeafQueue queue = queueMgr.getLeafQueue(attempt.getQueue().getQueueName(), false);
            boolean wasRunnable = queue.removeApp(attempt);

            if (wasRunnable) {
                maxRunningEnforcer.untrackRunnableApp(attempt);
                maxRunningEnforcer.updateRunnabilityOnAppRemoval(attempt, attempt.getQueue());
            } else {
                maxRunningEnforcer.untrackNonRunnableApp(attempt);
            }
        } finally {
            writeLock.unlock();
        }
    }

    /**
     * Clean up a completed container.
     */
    @Override
    protected void completedContainerInternal(RMContainer rmContainer, ContainerStatus containerStatus,
            RMContainerEventType event) {
        writeLock.lock();
        try {
            Container container = rmContainer.getContainer();

            // Get the application for the finished container
            FSAppAttempt application = getCurrentAttemptForContainer(container.getId());
            ApplicationId appId = container.getId().getApplicationAttemptId().getApplicationId();
            if (application == null) {
                LOG.info("Container " + container + " of finished application " + appId + " completed with event "
                        + event);
                return;
            }

            // Get the node on which the container was allocated
            NodeId nodeID = container.getNodeId();
            FSSchedulerNode node = getFSSchedulerNode(nodeID);
            // node could be null if the thread was waiting for the lock and the node
            // was removed in another thread
            if (rmContainer.getState() == RMContainerState.RESERVED) {
                if (node != null) {
                    application.unreserve(rmContainer.getReservedSchedulerKey(), node);
                } else if (LOG.isDebugEnabled()) {
                    LOG.debug("Skipping unreserve on removed node: " + nodeID);
                }
            } else {
                application.containerCompleted(rmContainer, containerStatus, event);
                if (node != null) {
                    node.releaseContainer(rmContainer.getContainerId(), false);
                } else if (LOG.isDebugEnabled()) {
                    LOG.debug("Skipping container release on removed node: " + nodeID);
                }
                updateRootQueueMetrics();
            }

            if (LOG.isDebugEnabled()) {
                LOG.debug("Application attempt " + application.getApplicationAttemptId() + " released container "
                        + container.getId() + " on node: " + (node == null ? nodeID : node) + " with event: "
                        + event);
            }
        } finally {
            writeLock.unlock();
        }
    }

    private void addNode(List<NMContainerStatus> containerReports, RMNode node) {
        writeLock.lock();
        try {
            FSSchedulerNode schedulerNode = new FSSchedulerNode(node, usePortForNodeName);
            nodeTracker.addNode(schedulerNode);

            triggerUpdate();

            Resource clusterResource = getClusterResource();
            queueMgr.getRootQueue().setSteadyFairShare(clusterResource);
            queueMgr.getRootQueue().recomputeSteadyShares();
            LOG.info("Added node " + node.getNodeAddress() + " cluster capacity: " + clusterResource);

            recoverContainersOnNode(containerReports, node);
            updateRootQueueMetrics();
        } finally {
            writeLock.unlock();
        }
    }

    private void removeNode(RMNode rmNode) {
        writeLock.lock();
        try {
            NodeId nodeId = rmNode.getNodeID();
            FSSchedulerNode node = nodeTracker.getNode(nodeId);
            if (node == null) {
                LOG.error("Attempting to remove non-existent node " + nodeId);
                return;
            }

            // Remove running containers
            List<RMContainer> runningContainers = node.getCopiedListOfRunningContainers();
            for (RMContainer container : runningContainers) {
                super.completedContainer(container, SchedulerUtils.createAbnormalContainerStatus(
                        container.getContainerId(), SchedulerUtils.LOST_CONTAINER), RMContainerEventType.KILL);
            }

            // Remove reservations, if any
            RMContainer reservedContainer = node.getReservedContainer();
            if (reservedContainer != null) {
                super.completedContainer(reservedContainer,
                        SchedulerUtils.createAbnormalContainerStatus(reservedContainer.getContainerId(),
                                SchedulerUtils.LOST_CONTAINER),
                        RMContainerEventType.KILL);
            }

            nodeTracker.removeNode(nodeId);
            Resource clusterResource = getClusterResource();
            queueMgr.getRootQueue().setSteadyFairShare(clusterResource);
            queueMgr.getRootQueue().recomputeSteadyShares();
            updateRootQueueMetrics();
            triggerUpdate();

            LOG.info("Removed node " + rmNode.getNodeAddress() + " cluster capacity: " + clusterResource);
        } finally {
            writeLock.unlock();
        }
    }

    @Override
    public Resource getNormalizedResource(Resource requestedResource, Resource maxResourceCapability) {
        return SchedulerUtils.getNormalizedResource(requestedResource, DOMINANT_RESOURCE_CALCULATOR,
                minimumAllocation, maxResourceCapability, incrAllocation);
    }

    @Override
    public Resource getMaximumResourceCapability(String queueName) {
        if (queueName == null || queueName.isEmpty()) {
            return getMaximumResourceCapability();
        }
        FSQueue queue = queueMgr.getQueue(queueName);
        Resource schedulerLevelMaxResourceCapability = getMaximumResourceCapability();
        if (queue == null) {
            return schedulerLevelMaxResourceCapability;
        }
        Resource queueMaxResourceCapability = queue.getMaximumContainerAllocation();
        if (queueMaxResourceCapability.equals(Resources.unbounded())) {
            return schedulerLevelMaxResourceCapability;
        } else {
            return Resources.componentwiseMin(schedulerLevelMaxResourceCapability, queueMaxResourceCapability);
        }
    }

    @VisibleForTesting
    @Override
    public void killContainer(RMContainer container) {
        ContainerStatus status = SchedulerUtils.createKilledContainerStatus(container.getContainerId(),
                "Killed by RM to simulate an AM container failure");
        LOG.info("Killing container " + container);
        completedContainer(container, status, RMContainerEventType.KILL);
    }

    @Override
    public Allocation allocate(ApplicationAttemptId appAttemptId, List<ResourceRequest> ask,
            List<SchedulingRequest> schedulingRequests, List<ContainerId> release, List<String> blacklistAdditions,
            List<String> blacklistRemovals, ContainerUpdates updateRequests) {
        // Make sure this application exists
        FSAppAttempt application = getSchedulerApp(appAttemptId);
        if (application == null) {
            LOG.error("Calling allocate on removed or non existent application " + appAttemptId.getApplicationId());
            return EMPTY_ALLOCATION;
        }

        // The allocate may be the leftover from previous attempt, and it will
        // impact current attempt, such as confuse the request and allocation for
        // current attempt's AM container.
        // Note outside precondition check for the attempt id may be
        // outdated here, so double check it here is necessary.
        if (!application.getApplicationAttemptId().equals(appAttemptId)) {
            LOG.error("Calling allocate on previous or removed " + "or non existent application attempt "
                    + appAttemptId);
            return EMPTY_ALLOCATION;
        }

        ApplicationId applicationId = application.getApplicationId();
        FSLeafQueue queue = application.getQueue();
        List<MaxResourceValidationResult> invalidAsks = validateResourceRequests(ask, queue);

        // We need to be fail-fast here if any invalid ask is detected.
        // If we would have thrown exception later, this could be problematic as
        // tokens and promoted / demoted containers would have been lost because
        // scheduler would clear them right away and AM
        // would not get this information.
        if (!invalidAsks.isEmpty()) {
            throw new SchedulerInvalidResoureRequestException(String.format(
                    "Resource request is invalid for application %s because queue %s "
                            + "has 0 amount of resource for a resource type! " + "Validation result: %s",
                    applicationId, queue.getName(), invalidAsks));
        }

        // Handle promotions and demotions
        handleContainerUpdates(application, updateRequests);

        // Sanity check
        normalizeResourceRequests(ask, queue.getName());

        // TODO, normalize SchedulingRequest

        // Record container allocation start time
        application.recordContainerRequestTime(getClock().getTime());

        // Release containers
        releaseContainers(release, application);

        ReentrantReadWriteLock.WriteLock lock = application.getWriteLock();
        lock.lock();
        try {
            if (!ask.isEmpty()) {
                if (LOG.isDebugEnabled()) {
                    LOG.debug("allocate: pre-update" + " applicationAttemptId=" + appAttemptId + " application="
                            + application.getApplicationId());
                }
                application.showRequests();

                // Update application requests
                application.updateResourceRequests(ask);

                // TODO, handle SchedulingRequest
                application.showRequests();
            }
        } finally {
            lock.unlock();
        }

        Set<ContainerId> preemptionContainerIds = application.getPreemptionContainerIds();
        if (LOG.isDebugEnabled()) {
            LOG.debug("allocate: post-update" + " applicationAttemptId=" + appAttemptId + " #ask=" + ask.size()
                    + " reservation= " + application.getCurrentReservation());

            LOG.debug("Preempting " + preemptionContainerIds.size() + " container(s)");
        }

        application.updateBlacklist(blacklistAdditions, blacklistRemovals);

        List<Container> newlyAllocatedContainers = application.pullNewlyAllocatedContainers();
        // Record container allocation time
        if (!(newlyAllocatedContainers.isEmpty())) {
            application.recordContainerAllocationTime(getClock().getTime());
        }

        Resource headroom = application.getHeadroom();
        application.setApplicationHeadroomForMetrics(headroom);

        List<Container> previousAttemptContainers = application.pullPreviousAttemptContainers();
        List<NMToken> updatedNMTokens = application.pullUpdatedNMTokens();
        return new Allocation(newlyAllocatedContainers, headroom, preemptionContainerIds, null, null,
                updatedNMTokens, null, null, application.pullNewlyPromotedContainers(),
                application.pullNewlyDemotedContainers(), previousAttemptContainers);
    }

    private List<MaxResourceValidationResult> validateResourceRequests(List<ResourceRequest> requests,
            FSLeafQueue queue) {
        List<MaxResourceValidationResult> validationResults = Lists.newArrayList();

        for (ResourceRequest resourceRequest : requests) {
            if (LOG.isTraceEnabled()) {
                LOG.trace("Validating resource request: " + resourceRequest);
            }

            MaxResourceValidationResult validationResult = SchedulerUtils
                    .validateResourceRequestsAgainstQueueMaxResource(resourceRequest, queue.getMaxShare());
            if (!validationResult.isValid()) {
                validationResults.add(validationResult);
                LOG.warn(String.format(
                        "Queue %s cannot handle resource request"
                                + "because it has zero available amount of resource "
                                + "for a requested resource type, " + "so the resource request is ignored!"
                                + " Requested resources: %s, " + "maximum queue resources: %s",
                        queue.getName(), resourceRequest.getCapability(), queue.getMaxShare()));
            }
        }

        return validationResults;
    }

    @Override
    protected void nodeUpdate(RMNode nm) {
        writeLock.lock();
        try {
            long start = getClock().getTime();
            super.nodeUpdate(nm);

            FSSchedulerNode fsNode = getFSSchedulerNode(nm.getNodeID());
            attemptScheduling(fsNode);

            long duration = getClock().getTime() - start;
            fsOpDurations.addNodeUpdateDuration(duration);
        } finally {
            writeLock.unlock();
        }
    }

    @Deprecated
    void continuousSchedulingAttempt() throws InterruptedException {
        long start = getClock().getTime();
        List<FSSchedulerNode> nodeIdList;
        // Hold a lock to prevent comparator order changes due to changes of node
        // unallocated resources
        synchronized (this) {
            nodeIdList = nodeTracker.sortedNodeList(nodeAvailableResourceComparator);
        }

        // iterate all nodes
        for (FSSchedulerNode node : nodeIdList) {
            try {
                if (Resources.fitsIn(minimumAllocation, node.getUnallocatedResource())) {
                    attemptScheduling(node);
                }
            } catch (Throwable ex) {
                LOG.error("Error while attempting scheduling for node " + node + ": " + ex.toString(), ex);
                if ((ex instanceof YarnRuntimeException) && (ex.getCause() instanceof InterruptedException)) {
                    // AsyncDispatcher translates InterruptedException to
                    // YarnRuntimeException with cause InterruptedException.
                    // Need to throw InterruptedException to stop schedulingThread.
                    throw (InterruptedException) ex.getCause();
                }
            }
        }

        long duration = getClock().getTime() - start;
        fsOpDurations.addContinuousSchedulingRunDuration(duration);
    }

    /** Sort nodes by available resource */
    private class NodeAvailableResourceComparator implements Comparator<FSSchedulerNode> {

        @Override
        public int compare(FSSchedulerNode n1, FSSchedulerNode n2) {
            return RESOURCE_CALCULATOR.compare(getClusterResource(), n2.getUnallocatedResource(),
                    n1.getUnallocatedResource());
        }
    }

    private boolean shouldContinueAssigning(int containers, Resource maxResourcesToAssign,
            Resource assignedResource) {
        if (!assignMultiple) {
            return false; // assignMultiple is not enabled. Allocate one at a time.
        }

        if (maxAssignDynamic) {
            // Using fitsIn to check if the resources assigned so far are less than
            // or equal to max resources to assign (half of remaining resources).
            // The "equal to" part can lead to allocating one extra container.
            return Resources.fitsIn(assignedResource, maxResourcesToAssign);
        } else {
            return maxAssign <= 0 || containers < maxAssign;
        }
    }

    /**
     * Assign preempted containers to the applications that have reserved
     * resources for preempted containers.
     * @param node Node to check
     */
    static void assignPreemptedContainers(FSSchedulerNode node) {
        for (Entry<FSAppAttempt, Resource> entry : node.getPreemptionList().entrySet()) {
            FSAppAttempt app = entry.getKey();
            Resource preemptionPending = Resources.clone(entry.getValue());
            while (!app.isStopped() && !Resources.isNone(preemptionPending)) {
                Resource assigned = app.assignContainer(node);
                if (Resources.isNone(assigned) || assigned.equals(FairScheduler.CONTAINER_RESERVED)) {
                    // Fail to assign, let's not try further
                    break;
                }
                Resources.subtractFromNonNegative(preemptionPending, assigned);
            }
        }
    }

    @VisibleForTesting
    void attemptScheduling(FSSchedulerNode node) {
        writeLock.lock();
        try {
            if (rmContext.isWorkPreservingRecoveryEnabled()
                    && !rmContext.isSchedulerReadyForAllocatingContainers()) {
                return;
            }

            final NodeId nodeID = (node != null ? node.getNodeID() : null);
            if (!nodeTracker.exists(nodeID)) {
                // The node might have just been removed while this thread was waiting
                // on the synchronized lock before it entered this synchronized method
                LOG.info("Skipping scheduling as the node " + nodeID + " has been removed");
                return;
            }

            // Assign new containers...
            // 1. Ensure containers are assigned to the apps that preempted
            // 2. Check for reserved applications
            // 3. Schedule if there are no reservations

            // Apps may wait for preempted containers
            // We have to satisfy these first to avoid cases, when we preempt
            // a container for A from B and C gets the preempted containers,
            // when C does not qualify for preemption itself.
            assignPreemptedContainers(node);
            FSAppAttempt reservedAppSchedulable = node.getReservedAppSchedulable();
            boolean validReservation = false;
            if (reservedAppSchedulable != null) {
                validReservation = reservedAppSchedulable.assignReservedContainer(node);
            }
            if (!validReservation) {
                // No reservation, schedule at queue which is farthest below fair share
                int assignedContainers = 0;
                Resource assignedResource = Resources.clone(Resources.none());
                Resource maxResourcesToAssign = Resources.multiply(node.getUnallocatedResource(), 0.5f);

                while (node.getReservedContainer() == null) {
                    Resource assignment = queueMgr.getRootQueue().assignContainer(node);

                    if (assignment.equals(Resources.none())) {
                        if (LOG.isDebugEnabled()) {
                            LOG.debug("No container is allocated on node " + node);
                        }
                        break;
                    }

                    assignedContainers++;
                    Resources.addTo(assignedResource, assignment);
                    if (!shouldContinueAssigning(assignedContainers, maxResourcesToAssign, assignedResource)) {
                        break;
                    }
                }
            }
            updateRootQueueMetrics();
        } finally {
            writeLock.unlock();
        }
    }

    public FSAppAttempt getSchedulerApp(ApplicationAttemptId appAttemptId) {
        return super.getApplicationAttempt(appAttemptId);
    }

    @Override
    public ResourceCalculator getResourceCalculator() {
        return RESOURCE_CALCULATOR;
    }

    /**
     * Subqueue metrics might be a little out of date because fair shares are
     * recalculated at the update interval, but the root queue metrics needs to
     * be updated synchronously with allocations and completions so that cluster
     * metrics will be consistent.
     */
    private void updateRootQueueMetrics() {
        rootMetrics.setAvailableResourcesToQueue(
                Resources.subtract(getClusterResource(), rootMetrics.getAllocatedResources()));
    }

    /**
     * Check if preemption is enabled and the utilization threshold for
     * preemption is met.
     *
     * @return true if preemption should be attempted, false otherwise.
     */
    private boolean shouldAttemptPreemption() {
        if (context.isPreemptionEnabled()) {
            return (context.getPreemptionUtilizationThreshold() < Math.max(
                    (float) rootMetrics.getAllocatedMB() / getClusterResource().getMemorySize(),
                    (float) rootMetrics.getAllocatedVirtualCores() / getClusterResource().getVirtualCores()));
        }
        return false;
    }

    @Override
    public QueueMetrics getRootQueueMetrics() {
        return rootMetrics;
    }

    @Override
    public void handle(SchedulerEvent event) {
        switch (event.getType()) {
        case NODE_ADDED:
            if (!(event instanceof NodeAddedSchedulerEvent)) {
                throw new RuntimeException("Unexpected event type: " + event);
            }
            NodeAddedSchedulerEvent nodeAddedEvent = (NodeAddedSchedulerEvent) event;
            addNode(nodeAddedEvent.getContainerReports(), nodeAddedEvent.getAddedRMNode());
            break;
        case NODE_REMOVED:
            if (!(event instanceof NodeRemovedSchedulerEvent)) {
                throw new RuntimeException("Unexpected event type: " + event);
            }
            NodeRemovedSchedulerEvent nodeRemovedEvent = (NodeRemovedSchedulerEvent) event;
            removeNode(nodeRemovedEvent.getRemovedRMNode());
            break;
        case NODE_UPDATE:
            if (!(event instanceof NodeUpdateSchedulerEvent)) {
                throw new RuntimeException("Unexpected event type: " + event);
            }
            NodeUpdateSchedulerEvent nodeUpdatedEvent = (NodeUpdateSchedulerEvent) event;
            nodeUpdate(nodeUpdatedEvent.getRMNode());
            break;
        case APP_ADDED:
            if (!(event instanceof AppAddedSchedulerEvent)) {
                throw new RuntimeException("Unexpected event type: " + event);
            }
            AppAddedSchedulerEvent appAddedEvent = (AppAddedSchedulerEvent) event;
            String queueName = resolveReservationQueueName(appAddedEvent.getQueue(),
                    appAddedEvent.getApplicationId(), appAddedEvent.getReservationID(),
                    appAddedEvent.getIsAppRecovering());
            if (queueName != null) {
                addApplication(appAddedEvent.getApplicationId(), queueName, appAddedEvent.getUser(),
                        appAddedEvent.getIsAppRecovering());
            }
            break;
        case APP_REMOVED:
            if (!(event instanceof AppRemovedSchedulerEvent)) {
                throw new RuntimeException("Unexpected event type: " + event);
            }
            AppRemovedSchedulerEvent appRemovedEvent = (AppRemovedSchedulerEvent) event;
            removeApplication(appRemovedEvent.getApplicationID(), appRemovedEvent.getFinalState());
            break;
        case NODE_RESOURCE_UPDATE:
            if (!(event instanceof NodeResourceUpdateSchedulerEvent)) {
                throw new RuntimeException("Unexpected event type: " + event);
            }
            NodeResourceUpdateSchedulerEvent nodeResourceUpdatedEvent = (NodeResourceUpdateSchedulerEvent) event;
            updateNodeResource(nodeResourceUpdatedEvent.getRMNode(), nodeResourceUpdatedEvent.getResourceOption());
            break;
        case APP_ATTEMPT_ADDED:
            if (!(event instanceof AppAttemptAddedSchedulerEvent)) {
                throw new RuntimeException("Unexpected event type: " + event);
            }
            AppAttemptAddedSchedulerEvent appAttemptAddedEvent = (AppAttemptAddedSchedulerEvent) event;
            addApplicationAttempt(appAttemptAddedEvent.getApplicationAttemptId(),
                    appAttemptAddedEvent.getTransferStateFromPreviousAttempt(),
                    appAttemptAddedEvent.getIsAttemptRecovering());
            break;
        case APP_ATTEMPT_REMOVED:
            if (!(event instanceof AppAttemptRemovedSchedulerEvent)) {
                throw new RuntimeException("Unexpected event type: " + event);
            }
            AppAttemptRemovedSchedulerEvent appAttemptRemovedEvent = (AppAttemptRemovedSchedulerEvent) event;
            removeApplicationAttempt(appAttemptRemovedEvent.getApplicationAttemptID(),
                    appAttemptRemovedEvent.getFinalAttemptState(),
                    appAttemptRemovedEvent.getKeepContainersAcrossAppAttempts());
            break;
        case RELEASE_CONTAINER:
            if (!(event instanceof ReleaseContainerEvent)) {
                throw new RuntimeException("Unexpected event type: " + event);
            }
            RMContainer container = ((ReleaseContainerEvent) event).getContainer();
            completedContainer(container, SchedulerUtils.createAbnormalContainerStatus(container.getContainerId(),
                    SchedulerUtils.RELEASED_CONTAINER), RMContainerEventType.RELEASED);
            break;
        case CONTAINER_EXPIRED:
            if (!(event instanceof ContainerExpiredSchedulerEvent)) {
                throw new RuntimeException("Unexpected event type: " + event);
            }
            ContainerExpiredSchedulerEvent containerExpiredEvent = (ContainerExpiredSchedulerEvent) event;
            ContainerId containerId = containerExpiredEvent.getContainerId();
            super.completedContainer(getRMContainer(containerId),
                    SchedulerUtils.createAbnormalContainerStatus(containerId, SchedulerUtils.EXPIRED_CONTAINER),
                    RMContainerEventType.EXPIRE);
            break;
        default:
            LOG.error("Unknown event arrived at FairScheduler: " + event.toString());
        }
    }

    private String resolveReservationQueueName(String queueName, ApplicationId applicationId,
            ReservationId reservationID, boolean isRecovering) {
        readLock.lock();
        try {
            FSQueue queue = queueMgr.getQueue(queueName);
            if ((queue == null) || !allocConf.isReservable(queue.getQueueName())) {
                return queueName;
            }
            // Use fully specified name from now on (including root. prefix)
            queueName = queue.getQueueName();
            if (reservationID != null) {
                String resQName = queueName + "." + reservationID.toString();
                queue = queueMgr.getQueue(resQName);
                if (queue == null) {
                    // reservation has terminated during failover
                    if (isRecovering && allocConf.getMoveOnExpiry(queueName)) {
                        // move to the default child queue of the plan
                        return getDefaultQueueForPlanQueue(queueName);
                    }
                    String message = "Application " + applicationId
                            + " submitted to a reservation which is not yet " + "currently active: " + resQName;
                    rejectApplicationWithMessage(applicationId, message);
                    return null;
                }
                if (!queue.getParent().getQueueName().equals(queueName)) {
                    String message = "Application: " + applicationId + " submitted to a reservation " + resQName
                            + " which does not belong to the specified queue: " + queueName;
                    rejectApplicationWithMessage(applicationId, message);
                    return null;
                }
                // use the reservation queue to run the app
                queueName = resQName;
            } else {
                // use the default child queue of the plan for unreserved apps
                queueName = getDefaultQueueForPlanQueue(queueName);
            }
            return queueName;
        } finally {
            readLock.unlock();
        }
    }

    private void rejectApplicationWithMessage(ApplicationId applicationId, String msg) {
        LOG.info(msg);
        rmContext.getDispatcher().getEventHandler()
                .handle(new RMAppEvent(applicationId, RMAppEventType.APP_REJECTED, msg));
    }

    private String getDefaultQueueForPlanQueue(String queueName) {
        String planName = queueName.substring(queueName.lastIndexOf(".") + 1);
        queueName = queueName + "." + planName + ReservationConstants.DEFAULT_QUEUE_SUFFIX;
        return queueName;
    }

    @Override
    public void recover(RMState state) throws Exception {
        // NOT IMPLEMENTED
    }

    public void setRMContext(RMContext rmContext) {
        this.rmContext = rmContext;
    }

    @SuppressWarnings("deprecation")
    private void initScheduler(Configuration conf) throws IOException {
        writeLock.lock();
        try {
            this.conf = new FairSchedulerConfiguration(conf);
            validateConf(this.conf);
            authorizer = YarnAuthorizationProvider.getInstance(conf);
            minimumAllocation = super.getMinimumAllocation();
            initMaximumResourceCapability(super.getMaximumAllocation());
            incrAllocation = this.conf.getIncrementAllocation();
            updateReservationThreshold();
            continuousSchedulingEnabled = this.conf.isContinuousSchedulingEnabled();
            continuousSchedulingSleepMs = this.conf.getContinuousSchedulingSleepMs();
            nodeLocalityThreshold = this.conf.getLocalityThresholdNode();
            rackLocalityThreshold = this.conf.getLocalityThresholdRack();
            nodeLocalityDelayMs = this.conf.getLocalityDelayNodeMs();
            rackLocalityDelayMs = this.conf.getLocalityDelayRackMs();
            assignMultiple = this.conf.getAssignMultiple();
            maxAssignDynamic = this.conf.isMaxAssignDynamic();
            maxAssign = this.conf.getMaxAssign();
            sizeBasedWeight = this.conf.getSizeBasedWeight();
            usePortForNodeName = this.conf.getUsePortForNodeName();
            reservableNodesRatio = this.conf.getReservableNodes();

            updateInterval = this.conf.getUpdateInterval();
            if (updateInterval < 0) {
                updateInterval = FairSchedulerConfiguration.DEFAULT_UPDATE_INTERVAL_MS;
                LOG.warn(FairSchedulerConfiguration.UPDATE_INTERVAL_MS + " is invalid, so using default value "
                        + FairSchedulerConfiguration.DEFAULT_UPDATE_INTERVAL_MS + " ms instead");
            }

            rootMetrics = FSQueueMetrics.forQueue("root", null, true, conf);
            fsOpDurations = FSOpDurations.getInstance(true);

            // This stores per-application scheduling information
            this.applications = new ConcurrentHashMap<>();

            allocConf = new AllocationConfiguration(conf);
            try {
                queueMgr.initialize(conf);
            } catch (Exception e) {
                throw new IOException("Failed to start FairScheduler", e);
            }

            if (continuousSchedulingEnabled) {
                // Continuous scheduling is deprecated log it on startup
                LOG.warn("Continuous scheduling is turned ON. It is deprecated "
                        + "because it can cause scheduler slowness due to locking issues. "
                        + "Schedulers should use assignmultiple as a replacement.");
                // start continuous scheduling thread
                schedulingThread = new ContinuousSchedulingThread();
                schedulingThread.setName("FairSchedulerContinuousScheduling");
                schedulingThread
                        .setUncaughtExceptionHandler(new RMCriticalThreadUncaughtExceptionHandler(rmContext));
                schedulingThread.setDaemon(true);
            }

            if (this.conf.getPreemptionEnabled()) {
                createPreemptionThread();
            }
        } finally {
            writeLock.unlock();
        }

        allocsLoader.init(conf);
        allocsLoader.setReloadListener(new AllocationReloadListener());
        // If we fail to load allocations file on initialize, we want to fail
        // immediately.  After a successful load, exceptions on future reloads
        // will just result in leaving things as they are.
        try {
            allocsLoader.reloadAllocations();
        } catch (Exception e) {
            throw new IOException("Failed to initialize FairScheduler", e);
        }
    }

    @VisibleForTesting
    protected void createPreemptionThread() {
        preemptionThread = new FSPreemptionThread(this);
        preemptionThread.setUncaughtExceptionHandler(new RMCriticalThreadUncaughtExceptionHandler(rmContext));
    }

    private void updateReservationThreshold() {
        Resource newThreshold = Resources.multiply(getIncrementResourceCapability(),
                this.conf.getReservationThresholdIncrementMultiple());

        reservationThreshold = newThreshold;
    }

    private void startSchedulerThreads() {
        writeLock.lock();
        try {
            Preconditions.checkNotNull(allocsLoader, "allocsLoader is null");
            if (continuousSchedulingEnabled) {
                Preconditions.checkNotNull(schedulingThread, "schedulingThread is null");
                schedulingThread.start();
            }
            if (preemptionThread != null) {
                preemptionThread.start();
            }
            allocsLoader.start();
        } finally {
            writeLock.unlock();
        }
    }

    @Override
    public void serviceInit(Configuration conf) throws Exception {
        initScheduler(conf);
        super.serviceInit(conf);

        // Initialize SchedulingMonitorManager
        schedulingMonitorManager.initialize(rmContext, conf);
    }

    @Override
    public void serviceStart() throws Exception {
        startSchedulerThreads();
        super.serviceStart();
    }

    @SuppressWarnings("deprecation")
    @Override
    public void serviceStop() throws Exception {
        writeLock.lock();
        try {
            if (continuousSchedulingEnabled) {
                if (schedulingThread != null) {
                    schedulingThread.interrupt();
                    schedulingThread.join(THREAD_JOIN_TIMEOUT_MS);
                }
            }
            if (preemptionThread != null) {
                preemptionThread.interrupt();
                preemptionThread.join(THREAD_JOIN_TIMEOUT_MS);
            }
            if (allocsLoader != null) {
                allocsLoader.stop();
            }
        } finally {
            writeLock.unlock();
        }

        super.serviceStop();
    }

    @Override
    public void reinitialize(Configuration conf, RMContext rmContext) throws IOException {
        try {
            allocsLoader.reloadAllocations();
            super.reinitialize(conf, rmContext);
        } catch (Exception e) {
            LOG.error("Failed to reload allocations file", e);
        }
        try {
            refreshMaximumAllocation(ResourceUtils.fetchMaximumAllocationFromConfig(conf));
        } catch (Exception e) {
            LOG.error("Failed to refresh maximum allocation", e);
        }
    }

    @Override
    public QueueInfo getQueueInfo(String queueName, boolean includeChildQueues, boolean recursive)
            throws IOException {
        if (!queueMgr.exists(queueName)) {
            throw new IOException("queue " + queueName + " does not exist");
        }
        return queueMgr.getQueue(queueName).getQueueInfo(includeChildQueues, recursive);
    }

    @Override
    public List<QueueUserACLInfo> getQueueUserAclInfo() {
        UserGroupInformation user;
        try {
            user = UserGroupInformation.getCurrentUser();
        } catch (IOException ioe) {
            return new ArrayList<QueueUserACLInfo>();
        }

        return queueMgr.getRootQueue().getQueueUserAclInfo(user);
    }

    @Override
    public int getNumClusterNodes() {
        return nodeTracker.nodeCount();
    }

    @Override
    public boolean checkAccess(UserGroupInformation callerUGI, QueueACL acl, String queueName) {
        readLock.lock();
        try {
            FSQueue queue = getQueueManager().getQueue(queueName);
            if (queue == null) {
                if (LOG.isDebugEnabled()) {
                    LOG.debug("ACL not found for queue access-type " + acl + " for queue " + queueName);
                }
                return false;
            }
            return queue.hasAccess(acl, callerUGI);
        } finally {
            readLock.unlock();
        }
    }

    public AllocationConfiguration getAllocationConfiguration() {
        return allocConf;
    }

    private class AllocationReloadListener implements AllocationFileLoaderService.Listener {

        @Override
        public void onReload(AllocationConfiguration queueInfo) throws IOException {
            // Commit the reload; also create any queue defined in the alloc file
            // if it does not already exist, so it can be displayed on the web UI.

            Set<String> removedStaticQueues = getRemovedStaticQueues(queueInfo);
            writeLock.lock();
            try {
                if (queueInfo == null) {
                    authorizer.setPermission(allocsLoader.getDefaultPermissions(),
                            UserGroupInformation.getCurrentUser());
                } else {
                    allocConf = queueInfo;
                    setQueueAcls(allocConf.getQueueAcls());
                    allocConf.getDefaultSchedulingPolicy().initialize(getContext());
                    queueMgr.updateAllocationConfiguration(allocConf);
                    queueMgr.setQueuesToDynamic(removedStaticQueues);
                    applyChildDefaults();
                    maxRunningEnforcer.updateRunnabilityOnReload();
                }
            } finally {
                writeLock.unlock();
            }
        }

        private Set<String> getRemovedStaticQueues(AllocationConfiguration queueInfo) {
            if (queueInfo == null || allocConf == null) {
                return Collections.emptySet();
            }
            Set<String> removedStaticQueues = new HashSet<>();
            for (Set<String> queues : allocConf.getConfiguredQueues().values()) {
                removedStaticQueues.addAll(queues);
            }
            for (Set<String> queues : queueInfo.getConfiguredQueues().values()) {
                removedStaticQueues.removeAll(queues);
            }
            return removedStaticQueues;
        }

        @Override
        public void onCheck() {
            queueMgr.removeEmptyDynamicQueues();
            queueMgr.removePendingIncompatibleQueues();
        }
    }

    private void setQueueAcls(Map<String, Map<AccessType, AccessControlList>> queueAcls) throws IOException {
        authorizer.setPermission(allocsLoader.getDefaultPermissions(), UserGroupInformation.getCurrentUser());
        List<Permission> permissions = new ArrayList<>();
        for (Entry<String, Map<AccessType, AccessControlList>> queueAcl : queueAcls.entrySet()) {
            permissions.add(
                    new Permission(new PrivilegedEntity(EntityType.QUEUE, queueAcl.getKey()), queueAcl.getValue()));
        }
        authorizer.setPermission(permissions, UserGroupInformation.getCurrentUser());
    }

    /**
     * After reloading the allocation config, the max resource settings for any
     * ad hoc queues will be missing. This method goes through the queue manager's
     * queue list and adds back the max resources settings for any ad hoc queues.
     * Note that the new max resource settings will be based on the new config.
     * The old settings are lost.
     */
    private void applyChildDefaults() {
        Collection<FSQueue> queues = queueMgr.getQueues();
        Set<String> configuredLeafQueues = allocConf.getConfiguredQueues().get(FSQueueType.LEAF);
        Set<String> configuredParentQueues = allocConf.getConfiguredQueues().get(FSQueueType.PARENT);

        for (FSQueue queue : queues) {
            // If the queue is ad hoc and not root, apply the child defaults
            if ((queue.getParent() != null) && !configuredLeafQueues.contains(queue.getName())
                    && !configuredParentQueues.contains(queue.getName())) {
                ConfigurableResource max = queue.getParent().getMaxChildQueueResource();

                if (max != null) {
                    queue.setMaxShare(max);
                }
            }
        }
    }

    @Override
    public List<ApplicationAttemptId> getAppsInQueue(String queueName) {
        FSQueue queue = queueMgr.getQueue(queueName);
        if (queue == null) {
            return null;
        }
        List<ApplicationAttemptId> apps = new ArrayList<ApplicationAttemptId>();
        queue.collectSchedulerApplications(apps);
        return apps;
    }

    @Override
    public String moveApplication(ApplicationId appId, String queueName) throws YarnException {
        writeLock.lock();
        try {
            SchedulerApplication<FSAppAttempt> app = applications.get(appId);
            if (app == null) {
                throw new YarnException("App to be moved " + appId + " not found.");
            }
            FSAppAttempt attempt = (FSAppAttempt) app.getCurrentAppAttempt();
            // To serialize with FairScheduler#allocate, synchronize on app attempt

            attempt.getWriteLock().lock();
            try {
                FSLeafQueue oldQueue = (FSLeafQueue) app.getQueue();
                // Check if the attempt is already stopped: don't move stopped app
                // attempt. The attempt has already been removed from all queues.
                if (attempt.isStopped()) {
                    LOG.info("Application " + appId + " is stopped and can't be moved!");
                    throw new YarnException("Application " + appId + " is stopped and can't be moved!");
                }
                String destQueueName = handleMoveToPlanQueue(queueName);
                FSLeafQueue targetQueue = queueMgr.getLeafQueue(destQueueName, false);
                if (targetQueue == null) {
                    throw new YarnException("Target queue " + queueName + " not found or is not a leaf queue.");
                }
                if (targetQueue == oldQueue) {
                    return oldQueue.getQueueName();
                }

                if (oldQueue.isRunnableApp(attempt)) {
                    verifyMoveDoesNotViolateConstraints(attempt, oldQueue, targetQueue);
                }

                executeMove(app, attempt, oldQueue, targetQueue);
                return targetQueue.getQueueName();
            } finally {
                attempt.getWriteLock().unlock();
            }
        } finally {
            writeLock.unlock();
        }
    }

    @Override
    public void preValidateMoveApplication(ApplicationId appId, String newQueue) throws YarnException {
        writeLock.lock();
        try {
            SchedulerApplication<FSAppAttempt> app = applications.get(appId);
            if (app == null) {
                throw new YarnException("App to be moved " + appId + " not found.");
            }

            FSAppAttempt attempt = app.getCurrentAppAttempt();
            // To serialize with FairScheduler#allocate, synchronize on app attempt

            attempt.getWriteLock().lock();
            try {
                FSLeafQueue oldQueue = (FSLeafQueue) app.getQueue();
                String destQueueName = handleMoveToPlanQueue(newQueue);
                FSLeafQueue targetQueue = queueMgr.getLeafQueue(destQueueName, false);
                if (targetQueue == null) {
                    throw new YarnException("Target queue " + newQueue + " not found or is not a leaf queue.");
                }

                if (oldQueue.isRunnableApp(attempt)) {
                    verifyMoveDoesNotViolateConstraints(attempt, oldQueue, targetQueue);
                }
            } finally {
                attempt.getWriteLock().unlock();
            }
        } finally {
            writeLock.unlock();
        }
    }

    private void verifyMoveDoesNotViolateConstraints(FSAppAttempt app, FSLeafQueue oldQueue,
            FSLeafQueue targetQueue) throws YarnException {
        String queueName = targetQueue.getQueueName();
        ApplicationAttemptId appAttId = app.getApplicationAttemptId();
        // When checking maxResources and maxRunningApps, only need to consider
        // queues before the lowest common ancestor of the two queues because the
        // total running apps in queues above will not be changed.
        FSQueue lowestCommonAncestor = findLowestCommonAncestorQueue(oldQueue, targetQueue);
        Resource consumption = app.getCurrentConsumption();

        // Check whether the move would go over maxRunningApps or maxShare
        FSQueue cur = targetQueue;
        while (cur != lowestCommonAncestor) {
            // maxRunningApps
            if (cur.getNumRunnableApps() == cur.getMaxRunningApps()) {
                throw new YarnException("Moving app attempt " + appAttId + " to queue " + queueName
                        + " would violate queue maxRunningApps constraints on" + " queue " + cur.getQueueName());
            }

            // maxShare
            if (!Resources.fitsIn(Resources.add(cur.getResourceUsage(), consumption), cur.getMaxShare())) {
                throw new YarnException("Moving app attempt " + appAttId + " to queue " + queueName
                        + " would violate queue maxShare constraints on" + " queue " + cur.getQueueName());
            }

            cur = cur.getParent();
        }
    }

    /**
     * Helper for moveApplication, which has appropriate synchronization, so all
     * operations will be atomic.
     */
    private void executeMove(SchedulerApplication<FSAppAttempt> app, FSAppAttempt attempt, FSLeafQueue oldQueue,
            FSLeafQueue newQueue) throws YarnException {
        // Check current runs state. Do not remove the attempt from the queue until
        // after the check has been performed otherwise it could remove the app
        // from a queue without moving it to a new queue.
        boolean wasRunnable = oldQueue.isRunnableApp(attempt);
        // if app was not runnable before, it may be runnable now
        boolean nowRunnable = maxRunningEnforcer.canAppBeRunnable(newQueue, attempt);
        if (wasRunnable && !nowRunnable) {
            throw new YarnException("Should have already verified that app " + attempt.getApplicationId()
                    + " would be runnable in new queue");
        }

        // Now it is safe to remove from the queue.
        oldQueue.removeApp(attempt);

        if (wasRunnable) {
            maxRunningEnforcer.untrackRunnableApp(attempt);
        } else if (nowRunnable) {
            // App has changed from non-runnable to runnable
            maxRunningEnforcer.untrackNonRunnableApp(attempt);
        }

        attempt.move(newQueue); // This updates all the metrics
        app.setQueue(newQueue);
        newQueue.addApp(attempt, nowRunnable);

        if (nowRunnable) {
            maxRunningEnforcer.trackRunnableApp(attempt);
        }
        if (wasRunnable) {
            maxRunningEnforcer.updateRunnabilityOnAppRemoval(attempt, oldQueue);
        }
    }

    @VisibleForTesting
    FSQueue findLowestCommonAncestorQueue(FSQueue queue1, FSQueue queue2) {
        // Because queue names include ancestors, separated by periods, we can find
        // the lowest common ancestors by going from the start of the names until
        // there's a character that doesn't match.
        String name1 = queue1.getName();
        String name2 = queue2.getName();
        // We keep track of the last period we encounter to avoid returning root.apple
        // when the queues are root.applepie and root.appletart
        int lastPeriodIndex = -1;
        for (int i = 0; i < Math.max(name1.length(), name2.length()); i++) {
            if (name1.length() <= i || name2.length() <= i || name1.charAt(i) != name2.charAt(i)) {
                return queueMgr.getQueue(name1.substring(0, lastPeriodIndex));
            } else if (name1.charAt(i) == '.') {
                lastPeriodIndex = i;
            }
        }
        return queue1; // names are identical
    }

    /**
     * Process resource update on a node and update Queue.
     */
    @Override
    public void updateNodeResource(RMNode nm, ResourceOption resourceOption) {
        writeLock.lock();
        try {
            super.updateNodeResource(nm, resourceOption);
            updateRootQueueMetrics();
            queueMgr.getRootQueue().setSteadyFairShare(getClusterResource());
            queueMgr.getRootQueue().recomputeSteadyShares();
        } finally {
            writeLock.unlock();
        }
    }

    /** {@inheritDoc} */
    @Override
    public EnumSet<SchedulerResourceTypes> getSchedulingResourceTypes() {
        return EnumSet.of(SchedulerResourceTypes.MEMORY, SchedulerResourceTypes.CPU);
    }

    @Override
    public Set<String> getPlanQueues() throws YarnException {
        Set<String> planQueues = new HashSet<String>();
        for (FSQueue fsQueue : queueMgr.getQueues()) {
            String queueName = fsQueue.getName();
            if (allocConf.isReservable(queueName)) {
                planQueues.add(queueName);
            }
        }
        return planQueues;
    }

    @Override
    public void setEntitlement(String queueName, QueueEntitlement entitlement) throws YarnException {

        FSLeafQueue reservationQueue = queueMgr.getLeafQueue(queueName, false);
        if (reservationQueue == null) {
            throw new YarnException("Target queue " + queueName + " not found or is not a leaf queue.");
        }

        reservationQueue.setWeights(entitlement.getCapacity());

        // TODO Does MaxCapacity need to be set for fairScheduler ?
    }

    /**
     * Only supports removing empty leaf queues
     * @param queueName name of queue to remove
     * @throws YarnException if queue to remove is either not a leaf or if its
     * not empty
     */
    @Override
    public void removeQueue(String queueName) throws YarnException {
        FSLeafQueue reservationQueue = queueMgr.getLeafQueue(queueName, false);
        if (reservationQueue != null) {
            if (!queueMgr.removeLeafQueue(queueName)) {
                throw new YarnException("Could not remove queue " + queueName + " as "
                        + "its either not a leaf queue or its not empty");
            }
        }
    }

    private String handleMoveToPlanQueue(String targetQueueName) {
        FSQueue dest = queueMgr.getQueue(targetQueueName);
        if (dest != null && allocConf.isReservable(dest.getQueueName())) {
            // use the default child reservation queue of the plan
            targetQueueName = getDefaultQueueForPlanQueue(targetQueueName);
        }
        return targetQueueName;
    }

    public float getReservableNodesRatio() {
        return reservableNodesRatio;
    }

    long getNMHeartbeatInterval() {
        return nmHeartbeatInterval;
    }

    ReadLock getSchedulerReadLock() {
        return this.readLock;
    }

    @Override
    public long checkAndGetApplicationLifetime(String queueName, long lifetime) {
        // Lifetime is the application lifetime by default.
        return lifetime;
    }
}