org.apache.tez.dag.app.rm.LlapTaskSchedulerService.java Source code

Introduction

Here is the source code for org.apache.tez.dag.app.rm.LlapTaskSchedulerService.java
Source

/*
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.tez.dag.app.rm;

import java.io.IOException;
import java.util.Arrays;
import java.util.Comparator;
import java.util.EnumSet;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.LinkedHashMap;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.NavigableMap;
import java.util.Random;
import java.util.Set;
import java.util.TreeMap;
import java.util.TreeSet;
import java.util.concurrent.Callable;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.ConcurrentMap;
import java.util.concurrent.DelayQueue;
import java.util.concurrent.Delayed;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.atomic.AtomicBoolean;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.concurrent.atomic.AtomicLong;
import java.util.concurrent.locks.Condition;
import java.util.concurrent.locks.Lock;
import java.util.concurrent.locks.ReentrantLock;
import java.util.concurrent.locks.ReentrantReadWriteLock;

import com.google.common.collect.Sets;
import com.google.common.util.concurrent.FutureCallback;
import com.google.common.util.concurrent.Futures;
import org.apache.commons.lang.mutable.MutableInt;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.hive.conf.HiveConf;
import org.apache.hadoop.hive.conf.HiveConf.ConfVars;
import org.apache.hadoop.hive.llap.registry.ServiceInstance;
import org.apache.hadoop.hive.llap.registry.ServiceInstanceSet;
import org.apache.hadoop.hive.llap.registry.impl.LlapRegistryService;
import org.apache.hadoop.yarn.api.records.Container;
import org.apache.hadoop.yarn.api.records.ContainerId;
import org.apache.hadoop.yarn.api.records.NodeId;
import org.apache.hadoop.yarn.api.records.Priority;
import org.apache.hadoop.yarn.api.records.Resource;
import org.apache.hadoop.yarn.util.Clock;

import com.google.common.annotations.VisibleForTesting;
import com.google.common.base.Preconditions;
import com.google.common.util.concurrent.ListenableFuture;
import com.google.common.util.concurrent.ListeningExecutorService;
import com.google.common.util.concurrent.MoreExecutors;
import com.google.common.util.concurrent.ThreadFactoryBuilder;
import org.apache.hadoop.yarn.util.SystemClock;
import org.apache.tez.common.TezUtils;
import org.apache.tez.dag.api.TezUncheckedException;
import org.apache.tez.serviceplugins.api.TaskAttemptEndReason;
import org.apache.tez.serviceplugins.api.TaskScheduler;
import org.apache.tez.serviceplugins.api.TaskSchedulerContext;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

public class LlapTaskSchedulerService extends TaskScheduler {

    private static final Logger LOG = LoggerFactory.getLogger(LlapTaskSchedulerService.class);

    private final Configuration conf;

    // interface into the registry service
    private ServiceInstanceSet activeInstances;

    // Tracks all instances, including ones which have been disabled in the past.
    // LinkedHashMap to provide the same iteration order when selecting a random host.
    @VisibleForTesting
    final Map<ServiceInstance, NodeInfo> instanceToNodeMap = new LinkedHashMap<>();
    // TODO Ideally, remove elements from this once it's known that no tasks are linked to the instance (all deallocated)

    // Tracks tasks which could not be allocated immediately.
    @VisibleForTesting
    final TreeMap<Priority, List<TaskInfo>> pendingTasks = new TreeMap<>(new Comparator<Priority>() {
        @Override
        public int compare(Priority o1, Priority o2) {
            return o1.getPriority() - o2.getPriority();
        }
    });

    // Tracks running and queued tasks. Cleared after a task completes.
    private final ConcurrentMap<Object, TaskInfo> knownTasks = new ConcurrentHashMap<>();
    private final TreeMap<Integer, TreeSet<TaskInfo>> runningTasks = new TreeMap<>();
    private static final TaskStartComparator TASK_INFO_COMPARATOR = new TaskStartComparator();

    // Queue for disabled nodes. Nodes make it out of this queue when their expiration timeout is hit.
    @VisibleForTesting
    final DelayQueue<NodeInfo> disabledNodesQueue = new DelayQueue<>();

    private final boolean forceLocation;

    private final ContainerFactory containerFactory;
    private final Random random = new Random();
    private final Clock clock;

    private final ListeningExecutorService nodeEnabledExecutor;
    private final NodeEnablerCallable nodeEnablerCallable = new NodeEnablerCallable();

    private final ReentrantReadWriteLock lock = new ReentrantReadWriteLock();
    private final ReentrantReadWriteLock.ReadLock readLock = lock.readLock();
    private final ReentrantReadWriteLock.WriteLock writeLock = lock.writeLock();

    private final Lock scheduleLock = new ReentrantLock();
    private final Condition scheduleCondition = scheduleLock.newCondition();
    private final AtomicBoolean pendingScheduleInvodations = new AtomicBoolean(false);
    private final ListeningExecutorService schedulerExecutor;
    private final SchedulerCallable schedulerCallable = new SchedulerCallable();

    private final AtomicBoolean isStopped = new AtomicBoolean(false);
    // Tracks total pending preemptions.
    private final AtomicInteger pendingPreemptions = new AtomicInteger(0);
    // Tracks pending preemptions per host, using the hostname || Always to be accessed inside a lock
    private final Map<String, MutableInt> pendingPreemptionsPerHost = new HashMap<>();

    private final NodeBlacklistConf nodeBlacklistConf;

    // Per daemon
    private final int memoryPerInstance;
    private final int coresPerInstance;
    private final int executorsPerInstance;

    private final int numSchedulableTasksPerNode;

    // Per Executor Thread
    private final Resource resourcePerExecutor;

    private final LlapRegistryService registry = new LlapRegistryService(false);

    private volatile ListenableFuture<Void> nodeEnablerFuture;
    private volatile ListenableFuture<Void> schedulerFuture;

    @VisibleForTesting
    private final AtomicInteger dagCounter = new AtomicInteger(1);
    // Statistics to track allocations
    // All of stats variables are visible for testing.
    @VisibleForTesting
    StatsPerDag dagStats = new StatsPerDag();

    public LlapTaskSchedulerService(TaskSchedulerContext taskSchedulerContext) {
        this(taskSchedulerContext, new SystemClock());
    }

    @VisibleForTesting
    public LlapTaskSchedulerService(TaskSchedulerContext taskSchedulerContext, Clock clock) {
        super(taskSchedulerContext);
        this.clock = clock;
        try {
            this.conf = TezUtils.createConfFromUserPayload(taskSchedulerContext.getInitialUserPayload());
        } catch (IOException e) {
            throw new TezUncheckedException(
                    "Failed to parse user payload for " + LlapTaskSchedulerService.class.getSimpleName(), e);
        }
        this.containerFactory = new ContainerFactory(taskSchedulerContext.getApplicationAttemptId(),
                taskSchedulerContext.getCustomClusterIdentifier());
        this.memoryPerInstance = HiveConf.getIntVar(conf, ConfVars.LLAP_DAEMON_MEMORY_PER_INSTANCE_MB);
        this.coresPerInstance = HiveConf.getIntVar(conf, ConfVars.LLAP_DAEMON_VCPUS_PER_INSTANCE);
        this.executorsPerInstance = HiveConf.getIntVar(conf, ConfVars.LLAP_DAEMON_NUM_EXECUTORS);
        this.nodeBlacklistConf = new NodeBlacklistConf(
                HiveConf.getTimeVar(conf, ConfVars.LLAP_TASK_SCHEDULER_NODE_REENABLE_MIN_TIMEOUT_MS,
                        TimeUnit.MILLISECONDS),
                HiveConf.getTimeVar(conf, ConfVars.LLAP_TASK_SCHEDULER_NODE_REENABLE_MAX_TIMEOUT_MS,
                        TimeUnit.MILLISECONDS),
                HiveConf.getFloatVar(conf, ConfVars.LLAP_TASK_SCHEDULER_NODE_DISABLE_BACK_OFF_FACTOR));

        this.numSchedulableTasksPerNode = HiveConf.getIntVar(conf,
                ConfVars.LLAP_TASK_SCHEDULER_NUM_SCHEDULABLE_TASKS_PER_NODE);

        long localityDelayMs = HiveConf.getTimeVar(conf, ConfVars.LLAP_TASK_SCHEDULER_LOCALITY_DELAY,
                TimeUnit.MILLISECONDS);
        if (localityDelayMs == -1) {
            this.forceLocation = true;
        } else {
            this.forceLocation = false;
        }

        int memoryPerExecutor = (int) (memoryPerInstance / (float) executorsPerInstance);
        int coresPerExecutor = (int) (coresPerInstance / (float) executorsPerInstance);
        this.resourcePerExecutor = Resource.newInstance(memoryPerExecutor, coresPerExecutor);

        String instanceId = HiveConf.getTrimmedVar(conf, ConfVars.LLAP_DAEMON_SERVICE_HOSTS);

        Preconditions.checkNotNull(instanceId, ConfVars.LLAP_DAEMON_SERVICE_HOSTS.varname + " must be defined");

        ExecutorService executorServiceRaw = Executors.newFixedThreadPool(1,
                new ThreadFactoryBuilder().setDaemon(true).setNameFormat("LlapSchedulerNodeEnabler").build());
        nodeEnabledExecutor = MoreExecutors.listeningDecorator(executorServiceRaw);

        ExecutorService schedulerExecutorServiceRaw = Executors.newFixedThreadPool(1,
                new ThreadFactoryBuilder().setDaemon(true).setNameFormat("LlapScheduler").build());
        schedulerExecutor = MoreExecutors.listeningDecorator(schedulerExecutorServiceRaw);

        LOG.info("Running with configuration: " + "memoryPerInstance=" + memoryPerInstance + ", vCoresPerInstance="
                + coresPerInstance + ", executorsPerInstance=" + executorsPerInstance
                + ", resourcePerInstanceInferred=" + resourcePerExecutor + ", nodeBlacklistConf="
                + nodeBlacklistConf + ", forceLocation=" + forceLocation);
    }

    @Override
    public void initialize() {
        registry.init(conf);
    }

    @Override
    public void start() throws IOException {
        writeLock.lock();
        try {
            nodeEnablerFuture = nodeEnabledExecutor.submit(nodeEnablerCallable);
            Futures.addCallback(nodeEnablerFuture, new FutureCallback<Void>() {
                @Override
                public void onSuccess(Void result) {
                    LOG.info("NodeEnabledThread exited");
                }

                @Override
                public void onFailure(Throwable t) {
                    LOG.warn("NodeEnabledThread exited with error", t);
                }
            });
            schedulerFuture = schedulerExecutor.submit(schedulerCallable);
            Futures.addCallback(schedulerFuture, new FutureCallback<Void>() {
                @Override
                public void onSuccess(Void result) {
                    LOG.info("SchedulerThread exited");
                }

                @Override
                public void onFailure(Throwable t) {
                    LOG.warn("SchedulerThread exited with error", t);
                }
            });
            registry.start();
            activeInstances = registry.getInstances();
            for (ServiceInstance inst : activeInstances.getAll().values()) {
                addNode(inst, new NodeInfo(inst, nodeBlacklistConf, clock, numSchedulableTasksPerNode));
            }
        } finally {
            writeLock.unlock();
        }
    }

    @Override
    public void shutdown() {
        writeLock.lock();
        try {
            if (!this.isStopped.getAndSet(true)) {
                nodeEnablerCallable.shutdown();
                if (nodeEnablerFuture != null) {
                    nodeEnablerFuture.cancel(true);
                }
                nodeEnabledExecutor.shutdownNow();

                schedulerCallable.shutdown();
                if (schedulerFuture != null) {
                    schedulerFuture.cancel(true);
                }
                schedulerExecutor.shutdownNow();

                if (registry != null) {
                    registry.stop();
                }
            }
        } finally {
            writeLock.unlock();
        }
    }

    @Override
    public Resource getTotalResources() {
        int memory = 0;
        int vcores = 0;
        readLock.lock();
        try {
            for (ServiceInstance inst : activeInstances.getAll().values()) {
                if (inst.isAlive()) {
                    Resource r = inst.getResource();
                    LOG.info("Found instance " + inst);
                    memory += r.getMemory();
                    vcores += r.getVirtualCores();
                } else {
                    LOG.info("Ignoring dead instance " + inst);
                }
            }
        } finally {
            readLock.unlock();
        }

        return Resource.newInstance(memory, vcores);
    }

    /**
     * The difference between this and getTotalResources() is that this only gives currently free
     * resource instances, while the other lists all the instances that may become available in a
     * while.
     */
    @Override
    public Resource getAvailableResources() {
        // need a state store eventually for current state & measure backoffs
        int memory = 0;
        int vcores = 0;
        readLock.lock();
        try {
            for (Entry<ServiceInstance, NodeInfo> entry : instanceToNodeMap.entrySet()) {
                if (entry.getKey().isAlive() && !entry.getValue().isDisabled()) {
                    Resource r = entry.getKey().getResource();
                    memory += r.getMemory();
                    vcores += r.getVirtualCores();
                }
            }
        } finally {
            readLock.unlock();
        }

        return Resource.newInstance(memory, vcores);
    }

    @Override
    public int getClusterNodeCount() {
        readLock.lock();
        try {
            int n = 0;
            for (ServiceInstance inst : activeInstances.getAll().values()) {
                if (inst.isAlive()) {
                    n++;
                }
            }
            return n;
        } finally {
            readLock.unlock();
        }
    }

    @Override
    public void dagComplete() {
        // This is effectively DAG completed, and can be used to reset statistics being tracked.
        LOG.info("DAG: " + dagCounter.get() + " completed. Scheduling stats: " + dagStats);
        dagCounter.incrementAndGet();
        dagStats = new StatsPerDag();
    }

    @Override
    public void blacklistNode(NodeId nodeId) {
        LOG.info("BlacklistNode not supported");
    }

    @Override
    public void unblacklistNode(NodeId nodeId) {
        LOG.info("unBlacklistNode not supported");
    }

    @Override
    public void allocateTask(Object task, Resource capability, String[] hosts, String[] racks, Priority priority,
            Object containerSignature, Object clientCookie) {
        TaskInfo taskInfo = new TaskInfo(task, clientCookie, priority, capability, hosts, racks, clock.getTime());
        writeLock.lock();
        try {
            dagStats.registerTaskRequest(hosts, racks);
        } finally {
            writeLock.unlock();
        }
        addPendingTask(taskInfo);
        trySchedulingPendingTasks();
    }

    @Override
    public void allocateTask(Object task, Resource capability, ContainerId containerId, Priority priority,
            Object containerSignature, Object clientCookie) {
        // Container affinity can be implemented as Host affinity for LLAP. Not required until
        // 1:1 edges are used in Hive.
        TaskInfo taskInfo = new TaskInfo(task, clientCookie, priority, capability, null, null, clock.getTime());
        writeLock.lock();
        try {
            dagStats.registerTaskRequest(null, null);
        } finally {
            writeLock.unlock();
        }
        addPendingTask(taskInfo);
        trySchedulingPendingTasks();
    }

    // This may be invoked before a container is ever assigned to a task. allocateTask... app decides
    // the task is no longer required, and asks for a de-allocation.
    @Override
    public boolean deallocateTask(Object task, boolean taskSucceeded, TaskAttemptEndReason endReason,
            String diagnostics) {
        writeLock.lock(); // Updating several local structures
        TaskInfo taskInfo;
        try {
            taskInfo = unregisterTask(task);
            if (taskInfo == null) {
                LOG.error("Could not determine ContainerId for task: " + task
                        + " . Could have hit a race condition. Ignoring."
                        + " The query may hang since this \"unknown\" container is now taking up a slot permanently");
                return false;
            }
            if (taskInfo.containerId == null) {
                if (taskInfo.assigned) {
                    LOG.error("Task: " + task + " assigned, but could not find the corresponding containerId."
                            + " The query may hang since this \"unknown\" container is now taking up a slot permanently");
                } else {
                    LOG.info("Ignoring deallocate request for task " + task
                            + " which hasn't been assigned to a container");
                    removePendingTask(taskInfo);
                }
                return false;
            }
            ServiceInstance assignedInstance = taskInfo.assignedInstance;
            assert assignedInstance != null;

            NodeInfo nodeInfo = instanceToNodeMap.get(assignedInstance);
            assert nodeInfo != null;

            // Re-enable the node if preempted
            if (taskInfo.preempted) {
                LOG.info("Processing deallocateTask for {} which was preempted, EndReason={}", task, endReason);
                unregisterPendingPreemption(taskInfo.assignedInstance.getHost());
                nodeInfo.registerUnsuccessfulTaskEnd(true);
                if (nodeInfo.isDisabled()) {
                    // Re-enable the node. If a task succeeded, a slot may have become available.
                    // Also reset commFailures since a task was able to communicate back and indicate success.
                    nodeInfo.enableNode();
                    // Re-insert into the queue to force the poll thread to remove the element.
                    if (disabledNodesQueue.remove(nodeInfo)) {
                        disabledNodesQueue.add(nodeInfo);
                    }
                }
                // In case of success, trigger a scheduling run for pending tasks.
                trySchedulingPendingTasks();
            } else {
                if (taskSucceeded) {
                    // The node may have been blacklisted at this point - which means it may not be in the
                    // activeNodeList.

                    nodeInfo.registerTaskSuccess();

                    if (nodeInfo.isDisabled()) {
                        // Re-enable the node. If a task succeeded, a slot may have become available.
                        // Also reset commFailures since a task was able to communicate back and indicate success.
                        nodeInfo.enableNode();
                        // Re-insert into the queue to force the poll thread to remove the element.
                        if (disabledNodesQueue.remove(nodeInfo)) {
                            disabledNodesQueue.add(nodeInfo);
                        }
                    }
                    // In case of success, trigger a scheduling run for pending tasks.
                    trySchedulingPendingTasks();

                } else if (!taskSucceeded) {
                    nodeInfo.registerUnsuccessfulTaskEnd(false);
                    if (endReason != null && EnumSet
                            .of(TaskAttemptEndReason.EXECUTOR_BUSY, TaskAttemptEndReason.COMMUNICATION_ERROR)
                            .contains(endReason)) {
                        if (endReason == TaskAttemptEndReason.COMMUNICATION_ERROR) {
                            dagStats.registerCommFailure(taskInfo.assignedInstance.getHost());
                        } else if (endReason == TaskAttemptEndReason.EXECUTOR_BUSY) {
                            dagStats.registerTaskRejected(taskInfo.assignedInstance.getHost());
                        }
                    }
                    boolean commFailure = endReason != null
                            && endReason == TaskAttemptEndReason.COMMUNICATION_ERROR;
                    disableInstance(assignedInstance, commFailure);
                }
            }
        } finally {
            writeLock.unlock();
        }
        getContext().containerBeingReleased(taskInfo.containerId);
        return true;
    }

    @Override
    public Object deallocateContainer(ContainerId containerId) {
        LOG.debug("Ignoring deallocateContainer for containerId: " + containerId);
        // Containers are not being tracked for re-use.
        // This is safe to ignore since a deallocate task will come in.
        return null;
    }

    @Override
    public void setShouldUnregister() {

    }

    @Override
    public boolean hasUnregistered() {
        // Nothing to do. No registration involved.
        return true;
    }

    private ExecutorService createAppCallbackExecutorService() {
        return Executors.newSingleThreadExecutor(
                new ThreadFactoryBuilder().setNameFormat("TaskSchedulerAppCaller #%d").setDaemon(true).build());
    }

    /**
     * @param request the list of preferred hosts. null implies any host
     * @return
     */
    private SelectHostResult selectHost(TaskInfo request) {
        String[] requestedHosts = request.requestedHosts;
        readLock.lock(); // Read-lock. Not updating any stats at the moment.
        try {
            // Check if any hosts are active.
            if (getAvailableResources().getMemory() <= 0) {
                if (LOG.isDebugEnabled()) {
                    LOG.debug("Refreshing instances since total memory is 0");
                }
                refreshInstances();
            }

            // If there's no memory available, fail
            if (getTotalResources().getMemory() <= 0) {
                return SELECT_HOST_RESULT_INADEQUATE_TOTAL_CAPACITY;
            }

            if (requestedHosts != null && requestedHosts.length > 0) {
                int prefHostCount = -1;
                boolean requestedHostExists = false;
                for (String host : requestedHosts) {
                    prefHostCount++;
                    // Pick the first host always. Weak attempt at cache affinity.
                    Set<ServiceInstance> instances = activeInstances.getByHost(host);
                    if (!instances.isEmpty()) {
                        requestedHostExists = true;
                        for (ServiceInstance inst : instances) {
                            NodeInfo nodeInfo = instanceToNodeMap.get(inst);
                            if (nodeInfo != null && nodeInfo.canAcceptTask()) {
                                LOG.info("Assigning " + inst + " when looking for " + host + "."
                                        + " FirstRequestedHost=" + (prefHostCount == 0)
                                        + (requestedHosts.length > 1 ? "#prefLocations=" + requestedHosts.length
                                                : ""));
                                return new SelectHostResult(inst, nodeInfo);
                            }
                        }
                    }
                }
                // Check if forcing the location is required.
                if (forceLocation) {
                    if (requestedHostExists) {
                        if (LOG.isDebugEnabled()) {
                            LOG.debug("Skipping non-local location allocation for [" + request.task
                                    + "] when trying to allocate on [" + Arrays.toString(requestedHosts) + "]");
                        }
                        return SELECT_HOST_RESULT_DELAYED_LOCALITY;
                    } else {
                        if (LOG.isDebugEnabled()) {
                            LOG.debug("Not skipping non-local location allocation for [" + request.task
                                    + "] when trying to allocate on [" + Arrays.toString(requestedHosts)
                                    + "] since none of these hosts are part of the known list");
                        }
                    }
                }
            }
            /* fall through - miss in locality (random scheduling) */
            Entry<ServiceInstance, NodeInfo>[] all = instanceToNodeMap.entrySet()
                    .toArray(new Entry[instanceToNodeMap.size()]);
            // Check again
            if (all.length > 0) {
                int n = random.nextInt(all.length);
                // start at random offset and iterate whole list
                for (int i = 0; i < all.length; i++) {
                    Entry<ServiceInstance, NodeInfo> inst = all[(i + n) % all.length];
                    if (inst.getValue().canAcceptTask()) {
                        LOG.info("Assigning " + inst + " when looking for any host, from #hosts=" + all.length
                                + ", requestedHosts="
                                + ((requestedHosts == null || requestedHosts.length == 0) ? "null"
                                        : Arrays.toString(requestedHosts)));
                        return new SelectHostResult(inst.getKey(), inst.getValue());
                    }
                }
            }
            return SELECT_HOST_RESULT_DELAYED_RESOURCES;
        } finally {
            readLock.unlock();
        }
    }

    // TODO Each refresh operation should addNodes if they don't already exist.
    // Even better would be to get notifications from the service impl when a node gets added or removed.
    // Instead of having to walk through the entire list. The computation of a node getting added or
    // removed already exists in the DynamicRegistry implementation.
    private void refreshInstances() {
        try {
            activeInstances.refresh(); // handles its own sync
        } catch (IOException ioe) {
            LOG.warn("Could not refresh list of active instances", ioe);
        }
    }

    private void scanForNodeChanges() {
        /* check again whether nodes are disabled or just missing */
        writeLock.lock();
        try {
            for (ServiceInstance inst : activeInstances.getAll().values()) {
                if (inst.isAlive() && instanceToNodeMap.containsKey(inst) == false) {
                    /* that's a good node, not added to the allocations yet */
                    LOG.info("Found a new node: " + inst + ".");
                    addNode(inst, new NodeInfo(inst, nodeBlacklistConf, clock, numSchedulableTasksPerNode));
                }
            }
        } finally {
            writeLock.unlock();
        }
    }

    private void addNode(ServiceInstance inst, NodeInfo node) {
        LOG.info("Adding node: " + inst);
        instanceToNodeMap.put(inst, node);
        // Trigger scheduling since a new node became available.
        trySchedulingPendingTasks();
    }

    private void reenableDisabledNode(NodeInfo nodeInfo) {
        writeLock.lock();
        try {
            if (nodeInfo.hadCommFailure()) {
                // If the node being re-enabled was not marked busy previously, then it was disabled due to
                // some other failure. Refresh the service list to see if it's been removed permanently.
                refreshInstances();
            }
            LOG.info("Attempting to re-enable node: " + nodeInfo.getServiceInstance());
            if (nodeInfo.getServiceInstance().isAlive()) {
                nodeInfo.enableNode();
            } else {
                if (LOG.isInfoEnabled()) {
                    LOG.info("Removing dead node " + nodeInfo);
                }
            }
        } finally {
            writeLock.unlock();
        }
    }

    private void disableInstance(ServiceInstance instance, boolean isCommFailure) {
        writeLock.lock();
        try {
            NodeInfo nodeInfo = instanceToNodeMap.get(instance);
            if (nodeInfo == null || nodeInfo.isDisabled()) {
                if (LOG.isDebugEnabled()) {
                    LOG.debug("Node: " + instance + " already disabled, or invalid. Not doing anything.");
                }
            } else {
                nodeInfo.disableNode(isCommFailure);
                // TODO: handle task to container map events in case of hard failures
                disabledNodesQueue.add(nodeInfo);
            }
        } finally {
            writeLock.unlock();
        }
    }

    private void addPendingTask(TaskInfo taskInfo) {
        writeLock.lock();
        try {
            List<TaskInfo> tasksAtPriority = pendingTasks.get(taskInfo.priority);
            if (tasksAtPriority == null) {
                tasksAtPriority = new LinkedList<>();
                pendingTasks.put(taskInfo.priority, tasksAtPriority);
            }
            tasksAtPriority.add(taskInfo);
            knownTasks.putIfAbsent(taskInfo.task, taskInfo);
        } finally {
            writeLock.unlock();
        }
    }

    /* Remove a task from the pending list */
    private void removePendingTask(TaskInfo taskInfo) {
        writeLock.lock();
        try {
            Priority priority = taskInfo.priority;
            List<TaskInfo> taskInfoList = pendingTasks.get(priority);
            if (taskInfoList == null || taskInfoList.isEmpty() || !taskInfoList.remove(taskInfo)) {
                LOG.warn("Could not find task: " + taskInfo.task + " in pending list, at priority: " + priority);
            }
        } finally {
            writeLock.unlock();
        }
    }

    /* Register a running task into the runningTasks structure */
    private void registerRunningTask(TaskInfo taskInfo) {
        writeLock.lock();
        try {
            int priority = taskInfo.priority.getPriority();
            TreeSet<TaskInfo> tasksAtpriority = runningTasks.get(priority);
            if (tasksAtpriority == null) {
                tasksAtpriority = new TreeSet<>(TASK_INFO_COMPARATOR);
                runningTasks.put(priority, tasksAtpriority);
            }
            tasksAtpriority.add(taskInfo);
        } finally {
            writeLock.unlock();
        }
    }

    /* Unregister a task from the known and running structures */
    private TaskInfo unregisterTask(Object task) {
        writeLock.lock();
        try {
            TaskInfo taskInfo = knownTasks.remove(task);
            if (taskInfo != null) {
                if (taskInfo.assigned) {
                    // Remove from the running list.
                    int priority = taskInfo.priority.getPriority();
                    Set<TaskInfo> tasksAtPriority = runningTasks.get(priority);
                    Preconditions.checkState(tasksAtPriority != null,
                            "runningTasks should contain an entry if the task was in running state. Caused by task: {}",
                            task);
                    tasksAtPriority.remove(taskInfo);
                    if (tasksAtPriority.isEmpty()) {
                        runningTasks.remove(priority);
                    }
                }
            } else {
                LOG.warn("Could not find TaskInfo for task: {}. Not removing it from the running set", task);
            }
            return taskInfo;
        } finally {
            writeLock.unlock();
        }
    }

    private enum ScheduleResult {
        // Successfully scheduled
        SCHEDULED,

        // Delayed to find a local match
        DELAYED_LOCALITY,

        // Delayed due to temporary resource availability
        DELAYED_RESOURCES,

        // Inadequate total resources - will never succeed / wait for new executors to become available
        INADEQUATE_TOTAL_RESOURCES,
    }

    @VisibleForTesting
    protected void schedulePendingTasks() {
        writeLock.lock();
        try {
            Iterator<Entry<Priority, List<TaskInfo>>> pendingIterator = pendingTasks.entrySet().iterator();
            while (pendingIterator.hasNext()) {
                Entry<Priority, List<TaskInfo>> entry = pendingIterator.next();
                List<TaskInfo> taskListAtPriority = entry.getValue();
                Iterator<TaskInfo> taskIter = taskListAtPriority.iterator();
                boolean scheduledAllAtPriority = true;
                while (taskIter.hasNext()) {

                    // TODO Optimization: Add a check to see if there's any capacity available. No point in
                    // walking through all active nodes, if they don't have potential capacity.

                    TaskInfo taskInfo = taskIter.next();
                    if (taskInfo.getNumPreviousAssignAttempts() == 1) {
                        dagStats.registerDelayedAllocation();
                    }
                    taskInfo.triedAssigningTask();
                    ScheduleResult scheduleResult = scheduleTask(taskInfo);
                    if (scheduleResult == ScheduleResult.SCHEDULED) {
                        taskIter.remove();
                    } else {
                        // TODO Handle INADEQUATE_TOTAL_RESOURCES eventually - either by throwin an error immediately,
                        // or waiting for some timeout for new executors and then throwing an error

                        // Try pre-empting a task so that a higher priority task can take it's place.
                        // Preempt only if there's no pending preemptions to avoid preempting twice for a task.
                        String[] potentialHosts;
                        if (scheduleResult == ScheduleResult.DELAYED_LOCALITY) {
                            // preempt only on specific hosts, if no preemptions already exist on those.
                            potentialHosts = taskInfo.requestedHosts;
                            //Protect against a bad location being requested.
                            if (potentialHosts == null || potentialHosts.length == 0) {
                                potentialHosts = null;
                            }
                        } else {
                            // preempt on any host.
                            potentialHosts = null;
                        }

                        if (potentialHosts != null) {
                            // Preempt on specific host
                            boolean shouldPreempt = true;
                            for (String host : potentialHosts) {
                                // Preempt only if there are not pending preemptions on the same host
                                // When the premption registers, the request at the highest priority will be given the slot,
                                // even if the initial request was for some other task.
                                // TODO Maybe register which task the preemption was for, to avoid a bad non-local allocation.
                                MutableInt pendingHostPreemptions = pendingPreemptionsPerHost.get(host);
                                if (pendingHostPreemptions != null && pendingHostPreemptions.intValue() > 0) {
                                    shouldPreempt = false;
                                    break;
                                }
                            }
                            if (shouldPreempt) {
                                LOG.info("Attempting to preempt for {}, pendingPreemptions={} on hosts={}",
                                        taskInfo.task, pendingPreemptions.get(), Arrays.toString(potentialHosts));
                                preemptTasks(entry.getKey().getPriority(), 1, potentialHosts);
                            }
                        } else {
                            // Request for a preemption if there's none pending. If a single preemption is pending,
                            // and this is the next task to be assigned, it will be assigned once that slot becomes available.
                            if (pendingPreemptions.get() == 0) {
                                LOG.info("Attempting to preempt for {}, pendingPreemptions={} on any host",
                                        taskInfo.task, pendingPreemptions.get());
                                preemptTasks(entry.getKey().getPriority(), 1, null);
                            }
                        }
                        // Since there was an allocation failure - don't try assigning tasks at the next priority.

                        scheduledAllAtPriority = false;
                        // Don't break if this allocation failure was a result of a LOCALITY_DELAY. Others could still be allocated.
                        if (scheduleResult != ScheduleResult.DELAYED_LOCALITY) {
                            break;
                        }
                    } // end of else - i.e. could not allocate
                } // end of loop over pending tasks
                if (taskListAtPriority.isEmpty()) {
                    // Remove the entry, if there's nothing left at the specific priority level
                    pendingIterator.remove();
                }
                if (!scheduledAllAtPriority) {
                    // Don't attempt scheduling for additional priorities
                    break;
                }
            }
        } finally {
            writeLock.unlock();
        }
    }

    private ScheduleResult scheduleTask(TaskInfo taskInfo) {
        SelectHostResult selectHostResult = selectHost(taskInfo);
        if (selectHostResult.scheduleResult == ScheduleResult.SCHEDULED) {
            NodeServiceInstancePair nsPair = selectHostResult.nodeServiceInstancePair;
            Container container = containerFactory.createContainer(resourcePerExecutor, taskInfo.priority,
                    nsPair.getServiceInstance().getHost(), nsPair.getServiceInstance().getRpcPort());
            writeLock.lock(); // While updating local structures
            try {
                LOG.info("Assigned task {} to container {}", taskInfo, container.getId());
                dagStats.registerTaskAllocated(taskInfo.requestedHosts, taskInfo.requestedRacks,
                        nsPair.getServiceInstance().getHost());
                taskInfo.setAssignmentInfo(nsPair.getServiceInstance(), container.getId(), clock.getTime());
                registerRunningTask(taskInfo);
                nsPair.getNodeInfo().registerTaskScheduled();
            } finally {
                writeLock.unlock();
            }
            getContext().taskAllocated(taskInfo.task, taskInfo.clientCookie, container);
        }
        return selectHostResult.scheduleResult;
    }

    // Removes tasks from the runningList and sends out a preempt request to the system.
    // Subsequent tasks will be scheduled again once the de-allocate request for the preempted
    // task is processed.
    private void preemptTasks(int forPriority, int numTasksToPreempt, String[] potentialHosts) {
        Set<String> preemptHosts;
        if (potentialHosts == null) {
            preemptHosts = null;
        } else {
            preemptHosts = Sets.newHashSet(potentialHosts);
        }
        writeLock.lock();
        List<TaskInfo> preemptedTaskList = null;
        try {
            NavigableMap<Integer, TreeSet<TaskInfo>> orderedMap = runningTasks.descendingMap();
            Iterator<Entry<Integer, TreeSet<TaskInfo>>> iterator = orderedMap.entrySet().iterator();
            int preemptedCount = 0;
            while (iterator.hasNext() && preemptedCount < numTasksToPreempt) {
                Entry<Integer, TreeSet<TaskInfo>> entryAtPriority = iterator.next();
                if (entryAtPriority.getKey() > forPriority) {
                    Iterator<TaskInfo> taskInfoIterator = entryAtPriority.getValue().iterator();
                    while (taskInfoIterator.hasNext() && preemptedCount < numTasksToPreempt) {
                        TaskInfo taskInfo = taskInfoIterator.next();
                        if (preemptHosts == null || preemptHosts.contains(taskInfo.assignedInstance.getHost())) {
                            // Candidate for preemption.
                            preemptedCount++;
                            LOG.info("preempting {} for task at priority {} with potentialHosts={}", taskInfo,
                                    forPriority, potentialHosts == null ? "" : Arrays.toString(potentialHosts));
                            taskInfo.setPreemptedInfo(clock.getTime());
                            if (preemptedTaskList == null) {
                                preemptedTaskList = new LinkedList<>();
                            }
                            dagStats.registerTaskPreempted(taskInfo.assignedInstance.getHost());
                            preemptedTaskList.add(taskInfo);
                            registerPendingPreemption(taskInfo.assignedInstance.getHost());
                            // Remove from the runningTaskList
                            taskInfoIterator.remove();
                        }
                    }

                    // Remove entire priority level if it's been emptied.
                    if (entryAtPriority.getValue().isEmpty()) {
                        iterator.remove();
                    }
                } else {
                    // No tasks qualify as preemptable
                    LOG.info("DBG: No tasks qualify as killable to schedule tasks at priority {}", forPriority);
                    break;
                }
            }
        } finally {
            writeLock.unlock();
        }
        // Send out the preempted request outside of the lock.
        if (preemptedTaskList != null) {
            for (TaskInfo taskInfo : preemptedTaskList) {
                LOG.info("DBG: Preempting task {}", taskInfo);
                getContext().preemptContainer(taskInfo.containerId);
                // Preemption will finally be registered as a deallocateTask as a result of preemptContainer
                // That resets preemption info and allows additional tasks to be pre-empted if required.
            }
        }
        // The schedule loop will be triggered again when the deallocateTask request comes in for the
        // preempted task.
    }

    private void registerPendingPreemption(String host) {
        writeLock.lock();
        try {
            pendingPreemptions.incrementAndGet();
            MutableInt val = pendingPreemptionsPerHost.get(host);
            if (val == null) {
                val = new MutableInt(1);
                pendingPreemptionsPerHost.put(host, val);
            }
            val.increment();
        } finally {
            writeLock.unlock();
        }
    }

    private void unregisterPendingPreemption(String host) {
        writeLock.lock();
        try {
            pendingPreemptions.decrementAndGet();
            MutableInt val = pendingPreemptionsPerHost.get(host);
            Preconditions.checkNotNull(val);
            val.decrement();
            // Not bothering with removing the entry. There's a limited number of hosts, and a good
            // chance that the entry will make it back in when the AM is used for a long duration.
        } finally {
            writeLock.unlock();
        }
    }

    private class NodeEnablerCallable implements Callable<Void> {

        private AtomicBoolean isShutdown = new AtomicBoolean(false);
        private static final long REFRESH_INTERVAL = 10000l;
        long nextPollInterval = REFRESH_INTERVAL;
        long lastRefreshTime;

        @Override
        public Void call() {

            lastRefreshTime = System.currentTimeMillis();
            while (!isShutdown.get() && !Thread.currentThread().isInterrupted()) {
                try {
                    while (true) {
                        NodeInfo nodeInfo = disabledNodesQueue.poll(nextPollInterval, TimeUnit.MILLISECONDS);
                        if (nodeInfo != null) {
                            long currentTime = System.currentTimeMillis();
                            // A node became available. Enable the node and try scheduling.
                            reenableDisabledNode(nodeInfo);
                            trySchedulingPendingTasks();

                            nextPollInterval -= (currentTime - lastRefreshTime);
                        }

                        if (nextPollInterval < 0 || nodeInfo == null) {
                            // timeout expired. Reset the poll interval and refresh nodes.
                            nextPollInterval = REFRESH_INTERVAL;
                            lastRefreshTime = System.currentTimeMillis();
                            // TODO Get rid of this polling once we have notificaitons from the registry sub-system
                            if (LOG.isDebugEnabled()) {
                                LOG.debug("Refreshing instances based on poll interval");
                            }
                            refreshInstances();
                            scanForNodeChanges();
                        }
                    }
                } catch (InterruptedException e) {
                    if (isShutdown.get()) {
                        LOG.info("NodeEnabler thread interrupted after shutdown");
                        break;
                    } else {
                        LOG.warn("NodeEnabler thread interrupted without being shutdown");
                        throw new RuntimeException("NodeEnabler thread interrupted without being shutdown", e);
                    }
                }
            }
            return null;
        }

        // Call this first, then send in an interrupt to the thread.
        public void shutdown() {
            isShutdown.set(true);
        }
    }

    private void trySchedulingPendingTasks() {
        scheduleLock.lock();
        try {
            pendingScheduleInvodations.set(true);
            scheduleCondition.signal();
        } finally {
            scheduleLock.unlock();
        }
    }

    private class SchedulerCallable implements Callable<Void> {
        private AtomicBoolean isShutdown = new AtomicBoolean(false);

        @Override
        public Void call() {
            while (!isShutdown.get() && !Thread.currentThread().isInterrupted()) {
                scheduleLock.lock();
                try {
                    while (!pendingScheduleInvodations.get()) {
                        scheduleCondition.await();
                    }
                } catch (InterruptedException e) {
                    if (isShutdown.get()) {
                        LOG.info("Scheduler thread interrupted after shutdown");
                        break;
                    } else {
                        LOG.warn("Scheduler thread interrupted without being shutdown");
                        throw new RuntimeException("Scheduler thread interrupted without being shutdown", e);
                    }
                } finally {
                    scheduleLock.unlock();
                }

                // Set pending to false since scheduling is about to run. Any triggers up to this point
                // will be handled in the next run.
                // A new request may come in right after this is set to false, but before the actual scheduling.
                // This will be handled in this run, but will cause an immediate run after, which is harmless.
                pendingScheduleInvodations.set(false);
                // Schedule outside of the scheduleLock - which should only be used to wait on the condition.
                schedulePendingTasks();
            }
            return null;
        }

        // Call this first, then send in an interrupt to the thread.
        public void shutdown() {
            isShutdown.set(true);
        }
    }

    @VisibleForTesting
    static class NodeInfo implements Delayed {
        private final NodeBlacklistConf blacklistConf;
        final ServiceInstance serviceInstance;
        private final Clock clock;

        long expireTimeMillis = -1;
        private long numSuccessfulTasks = 0;
        private long numSuccessfulTasksAtLastBlacklist = -1;
        float cumulativeBackoffFactor = 1.0f;

        // Indicates whether a node had a recent communication failure.
        private boolean hadCommFailure = false;

        // Indicates whether a node is disabled - for whatever reason - commFailure, busy, etc.
        private boolean disabled = false;

        private int numPreemptedTasks = 0;
        private int numScheduledTasks = 0;
        private final int numSchedulableTasks;

        /**
         * Create a NodeInfo bound to a service instance
         *
         * @param serviceInstance         the associated serviceInstance
         * @param blacklistConf           blacklist configuration
         * @param clock                   clock to use to obtain timing information
         * @param numSchedulableTasksConf number of schedulable tasks on the node. 0 represents auto
         *                                detect based on the serviceInstance, -1 indicates indicates
         *                                unlimited capacity
         */
        NodeInfo(ServiceInstance serviceInstance, NodeBlacklistConf blacklistConf, Clock clock,
                int numSchedulableTasksConf) {
            Preconditions.checkArgument(numSchedulableTasksConf >= -1, "NumSchedulableTasks must be >=-1");
            this.serviceInstance = serviceInstance;
            this.blacklistConf = blacklistConf;
            this.clock = clock;

            if (numSchedulableTasksConf == 0) {
                int pendingQueueuCapacity = 0;
                String pendingQueueCapacityString = serviceInstance.getProperties()
                        .get(ConfVars.LLAP_DAEMON_TASK_SCHEDULER_WAIT_QUEUE_SIZE.varname);
                if (LOG.isDebugEnabled()) {
                    LOG.debug("Setting up node: " + serviceInstance + ", with available capacity="
                            + serviceInstance.getResource().getVirtualCores() + ", pendingQueueCapacity="
                            + pendingQueueCapacityString);
                }

                if (pendingQueueCapacityString != null) {
                    pendingQueueuCapacity = Integer.parseInt(pendingQueueCapacityString);
                }
                this.numSchedulableTasks = serviceInstance.getResource().getVirtualCores() + pendingQueueuCapacity;
            } else {
                this.numSchedulableTasks = numSchedulableTasksConf;
            }
            LOG.info("Setting up node: " + serviceInstance + " with schedulableCapacity="
                    + this.numSchedulableTasks);
        }

        ServiceInstance getServiceInstance() {
            return serviceInstance;
        }

        void enableNode() {
            expireTimeMillis = -1;
            disabled = false;
            hadCommFailure = false;
        }

        void disableNode(boolean commFailure) {
            long duration = blacklistConf.minDelay;
            long currentTime = clock.getTime();
            this.hadCommFailure = commFailure;
            disabled = true;
            if (numSuccessfulTasksAtLastBlacklist == numSuccessfulTasks) {
                // Relying on a task succeeding to reset the exponent.
                // There's no notifications on whether a task gets accepted or not. That would be ideal to
                // reset this.
                cumulativeBackoffFactor = cumulativeBackoffFactor * blacklistConf.backoffFactor;
            } else {
                // Was able to execute something before the last blacklist. Reset the exponent.
                cumulativeBackoffFactor = 1.0f;
            }

            long delayTime = (long) (duration * cumulativeBackoffFactor);
            if (delayTime > blacklistConf.maxDelay) {
                delayTime = blacklistConf.maxDelay;
            }
            if (LOG.isInfoEnabled()) {
                LOG.info("Disabling instance " + serviceInstance + " for " + delayTime + " milli-seconds");
            }
            expireTimeMillis = currentTime + delayTime;
            numSuccessfulTasksAtLastBlacklist = numSuccessfulTasks;
        }

        void registerTaskScheduled() {
            numScheduledTasks++;
        }

        void registerTaskSuccess() {
            numSuccessfulTasks++;
            numScheduledTasks--;
        }

        void registerUnsuccessfulTaskEnd(boolean wasPreempted) {
            numScheduledTasks--;
            if (wasPreempted) {
                numPreemptedTasks++;
            }
        }

        public boolean isDisabled() {
            return disabled;
        }

        public boolean hadCommFailure() {
            return hadCommFailure;
        }

        /* Returning true does not guarantee that the task will run, considering other queries
        may be running in the system. Also depends upon the capacity usage configuration
         */
        public boolean canAcceptTask() {
            boolean result = !hadCommFailure && !disabled && serviceInstance.isAlive()
                    && (numSchedulableTasks == -1 || ((numSchedulableTasks - numScheduledTasks) > 0));
            LOG.info(
                    "canAcceptTask={}, numScheduledTasks={}, numSchedulableTasks={}, hadCommFailure={}, disabled={}, serviceInstance.isAlive={}",
                    result, numScheduledTasks, numSchedulableTasks, hadCommFailure, disabled,
                    serviceInstance.isAlive());
            return result;
        }

        @Override
        public long getDelay(TimeUnit unit) {
            return unit.convert(expireTimeMillis - clock.getTime(), TimeUnit.MILLISECONDS);
        }

        @Override
        public int compareTo(Delayed o) {
            NodeInfo other = (NodeInfo) o;
            if (other.expireTimeMillis > this.expireTimeMillis) {
                return -1;
            } else if (other.expireTimeMillis < this.expireTimeMillis) {
                return 1;
            } else {
                return 0;
            }
        }

        @Override
        public String toString() {
            return "NodeInfo{" + "instance=" + serviceInstance + ", expireTimeMillis=" + expireTimeMillis
                    + ", numSuccessfulTasks=" + numSuccessfulTasks + ", numSuccessfulTasksAtLastBlacklist="
                    + numSuccessfulTasksAtLastBlacklist + ", cumulativeBackoffFactor=" + cumulativeBackoffFactor
                    + ", numSchedulableTasks=" + numSchedulableTasks + ", numScheduledTasks=" + numScheduledTasks
                    + ", disabled=" + disabled + ", commFailures=" + hadCommFailure + '}';
        }
    }

    @VisibleForTesting
    static class StatsPerDag {
        int numRequestedAllocations = 0;
        int numRequestsWithLocation = 0;
        int numRequestsWithoutLocation = 0;
        int numTotalAllocations = 0;
        int numLocalAllocations = 0;
        int numNonLocalAllocations = 0;
        int numAllocationsNoLocalityRequest = 0;
        int numRejectedTasks = 0;
        int numCommFailures = 0;
        int numDelayedAllocations = 0;
        int numPreemptedTasks = 0;
        Map<String, AtomicInteger> localityBasedNumAllocationsPerHost = new HashMap<>();
        Map<String, AtomicInteger> numAllocationsPerHost = new HashMap<>();

        @Override
        public String toString() {
            StringBuilder sb = new StringBuilder();
            sb.append("NumPreemptedTasks=").append(numPreemptedTasks).append(", ");
            sb.append("NumRequestedAllocations=").append(numRequestedAllocations).append(", ");
            sb.append("NumRequestsWithlocation=").append(numRequestsWithLocation).append(", ");
            sb.append("NumLocalAllocations=").append(numLocalAllocations).append(",");
            sb.append("NumNonLocalAllocations=").append(numNonLocalAllocations).append(",");
            sb.append("NumTotalAllocations=").append(numTotalAllocations).append(",");
            sb.append("NumRequestsWithoutLocation=").append(numRequestsWithoutLocation).append(", ");
            sb.append("NumRejectedTasks=").append(numRejectedTasks).append(", ");
            sb.append("NumCommFailures=").append(numCommFailures).append(", ");
            sb.append("NumDelayedAllocations=").append(numDelayedAllocations).append(", ");
            sb.append("LocalityBasedAllocationsPerHost=").append(localityBasedNumAllocationsPerHost).append(", ");
            sb.append("NumAllocationsPerHost=").append(numAllocationsPerHost);
            return sb.toString();
        }

        void registerTaskRequest(String[] requestedHosts, String[] requestedRacks) {
            numRequestedAllocations++;
            // TODO Change after HIVE-9987. For now, there's no rack matching.
            if (requestedHosts != null && requestedHosts.length != 0) {
                numRequestsWithLocation++;
            } else {
                numRequestsWithoutLocation++;
            }
        }

        void registerTaskAllocated(String[] requestedHosts, String[] requestedRacks, String allocatedHost) {
            // TODO Change after HIVE-9987. For now, there's no rack matching.
            if (requestedHosts != null && requestedHosts.length != 0) {
                Set<String> requestedHostSet = new HashSet<>(Arrays.asList(requestedHosts));
                if (requestedHostSet.contains(allocatedHost)) {
                    numLocalAllocations++;
                    _registerAllocationInHostMap(allocatedHost, localityBasedNumAllocationsPerHost);
                } else {
                    numNonLocalAllocations++;
                }
            } else {
                numAllocationsNoLocalityRequest++;
            }
            numTotalAllocations++;
            _registerAllocationInHostMap(allocatedHost, numAllocationsPerHost);
        }

        void registerTaskPreempted(String host) {
            numPreemptedTasks++;
        }

        void registerCommFailure(String host) {
            numCommFailures++;
        }

        void registerTaskRejected(String host) {
            numRejectedTasks++;
        }

        void registerDelayedAllocation() {
            numDelayedAllocations++;
        }

        private void _registerAllocationInHostMap(String host, Map<String, AtomicInteger> hostMap) {
            AtomicInteger val = hostMap.get(host);
            if (val == null) {
                val = new AtomicInteger(0);
                hostMap.put(host, val);
            }
            val.incrementAndGet();
        }
    }

    private static class TaskInfo {
        // IDs used to ensure two TaskInfos are different without using the underlying task instance.
        // Required for insertion into a TreeMap
        static final AtomicLong ID_GEN = new AtomicLong(0);
        final long uniqueId;
        final Object task;
        final Object clientCookie;
        final Priority priority;
        final Resource capability;
        final String[] requestedHosts;
        final String[] requestedRacks;
        final long requestTime;
        long startTime;
        long preemptTime;
        ContainerId containerId;
        ServiceInstance assignedInstance;
        private boolean assigned = false;
        private boolean preempted = false;

        private int numAssignAttempts = 0;

        // TaskInfo instances for two different tasks will not be the same. Only a single instance should
        // ever be created for a taskAttempt
        public TaskInfo(Object task, Object clientCookie, Priority priority, Resource capability, String[] hosts,
                String[] racks, long requestTime) {
            this.task = task;
            this.clientCookie = clientCookie;
            this.priority = priority;
            this.capability = capability;
            this.requestedHosts = hosts;
            this.requestedRacks = racks;
            this.requestTime = requestTime;
            this.uniqueId = ID_GEN.getAndIncrement();
        }

        void setAssignmentInfo(ServiceInstance instance, ContainerId containerId, long startTime) {
            this.assignedInstance = instance;
            this.containerId = containerId;
            this.startTime = startTime;
            assigned = true;
        }

        void setPreemptedInfo(long preemptTime) {
            this.preempted = true;
            this.assigned = false;
            this.preemptTime = preemptTime;
        }

        void triedAssigningTask() {
            numAssignAttempts++;
        }

        int getNumPreviousAssignAttempts() {
            return numAssignAttempts;
        }

        @Override
        public boolean equals(Object o) {
            if (this == o) {
                return true;
            }
            if (o == null || getClass() != o.getClass()) {
                return false;
            }

            TaskInfo taskInfo = (TaskInfo) o;

            if (uniqueId != taskInfo.uniqueId) {
                return false;
            }
            return task.equals(taskInfo.task);

        }

        @Override
        public int hashCode() {
            int result = (int) (uniqueId ^ (uniqueId >>> 32));
            result = 31 * result + task.hashCode();
            return result;
        }

        @Override
        public String toString() {
            return "TaskInfo{" + "task=" + task + ", priority=" + priority + ", startTime=" + startTime
                    + ", containerId=" + containerId + ", assignedInstance=" + assignedInstance + ", uniqueId="
                    + uniqueId + '}';
        }
    }

    // Newer tasks first.
    private static class TaskStartComparator implements Comparator<TaskInfo> {

        @Override
        public int compare(TaskInfo o1, TaskInfo o2) {
            if (o1.startTime > o2.startTime) {
                return -1;
            } else if (o1.startTime < o2.startTime) {
                return 1;
            } else {
                // Comparing on time is not sufficient since two may be created at the same time,
                // in which case inserting into a TreeSet/Map would break
                if (o1.uniqueId > o2.uniqueId) {
                    return -1;
                } else if (o1.uniqueId < o2.uniqueId) {
                    return 1;
                } else {
                    return 0;
                }
            }
        }
    }

    private static class SelectHostResult {
        final NodeServiceInstancePair nodeServiceInstancePair;
        final ScheduleResult scheduleResult;

        SelectHostResult(ServiceInstance serviceInstance, NodeInfo nodeInfo) {
            this.nodeServiceInstancePair = new NodeServiceInstancePair(serviceInstance, nodeInfo);
            this.scheduleResult = ScheduleResult.SCHEDULED;
        }

        SelectHostResult(ScheduleResult scheduleResult) {
            this.nodeServiceInstancePair = null;
            this.scheduleResult = scheduleResult;
        }
    }

    private static final SelectHostResult SELECT_HOST_RESULT_INADEQUATE_TOTAL_CAPACITY = new SelectHostResult(
            ScheduleResult.INADEQUATE_TOTAL_RESOURCES);
    private static final SelectHostResult SELECT_HOST_RESULT_DELAYED_LOCALITY = new SelectHostResult(
            ScheduleResult.DELAYED_LOCALITY);
    private static final SelectHostResult SELECT_HOST_RESULT_DELAYED_RESOURCES = new SelectHostResult(
            ScheduleResult.DELAYED_RESOURCES);

    private static class NodeServiceInstancePair {
        final NodeInfo nodeInfo;
        final ServiceInstance serviceInstance;

        private NodeServiceInstancePair(ServiceInstance serviceInstance, NodeInfo nodeInfo) {
            this.serviceInstance = serviceInstance;
            this.nodeInfo = nodeInfo;
        }

        public ServiceInstance getServiceInstance() {
            return serviceInstance;
        }

        public NodeInfo getNodeInfo() {
            return nodeInfo;
        }
    }

    private static final class NodeBlacklistConf {
        private final long minDelay;
        private final long maxDelay;
        private final float backoffFactor;

        public NodeBlacklistConf(long minDelay, long maxDelay, float backoffFactor) {
            this.minDelay = minDelay;
            this.maxDelay = maxDelay;
            this.backoffFactor = backoffFactor;
        }

        @Override
        public String toString() {
            return "NodeBlacklistConf{" + "minDelay=" + minDelay + ", maxDelay=" + maxDelay + ", backoffFactor="
                    + backoffFactor + '}';
        }
    }
}