org.apache.hadoop.yarn.server.resourcemanager.monitor.capacity.FifoIntraQueuePreemptionPlugin.java Source code

Java tutorial

Introduction

Here is the source code for org.apache.hadoop.yarn.server.resourcemanager.monitor.capacity.FifoIntraQueuePreemptionPlugin.java

Source

/**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.hadoop.yarn.server.resourcemanager.monitor.capacity;

import java.util.Collection;
import java.util.Collections;
import java.util.Comparator;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Map;
import java.util.PriorityQueue;
import java.util.Set;
import java.util.TreeSet;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.yarn.api.records.ApplicationAttemptId;
import org.apache.hadoop.yarn.api.records.Resource;
import org.apache.hadoop.yarn.server.resourcemanager.monitor.capacity.IntraQueueCandidatesSelector.TAPriorityComparator;
import org.apache.hadoop.yarn.server.resourcemanager.rmcontainer.RMContainer;
import org.apache.hadoop.yarn.server.resourcemanager.scheduler.capacity.LeafQueue;
import org.apache.hadoop.yarn.server.resourcemanager.scheduler.common.fica.FiCaSchedulerApp;
import org.apache.hadoop.yarn.util.resource.ResourceCalculator;
import org.apache.hadoop.yarn.util.resource.Resources;

/**
 * FifoIntraQueuePreemptionPlugin will handle intra-queue preemption for
 * priority and user-limit.
 */
public class FifoIntraQueuePreemptionPlugin implements IntraQueuePreemptionComputePlugin {

    protected final CapacitySchedulerPreemptionContext context;
    protected final ResourceCalculator rc;

    private static final Log LOG = LogFactory.getLog(FifoIntraQueuePreemptionPlugin.class);

    public FifoIntraQueuePreemptionPlugin(ResourceCalculator rc,
            CapacitySchedulerPreemptionContext preemptionContext) {
        this.context = preemptionContext;
        this.rc = rc;
    }

    @Override
    public Map<String, Resource> getResourceDemandFromAppsPerQueue(String queueName, String partition) {

        Map<String, Resource> resToObtainByPartition = new HashMap<>();
        TempQueuePerPartition tq = context.getQueueByPartition(queueName, partition);

        Collection<TempAppPerPartition> appsOrderedByPriority = tq.getApps();
        Resource actualPreemptNeeded = resToObtainByPartition.get(partition);

        // Updating pending resource per-partition level.
        if (actualPreemptNeeded == null) {
            actualPreemptNeeded = Resources.createResource(0, 0);
            resToObtainByPartition.put(partition, actualPreemptNeeded);
        }

        for (TempAppPerPartition a1 : appsOrderedByPriority) {
            Resources.addTo(actualPreemptNeeded, a1.getActuallyToBePreempted());
        }

        if (LOG.isDebugEnabled()) {
            LOG.debug("Selected to preempt " + actualPreemptNeeded + " resource from partition:" + partition);
        }
        return resToObtainByPartition;
    }

    @Override
    public void computeAppsIdealAllocation(Resource clusterResource, Resource partitionBasedResource,
            TempQueuePerPartition tq, Map<ApplicationAttemptId, Set<RMContainer>> selectedCandidates,
            Resource totalPreemptedResourceAllowed, Resource queueReassignableResource,
            float maxAllowablePreemptLimit) {

        // 1. AM used resource can be considered as a frozen resource for now.
        // Hence such containers in a queue can be omitted from the preemption
        // calculation.
        Map<String, Resource> perUserAMUsed = new HashMap<String, Resource>();
        Resource amUsed = calculateUsedAMResourcesPerQueue(tq.partition, tq.leafQueue, perUserAMUsed);
        Resources.subtractFrom(queueReassignableResource, amUsed);

        // 2. tq.leafQueue will not be null as we validated it in caller side
        Collection<FiCaSchedulerApp> apps = tq.leafQueue.getAllApplications();

        // We do not need preemption for a single app
        if (apps.size() == 1) {
            return;
        }

        // 3. Create all tempApps for internal calculation and return a list from
        // high priority to low priority order.
        TAPriorityComparator taComparator = new TAPriorityComparator();
        PriorityQueue<TempAppPerPartition> orderedByPriority = createTempAppForResCalculation(tq.partition, apps,
                taComparator);

        // 4. Calculate idealAssigned per app by checking based on queue's
        // unallocated resource.Also return apps arranged from lower priority to
        // higher priority.
        TreeSet<TempAppPerPartition> orderedApps = calculateIdealAssignedResourcePerApp(clusterResource,
                partitionBasedResource, tq, selectedCandidates, queueReassignableResource, orderedByPriority,
                perUserAMUsed);

        // 5. A configurable limit that could define an ideal allowable preemption
        // limit. Based on current queue's capacity,defined how much % could become
        // preemptable.
        Resource maxIntraQueuePreemptable = Resources.multiply(tq.getGuaranteed(), maxAllowablePreemptLimit);
        if (Resources.greaterThan(rc, clusterResource, maxIntraQueuePreemptable, tq.getActuallyToBePreempted())) {
            Resources.subtractFrom(maxIntraQueuePreemptable, tq.getActuallyToBePreempted());
        } else {
            maxIntraQueuePreemptable = Resource.newInstance(0, 0);
        }

        // 6. We have two configurations here, one is intra queue limit and second
        // one is per-round limit for any time preemption. Take a minimum of these
        Resource preemptionLimit = Resources.min(rc, clusterResource, maxIntraQueuePreemptable,
                totalPreemptedResourceAllowed);

        // 7. From lowest priority app onwards, calculate toBePreempted resource
        // based on demand.
        calculateToBePreemptedResourcePerApp(clusterResource, orderedApps, preemptionLimit);

        // Save all apps (low to high) to temp queue for further reference
        tq.addAllApps(orderedApps);

        // 8. There are chances that we may preempt for the demand from same
        // priority level, such cases are to be validated out.
        validateOutSameAppPriorityFromDemand(clusterResource, (TreeSet<TempAppPerPartition>) tq.getApps());

        if (LOG.isDebugEnabled()) {
            LOG.debug("Queue Name:" + tq.queueName + ", partition:" + tq.partition);
            for (TempAppPerPartition tmpApp : tq.getApps()) {
                LOG.debug(tmpApp);
            }
        }
    }

    private void calculateToBePreemptedResourcePerApp(Resource clusterResource,
            TreeSet<TempAppPerPartition> orderedApps, Resource preemptionLimit) {

        for (TempAppPerPartition tmpApp : orderedApps) {
            if (Resources.lessThanOrEqual(rc, clusterResource, preemptionLimit, Resources.none())
                    || Resources.lessThanOrEqual(rc, clusterResource, tmpApp.getUsed(), Resources.none())) {
                continue;
            }

            Resource preemtableFromApp = Resources.subtract(tmpApp.getUsed(), tmpApp.idealAssigned);
            Resources.subtractFrom(preemtableFromApp, tmpApp.selected);
            Resources.subtractFrom(preemtableFromApp, tmpApp.getAMUsed());

            // Calculate toBePreempted from apps as follows:
            // app.preemptable = min(max(app.used - app.selected - app.ideal, 0),
            // intra_q_preemptable)
            tmpApp.toBePreempted = Resources.min(rc, clusterResource,
                    Resources.max(rc, clusterResource, preemtableFromApp, Resources.none()), preemptionLimit);

            preemptionLimit = Resources.subtract(preemptionLimit, tmpApp.toBePreempted);
        }
    }

    /**
     * Algorithm for calculating idealAssigned is as follows:
     * For each partition:
     *  Q.reassignable = Q.used - Q.selected;
     *  
     * # By default set ideal assigned 0 for app.
     * app.idealAssigned as 0
     * # get user limit from scheduler.
     * userLimitRes = Q.getUserLimit(userName)
     * 
     * # initial all value to 0
     * Map<String, Resource> userToAllocated
     * 
     * # Loop from highest priority to lowest priority app to calculate ideal
     * for app in sorted-by(priority) {
     *  if Q.reassignable < 0:
     *    break;
     *    
     *  if (user-to-allocated.get(app.user) < userLimitRes) {
     *   idealAssigned = min((userLimitRes - userToAllocated.get(app.user)), 
     *                      (app.used + app.pending - app.selected))
     *   app.idealAssigned = min(Q.reassignable, idealAssigned)
     *   userToAllocated.get(app.user) += app.idealAssigned;
     *  } else { 
     *   // skip this app because user-limit reached
     *  }
     *  Q.reassignable -= app.idealAssigned
     * }
     *  
     * @param clusterResource Cluster Resource
     * @param partitionBasedResource resource per partition
     * @param tq TempQueue
     * @param selectedCandidates Already Selected preemption candidates
     * @param queueReassignableResource Resource used in a queue
     * @param orderedByPriority List of running apps
     * @param perUserAMUsed AM used resource
     * @return List of temp apps ordered from low to high priority
     */
    private TreeSet<TempAppPerPartition> calculateIdealAssignedResourcePerApp(Resource clusterResource,
            Resource partitionBasedResource, TempQueuePerPartition tq,
            Map<ApplicationAttemptId, Set<RMContainer>> selectedCandidates, Resource queueReassignableResource,
            PriorityQueue<TempAppPerPartition> orderedByPriority, Map<String, Resource> perUserAMUsed) {

        Comparator<TempAppPerPartition> reverseComp = Collections.reverseOrder(new TAPriorityComparator());
        TreeSet<TempAppPerPartition> orderedApps = new TreeSet<>(reverseComp);

        Map<String, Resource> userIdealAssignedMapping = new HashMap<>();
        String partition = tq.partition;

        Map<String, Resource> preCalculatedUserLimit = new HashMap<String, Resource>();

        while (!orderedByPriority.isEmpty()) {
            // Remove app from the next highest remaining priority and process it to
            // calculate idealAssigned per app.
            TempAppPerPartition tmpApp = orderedByPriority.remove();
            orderedApps.add(tmpApp);

            // Once unallocated resource is 0, we can stop assigning ideal per app.
            if (Resources.lessThanOrEqual(rc, clusterResource, queueReassignableResource, Resources.none())) {
                continue;
            }

            String userName = tmpApp.app.getUser();
            Resource userLimitResource = preCalculatedUserLimit.get(userName);

            // Verify whether we already calculated headroom for this user.
            if (userLimitResource == null) {
                userLimitResource = Resources
                        .clone(tq.leafQueue.getUserLimitPerUser(userName, partitionBasedResource, partition));

                Resource amUsed = perUserAMUsed.get(userName);
                if (null == amUsed) {
                    amUsed = Resources.createResource(0, 0);
                }

                // Real AM used need not have to be considered for user-limit as well.
                userLimitResource = Resources.subtract(userLimitResource, amUsed);
                if (LOG.isDebugEnabled()) {
                    LOG.debug("Userlimit for user '" + userName + "' is :" + userLimitResource + ", and amUsed is:"
                            + amUsed);
                }

                preCalculatedUserLimit.put(userName, userLimitResource);
            }

            Resource idealAssignedForUser = userIdealAssignedMapping.get(userName);

            if (idealAssignedForUser == null) {
                idealAssignedForUser = Resources.createResource(0, 0);
                userIdealAssignedMapping.put(userName, idealAssignedForUser);
            }

            // Calculate total selected container resources from current app.
            getAlreadySelectedPreemptionCandidatesResource(selectedCandidates, tmpApp, partition);

            // For any app, used+pending will give its idealAssigned. However it will
            // be tightly linked to queue's unallocated quota. So lower priority apps
            // idealAssigned may fall to 0 if higher priority apps demand is more.
            Resource appIdealAssigned = Resources.add(tmpApp.getUsedDeductAM(), tmpApp.getPending());
            Resources.subtractFrom(appIdealAssigned, tmpApp.selected);

            if (Resources.lessThan(rc, clusterResource, idealAssignedForUser, userLimitResource)) {
                appIdealAssigned = Resources.min(rc, clusterResource, appIdealAssigned,
                        Resources.subtract(userLimitResource, idealAssignedForUser));
                tmpApp.idealAssigned = Resources
                        .clone(Resources.min(rc, clusterResource, queueReassignableResource, appIdealAssigned));
                Resources.addTo(idealAssignedForUser, tmpApp.idealAssigned);
            } else {
                continue;
            }

            // Also set how much resource is needed by this app from others.
            Resource appUsedExcludedSelected = Resources.subtract(tmpApp.getUsedDeductAM(), tmpApp.selected);
            if (Resources.greaterThan(rc, clusterResource, tmpApp.idealAssigned, appUsedExcludedSelected)) {
                tmpApp.setToBePreemptFromOther(Resources.subtract(tmpApp.idealAssigned, appUsedExcludedSelected));
            }

            Resources.subtractFrom(queueReassignableResource, tmpApp.idealAssigned);
        }

        return orderedApps;
    }

    /*
     * Previous policies would have already selected few containers from an
     * application. Calculate total resource from these selected containers.
     */
    private void getAlreadySelectedPreemptionCandidatesResource(
            Map<ApplicationAttemptId, Set<RMContainer>> selectedCandidates, TempAppPerPartition tmpApp,
            String partition) {
        tmpApp.selected = Resources.createResource(0, 0);
        Set<RMContainer> containers = selectedCandidates.get(tmpApp.app.getApplicationAttemptId());

        if (containers == null) {
            return;
        }

        for (RMContainer cont : containers) {
            if (partition.equals(cont.getNodeLabelExpression())) {
                Resources.addTo(tmpApp.selected, cont.getAllocatedResource());
            }
        }
    }

    private PriorityQueue<TempAppPerPartition> createTempAppForResCalculation(String partition,
            Collection<FiCaSchedulerApp> apps, TAPriorityComparator taComparator) {
        PriorityQueue<TempAppPerPartition> orderedByPriority = new PriorityQueue<>(100, taComparator);

        // have an internal temp app structure to store intermediate data(priority)
        for (FiCaSchedulerApp app : apps) {

            Resource used = app.getAppAttemptResourceUsage().getUsed(partition);
            Resource amUsed = null;
            if (!app.isWaitingForAMContainer()) {
                amUsed = app.getAMResource(partition);
            }
            Resource pending = app.getTotalPendingRequestsPerPartition().get(partition);
            Resource reserved = app.getAppAttemptResourceUsage().getReserved(partition);

            used = (used == null) ? Resources.createResource(0, 0) : used;
            amUsed = (amUsed == null) ? Resources.createResource(0, 0) : amUsed;
            pending = (pending == null) ? Resources.createResource(0, 0) : pending;
            reserved = (reserved == null) ? Resources.createResource(0, 0) : reserved;

            HashSet<String> partitions = new HashSet<String>(
                    app.getAppAttemptResourceUsage().getNodePartitionsSet());
            partitions.addAll(app.getTotalPendingRequestsPerPartition().keySet());

            // Create TempAppPerQueue for further calculation.
            TempAppPerPartition tmpApp = new TempAppPerPartition(app, Resources.clone(used),
                    Resources.clone(amUsed), Resources.clone(reserved), Resources.clone(pending));

            // Set ideal allocation of app as 0.
            tmpApp.idealAssigned = Resources.createResource(0, 0);

            orderedByPriority.add(tmpApp);
        }
        return orderedByPriority;
    }

    /*
     * Fifo+Priority based preemption policy need not have to preempt resources at
     * same priority level. Such cases will be validated out.
     */
    public void validateOutSameAppPriorityFromDemand(Resource cluster,
            TreeSet<TempAppPerPartition> appsOrderedfromLowerPriority) {

        TempAppPerPartition[] apps = appsOrderedfromLowerPriority
                .toArray(new TempAppPerPartition[appsOrderedfromLowerPriority.size()]);
        if (apps.length <= 0) {
            return;
        }

        int lPriority = 0;
        int hPriority = apps.length - 1;

        while (lPriority < hPriority && !apps[lPriority].equals(apps[hPriority])
                && apps[lPriority].getPriority() < apps[hPriority].getPriority()) {
            Resource toPreemptFromOther = apps[hPriority].getToBePreemptFromOther();
            Resource actuallyToPreempt = apps[lPriority].getActuallyToBePreempted();
            Resource delta = Resources.subtract(apps[lPriority].toBePreempted, actuallyToPreempt);

            if (Resources.greaterThan(rc, cluster, delta, Resources.none())) {
                Resource toPreempt = Resources.min(rc, cluster, toPreemptFromOther, delta);

                apps[hPriority].setToBePreemptFromOther(Resources.subtract(toPreemptFromOther, toPreempt));
                apps[lPriority].setActuallyToBePreempted(Resources.add(actuallyToPreempt, toPreempt));
            }

            if (Resources.lessThanOrEqual(rc, cluster, apps[lPriority].toBePreempted,
                    apps[lPriority].getActuallyToBePreempted())) {
                lPriority++;
                continue;
            }

            if (Resources.equals(apps[hPriority].getToBePreemptFromOther(), Resources.none())) {
                hPriority--;
                continue;
            }
        }
    }

    private Resource calculateUsedAMResourcesPerQueue(String partition, LeafQueue leafQueue,
            Map<String, Resource> perUserAMUsed) {
        Collection<FiCaSchedulerApp> runningApps = leafQueue.getApplications();
        Resource amUsed = Resources.createResource(0, 0);

        for (FiCaSchedulerApp app : runningApps) {
            Resource userAMResource = perUserAMUsed.get(app.getUser());
            if (null == userAMResource) {
                userAMResource = Resources.createResource(0, 0);
                perUserAMUsed.put(app.getUser(), userAMResource);
            }

            Resources.addTo(userAMResource, app.getAMResource(partition));
            Resources.addTo(amUsed, app.getAMResource(partition));
        }
        return amUsed;
    }
}