Java tutorial
/** * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.hadoop.mapred; import java.io.IOException; import java.util.ArrayList; import java.util.Collection; import java.util.Collections; import java.util.HashMap; import java.util.Iterator; import java.util.LinkedList; import java.util.List; import java.util.Map; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.util.ReflectionUtils; /** * A {@link TaskScheduler} that implements fair sharing. */ public class Yunti3Scheduler extends TaskScheduler { /** How often fair shares are re-calculated */ public static long UPDATE_INTERVAL = 1000; public static final Log LOG = LogFactory.getLog(Yunti3Scheduler.class); protected YTPoolManager poolMgr; protected YTLoadManager loadMgr; protected YTTaskSelector taskSelector; protected YTWeightAdjuster weightAdjuster; // Can be null for no weight adjuster protected boolean initialized; // Are we initialized? protected boolean running; // Are we running? protected boolean assignMultiple; // Simultaneously assign map and reduce? protected int mapAssignCap = -1; // Max maps to launch per heartbeat, -1 to use // the TaskTracker's slot count protected int reduceAssignCap = -1; // Max reduces to launch per heartbeat, // -1 to use the TaskTracker's slot count protected long localityDelay = MAX_AUTOCOMPUTED_LOCALITY_DELAY; // Time to wait for node and rack locality protected boolean autoComputeLocalityDelay = false; // Compute locality delay // Maximum locality delay when auto-computing locality delays private static final long MAX_AUTOCOMPUTED_LOCALITY_DELAY = 15000; // from heartbeat interval protected boolean sizeBasedWeight; // Give larger weights to larger jobs protected boolean waitForMapsBeforeLaunchingReduces = true; private Clock clock; private int tickCount; private EagerTaskInitializationListener eagerInitListener; private JobListener jobListener; private boolean mockMode; // Used for unit tests; disables background updates // and scheduler event log private int runnableMaps = 0; private int runnableReduces = 0; private int totalMapSlots = 0; private int totalReduceSlots = 0; protected long lastHeartbeatTime; // Time we last ran assignTasks /** * A class for holding per-job scheduler variables. These always contain the * values of the variables at the last update(), and are used along with a * time delta to update the map and reduce deficits before a new update(). */ static class JobInfo { boolean runnable = false; // Can the job run given user/pool limits? double mapWeight = 0; // Weight of job in calculation of map share double reduceWeight = 0; // Weight of job in calculation of reduce share int runningMaps = 0; // Maps running at last update int runningReduces = 0; // Reduces running at last update int neededMaps; // Maps needed at last update int neededReduces; // Reduces needed at last update double mapFairShare = 0; // Fair share of map slots at last update double reduceFairShare = 0; // Fair share of reduce slots at last update int jobLevel = 0; // Level of job in a queue LocalityLevel lastMapLocalityLevel = LocalityLevel.NODE; // Locality level of last map launched long timeWaitedForLocalMap = 0; // Time waiting for local map since last map int assignedAtLastHeartbeat = 0; int skippedAtLastHeartbeat = 0; // Was job skipped at previous assignTasks? } /** * A clock class - can be mocked out for testing. */ static class Clock { long getTime() { return System.currentTimeMillis(); } } /** * Top level scheduling information to be set to the queueManager * added by liangly */ private static class SchedulingInfo { private YTPool pool; private YTPoolManager poolMgr; private QueueManager queueMgr; public SchedulingInfo(YTPool pool, YTPoolManager poolMgr, QueueManager queueMgr) { this.pool = pool; this.poolMgr = poolMgr; this.queueMgr = queueMgr; } @Override public String toString() { int runningJobs = pool.getJobs().size(); int maxJobs = poolMgr.getPoolMaxJobs(pool.getName()); int runningMaps = pool.getTotalRunningMaps(); int runningReduces = pool.getTotalRunningReduces(); int minMaps = poolMgr.getAllocation(pool.getName(), YTTaskType.MAP); int minReduces = poolMgr.getAllocation(pool.getName(), YTTaskType.REDUCE); StringBuilder sb = new StringBuilder(); sb.append(String.format("Running Jobs: %d/%d, ", runningJobs, maxJobs)); sb.append(String.format("Running Maps: %d/%d, ", runningMaps, minMaps)); sb.append(String.format("Running Reduces: %d/%d\n", runningReduces, minReduces)); String info = sb.toString(); float highlightThreshold = queueMgr.getHighlightThreshold(); // highlight scheduling information with red color if ((minMaps > 0 && runningMaps * 1.f / minMaps > highlightThreshold) || (minReduces > 0 && runningReduces * 1.f / minReduces > highlightThreshold)) { info = "<font color = #FF0000>" + info + "</font>"; } return info; } } public Yunti3Scheduler() { this(new Clock(), false); } /** * Constructor used for tests, which can change the clock and disable updates. */ protected Yunti3Scheduler(Clock clock, boolean mockMode) { this.clock = clock; this.tickCount = 1; this.mockMode = mockMode; this.jobListener = new JobListener(); } @Override public void start() { try { Configuration conf = getConf(); if (!mockMode) { eagerInitListener = new EagerTaskInitializationListener(); eagerInitListener.setTaskTrackerManager(taskTrackerManager); eagerInitListener.start(); taskTrackerManager.addJobInProgressListener(eagerInitListener); } taskTrackerManager.addJobInProgressListener(jobListener); poolMgr = new YTPoolManager(conf); loadMgr = (YTLoadManager) ReflectionUtils.newInstance(conf.getClass( "mapred.yunti3scheduler.loadmanager", YTCapBasedLoadManager.class, YTLoadManager.class), conf); loadMgr.setTaskTrackerManager(taskTrackerManager); loadMgr.start(); taskSelector = (YTTaskSelector) ReflectionUtils .newInstance(conf.getClass("mapred.yunti3scheduler.taskselector", YTDefaultTaskSelector.class, YTTaskSelector.class), conf); taskSelector.setTaskTrackerManager(taskTrackerManager); taskSelector.start(); Class<?> weightAdjClass = conf.getClass("mapred.yunti3scheduler.weightadjuster", null); if (weightAdjClass != null) { weightAdjuster = (YTWeightAdjuster) ReflectionUtils.newInstance(weightAdjClass, conf); } reloadConfiguration(conf); initialized = true; running = true; // Start a thread to update deficits every UPDATE_INTERVAL if (!mockMode) new UpdateThread().start(); // Register servlet with JobTracker's Jetty server if (taskTrackerManager instanceof JobTracker) { JobTracker jobTracker = (JobTracker) taskTrackerManager; StatusHttpServer infoServer = jobTracker.infoServer; infoServer.setAttribute("scheduler", this); infoServer.addServlet("scheduler", "/scheduler", Yunti3SchedulerServlet.class); } // set scheduling information to queue manager. added by liangly setSchedulerInfo(); } catch (Exception e) { // Can't load one of the managers - crash the JobTracker now while it is // starting up so that the user notices. throw new RuntimeException("Failed to start YuntiScheduler", e); } LOG.info("Successfully configured YuntiScheduler"); } /** * set scheduling information to queue manager. * * @author liangly */ public void setSchedulerInfo() { if (taskTrackerManager instanceof JobTracker) { QueueManager queueManager = taskTrackerManager.getQueueManager(); for (String queueName : queueManager.getQueues()) { YTPool pool = poolMgr.getPool(queueName); SchedulingInfo schedulingInfo = new SchedulingInfo(pool, poolMgr, queueManager); queueManager.setSchedulerInfo(queueName, schedulingInfo); } } } @Override public void terminate() throws IOException { running = false; if (jobListener != null) taskTrackerManager.removeJobInProgressListener(jobListener); if (eagerInitListener != null) taskTrackerManager.removeJobInProgressListener(eagerInitListener); } /** * Used to listen for jobs added/removed by our {@link TaskTrackerManager}. */ private class JobListener extends JobInProgressListener { @Override public void jobAdded(JobInProgress job) { synchronized (Yunti3Scheduler.this) { poolMgr.addJob(job); } } @Override public void jobRemoved(JobInProgress job) { synchronized (Yunti3Scheduler.this) { poolMgr.removeJob(job); } } @Override public void jobUpdated(JobChangeEvent event) { } } /** * A thread which calls {@link Yunti3Scheduler#update()} ever * <code>UPDATE_INTERVAL</code> milliseconds. */ private class UpdateThread extends Thread { private UpdateThread() { super("Yunti3Scheduler update thread"); } public void run() { while (running) { try { Thread.sleep(UPDATE_INTERVAL); tickCount++; if (tickCount == 3600) { tickCount = 0; } update(); } catch (Exception e) { LOG.error("Failed to update fair share calculations", e); } } } } private void updateUpdateInterval() { UPDATE_INTERVAL = JobTracker.getJobSlotsUpdateInterval(); } @Override public synchronized List<Task> assignTasks(TaskTrackerStatus tracker) throws IOException { if (!initialized) // Don't try to assign tasks if we haven't yet started up return null; long currentTime = clock.getTime(); // FIXME: AVOID CALCULATE FOR RUNNINGS EVERYTIME // Compute total running maps and reduces int runningMaps = 0; int runningReduces = 0; for (YTPool pool : poolMgr.getPools()) { runningMaps += pool.getTotalRunningMaps(); runningReduces += pool.getTotalRunningReduces(); } int mapsAssigned = 0; // loop counter for map in the below while loop int reducesAssigned = 0; // loop counter for reduce in the below while int mapCapacity = maxTasksToAssign(YTTaskType.MAP, tracker); int reduceCapacity = maxTasksToAssign(YTTaskType.REDUCE, tracker); mapCapacity = loadMgr.canAssignMapNum(tracker, runnableMaps, totalMapSlots, mapCapacity); reduceCapacity = loadMgr.canAssignReduceNum(tracker, runnableReduces, totalReduceSlots, reduceCapacity); boolean mapRejected = false; // flag used for ending the loop boolean reduceRejected = false; // flag used for ending the loop if (LOG.isDebugEnabled()) { LOG.debug("Task capacity for tracker " + tracker.getHost() + " is(m/r):" + mapCapacity + "/" + reduceCapacity); } // Scan to see whether any job needs to run a map, then a reduce ArrayList<Task> tasks = new ArrayList<Task>(); // If a pool has no task to schedule, give the chance to another pool. while (true) { // Computing the ending conditions for the loop // Reject a task type if one of the following condition happens // 1. number of assigned task reaches per heatbeat limit // 2. number of running tasks reaches runnable tasks // 3. task is rejected by the LoadManager.canAssign if (!mapRejected) { if (mapsAssigned >= mapCapacity || runningMaps >= runnableMaps) { mapRejected = true; } } if (!reduceRejected) { if (reducesAssigned >= reduceCapacity || runningReduces >= runnableReduces) { reduceRejected = true; } } // Exit while (true) loop if neither maps nor reduces can be assigned if ((mapRejected && reduceRejected) || (!assignMultiple && tasks.size() > 0)) { break; // This is the only exit of the while (true) loop } // Determine which task type to assign this time // First try choosing a task type which is not rejected YTTaskType taskType; if (mapRejected) { taskType = YTTaskType.REDUCE; } else if (reduceRejected) { taskType = YTTaskType.MAP; } else { // If both types are available, choose the task type with fewer running // tasks on the task tracker to prevent that task type from starving if (tracker.countMapTasks() <= tracker.countReduceTasks()) { taskType = YTTaskType.MAP; } else { taskType = YTTaskType.REDUCE; } } // The jobs of this pool should be scheduled in turn. int accessedPools = 0; int totalPools = poolMgr.getPools().size(); boolean foundTask = false; YTPool prePool = null; while (accessedPools < totalPools) { YTPool pool = poolMgr.getCurrentScheduledPool(); if (pool == null) // no pool to schedule break; // pick another pool next time poolMgr.updateAccessedPoolIndex(); if (pool == prePool) {// skip the same pool continue; } prePool = pool; accessedPools++; if (taskType == YTTaskType.MAP) { if (pool.getTotalRunningMaps() >= poolMgr.getAllocation(pool.getName(), taskType)) { continue; } } else if (pool.getTotalRunningReduces() >= poolMgr.getAllocation(pool.getName(), taskType)) { continue; } if (LOG.isDebugEnabled()) { LOG.debug("The scheduler choose Pool: " + pool.getName() + "'s job to assign tasks."); } // Iterate all jobs in current pool, find a job need this type of task for (JobInProgress job : pool.getRunnableJobs()) { JobInfo info = pool.getJobInfo(job); if (!needLaunchTask(taskType, info)) continue; Task task = null; if (taskType == YTTaskType.MAP) { LocalityLevel localityLevel = getAllowedLocalityLevel(job, info, currentTime); task = taskSelector.obtainNewMapTask(tracker, job, localityLevel.toCacheLevelCap()); } else { task = taskSelector.obtainNewReduceTask(tracker, job); } if (task != null) { // Update the JobInfo for this job so we account for the launched // tasks during this update interval and don't try to launch more // tasks than the job needed on future heartbeats foundTask = true; if (taskType == YTTaskType.MAP) { info.runningMaps++; info.neededMaps--; mapsAssigned++; runningMaps++; updateLastMapLocalityLevel(job, pool.getJobInfo(job), task, tracker); pool.setTotalRunningMaps(pool.getTotalRunningMaps() + 1); pool.getJobInfo(job).assignedAtLastHeartbeat++; } else { info.runningReduces++; info.neededReduces--; reducesAssigned++; runningReduces++; pool.setTotalRunningReduces(pool.getTotalRunningReduces() + 1); } tasks.add(task); break; } else { // Mark any jobs that were visited for map tasks but did not launch // a task as skipped on this heartbeat if (taskType == YTTaskType.MAP) { pool.getJobInfo(job).skippedAtLastHeartbeat++; } } } if (foundTask) break; } // Reject the task type if we cannot find a task if (!foundTask) { if (taskType == YTTaskType.MAP) { mapRejected = true; } else { reduceRejected = true; } } } if (LOG.isDebugEnabled()) { for (Task task : tasks) { LOG.debug("Assign task : " + task.getTaskID() + " to TaskTracker: " + tracker.getTrackerName()); } } // If no tasks were found, return null return tasks.isEmpty() ? null : tasks; } private boolean needLaunchTask(YTTaskType taskType, JobInfo info) { if (taskType == YTTaskType.MAP) { return info.runningMaps < info.mapFairShare; } else { return info.runningReduces < info.reduceFairShare; } } /** * Get the maximum locality level at which a given job is allowed to launch * tasks, based on how long it has been waiting for local tasks. This is used * to implement the "delay scheduling" feature of the Fair Scheduler for * optimizing data locality. If the job has no locality information (e.g. it * does not use HDFS), this method returns LocalityLevel.ANY, allowing tasks * at any level. Otherwise, the job can only launch tasks at its current * locality level or lower, unless it has waited at least localityDelay * milliseconds (in which case it can go one level beyond) or 2 * * localityDelay millis (in which case it can go to any level). */ protected LocalityLevel getAllowedLocalityLevel(JobInProgress job, JobInfo info, long currentTime) { if (info == null) { // Job not in infos (shouldn't happen) LOG.error("getAllowedLocalityLevel called on job " + job + ", which does not have a JobInfo in infos"); return LocalityLevel.ANY; } if (job.nonLocalMaps.size() > 0) { // Job doesn't have locality information return LocalityLevel.ANY; } // In the common case, compute locality level based on time waited switch (info.lastMapLocalityLevel) { case NODE: // Last task launched was node-local if (info.timeWaitedForLocalMap >= 2 * localityDelay) return LocalityLevel.ANY; else if (info.timeWaitedForLocalMap >= localityDelay) return LocalityLevel.RACK; else return LocalityLevel.NODE; case RACK: // Last task launched was rack-local if (info.timeWaitedForLocalMap >= localityDelay) return LocalityLevel.ANY; else return LocalityLevel.RACK; default: // Last task was non-local; can launch anywhere return LocalityLevel.ANY; } } /** * Recompute the internal variables used by the scheduler - per-job weights, * fair shares, deficits, minimum slot allocations, and numbers of running * and needed tasks of each type. */ protected void update() { // Recompute locality delay from JobTracker heartbeat interval if enabled. // This will also lock the JT, so do it outside of a fair scheduler lock. if (autoComputeLocalityDelay && tickCount % 600 == 0) { if (taskTrackerManager instanceof JobTracker) { JobTracker jobTracker = (JobTracker) taskTrackerManager; localityDelay = Math.min(MAX_AUTOCOMPUTED_LOCALITY_DELAY, (long) (1.5 * jobTracker.getNextHeartbeatInterval())); } } synchronized (this) { // Remove non-running jobs for (YTPool pool : poolMgr.getPools()) { List<JobInProgress> toRemove = new ArrayList<JobInProgress>(); for (JobInProgress job : pool.getJobs()) { int runState = job.getStatus().getRunState(); if (runState == JobStatus.SUCCEEDED || runState == JobStatus.FAILED || runState == JobStatus.KILLED) { toRemove.add(job); } } for (JobInProgress job : toRemove) { pool.removeJob(job); } } updateRunnability(); updateTaskCounts(); updateWeights(); updateMinSlots(); updateUpdateInterval(); updateLocalityWaitTimes(clock.getTime()); updateRunnableMapAndReduce(); // Reload allocations file if it hasn't been loaded in a while poolMgr.reloadAllocsIfNecessary(taskTrackerManager); if (tickCount % 600 == 0) { reloadConfiguration(new Configuration(true)); } } } private void updateRunnableMapAndReduce() { totalMapSlots = getTotalSlots(YTTaskType.MAP); totalReduceSlots = getTotalSlots(YTTaskType.REDUCE); runnableReduces = 0; runnableMaps = 0; for (YTPool pool : poolMgr.getPools()) { for (JobInProgress job : pool.getRunnableJobs()) { runnableMaps += pool.runnableTasks(job, YTTaskType.MAP); runnableReduces += pool.runnableTasks(job, YTTaskType.REDUCE); } } } private void updateRunnability() { // Start by marking everything as not runnable for (YTPool pool : poolMgr.getPools()) { for (JobInProgress job : pool.getRunnableJobs()) { JobInfo info = pool.getJobInfo(job); if (info != null) { info.runnable = false; } } } for (YTPool pool : poolMgr.getPools()) { Map<String, Integer> userJobs = new HashMap<String, Integer>(); int cntPoolJobs = 0; int cntPoolMaxJobs = poolMgr.getPoolMaxJobs(pool.getName()); List<JobInProgress> nextRunnableJobs = new ArrayList<JobInProgress>(cntPoolMaxJobs); for (JobInProgress job : pool.getOrderedJobs()) { if (job.getStatus().getRunState() == JobStatus.RUNNING) { String user = job.getJobConf().getUser(); int userCount = userJobs.containsKey(user) ? userJobs.get(user) : 0; if (userCount < poolMgr.getUserMaxJobs(user) && cntPoolJobs < cntPoolMaxJobs) { pool.getJobInfo(job).runnable = true; userJobs.put(user, userCount + 1); // make sure there are maxRunningJobs jobs can run maps if there are more then maxRunningJobs jobs in pool if (job.finishedMapTasks < job.numMapTasks) { cntPoolJobs++; } nextRunnableJobs.add(job); } else if (cntPoolJobs == cntPoolMaxJobs) { break; } } } pool.setRunnableJobs(nextRunnableJobs); } } private void updateTaskCounts() { for (YTPool pool : poolMgr.getPools()) { int totalRunningMaps = 0; int totalRunningReduces = 0; int maxPoolRunningJobs = poolMgr.getPoolMaxJobs(pool.getName()); int needReduceJobs = 0; for (JobInProgress job : pool.getOrderedJobs()) { JobInfo info = pool.getJobInfo(job); if (job.getStatus().getRunState() != JobStatus.RUNNING) continue; // Job is still in PREP state and tasks aren't initialized if (info.runnable == false && info.runningMaps == 0 && info.runningReduces == 0) continue; int totalMaps = job.numMapTasks; int finishedMaps = 0; int runningMaps = 0; int runningMapsWithoutSpeculative = 0; // Count maps if (info.runnable || info.runningMaps != 0) { for (TaskInProgress tip : job.getMapTasks()) { if (tip.isComplete()) { finishedMaps += 1; } else if (tip.isRunning()) { runningMaps += tip.getActiveTasks().size(); runningMapsWithoutSpeculative += 1; } } info.runningMaps = runningMaps; info.neededMaps = (totalMaps - runningMapsWithoutSpeculative - finishedMaps + taskSelector.neededSpeculativeMaps(job)); } // Count reduces if (info.runnable || info.runningReduces != 0) { int totalReduces = job.numReduceTasks; int finishedReduces = 0; int runningReduces = 0; int runningReducesWithoutSpeculative = 0; for (TaskInProgress tip : job.getReduceTasks()) { if (tip.isComplete()) { finishedReduces += 1; } else if (tip.isRunning()) { runningReduces += tip.getActiveTasks().size(); runningReducesWithoutSpeculative += 1; } } info.runningReduces = runningReduces; // make sure there are only maxRunningJobs in pool can run reduce if (needReduceJobs < maxPoolRunningJobs && enoughMapsFinishedToRunReduces(finishedMaps, totalMaps, pool.getName(), info.jobLevel)) { info.neededReduces = (totalReduces - runningReducesWithoutSpeculative - finishedReduces + taskSelector.neededSpeculativeReduces(job)); needReduceJobs += 1; } else { info.neededReduces = 0; } } if (!info.runnable) { info.neededMaps = 0; info.neededReduces = 0; } totalRunningMaps += info.runningMaps; totalRunningReduces += info.runningReduces; } pool.setTotalRunningMaps(totalRunningMaps); pool.setTotalRunningReduces(totalRunningReduces); } } /** * Has a job finished enough maps to allow launching its reduces? * HADOOP-4666 */ protected boolean enoughMapsFinishedToRunReduces(int finishedMaps, int totalMaps) { if (waitForMapsBeforeLaunchingReduces) { return finishedMaps >= Math.max(1, totalMaps * 0.1); } else { return true; } } /** * Has a job finished enough maps to allow launching its reduces * according to its job level and poolname? */ protected boolean enoughMapsFinishedToRunReduces(int finishedMaps, int totalMaps, String poolname, int jobLevel) { if (!waitForMapsBeforeLaunchingReduces) { return true; } YTPoolManager.ReduceWatcher watcher = poolMgr.getReduceWatcher(poolname); if (watcher == null) { return enoughMapsFinishedToRunReduces(finishedMaps, totalMaps); } return watcher.shouldRunReduces(finishedMaps, totalMaps, jobLevel); } private void updateWeights() { for (YTPool pool : poolMgr.getPools()) { for (Map.Entry<JobInProgress, JobInfo> entry : pool.getJobInfos().entrySet()) { JobInProgress job = entry.getKey(); JobInfo info = entry.getValue(); info.mapWeight = calculateWeight(job, YTTaskType.MAP, info, pool); info.reduceWeight = calculateWeight(job, YTTaskType.REDUCE, info, pool); } } } private void updateMinSlotsByFIFO(YTPool pool) { for (final YTTaskType type : YTTaskType.values()) { int slotsLeft = poolMgr.getAllocation(pool.getName(), type); for (JobInProgress job : pool.getRunnableJobs()) { if (slotsLeft <= 0) { break; } slotsLeft = giveMinSlots(job, type, slotsLeft, slotsLeft, pool); } } } private void updateMinSlots() { // For each pool, distribute its task allocation among jobs in it that need // slots. This is a little tricky since some jobs in the pool might not be // able to use all the slots, e.g. they might have only a few tasks left. // To deal with this, we repeatedly split up the available task slots // between the jobs left, give each job min(its alloc, # of slots it needs), // and redistribute any slots that are left over between jobs that still // need slots on the next pass. If, in total, the jobs in our pool don't // need all its allocation, we leave the leftover slots for general use. for (YTPool pool : poolMgr.getPools()) { // Clear old minSlots for (JobInfo info : pool.getJobInfos().values()) { info.mapFairShare = 0; info.reduceFairShare = 0; } if (poolMgr.getPoolUseFIFO(pool.getName())) { updateMinSlotsByFIFO(pool); continue; } for (final YTTaskType type : YTTaskType.values()) { List<JobInProgress> jobs = new LinkedList<JobInProgress>(pool.getRunnableJobs()); int slotsLeft = poolMgr.getAllocation(pool.getName(), type); // Keep assigning slots until none are left while (slotsLeft > 0) { // Figure out total weight of the highest joblevel jobs that still // need slots double totalWeight = 0; int topJobLevel = -1; ArrayList<JobInProgress> jobsOftopJobLevel = new ArrayList<JobInProgress>(); for (Iterator<JobInProgress> it = jobs.iterator(); it.hasNext();) { JobInProgress job = it.next(); JobInfo info = pool.getJobInfo(job); if (pool.isRunnable(job) && pool.runnableTasks(job, type) > pool.minTasks(job, type)) { if (info.jobLevel > topJobLevel) { topJobLevel = info.jobLevel; totalWeight = pool.weight(job, type); jobsOftopJobLevel.clear(); jobsOftopJobLevel.add(job); } else if (info.jobLevel == topJobLevel) { totalWeight += pool.weight(job, type); jobsOftopJobLevel.add(job); } } else { it.remove(); } } if (totalWeight == 0) // No jobs that can use more slots are left break; // Assign slots to jobs, using the floor of their weight divided by // total weight. This ensures that all jobs get some chance to take // a slot. Then, if no slots were assigned this way, we do another // pass where we use ceil, in case some slots were still left over. int oldSlots = slotsLeft; // Copy slotsLeft so we can modify it for (JobInProgress job : jobsOftopJobLevel) { double weight = pool.weight(job, type); int share = (int) Math.floor(oldSlots * weight / totalWeight); slotsLeft = giveMinSlots(job, type, slotsLeft, share, pool); } if (slotsLeft == oldSlots) { for (JobInProgress job : jobsOftopJobLevel) { double weight = pool.weight(job, type); int share = (int) Math.ceil(oldSlots * weight / totalWeight); slotsLeft = giveMinSlots(job, type, slotsLeft, share, pool); } } } } } } /** * Give up to <code>tasksToGive</code> min slots to a job (potentially fewer * if either the job needs fewer slots or there aren't enough slots left). * Returns the number of slots left over. */ private int giveMinSlots(JobInProgress job, YTTaskType type, int slotsLeft, int slotsToGive, YTPool pool) { int runnable = pool.runnableTasks(job, type); int curMin = pool.minTasks(job, type); slotsToGive = Math.min(Math.min(slotsLeft, runnable - curMin), slotsToGive); slotsLeft -= slotsToGive; JobInfo info = pool.getJobInfo(job); if (type == YTTaskType.MAP) info.mapFairShare += slotsToGive; else info.reduceFairShare += slotsToGive; return slotsLeft; } private double calculateWeight(JobInProgress job, YTTaskType taskType, JobInfo jobInfo, YTPool pool) { if (!jobInfo.runnable) { return 0; } else { double weight = 1.0; if (sizeBasedWeight) { // Set weight based on runnable tasks weight = Math.log1p(pool.runnableTasks(job, taskType)) / Math.log(2); } weight *= getPriorityFactor(job.getPriority()); if (weightAdjuster != null) { // Run weight through the user-supplied weightAdjuster weight = weightAdjuster.adjustWeight(job, taskType, weight); } return weight; } } public YTPoolManager getPoolManager() { return poolMgr; } public int getTotalSlots(YTTaskType type) { int slots = 0; for (TaskTrackerStatus tt : taskTrackerManager.taskTrackers()) { slots += (type == YTTaskType.MAP ? tt.getMaxMapTasks() : tt.getMaxReduceTasks()); } return slots; } @Override public synchronized Collection<JobInProgress> getJobs(String queueName) { YTPool myJobPool = poolMgr.getPool(queueName); return myJobPool.getJobs(); } private double getPriorityFactor(JobPriority priority) { switch (priority) { case VERY_HIGH: return 4.0; case HIGH: return 2.0; case NORMAL: return 1.0; case LOW: return 0.5; default: return 0.25; //priority = VERY_LOW } } public JobInfo getJobInfo(JobInProgress job) { String poolName = poolMgr.getPoolName(job); if (poolName != null) { YTPool pool = poolMgr.getPool(poolName); return pool.getJobInfo(job); } return null; } /** * Get maximum number of tasks to assign on a TaskTracker on a heartbeat. The * scheduler may launch fewer than this many tasks if the LoadManager says not * to launch more, but it will never launch more than this number. */ private int maxTasksToAssign(YTTaskType type, TaskTrackerStatus tts) { if (!assignMultiple) return 1; int cap = (type == YTTaskType.MAP) ? mapAssignCap : reduceAssignCap; int available = (type == YTTaskType.MAP) ? tts.getAvailableMapSlots() : tts.getAvailableReduceSlots(); if (cap == -1) // Infinite cap; use the TaskTracker's slot count return available; else return Math.min(cap, available); } /** * Update locality wait times for jobs that were skipped at last heartbeat. */ private void updateLocalityWaitTimes(long currentTime) { long timeSinceLastHeartbeat = (lastHeartbeatTime == 0 ? 0 : currentTime - lastHeartbeatTime); lastHeartbeatTime = currentTime; for (YTPool pool : poolMgr.getPools()) { for (JobInProgress job : pool.getRunnableJobs()) { JobInfo info = pool.getJobInfo(job); // Update wait time only if the job is skipped and not assigned even // once between updates if (info.skippedAtLastHeartbeat > 0 && info.assignedAtLastHeartbeat == 0) { info.timeWaitedForLocalMap += timeSinceLastHeartbeat; } info.assignedAtLastHeartbeat = 0; info.skippedAtLastHeartbeat = 0; } } } /** * Update a job's locality level and locality wait variables given that that * it has just launched a map task on a given task tracker. */ private void updateLastMapLocalityLevel(JobInProgress job, JobInfo info, Task mapTaskLaunched, TaskTrackerStatus tracker) { LocalityLevel localityLevel = LocalityLevel.fromTask(job, mapTaskLaunched, tracker); info.lastMapLocalityLevel = localityLevel; info.timeWaitedForLocalMap = 0; // eventLog.log("ASSIGNED_LOC_LEVEL", job.getJobID(), localityLevel); } /** * Reload configuration periodically to prevent restarting * <code>JobTracker</code> when configuration is changed. */ private void reloadConfiguration(Configuration conf) { assignMultiple = conf.getBoolean("mapred.yunti3scheduler.assignmultiple", true); mapAssignCap = conf.getInt("mapred.yunti3scheduler.assignmultiple.maps", 1); reduceAssignCap = conf.getInt("mapred.yunti3scheduler.assignmultiple.reduces", 1); localityDelay = conf.getLong("mapred.yunti3scheduler.locality.delay", -1); if (localityDelay == -1) autoComputeLocalityDelay = true; // Compute from heartbeat interval sizeBasedWeight = conf.getBoolean("mapred.yunti3scheduler.sizebasedweight", false); } }