Java tutorial
/** * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package skewtune.mapreduce; import java.io.DataInputStream; import java.io.FileNotFoundException; import java.io.IOException; import java.io.PrintWriter; import java.io.Writer; import java.net.BindException; import java.net.InetSocketAddress; import java.net.UnknownHostException; import java.security.PrivilegedExceptionAction; import java.text.ParseException; import java.text.SimpleDateFormat; import java.util.ArrayList; import java.util.Arrays; import java.util.BitSet; import java.util.Collection; import java.util.Collections; import java.util.Date; import java.util.HashMap; import java.util.HashSet; import java.util.Iterator; import java.util.List; import java.util.Map; import java.util.Properties; import java.util.Set; import java.util.TreeMap; import java.util.concurrent.Callable; import java.util.concurrent.ExecutionException; import java.util.concurrent.Future; import java.util.concurrent.SynchronousQueue; import java.util.concurrent.ThreadPoolExecutor; import java.util.concurrent.TimeUnit; import javax.servlet.ServletException; import javax.servlet.http.HttpServlet; import javax.servlet.http.HttpServletRequest; import javax.servlet.http.HttpServletResponse; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FSDataInputStream; import org.apache.hadoop.fs.FileStatus; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.LocalDirAllocator; import org.apache.hadoop.fs.LocalFileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.fs.permission.FsPermission; import org.apache.hadoop.http.HttpServer; import org.apache.hadoop.io.Text; import org.apache.hadoop.ipc.RPC; import org.apache.hadoop.ipc.Server; import org.apache.hadoop.ipc.RPC.VersionMismatch; import org.apache.hadoop.mapred.JobConf; import org.apache.hadoop.mapred.JobID; import org.apache.hadoop.mapred.JobTracker; import org.apache.hadoop.mapred.TaskAttemptID; import org.apache.hadoop.mapred.TaskID; import org.apache.hadoop.mapreduce.Cluster; import org.apache.hadoop.mapreduce.ClusterMetrics; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.MRConfig; import org.apache.hadoop.mapreduce.MRJobConfig; import org.apache.hadoop.mapreduce.TaskType; import org.apache.hadoop.mapreduce.protocol.ClientProtocol; import org.apache.hadoop.mapreduce.security.TokenCache; import org.apache.hadoop.mapreduce.security.token.delegation.DelegationTokenIdentifier; import org.apache.hadoop.mapreduce.security.token.delegation.DelegationTokenSecretManager; import org.apache.hadoop.mapreduce.split.JobSplit; import org.apache.hadoop.mapreduce.split.SplitMetaInfoReader; import org.apache.hadoop.mapreduce.util.MRAsyncDiskService; import org.apache.hadoop.net.NetUtils; import org.apache.hadoop.security.AccessControlException; import org.apache.hadoop.security.Groups; import org.apache.hadoop.security.RefreshUserToGroupMappingsProtocol; import org.apache.hadoop.security.TokenStorage; import org.apache.hadoop.security.UserGroupInformation; import org.apache.hadoop.security.authorize.RefreshAuthorizationPolicyProtocol; import org.apache.hadoop.security.token.Token; import org.apache.hadoop.util.StringUtils; import org.apache.hadoop.util.VersionInfo; import skewtune.mapreduce.JobInProgress.JobType; import skewtune.mapreduce.JobInProgress.ReactionContext; import skewtune.mapreduce.JobInProgress.ReexecMap; import skewtune.mapreduce.JobInProgress.ReexecReduce; import skewtune.mapreduce.PartitionPlanner.ClusterInfo; import skewtune.mapreduce.PartitionPlanner.Partition; import skewtune.mapreduce.PartitionPlanner.Plan; import skewtune.mapreduce.PartitionPlanner.PlanSpec; import skewtune.mapreduce.lib.input.InputSplitCache; import skewtune.mapreduce.protocol.HeartbeatResponse; import skewtune.mapreduce.protocol.JobOnTaskTracker; import skewtune.mapreduce.protocol.ReactiveMapOutput; import skewtune.mapreduce.protocol.STTaskStatus; import skewtune.mapreduce.protocol.SkewTuneClientProtocol; import skewtune.mapreduce.protocol.SkewTuneTrackerProtocol; import skewtune.mapreduce.protocol.TaskAction; import skewtune.mapreduce.protocol.TaskStatusEvent; import skewtune.mapreduce.protocol.TaskTrackerStatus; import skewtune.mapreduce.server.jobtracker.JTConfig; import skewtune.utils.Base64; public class STJobTracker implements MRJobConfig, SkewTuneClientProtocol, SkewTuneTrackerProtocol, RefreshUserToGroupMappingsProtocol, JTConfig, TaskTrackerHttpResolver { static { org.apache.hadoop.mapreduce.util.ConfigUtil.loadResources(); skewtune.mapreduce.util.ConfigUtil.loadResources(); } private final long DELEGATION_TOKEN_GC_INTERVAL = 3600000; // 1 hour private final DelegationTokenSecretManager secretManager; // Approximate number of heartbeats that could arrive JobTracker // in a second private int NUM_HEARTBEATS_IN_SECOND; private final int DEFAULT_NUM_HEARTBEATS_IN_SECOND = 100; private final int MIN_NUM_HEARTBEATS_IN_SECOND = 1; // Scaling factor for heartbeats, used for testing only private float HEARTBEATS_SCALING_FACTOR; private final float MIN_HEARTBEATS_SCALING_FACTOR = 0.01f; private final float DEFAULT_HEARTBEATS_SCALING_FACTOR = 1.0f; public static enum State { INITIALIZING, RUNNING } State state = State.INITIALIZING; private static final int FS_ACCESS_RETRY_PERIOD = 10000; static final String JOB_INFO_FILE = "job-info"; static final String LOCAL_SPLIT_FILE = "split.dta"; static final String LOCAL_SPLIT_META_FILE = "split.info"; static final String JOBFILE = "job.xml"; static final String JOB_TOKEN_FILE = "jobToken"; // system directory is completely owned by the JobTracker final static FsPermission SYSTEM_DIR_PERMISSION = FsPermission.createImmutable((short) 0700); // rwx------ // system files should have 700 permission final static FsPermission SYSTEM_FILE_PERMISSION = FsPermission.createImmutable((short) 0700); // rwx------ private MRAsyncDiskService asyncDiskService; public static final Log LOG = LogFactory.getLog(STJobTracker.class); /** * Start the JobTracker with given configuration. * * The conf will be modified to reflect the actual ports on which the * JobTracker is up and running if the user passes the port as * <code>zero</code>. * * @param conf * configuration for the JobTracker. * @throws IOException */ static STJobTracker startTracker(JobConf conf) throws IOException, InterruptedException { return startTracker(conf, generateNewIdentifier()); } static STJobTracker startTracker(JobConf conf, String identifier) throws IOException, InterruptedException { STJobTracker result = null; while (true) { try { result = new STJobTracker(conf, identifier); break; } catch (VersionMismatch e) { throw e; } catch (BindException e) { throw e; } catch (UnknownHostException e) { throw e; } catch (AccessControlException ace) { // in case of jobtracker not having right access // bail out throw ace; } catch (IOException e) { LOG.warn("Error starting tracker: " + StringUtils.stringifyException(e)); } Thread.sleep(1000); } return result; } public void stopTracker() throws IOException { close(); } @Override public long getProtocolVersion(String protocol, long clientVersion) throws IOException { if (protocol.equals(SkewTuneTrackerProtocol.class.getName())) { return SkewTuneTrackerProtocol.versionID; } else if (protocol.equals(SkewTuneClientProtocol.class.getName())) { return SkewTuneClientProtocol.versionID; } else if (protocol.equals(RefreshAuthorizationPolicyProtocol.class.getName())) { return RefreshAuthorizationPolicyProtocol.versionID; } else if (protocol.equals(RefreshUserToGroupMappingsProtocol.class.getName())) { return RefreshUserToGroupMappingsProtocol.versionID; } else { throw new IOException("Unknown protocol to job tracker: " + protocol); } } // /////////////////////////////////////////////////////////////// // The real JobTracker // ////////////////////////////////////////////////////////////// int port; String localMachine; private final String trackerIdentifier; long startTime; int totalSubmissions = 0; // // Properties to maintain while running Jobs and Tasks: // // 1. Each Task is always contained in a single Job. A Job succeeds when all // its // Tasks are complete. // // 2. Every running or successful Task is assigned to a Tracker. Idle Tasks // are not. // // 3. When a Tracker fails, all of its assigned Tasks are marked as // failures. // // 4. A Task might need to be reexecuted if it (or the machine it's hosted // on) fails // before the Job is 100% complete. Sometimes an upstream Task can fail // without // reexecution if all downstream Tasks that require its output have already // obtained // the necessary files. // // (trackerID --> list of jobs to cleanup) Map<String, Set<JobID>> trackerToJobsToCleanup = new HashMap<String, Set<JobID>>(); // (trackerID --> list of tasks to cleanup) Map<String, Set<TaskAttemptID>> trackerToTasksToCleanup = new HashMap<String, Set<TaskAttemptID>>(); // jobs that are running Map<JobID, JobInProgress> jobs = new TreeMap<JobID, JobInProgress>(); // (taskid --> trackerID) TreeMap<TaskAttemptID, String> taskidToTrackerMap = new TreeMap<TaskAttemptID, String>(); // for caching task completion events HashSet<JobInProgress> originalJobs = new HashSet<JobInProgress>(); HashSet<TaskID> pendingReactiveJob = new HashSet<TaskID>(); HashMap<JobID, ScanTask> scanningJob = new HashMap<JobID, ScanTask>(); // pending completed reactive map jobs Map<JobID, JobInProgress> pendingCompletedReactiveJob = new HashMap<JobID, JobInProgress>(); // (trackerID --> last received heartbeat message TreeMap<String, TaskTrackerStatus> trackerToLastHeartbeat = new TreeMap<String, TaskTrackerStatus>(); TreeMap<String, Integer> trackerToHttpPort = new TreeMap<String, Integer>(); TreeMap<TaskAttemptID, TaskInProgress> taskidToTIP = new TreeMap<TaskAttemptID, TaskInProgress>(); TreeMap<JobID, PlannedJob> plannedJobs = new TreeMap<JobID, PlannedJob>(); // Watch and expire TaskTracker objects using these structures. // We can map from Name->TaskTrackerStatus, or we can expire by time. int totalMaps = 0; int totalReduces = 0; // Used to provide an HTML view on Job, Task, and TaskTracker structures final HttpServer infoServer; int infoPort; String defaultNotificationUrl; String defaultSpeculationEventUrl; String defaultSkewReportUrl; String trackerHttp; boolean speculativeSplit; Server interTrackerServer; // Some jobs are stored in a local system directory. We can delete // the files when we're done with the job. static final String SUBDIR = "skewtune-jt"; FileSystem fs = null; Path systemDir = null; JobConf conf; private final UserGroupInformation mrOwner; private final String supergroup; // TODO currently SkewTune job tracker is associated with a single job // tracker. support multiple trackers? private final Cluster cluster; private final ClientProtocol jtClient; private boolean dumpHeartbeat; private static LocalDirAllocator lDirAlloc = new LocalDirAllocator(MRConfig.LOCAL_DIR); int reservedMaps; int reservedReduces; // TO BE USED BY TEST CLASSES ONLY // ONLY BUILD THE STATE WHICH IS REQUIRED BY TESTS STJobTracker() { infoServer = null; supergroup = null; trackerIdentifier = null; mrOwner = null; secretManager = null; cluster = null; jtClient = null; } /** * Start the JobTracker process, listen on the indicated port */ STJobTracker(JobConf conf) throws IOException, InterruptedException { this(conf, generateNewIdentifier()); } @SuppressWarnings("unchecked") STJobTracker(final JobConf conf, String jobtrackerIndentifier) throws IOException, InterruptedException { // find the owner of the process // get the desired principal to load String keytabFilename = conf.get(JTConfig.JT_KEYTAB_FILE); UserGroupInformation.setConfiguration(conf); if (keytabFilename != null) { String desiredUser = conf.get(JTConfig.JT_USER_NAME, System.getProperty("user.name")); UserGroupInformation.loginUserFromKeytab(desiredUser, keytabFilename); mrOwner = UserGroupInformation.getLoginUser(); } else { mrOwner = UserGroupInformation.getCurrentUser(); } supergroup = conf.get(MR_SUPERGROUP, "supergroup"); LOG.info("Starting jobtracker with owner as " + mrOwner.getShortUserName() + " and supergroup as " + supergroup); long secretKeyInterval = conf.getLong(MRConfig.DELEGATION_KEY_UPDATE_INTERVAL_KEY, MRConfig.DELEGATION_KEY_UPDATE_INTERVAL_DEFAULT); long tokenMaxLifetime = conf.getLong(MRConfig.DELEGATION_TOKEN_MAX_LIFETIME_KEY, MRConfig.DELEGATION_TOKEN_MAX_LIFETIME_DEFAULT); long tokenRenewInterval = conf.getLong(MRConfig.DELEGATION_TOKEN_RENEW_INTERVAL_KEY, MRConfig.DELEGATION_TOKEN_RENEW_INTERVAL_DEFAULT); secretManager = new DelegationTokenSecretManager(secretKeyInterval, tokenMaxLifetime, tokenRenewInterval, DELEGATION_TOKEN_GC_INTERVAL); secretManager.startThreads(); // // Grab some static constants // NUM_HEARTBEATS_IN_SECOND = conf.getInt(JT_HEARTBEATS_IN_SECOND, DEFAULT_NUM_HEARTBEATS_IN_SECOND); if (NUM_HEARTBEATS_IN_SECOND < MIN_NUM_HEARTBEATS_IN_SECOND) { NUM_HEARTBEATS_IN_SECOND = DEFAULT_NUM_HEARTBEATS_IN_SECOND; } HEARTBEATS_SCALING_FACTOR = conf.getFloat(JT_HEARTBEATS_SCALING_FACTOR, DEFAULT_HEARTBEATS_SCALING_FACTOR); if (HEARTBEATS_SCALING_FACTOR < MIN_HEARTBEATS_SCALING_FACTOR) { HEARTBEATS_SCALING_FACTOR = DEFAULT_HEARTBEATS_SCALING_FACTOR; } // whether to dump or not every heartbeat message even when DEBUG is enabled dumpHeartbeat = conf.getBoolean(JT_HEARTBEATS_DUMP, false); // This is a directory of temporary submission files. We delete it // on startup, and can delete any files that we're done with this.conf = conf; JobConf jobConf = new JobConf(conf); // Set ports, start RPC servers, setup security policy etc. InetSocketAddress addr = getAddress(conf); this.localMachine = addr.getHostName(); this.port = addr.getPort(); int handlerCount = conf.getInt(JT_IPC_HANDLER_COUNT, 10); this.interTrackerServer = RPC.getServer(SkewTuneClientProtocol.class, this, addr.getHostName(), addr.getPort(), handlerCount, false, conf, secretManager); if (LOG.isDebugEnabled()) { Properties p = System.getProperties(); for (Iterator it = p.keySet().iterator(); it.hasNext();) { String key = (String) it.next(); String val = p.getProperty(key); LOG.debug("Property '" + key + "' is " + val); } } InetSocketAddress infoSocAddr = NetUtils .createSocketAddr(conf.get(JT_HTTP_ADDRESS, String.format("%s:0", this.localMachine))); String infoBindAddress = infoSocAddr.getHostName(); int tmpInfoPort = infoSocAddr.getPort(); this.startTime = System.currentTimeMillis(); infoServer = new HttpServer("job", infoBindAddress, tmpInfoPort, tmpInfoPort == 0, conf); infoServer.setAttribute("job.tracker", this); infoServer.addServlet("jobcompletion", "/completion", JobCompletionServlet.class); infoServer.addServlet("taskspeculation", "/speculation", SpeculationEventServlet.class); infoServer.addServlet("skewreport", "/skew", SkewReportServlet.class); infoServer.addServlet("tasksplit", "/split/*", SplitTaskServlet.class); infoServer.addServlet("tasksplitV2", "/splitV2/*", SplitTaskV2Servlet.class); infoServer.start(); this.trackerIdentifier = jobtrackerIndentifier; // The rpc/web-server ports can be ephemeral ports... // ... ensure we have the correct info this.port = interTrackerServer.getListenerAddress().getPort(); this.conf.set(JT_IPC_ADDRESS, (this.localMachine + ":" + this.port)); LOG.info("JobTracker up at: " + this.port); this.infoPort = this.infoServer.getPort(); this.conf.set(JT_HTTP_ADDRESS, infoBindAddress + ":" + this.infoPort); LOG.info("JobTracker webserver: " + this.infoServer.getPort()); this.defaultNotificationUrl = String.format("http://%s:%d/completion?jobid=$jobId&status=$jobStatus", infoBindAddress, this.infoPort); LOG.info("JobTracker completion URI: " + defaultNotificationUrl); // this.defaultSpeculationEventUrl = String.format("http://%s:%d/speculation?taskid=$taskId&remainTime=$taskRemainTime",infoBindAddress,this.infoPort); this.defaultSpeculationEventUrl = String.format("http://%s:%d/speculation?jobid=$jobId", infoBindAddress, this.infoPort); LOG.info("JobTracker speculation event URI: " + defaultSpeculationEventUrl); this.defaultSkewReportUrl = String.format("http://%s:%d/skew", infoBindAddress, this.infoPort); LOG.info("JobTracker skew report event URI: " + defaultSkewReportUrl); this.trackerHttp = String.format("http://%s:%d", infoBindAddress, this.infoPort); while (!Thread.currentThread().isInterrupted()) { try { // if we haven't contacted the namenode go ahead and do it if (fs == null) { fs = mrOwner.doAs(new PrivilegedExceptionAction<FileSystem>() { @Override public FileSystem run() throws IOException { return FileSystem.get(conf); } }); } // clean up the system dir, which will only work if hdfs is out // of safe mode if (systemDir == null) { systemDir = new Path(getSystemDir()); } try { FileStatus systemDirStatus = fs.getFileStatus(systemDir); if (!systemDirStatus.getOwner().equals(mrOwner.getShortUserName())) { throw new AccessControlException( "The systemdir " + systemDir + " is not owned by " + mrOwner.getShortUserName()); } if (!systemDirStatus.getPermission().equals(SYSTEM_DIR_PERMISSION)) { LOG.warn("Incorrect permissions on " + systemDir + ". Setting it to " + SYSTEM_DIR_PERMISSION); fs.setPermission(systemDir, new FsPermission(SYSTEM_DIR_PERMISSION)); } else { break; } } catch (FileNotFoundException fnf) { } // ignore } catch (AccessControlException ace) { LOG.warn("Failed to operate on " + JTConfig.JT_SYSTEM_DIR + "(" + systemDir + ") because of permissions."); LOG.warn("Manually delete the " + JTConfig.JT_SYSTEM_DIR + "(" + systemDir + ") and then start the JobTracker."); LOG.warn("Bailing out ... "); throw ace; } catch (IOException ie) { LOG.info("problem cleaning system directory: " + systemDir, ie); } Thread.sleep(FS_ACCESS_RETRY_PERIOD); } if (Thread.currentThread().isInterrupted()) { throw new InterruptedException(); } // initialize cluster variable cluster = new Cluster(this.conf); // now create a job client proxy jtClient = (ClientProtocol) RPC.getProxy(ClientProtocol.class, ClientProtocol.versionID, JobTracker.getAddress(conf), mrOwner, this.conf, NetUtils.getSocketFactory(conf, ClientProtocol.class)); new SpeculativeScheduler().start(); // initialize task event fetcher new TaskCompletionEventFetcher().start(); // Same with 'localDir' except it's always on the local disk. asyncDiskService = new MRAsyncDiskService(FileSystem.getLocal(conf), conf.getLocalDirs()); asyncDiskService.moveAndDeleteFromEachVolume(SUBDIR); // keep at least one asynchronous worker per CPU core int numProcs = Runtime.getRuntime().availableProcessors(); LOG.info("# of available processors = " + numProcs); int maxFactor = conf.getInt(JT_MAX_ASYNC_WORKER_FACTOR, 2); asyncWorkers = new ThreadPoolExecutor(numProcs, numProcs * maxFactor, 30, TimeUnit.SECONDS, new SynchronousQueue<Runnable>(true), new ThreadPoolExecutor.CallerRunsPolicy()); speculativeSplit = conf.getBoolean(JT_SPECULATIVE_SPLIT, false); } private static SimpleDateFormat getDateFormat() { return new SimpleDateFormat("yyyyMMddHHmm"); } private static String generateNewIdentifier() { return getDateFormat().format(new Date()); } static boolean validateIdentifier(String id) { try { // the jobtracker id should be 'date' parseable getDateFormat().parse(id); return true; } catch (ParseException pe) { } return false; } static boolean validateJobNumber(String id) { try { // the job number should be integer parseable Integer.parseInt(id); return true; } catch (IllegalArgumentException pe) { } return false; } /** * Get JobTracker's FileSystem. This is the filesystem for * mapreduce.system.dir. */ FileSystem getFileSystem() { return fs; } /** * Get JobTracker's LocalFileSystem handle. This is used by jobs for * localizing job files to the local disk. */ LocalFileSystem getLocalFileSystem() throws IOException { return FileSystem.getLocal(conf); } public static InetSocketAddress getAddress(Configuration conf) { String jobTrackerStr = conf.get(JT_IPC_ADDRESS, "localhost:9012"); return NetUtils.createSocketAddr(jobTrackerStr); } /** * Run forever */ public void offerService() throws InterruptedException, IOException { // Prepare for recovery. This is done irrespective of the status of // restart // flag. // start the inter-tracker server once the jt is ready this.interTrackerServer.start(); synchronized (this) { state = State.RUNNING; } LOG.info("Starting RUNNING"); this.interTrackerServer.join(); LOG.info("Stopped interTrackerServer"); } void close() throws IOException { if (this.infoServer != null) { LOG.info("Stopping infoServer"); try { this.infoServer.stop(); } catch (Exception ex) { LOG.warn("Exception shutting down JobTracker", ex); } } if (this.interTrackerServer != null) { LOG.info("Stopping interTrackerServer"); this.interTrackerServer.stop(); } LOG.info("stopped all jobtracker services"); return; } // ///////////////////////////////////////////////////// // Accessors for objects that want info on jobs, tasks, // trackers, etc. // ///////////////////////////////////////////////////// public int getTotalSubmissions() { return totalSubmissions; } public String getJobTrackerMachine() { return localMachine; } /** * Get the unique identifier (ie. timestamp) of this job tracker start. * * @return a string with a unique identifier */ public String getTrackerIdentifier() { return trackerIdentifier; } public int getTrackerPort() { return port; } public int getInfoPort() { return infoPort; } public long getStartTime() { return startTime; } public Cluster getCluster() { return cluster; } // ////////////////////////////////////////////////// // InterTrackerProtocol // ////////////////////////////////////////////////// public String getBuildVersion() throws IOException { return VersionInfo.getBuildVersion(); } /** * Grab the local fs name */ public synchronized String getFilesystemName() throws IOException { if (fs == null) { throw new IllegalStateException("FileSystem object not available yet"); } return fs.getUri().toString(); } /** * Remove the job_ from jobids to get the unique string. */ static String getJobUniqueString(String jobid) { return jobid.substring(4); } /** * @see org.apache.hadoop.mapreduce.protocol.ClientProtocol#getSystemDir() */ public String getSystemDir() { Path sysDir = new Path(conf.get(JTConfig.JT_SYSTEM_DIR, "/tmp/hadoop/mapred/system")); return fs.makeQualified(sysDir).toString(); } /** * @throws LoginException * @see org.apache.hadoop.mapreduce.protocol.ClientProtocol#getStagingAreaDir() */ public String getStagingAreaDir() throws IOException { try { final String user = UserGroupInformation.getCurrentUser().getShortUserName(); return mrOwner.doAs(new PrivilegedExceptionAction<String>() { @Override public String run() throws Exception { Path stagingRootDir = new Path( conf.get(JTConfig.JT_STAGING_AREA_ROOT, "/tmp/hadoop/mapred/staging")); FileSystem fs = stagingRootDir.getFileSystem(conf); return fs.makeQualified(new Path(stagingRootDir, user + "/.staging")).toString(); } }); } catch (InterruptedException ie) { throw new IOException(ie); } } // ///////////////////////////////////////////////////////////// // JobTracker methods // ///////////////////////////////////////////////////////////// // Get the job directory in system directory Path getSystemDirectoryForJob(org.apache.hadoop.mapreduce.JobID id) { return new Path(getSystemDir(), id.toString()); } // Get the job token file in system directory Path getSystemFileForJob(org.apache.hadoop.mapreduce.JobID id) { return new Path(getSystemDirectoryForJob(id) + "/" + JOB_INFO_FILE); } /** * Is the calling user a super user? Or part of the supergroup? * * @return true, if it is a super user */ static boolean isSuperUserOrSuperGroup(UserGroupInformation callerUGI, UserGroupInformation superUser, String superGroup) { if (superUser.getShortUserName().equals(callerUGI.getShortUserName())) { return true; } String[] groups = callerUGI.getGroupNames(); for (int i = 0; i < groups.length; ++i) { if (groups[i].equals(superGroup)) { return true; } } return false; } UserGroupInformation getMROwner() { return mrOwner; } String getSuperGroup() { return supergroup; } public static String getUserDir(String user) { return SUBDIR + Path.SEPARATOR + user; } public static String getLocalJobDir(String user, String jobid) { return getUserDir(user) + Path.SEPARATOR + jobid; } public static String getLocalJobTokenFile(String user, String jobid) { return getLocalJobDir(user, jobid) + Path.SEPARATOR + JOB_TOKEN_FILE; } public static String getLocalSplitFile(String user, String jobid) { return getLocalJobDir(user, jobid) + Path.SEPARATOR + LOCAL_SPLIT_FILE; } public static String getLocalSplitMetaFile(String user, String jobid) { return getLocalJobDir(user, jobid) + Path.SEPARATOR + LOCAL_SPLIT_META_FILE; } // ////////////////////////////////////////////////////////// // main() // ////////////////////////////////////////////////////////// /** * Start the JobTracker process. This is used only for debugging. As a rule, * JobTracker should be run as part of the DFS Namenode process. */ public static void main(String argv[]) throws IOException, InterruptedException { StringUtils.startupShutdownMessage(STJobTracker.class, argv, LOG); try { if (argv.length == 0) { STJobTracker tracker = startTracker(new JobConf()); tracker.offerService(); } else { if ("-dumpConfiguration".equals(argv[0]) && argv.length == 1) { dumpConfiguration(new PrintWriter(System.out)); System.out.println(); } else { System.out.println("usage: JobTracker [-dumpConfiguration]"); System.exit(-1); } } } catch (Throwable e) { LOG.fatal(StringUtils.stringifyException(e)); System.exit(-1); } } /** * Dumps the configuration properties in Json format * * @param writer * {@link}Writer object to which the output is written * @throws IOException */ private static void dumpConfiguration(Writer writer) throws IOException { Configuration.dumpConfiguration(new JobConf(), writer); writer.write("\n"); } @Override public void refreshUserToGroupsMappings(Configuration conf) throws IOException { LOG.info("Refreshing all user-to-groups mappings. Requested by user: " + UserGroupInformation.getCurrentUser().getShortUserName()); Groups.getUserToGroupsMappingService(conf).refresh(); } /** * Discard a current delegation token. */ @Override public void cancelDelegationToken(Token<DelegationTokenIdentifier> token) throws IOException, InterruptedException { String user = UserGroupInformation.getCurrentUser().getUserName(); secretManager.cancelToken(token, user); } /** * Get a new delegation token. */ @Override public Token<DelegationTokenIdentifier> getDelegationToken(Text renewer) throws IOException, InterruptedException { UserGroupInformation ugi = UserGroupInformation.getCurrentUser(); Text owner = new Text(ugi.getUserName()); Text realUser = null; if (ugi.getRealUser() != null) { realUser = new Text(ugi.getRealUser().getUserName()); } DelegationTokenIdentifier ident = new DelegationTokenIdentifier(owner, renewer, realUser); return new Token<DelegationTokenIdentifier>(ident, secretManager); } /** * Renew a delegation token to extend its lifetime. */ @Override public long renewDelegationToken(Token<DelegationTokenIdentifier> token) throws IOException, InterruptedException { String user = UserGroupInformation.getCurrentUser().getUserName(); return secretManager.renewToken(token, user); } /* * skew reduce logic */ /** * should hold the lock on tracker object by heartbeat() * @param tip * @param trackerName */ void createTaskEntry(TaskAttemptID taskid, String taskTracker, TaskInProgress tip) { this.taskidToTIP.put(taskid, tip); JobID jobid = taskid.getJobID(); synchronized (plannedJobs) { PlannedJob job = this.plannedJobs.get(jobid); if (job != null && job.remove(taskid)) { this.plannedJobs.remove(jobid); } } } void removeTaskEntry(TaskAttemptID taskid) { taskidToTIP.remove(taskid); } PartitionPlanner.ClusterInfo getClusterAvailability(ReactionContext context, long now) throws IOException, InterruptedException { ClusterMetrics metrics = cluster.getClusterStatus(); TaskAttemptID attemptId = context.getTargetAttemptID(); TaskType type = attemptId == null ? context.getTaskID().getTaskType() : attemptId.getTaskType(); int maxSlots = type == TaskType.MAP ? metrics.getMapSlotCapacity() : metrics.getReduceSlotCapacity(); int runningSlots = type == TaskType.MAP ? metrics.getRunningMaps() : metrics.getRunningReduces(); int runningSkewTune = 0; double[] remainingTimes = new double[maxSlots]; int from = maxSlots; // if this is a speculative REDUCE, the original slot becomes available. We should make it available. boolean availRightNow = attemptId != null && type == TaskType.REDUCE && context.getTimePerByte() == 0.f; synchronized (this) { // FIXME this only involves tasks that are scheduled and running // we should keep an expected information as well. // on planning, we should add the planned tasks and getClusterAvailability should // incorporate any planned stuffs in it. // the information required: // Map<JobID, [long planned at, for tasks -- estimated runtime]> // on first heartbeat from each task, we remove each information. for (Map.Entry<TaskAttemptID, TaskInProgress> e : taskidToTIP.entrySet()) { TaskAttemptID taskid = e.getKey(); if (taskid.getTaskType() == type) { // extra check if (availRightNow && taskid.equals(attemptId)) continue; // this will become available immediately TaskInProgress tip = e.getValue(); double t = tip.getRemainingTime(taskid, now); if (t > 0.) { remainingTimes[--from] = tip.getRemainingTime(taskid, now); ++runningSkewTune; if (from == 0) break; } } } if (from > 0) { synchronized (plannedJobs) { for (Map.Entry<JobID, PlannedJob> e : this.plannedJobs.entrySet()) { PlannedJob plan = e.getValue(); from = plan.fillCompletionTime(type, now, remainingTimes, from); if (from == 0) break; } } } } Arrays.sort(remainingTimes, from, maxSlots); if (LOG.isDebugEnabled()) { LOG.debug("cluster availability = " + Arrays.toString(remainingTimes)); } // FIXME incorporate other tasks that are not SkewTune return new PartitionPlanner.ClusterInfo(type, maxSlots, runningSlots, runningSkewTune, remainingTimes, maxSlots); } /** * SkewTune heartbeat protocol * * REQUEST (Heartbeat) * * HOST TaskAttemptID -- status report (initialization|mapoutput|completed) * progress [splitted] TaskAttemptID (initialization|mapoutput|completed) * progress [splitted] ... * * RESPONSE * * TaskAttemptID (keep going | new map output [] | cancel ) * * .skewtune/m-0000?/part-m-XXXXX ... * * The protocol is softstate. Jobtracker responds to each heartbeat with the * task to cancel and list of unknown jobs in the heart beat message. The * task tracker is supposed to reclaim space occupied by the unknown jobs. */ @Override public synchronized HeartbeatResponse heartbeat(TaskTrackerStatus status, boolean justStarted, boolean justInited, short responseId) throws IOException, InterruptedException { if (LOG.isDebugEnabled() && dumpHeartbeat) { LOG.debug("Got heartbeat from: " + status.getTrackerName() + " with responseId: " + responseId); } String trackerName = status.getTrackerName(); long now = System.currentTimeMillis(); short newResponseId = (short) (responseId + 1); status.setLastSeen(now); trackerToLastHeartbeat.put(trackerName, status); trackerToHttpPort.put(trackerName, status.getHttpPort()); HashSet<JobID> unknownJobs = new HashSet<JobID>(); ArrayList<ReactiveMapOutput> newMapOutput = new ArrayList<ReactiveMapOutput>(); // ArrayList<TaskAttemptID> cancelledTasks = new ArrayList<TaskAttemptID>(); ArrayList<TaskAction> taskActions = new ArrayList<TaskAction>(); ArrayList<TaskStatusEvent> newTakeOver = new ArrayList<TaskStatusEvent>(); // per job -- processing // FIXME retrieve task tracker // FIXME for each job, update task status, build host-task map for (JobOnTaskTracker jobReport : status.getJobReports()) { JobID jobid = jobReport.getJobID(); JobInProgress jip = null; boolean pendingReactive = false; synchronized (jobs) { jip = jobs.get(jobid); } if (jip == null) { synchronized (pendingCompletedReactiveJob) { jip = pendingCompletedReactiveJob.get(jobid); } pendingReactive = jip != null; } if (jip == null) { // FIXME check the pending completion list unknownJobs.add(jobid); // this job must be cleared } else { int from = jobReport.getFromIndex(); int fromTakeOver = jobReport.getFromIndexOfTakeOver(); final JobType jobType = jip.getJobType(); BitSet completed = new BitSet(jip.getNumMapTasks()); synchronized (jip) { // load job token into this node if (jobType == JobType.ORIGINAL || jobType == JobType.REDUCE_REACTIVE) { scheduleJobTokenLoading(jip); // we only need to load it for original job // FIXME we need to load it for other job if we support recursive split } // update statistics of this task for (STTaskStatus taskStatus : jobReport.getTaskReports()) { int action = jip.handleTaskHeartbeat(taskStatus, status.getHostName(), completed); if (action != 0) { taskActions.add(new TaskAction(taskStatus.getTaskID(), action)); } // if ( jip.handleTaskHeartbeat(taskStatus,status.getHostName(),completed) != 0) { // cancelledTasks.add(taskStatus.getTaskID()); // FIXME create task action // } } // fetch all available new map output from FROM if (from >= 0) { jip.retrieveNewMapOutput(newMapOutput, from); } if (fromTakeOver >= 0) { jip.retrieveNewTakeOver(newTakeOver, fromTakeOver); } if (jobType == JobType.MAP_REACTIVE && pendingReactive) { if (jip.isAllMapOutputIndexAvailable()) { synchronized (pendingCompletedReactiveJob) { pendingCompletedReactiveJob.remove(jobid); } cleanupPendingReactiveMap(jip); } } } // if ( jobType == JobType.ORIGINAL ) { // jip.notifyMapCompletion(completed); // } } } int nextInterval = getNextHeartbeatInterval(); return new HeartbeatResponse(newResponseId, nextInterval, newMapOutput.toArray(new ReactiveMapOutput[newMapOutput.size()]), // cancelledTasks.toArray(new TaskAttemptID[cancelledTasks.size()]), taskActions.toArray(new TaskAction[taskActions.size()]), unknownJobs.toArray(new JobID[unknownJobs.size()]), newTakeOver.toArray(new TaskStatusEvent[newTakeOver.size()])); } /** * Calculates next heartbeat interval using cluster size. Heartbeat interval * is incremented by 1 second for every 100 nodes by default. * * @return next heartbeat interval. */ public int getNextHeartbeatInterval() { int clusterSize = clusterMetrics == null ? 0 : clusterMetrics.getTaskTrackerCount(); // get the no of task trackers int heartbeatInterval = Math.max((int) (1000 * HEARTBEATS_SCALING_FACTOR * Math.ceil((double) clusterSize / NUM_HEARTBEATS_IN_SECOND)), 3000); return heartbeatInterval; } @Override public void submitJob(JobID jobId, Configuration conf) throws IOException, InterruptedException { // initialize task // setup notification URL conf.set(END_NOTIFICATION_URL, this.defaultNotificationUrl); Job runningJob = cluster.getJob(jobId); JobInProgress jip = new JobInProgress(this, runningJob, conf); jip.eventSplitMeta = this.asyncWorkers.submit(new LoadInputSplitMeta(jip)); // now append to the job synchronized (jobs) { jobs.put(jobId, jip); } synchronized (originalJobs) { originalJobs.add(jip); } // FIXME reserve maps and reduces accordingly LOG.info("job has submitted " + jobId); // localizeJobFiles(jip,true); // should we always cache the split? } @Override public void killJob(JobID jobid) throws IOException, InterruptedException { JobInProgress jip = null; synchronized (jobs) { jip = jobs.get(jobid); } if (jip == null) { LOG.warn("Unknown jobid: " + jobid.toString()); } else { jip.kill(true, pendingCompletedReactiveJob); } } private Future<JobID> fastSplitTask(TaskID taskid, int n) throws IOException, InterruptedException { JobInProgress jip = null; synchronized (jobs) { jip = jobs.get(taskid.getJobID()); } if (jip == null) { String msg = "unknown task " + taskid; LOG.error(msg); throw new IOException(msg); } TaskInProgress tip = jip.getTaskInProgress(taskid); ReactionContext context = taskid.getTaskType() == TaskType.MAP ? new ReexecMap(tip, n) : new ReexecReduce(tip); return fastSplitTask(context, true); // return fastSplitTask(taskid,n,true); } /* private Future<JobID> fastSplitTask(TaskID taskid, int n, boolean speculative) throws IOException, InterruptedException { JobInProgress jip = null; synchronized (jobs) { jip = jobs.get(taskid.getJobID()); } if ( jip == null ) { String msg = "unknown task " + taskid; LOG.error(msg); throw new IOException(msg); } synchronized (pendingReactiveJob) { if ( ! jip.canSpeculateThis(taskid) || pendingReactiveJob.contains(taskid) || jip.hasReactiveJob(taskid) ) { // being paranoid. LOG.warn("reactive job is already scheduled or running for "+taskid); return null; } pendingReactiveJob.add(taskid); } if ( LOG.isInfoEnabled() ) { LOG.info(String.format("split task %s into %d tasks",taskid.toString(),n)); } // FIXME split the task using asynchronous task // check whether both job token and meta data has been loaded JobID jobid = null; try { jip.waitUntilReadyToSplit(taskid); if ( LOG.isDebugEnabled() ) { LOG.debug("scheduling asynchronous split task for task "+taskid); } TaskInProgress tip = jip.getTaskInProgress(taskid); ReactionContext context = taskid.getTaskType() == TaskType.MAP ? new ReexecMap(tip,n) : new ReexecReduce(tip); return this.asyncWorkers.submit(new SplitTask(context, speculative)); // return this.asyncWorkers.submit(new SplitTask(jip, taskid, n, context, speculative)); } catch (ExecutionException e) { throw new IOException(e.getCause()); // wrap again! } } */ private Future<JobID> fastSplitTask(ReactionContext context, boolean speculative) throws IOException, InterruptedException { JobInProgress jip = context.getJob(); TaskID taskid = context.getTaskID(); synchronized (pendingReactiveJob) { if (!jip.canSpeculateThis(taskid) || pendingReactiveJob.contains(taskid) || jip.hasReactiveJob(taskid)) { // being paranoid. LOG.warn("reactive job is already scheduled or running for " + taskid); return null; } pendingReactiveJob.add(taskid); } // FIXME split the task using asynchronous task // check whether both job token and meta data has been loaded JobID jobid = null; try { jip.waitUntilReadyToSplit(taskid); if (LOG.isDebugEnabled()) { LOG.debug("scheduling asynchronous split task for task " + taskid); } // // long now = System.currentTimeMillis(); // ClusterInfo clusterInfo = this.getClusterInfo(context,now); // Plan p = PartitionPlanner.plan(context, clusterInfo, now); // // if ( LOG.isInfoEnabled() ) { // LOG.info(String.format("split task %s into %d tasks",taskid.toString(),p.getNumPartitions())); // } return this.asyncWorkers.submit(new SplitTask(context, speculative)); // return this.asyncWorkers.submit(new SplitTask(jip, taskid, p.getNumPartitions(), context, speculative)); } catch (ExecutionException e) { throw new IOException(e.getCause()); // wrap again! } } private Future<JobID> launchScanTask(JobInProgress jip, TaskID taskid, JobInProgress.ReactionContext action) throws IOException, InterruptedException { return this.asyncWorkers.submit(new ScanTask(jip, taskid, action)); } private Future<JobID> launchPlanAndLaunchTask(JobInProgress jip, TaskID taskid, JobInProgress.ReactionContext action) throws IOException, InterruptedException { return this.asyncWorkers.submit(new PlanAndLaunchTask(jip, taskid, action)); } private Future<JobID> launchPlanAndLaunchTask(ScanTask scanTask) throws IOException, InterruptedException { return this.asyncWorkers.submit(new PlanAndLaunchTask(scanTask)); } @Override public JobID splitTask(TaskID taskid, int n) throws IOException, InterruptedException { try { JobID jobid = fastSplitTask(taskid, n).get(); if (jobid != null) { LOG.info("new splitted job " + jobid); } return jobid; } catch (ExecutionException e) { throw new IOException(e.getCause()); // wrap again! } } /* private Future<JobID> scheduleSpeculativeTask(JobID jobid) throws IOException, InterruptedException { JobInProgress jip = null; synchronized (jobs) { jip = jobs.get(jobid); } if ( jip == null ) { String msg = "unknown job "+jobid; LOG.error(msg); throw new IOException(msg); } STTaskStatus taskStatus = jip.findSpeculativeTask(); if ( taskStatus == null ) { LOG.debug("Nothing to speculate for "+jobid); return null; } TaskID taskid = taskStatus.getTaskID().getTaskID(); synchronized (pendingReactiveJob) { if ( pendingReactiveJob.contains(taskid) || jip.hasReactiveJob(taskid) ) { LOG.warn("reactive job is already running for "+taskid); return null; } pendingReactiveJob.add(taskid); } int n; if ( taskid.getTaskType() == TaskType.MAP ) { n = clusterMetrics.getMapSlotCapacity() - 1; // exclude original task } else { n = clusterMetrics.getReduceSlotCapacity() - 1; // exclude original task } if ( LOG.isInfoEnabled() ) { LOG.info(String.format("split task %s into %d tasks",taskid.toString(),n)); } // FIXME split the task using asynchronous task // check whether both job token and meta data has been loaded try { jip.waitUntilReadyToSplit(taskid); if ( LOG.isDebugEnabled() ) { LOG.debug("scheduling asynchronous split task for task "+taskid); } return this.asyncWorkers.submit(new SplitTask(jip, taskid, n)); } catch (ExecutionException e) { throw new IOException(e.getCause()); // wrap again! } }*/ ThreadPoolExecutor asyncWorkers; // mirroring job token? private void cleanupPendingReactiveMap(JobInProgress jip) { if (jip.getJobType() != JobType.MAP_REACTIVE) { throw new IllegalStateException("Invalid job type " + jip.getJobType()); } if (!jip.isAllMapOutputIndexAvailable()) { throw new IllegalStateException( "Reactive map " + jip.getJobID() + " is cleaned up before receiving all map output indexes"); } if (LOG.isInfoEnabled()) { LOG.info("Pending reactive map " + jip.getJobID() + " received all map output indexes"); } jip.notifyParentJob(false); // we don't need a map at the end of parent job if (jip.parent.getNumReduceTasks() > 0) { jip.parent.addPathToDelete(jip.getOutputPath()); } jip.cleanup(); } public static class JobCompletionServlet extends HttpServlet { private static final long serialVersionUID = -7533419814261205808L; @Override protected void doGet(HttpServletRequest req, HttpServletResponse resp) throws ServletException, IOException { String jobIdStr = req.getParameter("jobid"); if (jobIdStr == null) return; final JobID jobId = JobID.forName(jobIdStr); String jobStatus = req.getParameter("status"); if (LOG.isInfoEnabled()) { LOG.info("jobid = " + jobId + "; jobStatus = " + jobStatus); } final STJobTracker tracker = (STJobTracker) getServletContext().getAttribute("job.tracker"); // look up this job JobInProgress thisJob = null; synchronized (tracker.jobs) { thisJob = tracker.jobs.remove(jobId); // this has been // completed. so we can // safely remove } synchronized (tracker.plannedJobs) { tracker.plannedJobs.remove(jobId); } boolean cleanup = true; if (thisJob == null) { ScanTask scanTask = null; synchronized (tracker.scanningJob) { scanTask = tracker.scanningJob.remove(jobId); } if (scanTask == null) { if (LOG.isDebugEnabled()) { LOG.debug("completion of unknown job " + jobId); } resp.setStatus(HttpServletResponse.SC_NOT_FOUND); } else { // scan is done. schedule planning and repartition task if ("SUCCEEDED".equals(jobStatus)) { // schedule planning tracker.schedulePlanAndLaunch(scanTask); } else { // scan failed LOG.warn("scan failed for task " + scanTask.getTaskID()); try { scanTask.getJobInProgress().getOriginalJob().kill(); } catch (InterruptedException ignore) { } } } } else { JobInProgress.JobType jobType = thisJob.getJobType(); // TODO schedule suicide!!! if (jobType == JobInProgress.JobType.ORIGINAL) { // FIXME propagate notification URL // FIXME should remove temporary files on exit // if dependent jobs are running, then should wait for all // dependent jobs killed. then remove the final output // directory. synchronized (tracker.originalJobs) { tracker.originalJobs.remove(thisJob); // just in case } if (LOG.isDebugEnabled()) { LOG.debug("Original job " + jobId + " has been " + jobStatus); } try { thisJob.kill(true, tracker.pendingCompletedReactiveJob); // if there is any remaining reactive jobs } catch (InterruptedException e) { e.printStackTrace(); } tracker.pendingTakeOvers.unregister(jobId); // if ( ! "SUCCEEDED".equals(jobStatus) ) { // purge all temporary output thisJob.deleteAll(tracker.getFileSystem()); // } } else { int partition = thisJob.getPartition(); boolean cleanOutputFile = false; if (jobType == JobInProgress.JobType.MAP_REACTIVE) { if (thisJob.isAllMapOutputIndexAvailable()) { if (LOG.isDebugEnabled()) { LOG.debug("Reactive map " + jobId + " has been " + jobStatus); } if ("SUCCEEDED".equals(jobStatus)) { thisJob.notifyParentJob(false); } // we don't need a map at the end of parent job if (thisJob.parent.getNumReduceTasks() > 0) { cleanOutputFile = true; } } else { if ("SUCCEEDED".equals(jobStatus)) { if (LOG.isInfoEnabled()) { LOG.info("Reactive map " + jobId + " has been " + jobStatus + " but not all map output index have been retrieved"); } // append this job to pending completion. should wait until we retrieve map output indexes synchronized (tracker.pendingCompletedReactiveJob) { tracker.pendingCompletedReactiveJob.put(jobId, thisJob); } cleanup = false; } else { // handover job failed. should end up entire failure. if (!thisJob.isSpeculative()) { // FIXME what should we do? if this is take over, either retry or halt. try { thisJob.parent.kill(); } catch (InterruptedException ignore) { } } cleanup = true; } } } else { if (LOG.isDebugEnabled()) { LOG.debug("Reactive reduce " + jobId + " has been " + jobStatus); } if ("SUCCEEDED".equals(jobStatus)) { thisJob.notifyParentJob(true); // complete the original } else { cleanOutputFile = true; } } if (cleanOutputFile) { thisJob.parent.addPathToDelete(thisJob.getOutputPath()); } // setup job status thisJob.setState(Enum.valueOf(JobInProgress.State.class, jobStatus)); } if (cleanup) thisJob.cleanup(); resp.setStatus(HttpServletResponse.SC_OK); } resp.setContentType("text/html"); resp.setContentLength(0); resp.flushBuffer(); } } public static class SpeculationEventServlet extends HttpServlet { private static final long serialVersionUID = 6460188604896069661L; @Override protected void doGet(HttpServletRequest req, HttpServletResponse resp) throws ServletException, IOException { // String taskIdStr = req.getParameter("taskid"); // if (taskIdStr == null) // return; // final TaskID taskId = TaskID.forName(taskIdStr); // String rt = req.getParameter("remainTime"); // if ( rt != null ) { // float remainTime = Float.parseFloat(rt); // } /* String jobIdStr = req.getParameter("jobid"); final JobID jobId = JobID.forName(jobIdStr); if ( LOG.isInfoEnabled() ) { // LOG.info("speculative taskid = "+taskId + "; remain time = "+rt); LOG.info("speculative execution is available for "+jobIdStr); } final STJobTracker tracker = (STJobTracker) getServletContext().getAttribute("job.tracker"); if ( tracker.speculativeSplit ) { try { // FIXME how to split it? always binary? // tracker.fastSplitTask(taskId, 2); tracker.scheduleSpeculativeTask(jobId); } catch (InterruptedException e) { throw new IOException(e); } } resp.setStatus(HttpServletResponse.SC_OK); resp.setContentType("text/html"); resp.setContentLength(0); resp.flushBuffer(); */ } } public static class SkewReportServlet extends HttpServlet { @Override protected void doGet(HttpServletRequest req, HttpServletResponse resp) throws ServletException, IOException { String taskidStr = req.getParameter("i"); String factorStr = req.getParameter("f"); String offsetStr = req.getParameter("o"); String lengthStr = req.getParameter("l"); if (taskidStr == null || factorStr == null || offsetStr == null || lengthStr == null) return; final TaskID taskId = TaskID.forName(taskidStr); float factor = Float.parseFloat(factorStr); long offset = Long.parseLong(offsetStr); int len = Integer.parseInt(lengthStr); if (LOG.isInfoEnabled()) { LOG.info("reporting a large record " + taskidStr + " factor=" + factorStr + " offset=" + offsetStr + " len=" + lengthStr); } final STJobTracker tracker = (STJobTracker) getServletContext().getAttribute("job.tracker"); final JobID jobId = taskId.getJobID(); JobInProgress thisJob = null; synchronized (tracker.jobs) { thisJob = tracker.jobs.get(jobId); } if (thisJob == null) return; // unknown TaskInProgress tip = thisJob.getTaskInProgress(taskId); if (tip.addLargeRecord(factor, offset, len)) { LOG.warn("task " + taskId + " has too many tasks " + tip.getReactiveJob().getNumMapTasks() + ". only " + factor + " is needed."); } // append the given event. // if factor is less than 1, do not schedule it. resp.setStatus(HttpServletResponse.SC_OK); resp.setContentType("text/html"); resp.setContentLength(0); resp.flushBuffer(); } } /** * each task attempt requested to stop and send stopping position here * * \/split?i=[taskid]&c=[responsecode] * * POST body is either inputsplits or minimum key * * code * * 0: successfully splitted -- expect the binary in the body * 1: can not split the task -- fall back to speculative execution * 2: already completed * * @author yongchul * */ public static class SplitTaskServlet extends HttpServlet { @Override protected void doGet(HttpServletRequest req, HttpServletResponse resp) throws ServletException, IOException { doPost(req, resp); } @Override protected void doPost(HttpServletRequest req, HttpServletResponse resp) throws ServletException, IOException { String pi = req.getPathInfo(); int pos = pi.lastIndexOf('/'); if (LOG.isDebugEnabled()) { LOG.debug("Path Info = " + pi + " pos=" + pos); } // String attemptIdStr = req.getParameter("i"); // String codeStr = req.getParameter("c"); String attemptIdStr = pi.substring(1, pos); String codeStr = pi.substring(pos + 1); if (LOG.isDebugEnabled()) { LOG.debug("attempt id = " + attemptIdStr + " code = " + codeStr); } if (attemptIdStr == null || codeStr == null) { resp.setStatus(HttpServletResponse.SC_BAD_REQUEST); return; } final TaskAttemptID attemptId = TaskAttemptID.forName(attemptIdStr); final boolean isMap = attemptId.getTaskType() == TaskType.MAP; int code = Integer.parseInt(codeStr); if (LOG.isInfoEnabled()) { LOG.info("split response from " + attemptId + " code=" + code); } final STJobTracker tracker = (STJobTracker) getServletContext().getAttribute("job.tracker"); final JobID jobId = attemptId.getJobID(); JobInProgress thisJob = null; synchronized (tracker.jobs) { thisJob = tracker.jobs.get(jobId); } if (thisJob == null) { resp.setStatus(HttpServletResponse.SC_NOT_FOUND); return; // unknown } TaskInProgress tip = thisJob.getTaskInProgress(attemptId); int capacity = isMap ? tracker.clusterMetrics.getMapSlotCapacity() : tracker.clusterMetrics.getReduceSlotCapacity(); JobInProgress.ReactionContext action = null; boolean speculative = true; boolean scheduleReactive = true; int sz = req.getContentLength(); LOG.info("available response bytes = " + sz); DataInputStream in = new DataInputStream(req.getInputStream()); byte[] body; if (sz > (4 + 8)) { body = new byte[sz - (4 + 8)]; in.readFully(body); } else { body = new byte[0]; } float tpb = in.readFloat(); long remainBytes = in.readLong(); if (code == 0) { // STOP was successful. retrieve the body and split if (LOG.isInfoEnabled()) { LOG.info(attemptId + " time per byte = " + tpb + " remaining bytes = " + remainBytes); } // if map, this is split info // if reduce, this is key if (isMap) { // convert this current status into next thing to read // FIXME construct action. split the content action = new JobInProgress.TakeOverMap(tip, attemptId, body, capacity, tpb, remainBytes, code); } else { // String enc = Base64.encodeToString(body, false); // LOG.info("min reduce key = "+enc); action = new JobInProgress.TakeOverReduce(tip, attemptId, body, tpb, remainBytes, code); } // let down stream reducers know that it should receive the reactive output as well. thisJob.addTaskStateEvent(tip, TaskState.TAKEOVER); speculative = false; } else if (code == 1) { // can not split. fall back to speculative execution. only happens for MAP tasks --capacity; // leave a room for currently executing one action = isMap ? new ReexecMap(tip, attemptId, capacity, tpb, remainBytes) : new ReexecReduce(tip, attemptId, tpb, remainBytes); thisJob.addTaskStateEvent(tip, TaskState.CANCEL_TAKEOVER); } else { // BAD request resp.setStatus(HttpServletResponse.SC_BAD_REQUEST); return; } if (scheduleReactive) { try { if (speculative) { // tracker.fastSplitTask(attemptId.getTaskID(), capacity, action, speculative); tracker.fastSplitTask(action, speculative); } else { PlanSpec spec = action.getPlanSpec(); if (spec.requireScan()) { tracker.launchScanTask(thisJob, attemptId.getTaskID(), action); } else { // tracker.fastSplitTask(attemptId.getTaskID(), capacity, action, speculative); tracker.launchPlanAndLaunchTask(thisJob, attemptId.getTaskID(), action); } } } catch (InterruptedException ex) { LOG.error(ex); } } // if STOP was successful, the task is the last reduce, send wait. resp.setStatus(HttpServletResponse.SC_OK); resp.setContentType("text/html"); resp.setContentLength(0); resp.flushBuffer(); } } // localize key files and etc. private void initializeJobDirs(String user, String jobId) throws IOException { boolean initJobDirStatus = false; String jobDirPath = getLocalJobDir(user, jobId); for (String localDir : conf.getLocalDirs()) { Path jobDir = new Path(localDir, jobDirPath); if (fs.exists(jobDir)) { // this will happen on a partial execution of localizeJob. // Sometimes copying job.xml to the local disk succeeds but // copying // job.jar might throw out an exception. We should clean up and // then try again. fs.delete(jobDir, true); } boolean jobDirStatus = fs.mkdirs(jobDir); if (!jobDirStatus) { LOG.warn("Not able to create job directory " + jobDir.toString()); } initJobDirStatus = initJobDirStatus || jobDirStatus; // job-dir has to be private to the TT // Localizer.PermissionsHandler.setPermissions(new // File(jobDir.toUri() // .getPath()), Localizer.PermissionsHandler.sevenZeroZero); // FIXME properly set permission! } if (!initJobDirStatus) { throw new IOException("Not able to initialize job directories " + "in any of the configured local directories for job " + jobId); } } private void scheduleJobTokenLoading(JobInProgress jip) { if (jip.eventJobToken == null) { synchronized (jip) { if (jip.eventJobToken == null) { jip.eventJobToken = this.asyncWorkers.submit(new LoadJobTokens(jip)); jip.notifyAll(); // wakeup split jobs if there is any } } } } private void scheduleLoadCompletionEvents(JobInProgress jip) { if (jip.eventCompletionEvents == null) { synchronized (jip) { if (jip.eventCompletionEvents == null) { jip.eventCompletionEvents = this.asyncWorkers.submit(new LoadCompletionEvents(jip)); jip.notifyAll(); } } } } /* * class CopyJobTokens implements Runnable { final JobInProgress job; * * CopyJobTokens(JobInProgress job) { this.job = job; } * * @Override public void run() { JobID jobid = job.getDowngradeJobID(); if ( * LOG.isInfoEnabled() ) { * LOG.info("copying job token file for job "+jobid.toString()); } try { * String jobTokenFile = * localizeJobTokenFile(job.getUser(),job.getJobID().toString()); * job.setTokenStorage(TokenCache.loadTokens(jobTokenFile, conf)); } catch * (IOException ex) { * LOG.error("failed to copy job token file for job "+jobid.toString(),ex); * } finally { synchronized ( loadingJobToken ) { * loadingJobToken.remove(jobid); } } } } */ // TODO following should run in some sort of event delivery mechanism class LoadInputSplitMeta implements Callable<Boolean> { final JobInProgress job; LoadInputSplitMeta(JobInProgress job) { this.job = job; } @Override public Boolean call() throws Exception { try { JobSplit.TaskSplitMetaInfo[] metaSplits = SplitMetaInfoReader.readSplitMetaInfo(job.getJobID(), getFileSystem(), conf, job.getJobDir()); job.setSplits(metaSplits); } catch (Exception e) { LOG.error("failed to load meta split information for job " + job.getJobID(), e); throw e; } return true; } } /** * Load job tokens from job submission directory. this is required process to split reduce. */ class LoadJobTokens implements Callable<Boolean> { final JobInProgress job; LoadJobTokens(JobInProgress job) { this.job = job; } @Override public Boolean call() throws Exception { JobID jobId = job.getDowngradeJobID(); FSDataInputStream input = null; TokenStorage ts = new TokenStorage(); try { Path skPath = new Path(systemDir, jobId.toString() + Path.SEPARATOR + TokenCache.JOB_TOKEN_HDFS_FILE); // FileStatus status = null; // long jobTokenSize = -1; // status = getFileSystem().getFileStatus(skPath); // throws // // FileNotFoundException // jobTokenSize = status.getLen(); input = getFileSystem().open(skPath, 65536); ts.readFields(input); LOG.info("job token has been successfully loaded for " + jobId); job.setTokenStorage(ts); } catch (Exception ex) { LOG.error("failed to copy job token file for job " + jobId.toString(), ex); throw ex; } finally { if (input != null) try { input.close(); } catch (IOException ignore) { } input = null; } return true; } } class LoadCompletionEvents implements Callable<Boolean> { final JobInProgress job; LoadCompletionEvents(JobInProgress jip) { this.job = jip; } @Override public Boolean call() throws Exception { return job.updateMapCompletionEvents(); } } /** * Parallel speculative execution. * * @author yongchul * */ class SplitTask implements Callable<JobID> { // final JobInProgress job; // final TaskID taskid; // final int numSplits; final boolean speculative; final ReactionContext context; /* SplitTask(JobInProgress jip,TaskID taskid,int n) { this.job = jip; this.taskid = taskid; this.numSplits = n; this.speculative = true; this.action = taskid.getTaskType() == TaskType.MAP ? new JobInProgress.SplitMap(n) : null; } */ /* SplitTask(JobInProgress jip,TaskID taskid,int n,JobInProgress.ReactionContext hook,boolean speculative) { this.job = jip; this.taskid = taskid; this.numSplits = n; this.action = hook; this.speculative = speculative; } */ SplitTask(ReactionContext context, boolean speculative) { this.context = context; this.speculative = speculative; } @Override public JobID call() throws Exception { JobID jobid = null; JobInProgress job = context.getJob(); TaskInProgress tip = context.getTaskInProgress(); TaskID taskid = context.getTaskID(); try { if (tip.hasCommitted()) { // last check. LOG.info("task " + taskid + " has already been completed. cancel split."); return null; } // // if ( ! job.getJobID().equals(taskid.getJobID()) ) { // throw new IOException("Job ID does not match: "+job.getJobID() + "/" + taskid.getJobID()); // } long now = System.currentTimeMillis(); ClusterInfo clusterInfo = getClusterInfo(context, now); Plan p = PartitionPlanner.plan(context, clusterInfo, now); LOG.debug("Splitting task " + taskid + " into " + p.getNumPartitions()); // int newNumSplits = tip.adjustNumSplits(numSplits); // if ( numSplits != newNumSplits && LOG.isInfoEnabled() ) { // LOG.info("adjusting split size from "+numSplits+" to "+newNumSplits); // } // TaskAttemptWithHost attemptHost = job.getSplittableTask(taskid); // boolean speculative = attemptHost == null; // if ( ! speculative ) { // // set action flag // tip.setAction(new TaskAction(attemptHost.getTaskAttemptID(),numSplits,false)); // LOG.info("accelerate "+attemptHost.getTaskAttemptID()+" by splitting remaining input into "+numSplits); // newNumSplits = numSplits; // } if (!speculative) { // set action flag // tip.setAction(new TaskAction(attemptHost.getTaskAttemptID(),numSplits,false)); // LOG.info("accelerate "+tip.getTaskID()+" by splitting remaining input into "+numSplits); // newNumSplits = numSplits; } // FIXME is speculative or accelerate? // JobInProgress subJob = context.getJob().createReactiveJob(taskid, newNumSplits, speculative, context); JobInProgress subJob = context.getJob().createReactiveJob(taskid, p.getNumPartitions(), speculative, context); // if ( !speculative ) { // int httpPort = trackerToHttpPort.get(attemptHost.getHost()); // String url = "http://" + attemptHost.getHost() + ":" + httpPort; // subJob.getConfiguration().set(SkewTuneJobConfig.ORIGINAL_TASK_TRACKER_HTTP_ATTR,url); // LOG.info("the reactive mappers will retrieve split information from "+url); // } subJob.getJob().submit(); subJob.initialize(); // now add subjob to the data structures synchronized (jobs) { jobs.put(subJob.getDowngradeJobID(), subJob); } context.getJob().registerReactiveJob(taskid, subJob); jobid = subJob.getDowngradeJobID(); PlannedJob plan = context.getPlan().getPlannedJob(subJob); synchronized (plannedJobs) { plannedJobs.put(jobid, plan); } if (!speculative && taskid.getTaskType() == TaskType.REDUCE) { // when reduce handover, release the original slot so that it can be reused by other reduce tasks. job.markCancel(taskid); } } catch (Exception e) { LOG.error("Failed to split a job " + job.getJobID(), e); throw e; } finally { synchronized (pendingReactiveJob) { pendingReactiveJob.remove(taskid); } // if ( !speculative ) { pendingTakeOvers.unregister(taskid); // } } return jobid; } } @Override public String getHttpAddress() throws IOException, InterruptedException { return this.trackerHttp; } @Override public String getCompletionUrl() throws IOException, InterruptedException { return this.defaultNotificationUrl; } @Override public String getSpeculationEventUrl() throws IOException, InterruptedException { return this.defaultSpeculationEventUrl; } Set<TaskID> candidates = new HashSet<TaskID>(); // continuously monitor the cluster size volatile ClusterMetrics clusterMetrics; class PendingTakeOverJobs { private Map<JobID, Set<TaskID>> taskmap = new HashMap<JobID, Set<TaskID>>(); private int pendingTasks; public synchronized boolean hasPendingTasks() { return pendingTasks > 0; } public synchronized void unregister(JobID jobid) { Set<TaskID> tasks = taskmap.remove(jobid); if (tasks != null) { if (pendingTasks < tasks.size()) { LOG.warn("pending task goes below zero: " + pendingTasks + " - " + tasks.size()); // recalculate pendingTasks = 0; for (Set<TaskID> x : taskmap.values()) { pendingTasks += x.size(); } LOG.warn("recomputed pending tasks = " + pendingTasks); } else { pendingTasks -= tasks.size(); } } } public synchronized void register(TaskID taskid) { JobID jobid = taskid.getJobID(); Set<TaskID> tasks = taskmap.get(jobid); if (tasks == null) { tasks = new HashSet<TaskID>(); taskmap.put(jobid, tasks); } if (tasks.add(taskid)) { ++pendingTasks; } } public synchronized void unregister(TaskID taskid) { JobID jobid = taskid.getJobID(); Set<TaskID> tasks = taskmap.get(jobid); if (tasks == null) { LOG.warn("can't find pending take over: " + taskid); } else { if (tasks.remove(taskid)) { --pendingTasks; } } } public synchronized int getNumPendingTasks() { return pendingTasks; } } final PendingTakeOverJobs pendingTakeOvers = new PendingTakeOverJobs(); // pending speculative tasks class SpeculativeScheduler extends Thread { SpeculativeScheduler() { super("SpeculativeScheduler"); setDaemon(true); } private void checkReservedTasks(int[] counts) { JobInProgress[] jips = null; synchronized (jobs) { jips = jobs.values().toArray(new JobInProgress[0]); } for (JobInProgress jip : jips) { jip.getNumberOfReservedTasks(counts); } } @Override public void run() { int[] reservedTasks = new int[2]; while (true) { try { boolean hasJobs = false; synchronized (originalJobs) { hasJobs = !originalJobs.isEmpty(); } if (hasJobs && !pendingTakeOvers.hasPendingTasks()) { clusterMetrics = cluster.getClusterStatus(); reservedTasks[0] = 0; reservedTasks[1] = 0; checkReservedTasks(reservedTasks); // check whether we have any idle slots int availMaps = clusterMetrics.getMapSlotCapacity() - reservedTasks[0]; int availReduces = clusterMetrics.getReduceSlotCapacity() - reservedTasks[1]; if (availMaps > 0 || availReduces > 0) { if (LOG.isDebugEnabled()) { LOG.debug("trying to speculate tasks for map = " + availMaps + " and reduce = " + availReduces); } // asyncWorkers.submit(new SpeculateTask(availMaps,availReduces)); Set<Future<JobID>> speculated = speculateSlowest(availMaps, availReduces); for (Future<JobID> fjobid : speculated) { try { JobID jobid = fjobid.get(); if (jobid == null) { // failed to speculate } else { LOG.info("speculative job " + jobid + " has been scheduled"); } } catch (ExecutionException ex) { LOG.error("failed to retrieve job id for reactive job", ex); } } } } } catch (InterruptedException x) { // silently ignore. this thread is a daemon. will reclaim itself gracefully } catch (IOException iox) { LOG.error(iox); } catch (Exception ex) { LOG.error("failed to contact to job tracker?", ex); } finally { try { // Thread.sleep(3000); // refresh every minute Thread.sleep(5000); // refresh every 5 secs } catch (InterruptedException ignore) { break; } } } } public void runOld() { try { clusterMetrics = cluster.getClusterStatus(); // check whether we have any idle slots int availMaps = clusterMetrics.getMapSlotCapacity() - clusterMetrics.getRunningMaps(); int availReduces = clusterMetrics.getReduceSlotCapacity() - clusterMetrics.getRunningReduces(); if ((availMaps > 0 || availReduces > 0) && !pendingTakeOvers.hasPendingTasks()) { // no take over is pending boolean hasJobs = false; synchronized (originalJobs) { hasJobs = !originalJobs.isEmpty(); } if (hasJobs) { if (LOG.isDebugEnabled()) { LOG.debug("trying to speculate tasks for map = " + availMaps + " and reduce = " + availReduces); } // asyncWorkers.submit(new SpeculateTask(availMaps,availReduces)); Set<Future<JobID>> speculated = speculateSlowest(availMaps, availReduces); int to = pendingTakeOvers.getNumPendingTasks(); if (to > 0) { LOG.info(to + " take over tasks have been scheduled "); } for (Future<JobID> fjobid : speculated) { try { JobID jobid = fjobid.get(); if (jobid == null) { // failed to speculate } else { LOG.info("speculative job " + jobid + " has been scheduled"); } } catch (ExecutionException ex) { LOG.error("failed to retrieve job id for reactive job", ex); } } } } } catch (InterruptedException ignore) { // break; } catch (Exception ex) { LOG.error("failed to contact to job tracker?", ex); } finally { try { // Thread.sleep(3000); // refresh every minute Thread.sleep(5000); // refresh every 5 secs } catch (InterruptedException ignore) { // break; } } } // default private boolean checkSpeculation(JobInProgress job, TaskID taskid, int numSplits) { if (!job.canTakeover()) return true; TaskAttemptWithHost attemptHost = job.getSplittableTask(taskid); boolean speculative = attemptHost == null; if (!speculative) { // set action flag. will be fetched by next heartbeat message pendingTakeOvers.register(taskid); TaskInProgress tip = job.getTaskInProgress(taskid); tip.setAction(new TaskAction(attemptHost.getTaskAttemptID(), numSplits, false)); job.addTaskStateEvent(tip, TaskState.PREPARE_TAKEOVER); LOG.info("accelerate " + attemptHost.getTaskAttemptID() + " by splitting remaining input into " + numSplits); } return speculative; } public Set<Future<JobID>> speculateSlowest(int availMaps, int availReduces) throws InterruptedException, IOException { Set<Future<JobID>> speculated = new HashSet<Future<JobID>>(); // try { // ideally we want to use a full scheduler but let's just do a poor man's one HashSet<JobInProgress> jobs = new HashSet<JobInProgress>(); synchronized (originalJobs) { jobs.addAll(originalJobs); } final int mapCapacity = clusterMetrics.getMapSlotCapacity(); final int reduceCapacity = clusterMetrics.getReduceSlotCapacity(); // do one at a time availMaps = availMaps > 0 ? 1 : 0; availReduces = availReduces > 0 ? 1 : 0; int scheduled = 0; HashSet<TaskID> newCandidates = new HashSet<TaskID>(); long now = System.currentTimeMillis(); for (JobInProgress jip : jobs) { if (jip.doNotSpeculate) continue; // List<STTaskStatus> tips = jip.findSpeculativeTask(availMaps,availReduces); List<STTaskStatus> tips = jip.findSpeculativeTaskNew(availMaps, availReduces); if (LOG.isDebugEnabled()) { for (STTaskStatus s : tips) { LOG.debug(s + ": remaining time = " + s.getRemainTime(now)); } } for (STTaskStatus tip : tips) { TaskID taskid = tip.getTaskID().getTaskID(); if (!candidates.contains(taskid) && !jip.isRequired(taskid)) { LOG.info(taskid + " was not previously a candidate for speculative execution"); newCandidates.add(taskid); continue; } // now the job maybe different from the root JobInProgress realJip; synchronized (STJobTracker.this.jobs) { realJip = STJobTracker.this.jobs.get(taskid.getJobID()); } if (taskid.getTaskType() == TaskType.MAP) { // int maxCapacity = jip.hasCombiner() ? 3 : mapCapacity - 1; int maxCapacity = mapCapacity - 1; // LOG.debug("HAS COMBINER? "+jip.hasCombiner()); // LOG.debug(jip.job.getConfiguration().get(COMBINE_CLASS_ATTR)); // LOG.debug(jip.job.getConfiguration().get("mapred.combiner.class")); if (checkSpeculation(realJip, taskid, mapCapacity)) { // yes, it is a speculative execution Future<JobID> jobid = fastSplitTask(taskid, maxCapacity); if (jobid == null) { if (LOG.isDebugEnabled()) { LOG.debug("task is already speculated. " + taskid); } } else { ++scheduled; speculated.add(jobid); } --availMaps; } else { // we first try to stop. if (LOG.isDebugEnabled()) { LOG.debug("handover " + taskid); } ++scheduled; --availMaps; } } else { // must be a reduce if (checkSpeculation(realJip, taskid, reduceCapacity)) { // speculative execution Future<JobID> jobid = fastSplitTask(taskid, reduceCapacity - 1); if (jobid == null) { if (LOG.isDebugEnabled()) { LOG.debug("task is already speculated. " + taskid); } } else { ++scheduled; speculated.add(jobid); } --availReduces; } else { // when available, reduce will always do handover if (LOG.isDebugEnabled()) { LOG.debug("handover " + taskid); } if (realJip.isReactiveJob()) // this will start recursion in reduce job scheduleLoadCompletionEvents(realJip); ++scheduled; --availReduces; } } } if (availMaps == 0 && availReduces == 0) break; } candidates = newCandidates; if (scheduled > 0 && LOG.isInfoEnabled()) { int to = pendingTakeOvers.getNumPendingTasks(); LOG.info(scheduled + " have been scheduled (" + to + " handovers)"); } return speculated; // } catch ( Exception ex ) { // LOG.error(ex); // throw ex; // } } } /** * fetch task completion events for original job and cache them for reduce split. * @author yongchul */ class TaskCompletionEventFetcher extends Thread { TaskCompletionEventFetcher() { super("TaskCompletionEventFetcher"); setDaemon(true); } @Override public void run() { LOG.info("starting task completion event fetcher"); try { List<JobInProgress> targets = new ArrayList<JobInProgress>(); HashSet<JobInProgress> removed = new HashSet<JobInProgress>(); while (true) { targets.clear(); synchronized (originalJobs) { targets.addAll(originalJobs); } if (LOG.isTraceEnabled() && targets.size() > 0) { LOG.trace("retrieving task completion events for " + targets); } for (JobInProgress jip : targets) { // check whether this has been completed or not try { if (!jip.updateTaskCompletionEvent()) { removed.add(jip); } } catch (IOException ex) { LOG.error("failed to retrieve completion event for " + jip.getJobID(), ex); } } if (!removed.isEmpty()) { synchronized (originalJobs) { originalJobs.removeAll(removed); } removed.clear(); } Thread.sleep(3000); // FIXME or for heartbeat interval? } } catch (InterruptedException x) { LOG.error(x); } catch (Throwable x) { LOG.error(x); } } } @Override public String getTaskTrackerURI(String tracker, String path) { int port = 0; synchronized (this) { TaskTrackerStatus status = trackerToLastHeartbeat.get(tracker); port = status.getHttpPort(); } return "http://" + tracker + ':' + port + path; } class ScanTask implements Callable<JobID> { final JobInProgress job; final TaskID taskid; final JobInProgress.ReactionContext action; ScanTask(JobInProgress jip, TaskID taskid, JobInProgress.ReactionContext hook) { this.job = jip; this.taskid = taskid; this.action = hook; } public JobInProgress getJobInProgress() { return job; } public TaskID getTaskID() { return taskid; } public JobInProgress.ReactionContext getAction() { return action; } @Override public JobID call() throws Exception { JobID jobid = null; try { if (!job.getJobID().equals(taskid.getJobID())) { throw new IOException("Job ID does not match: " + job.getJobID() + "/" + taskid.getJobID()); } if (taskid.getTaskType() == TaskType.MAP) { // MAP PartitionMapInput partMapInJob = new PartitionMapInput(); Job newJob = partMapInJob.prepareJob(job, taskid, action, clusterMetrics.getMapSlotCapacity()); // InputSplitCache.set(newJob.getConfiguration(),Collections.singletonList(tip.getInputSplit())); newJob.submit(); jobid = org.apache.hadoop.mapred.JobID.downgrade(newJob.getJobID()); synchronized (scanningJob) { scanningJob.put(jobid, this); } } else { // REDUCE PartitionMapOutput partMapOutJob = new PartitionMapOutput(); Job newJob = partMapOutJob.prepareJob(job, taskid, action, clusterMetrics.getReduceSlotCapacity()); // setup appropriate notification URL. on completion, launch planning and reactive task // submit it newJob.submit(); jobid = org.apache.hadoop.mapred.JobID.downgrade(newJob.getJobID()); synchronized (scanningJob) { scanningJob.put(jobid, this); } job.markCancel(taskid); } } catch (Exception e) { LOG.error("Failed to schedule scan task for " + taskid, e); throw e; } return jobid; } } ClusterInfo getClusterInfo(ReactionContext context, long now) throws IOException, InterruptedException { if (context.getPlanSpec().requireClusterInfo()) { return getClusterAvailability(context, now); } return new ClusterInfo(context.getTaskType(), clusterMetrics); } class PlanAndLaunchTask implements Callable<JobID> { final Log LOG = LogFactory.getLog(PlanAndLaunchTask.class); final JobInProgress job; final TaskID taskid; final JobInProgress.ReactionContext action; PlanAndLaunchTask(JobInProgress job, TaskID taskid, JobInProgress.ReactionContext hook) { this.job = job; this.taskid = taskid; this.action = hook; } public PlanAndLaunchTask(ScanTask scanTask) { this.job = scanTask.job; this.taskid = scanTask.taskid; this.action = scanTask.action; } Plan doPlan() throws IOException, InterruptedException { List<Partition> partitions = null; boolean isMap = TaskType.MAP == taskid.getTaskType(); if (action.getPlanSpec().requireScan()) { if (action.getPartitions().isEmpty()) { Path pfpath = null; if (isMap) { pfpath = PartitionMapInput.getPartitionFile(job, taskid); if (LOG.isInfoEnabled()) { LOG.info("loading partition file from " + pfpath); } // read partitionfile partitions = PartitionMapInput.loadPartitionFile(fs, pfpath, conf); } else { pfpath = PartitionMapOutput.getPartitionFile(job, taskid); if (LOG.isInfoEnabled()) { LOG.info("loading partition file from " + pfpath); } // read partitionfile partitions = PartitionMapOutput.loadPartitionFile(fs, pfpath, conf); } } else { partitions = action.getPartitions(); if (LOG.isInfoEnabled()) { LOG.info("loading partitions from takeover response:" + partitions.size()); } } long totalBytes = 0; for (Partition p : partitions) { totalBytes += p.getLength(); } action.setRemainBytes(totalBytes); if (LOG.isTraceEnabled()) { for (Partition p : partitions) { LOG.trace(p); } } } else { partitions = new ArrayList<Partition>( isMap ? clusterMetrics.getMapSlotCapacity() : clusterMetrics.getReduceSlotCapacity()); } long now = System.currentTimeMillis(); ClusterInfo clusterInfo = getClusterInfo(action, now); return PartitionPlanner.plan(action, clusterInfo, partitions, now); } @Override public JobID call() throws Exception { JobID jobid = null; Plan plan = null; try { plan = doPlan(); // now create subtask JobInProgress subJob = job.createReactiveJob(taskid, plan.getNumPartitions(), false, action); subJob.getJob().submit(); subJob.initialize(); // now add subjob to the data structures synchronized (jobs) { jobs.put(subJob.getDowngradeJobID(), subJob); } job.registerReactiveJob(taskid, subJob); jobid = subJob.getDowngradeJobID(); if (taskid.getTaskType() == TaskType.REDUCE) { // when reduce handover, release the original slot so that it can be reused by other reduce tasks. job.markCancel(taskid); } PlannedJob planJob = plan.getPlannedJob(subJob); synchronized (plannedJobs) { plannedJobs.put(jobid, planJob); } // if ( taskid.getTaskType() == TaskType.REDUCE ) { // scheduleJobTokenLoading(subJob); // } } catch (Exception e) { LOG.error("Failed to split a job " + job.getJobID(), e); throw e; } finally { synchronized (pendingReactiveJob) { pendingReactiveJob.remove(taskid); } pendingTakeOvers.unregister(taskid); } if (LOG.isInfoEnabled()) { LOG.info("reactive job " + jobid + " has been scheduled for " + taskid); } return jobid; } } /** * schedule scan task. scan the input data and collect information. * once information collected, scheduling algorithm will run and launch reactive task. * only happens on handover * * @param taskid * @param action * @return * @throws IOException * @throws InterruptedException */ private Future<JobID> scheduleScanTask(TaskID taskid, JobInProgress.ReactionContext action) throws IOException, InterruptedException { JobInProgress jip = null; synchronized (jobs) { jip = jobs.get(taskid.getJobID()); } if (jip == null) { String msg = "unknown task " + taskid; LOG.error(msg); throw new IOException(msg); } JobID jobid = null; try { jip.waitUntilReadyToSplit(taskid); if (LOG.isDebugEnabled()) { LOG.debug("scheduling asynchronous scan task for task " + taskid); } return this.asyncWorkers.submit(new ScanTask(jip, taskid, action)); } catch (ExecutionException e) { throw new IOException(e.getCause()); // wrap again! } } private Future<JobID> schedulePlanAndLaunch(ScanTask scanTask) { return this.asyncWorkers.submit(new PlanAndLaunchTask(scanTask)); } public static class SplitTaskV2Servlet extends HttpServlet { @Override protected void doGet(HttpServletRequest req, HttpServletResponse resp) throws ServletException, IOException { doPost(req, resp); } @Override protected void doPost(HttpServletRequest req, HttpServletResponse resp) throws ServletException, IOException { String pi = req.getPathInfo(); int pos = pi.lastIndexOf('/'); if (LOG.isDebugEnabled()) { LOG.debug("Path Info = " + pi + " pos=" + pos); } // String attemptIdStr = req.getParameter("i"); // String codeStr = req.getParameter("c"); String attemptIdStr = pi.substring(1, pos); String codeStr = pi.substring(pos + 1); if (LOG.isDebugEnabled()) { LOG.debug("attempt id = " + attemptIdStr + " code = " + codeStr); } if (attemptIdStr == null || codeStr == null) { resp.setStatus(HttpServletResponse.SC_BAD_REQUEST); return; } final TaskAttemptID attemptId = TaskAttemptID.forName(attemptIdStr); final boolean isMap = attemptId.getTaskType() == TaskType.MAP; int code = Integer.parseInt(codeStr); if (LOG.isInfoEnabled()) { LOG.info("split response from " + attemptId + " code=" + code); } final STJobTracker tracker = (STJobTracker) getServletContext().getAttribute("job.tracker"); final JobID jobId = attemptId.getJobID(); JobInProgress thisJob = null; synchronized (tracker.jobs) { thisJob = tracker.jobs.get(jobId); } if (thisJob == null) { resp.setStatus(HttpServletResponse.SC_NOT_FOUND); return; // unknown } TaskInProgress tip = thisJob.getTaskInProgress(attemptId); int capacity = isMap ? tracker.clusterMetrics.getMapSlotCapacity() : tracker.clusterMetrics.getReduceSlotCapacity(); JobInProgress.ReactionContext action = null; boolean speculative = true; boolean scheduleReactive = true; int sz = req.getContentLength(); LOG.info("available response bytes = " + sz); DataInputStream in = new DataInputStream(req.getInputStream()); // NPARTITIONS, TPB, REMAIN BYTES // response code // 0: successful -- parallel scan [offset info][TPB][REMAINBYTES] // 1: can not split // 2: successful -- local scan (encode partition information) [p1 p2 ... pN numPartitions][TPB][REMAINBYTES] byte[] body; if (sz > (4 + 8)) { body = new byte[sz - (4 + 8)]; in.readFully(body); } else { body = new byte[0]; } float tpb = in.readFloat(); long remainBytes = in.readLong(); if (code == 0) { // STOP was successful. retrieve the body and split if (LOG.isInfoEnabled()) { LOG.info(attemptId + " time per byte = " + tpb + " remaining bytes = " + remainBytes); } // if map, this is split info // if reduce, this is key if (isMap) { // convert this current status into next thing to read // FIXME construct action. split the content action = new JobInProgress.TakeOverMap(tip, attemptId, body, capacity, tpb, remainBytes, code); } else { // String enc = Base64.encodeToString(body, false); // LOG.info("min reduce key = "+enc); action = new JobInProgress.TakeOverReduce(tip, attemptId, body, tpb, remainBytes, code); } // let down stream reducers know that it should receive the reactive output as well. thisJob.addTaskStateEvent(tip, TaskState.TAKEOVER); speculative = false; } else if (code == 1) { // can not split. fall back to speculative execution. only happens for MAP tasks --capacity; // leave a room for currently executing one action = isMap ? new ReexecMap(tip, attemptId, capacity, tpb, remainBytes) : new ReexecReduce(tip, attemptId, tpb, remainBytes); thisJob.addTaskStateEvent(tip, TaskState.CANCEL_TAKEOVER); } /* else if ( code == 2 ) { // STOP was successful. retrieve the body and split if ( LOG.isInfoEnabled() ) { LOG.info(attemptId+" time per byte = "+tpb+" remaining bytes = "+remainBytes); } // if map, this is split info // if reduce, this is key if ( isMap ) { // convert this current status into next thing to read // FIXME construct action. split the content action = new JobInProgress.TakeOverMap(tip, attemptId, body, capacity, tpb, remainBytes, code); } else { // String enc = Base64.encodeToString(body, false); // LOG.info("min reduce key = "+enc); action = new JobInProgress.TakeOverReduce(tip, attemptId, body, tpb, remainBytes, code); } // let down stream reducers know that it should receive the reactive output as well. thisJob.addTaskStateEvent(tip, TaskState.TAKEOVER); speculative = false; }*/ else { // BAD request resp.setStatus(HttpServletResponse.SC_BAD_REQUEST); return; } if (scheduleReactive) { try { if (speculative) { // tracker.fastSplitTask(attemptId.getTaskID(), capacity, action, speculative); tracker.fastSplitTask(action, speculative); } else { PlanSpec spec = action.getPlanSpec(); if (spec.requireScan() && action.getPartitions().isEmpty()) { tracker.launchScanTask(thisJob, attemptId.getTaskID(), action); } else { // tracker.fastSplitTask(attemptId.getTaskID(), capacity, action, speculative); tracker.launchPlanAndLaunchTask(thisJob, attemptId.getTaskID(), action); } } } catch (InterruptedException ex) { LOG.error(ex); } } // if STOP was successful, the task is the last reduce, send wait. resp.setStatus(HttpServletResponse.SC_OK); resp.setContentType("text/html"); resp.setContentLength(0); resp.flushBuffer(); } } }