org.apache.hadoop.mapreduce.v2.app.MRAppMaster.java Source code

Java tutorial

Introduction

Here is the source code for org.apache.hadoop.mapreduce.v2.app.MRAppMaster.java

Source

/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements.  See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership.  The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License.  You may obtain a copy of the License at
*
*     http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package org.apache.hadoop.mapreduce.v2.app;

import java.io.IOException;
import java.lang.reflect.Constructor;
import java.lang.reflect.InvocationTargetException;
import java.nio.file.Paths;
import java.security.NoSuchAlgorithmException;
import java.security.PrivilegedExceptionAction;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.Set;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.ScheduledExecutorService;
import java.util.concurrent.atomic.AtomicBoolean;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import io.hops.security.HopsUtil;
import org.apache.commons.io.IOUtils;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.CommonConfigurationKeys;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.ipc.CallerContext;
import org.apache.hadoop.mapred.FileOutputCommitter;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.LocalContainerLauncher;
import org.apache.hadoop.mapred.TaskAttemptListenerImpl;
import org.apache.hadoop.mapred.TaskLog;
import org.apache.hadoop.mapred.TaskUmbilicalProtocol;
import org.apache.hadoop.mapreduce.CryptoUtils;
import org.apache.hadoop.mapreduce.JobContext;
import org.apache.hadoop.mapreduce.MRJobConfig;
import org.apache.hadoop.mapreduce.OutputCommitter;
import org.apache.hadoop.mapreduce.OutputFormat;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.TaskAttemptID;
import org.apache.hadoop.mapreduce.TypeConverter;
import org.apache.hadoop.mapreduce.jobhistory.AMStartedEvent;
import org.apache.hadoop.mapreduce.jobhistory.EventReader;
import org.apache.hadoop.mapreduce.jobhistory.EventType;
import org.apache.hadoop.mapreduce.jobhistory.HistoryEvent;
import org.apache.hadoop.mapreduce.jobhistory.JobHistoryCopyService;
import org.apache.hadoop.mapreduce.jobhistory.JobHistoryEvent;
import org.apache.hadoop.mapreduce.jobhistory.JobHistoryEventHandler;
import org.apache.hadoop.mapreduce.jobhistory.JobHistoryParser;
import org.apache.hadoop.mapreduce.jobhistory.JobHistoryParser.JobInfo;
import org.apache.hadoop.mapreduce.jobhistory.JobHistoryParser.TaskAttemptInfo;
import org.apache.hadoop.mapreduce.jobhistory.JobHistoryParser.TaskInfo;
import org.apache.hadoop.mapreduce.security.TokenCache;
import org.apache.hadoop.mapreduce.security.token.JobTokenSecretManager;
import org.apache.hadoop.mapreduce.task.JobContextImpl;
import org.apache.hadoop.mapreduce.task.TaskAttemptContextImpl;
import org.apache.hadoop.mapreduce.v2.api.records.AMInfo;
import org.apache.hadoop.mapreduce.v2.api.records.JobId;
import org.apache.hadoop.mapreduce.v2.api.records.JobReport;
import org.apache.hadoop.mapreduce.v2.api.records.JobState;
import org.apache.hadoop.mapreduce.v2.api.records.TaskId;
import org.apache.hadoop.mapreduce.v2.api.records.TaskState;
import org.apache.hadoop.mapreduce.v2.api.records.TaskType;
import org.apache.hadoop.mapreduce.v2.app.client.ClientService;
import org.apache.hadoop.mapreduce.v2.app.client.MRClientService;
import org.apache.hadoop.mapreduce.v2.app.commit.CommitterEvent;
import org.apache.hadoop.mapreduce.v2.app.commit.CommitterEventHandler;
import org.apache.hadoop.mapreduce.v2.app.commit.CommitterEventType;
import org.apache.hadoop.mapreduce.v2.app.job.Job;
import org.apache.hadoop.mapreduce.v2.app.job.JobStateInternal;
import org.apache.hadoop.mapreduce.v2.app.job.Task;
import org.apache.hadoop.mapreduce.v2.app.job.TaskAttempt;
import org.apache.hadoop.mapreduce.v2.app.job.event.JobEvent;
import org.apache.hadoop.mapreduce.v2.app.job.event.JobEventType;
import org.apache.hadoop.mapreduce.v2.app.job.event.JobFinishEvent;
import org.apache.hadoop.mapreduce.v2.app.job.event.JobStartEvent;
import org.apache.hadoop.mapreduce.v2.app.job.event.TaskAttemptEvent;
import org.apache.hadoop.mapreduce.v2.app.job.event.TaskAttemptEventType;
import org.apache.hadoop.mapreduce.v2.app.job.event.TaskEvent;
import org.apache.hadoop.mapreduce.v2.app.job.event.TaskEventType;
import org.apache.hadoop.mapreduce.v2.app.job.impl.JobImpl;
import org.apache.hadoop.mapreduce.v2.app.launcher.ContainerLauncher;
import org.apache.hadoop.mapreduce.v2.app.launcher.ContainerLauncherEvent;
import org.apache.hadoop.mapreduce.v2.app.launcher.ContainerLauncherImpl;
import org.apache.hadoop.mapreduce.v2.app.local.LocalContainerAllocator;
import org.apache.hadoop.mapreduce.v2.app.metrics.MRAppMetrics;
import org.apache.hadoop.mapreduce.v2.app.rm.ContainerAllocator;
import org.apache.hadoop.mapreduce.v2.app.rm.ContainerAllocatorEvent;
import org.apache.hadoop.mapreduce.v2.app.rm.RMCommunicator;
import org.apache.hadoop.mapreduce.v2.app.rm.RMContainerAllocator;
import org.apache.hadoop.mapreduce.v2.app.rm.RMContainerRequestor;
import org.apache.hadoop.mapreduce.v2.app.rm.RMHeartbeatHandler;
import org.apache.hadoop.mapreduce.v2.app.speculate.DefaultSpeculator;
import org.apache.hadoop.mapreduce.v2.app.speculate.Speculator;
import org.apache.hadoop.mapreduce.v2.app.speculate.SpeculatorEvent;
import org.apache.hadoop.mapreduce.v2.jobhistory.JobHistoryUtils;
import org.apache.hadoop.mapreduce.v2.util.MRApps;
import org.apache.hadoop.mapreduce.v2.util.MRBuilderUtils;
import org.apache.hadoop.mapreduce.v2.util.MRWebAppUtil;
import org.apache.hadoop.metrics2.lib.DefaultMetricsSystem;
import org.apache.hadoop.security.Credentials;
import org.apache.hadoop.security.UserGroupInformation;
import org.apache.hadoop.security.token.Token;
import org.apache.hadoop.service.AbstractService;
import org.apache.hadoop.service.CompositeService;
import org.apache.hadoop.service.Service;
import org.apache.hadoop.service.ServiceOperations;
import org.apache.hadoop.util.ExitUtil;
import org.apache.hadoop.util.ReflectionUtils;
import org.apache.hadoop.util.ShutdownHookManager;
import org.apache.hadoop.util.StringInterner;
import org.apache.hadoop.yarn.YarnUncaughtExceptionHandler;
import org.apache.hadoop.yarn.api.ApplicationConstants;
import org.apache.hadoop.yarn.api.ApplicationConstants.Environment;
import org.apache.hadoop.yarn.api.records.ApplicationAttemptId;
import org.apache.hadoop.yarn.api.records.ApplicationId;
import org.apache.hadoop.yarn.api.records.ContainerId;
import org.apache.hadoop.yarn.conf.YarnConfiguration;
import org.apache.hadoop.yarn.event.AsyncDispatcher;
import org.apache.hadoop.yarn.event.Dispatcher;
import org.apache.hadoop.yarn.event.Event;
import org.apache.hadoop.yarn.event.EventHandler;
import org.apache.hadoop.yarn.exceptions.YarnRuntimeException;
import org.apache.hadoop.yarn.security.AMRMTokenIdentifier;
import org.apache.hadoop.yarn.security.client.ClientToAMTokenSecretManager;
import org.apache.hadoop.yarn.util.Clock;
import org.apache.hadoop.yarn.util.ConverterUtils;
import org.apache.hadoop.yarn.util.SystemClock;
import org.apache.log4j.LogManager;

import com.google.common.annotations.VisibleForTesting;

import javax.crypto.KeyGenerator;

/**
 * The Map-Reduce Application Master.
 * The state machine is encapsulated in the implementation of Job interface.
 * All state changes happens via Job interface. Each event 
 * results in a Finite State Transition in Job.
 * 
 * MR AppMaster is the composition of loosely coupled services. The services 
 * interact with each other via events. The components resembles the 
 * Actors model. The component acts on received event and send out the 
 * events to other components.
 * This keeps it highly concurrent with no or minimal synchronization needs.
 * 
 * The events are dispatched by a central Dispatch mechanism. All components
 * register to the Dispatcher.
 * 
 * The information is shared across different components using AppContext.
 */

@SuppressWarnings("rawtypes")
public class MRAppMaster extends CompositeService {

    private static final Log LOG = LogFactory.getLog(MRAppMaster.class);

    /**
     * Priority of the MRAppMaster shutdown hook.
     */
    public static final int SHUTDOWN_HOOK_PRIORITY = 30;
    public static final String INTERMEDIATE_DATA_ENCRYPTION_ALGO = "HmacSHA1";

    private Clock clock;
    private final long startTime;
    private final long appSubmitTime;
    private String appName;
    private final ApplicationAttemptId appAttemptID;
    private final ContainerId containerID;
    private final String nmHost;
    private final int nmPort;
    private final int nmHttpPort;
    protected final MRAppMetrics metrics;
    private Map<TaskId, TaskInfo> completedTasksFromPreviousRun;
    private List<AMInfo> amInfos;
    private AppContext context;
    private Dispatcher dispatcher;
    private ClientService clientService;
    private ContainerAllocator containerAllocator;
    private ContainerLauncher containerLauncher;
    private EventHandler<CommitterEvent> committerEventHandler;
    private Speculator speculator;
    private TaskAttemptListener taskAttemptListener;
    private JobTokenSecretManager jobTokenSecretManager = new JobTokenSecretManager();
    private JobId jobId;
    private boolean newApiCommitter;
    private ClassLoader jobClassLoader;
    private OutputCommitter committer;
    private JobEventDispatcher jobEventDispatcher;
    private JobHistoryEventHandler jobHistoryEventHandler;
    private SpeculatorEventDispatcher speculatorEventDispatcher;
    private byte[] encryptedSpillKey;

    // After a task attempt completes from TaskUmbilicalProtocol's point of view,
    // it will be transitioned to finishing state.
    // taskAttemptFinishingMonitor is just a timer for attempts in finishing
    // state. If the attempt stays in finishing state for too long,
    // taskAttemptFinishingMonitor will notify the attempt via TA_TIMED_OUT
    // event.
    private TaskAttemptFinishingMonitor taskAttemptFinishingMonitor;

    private Job job;
    private Credentials jobCredentials = new Credentials(); // Filled during init
    protected UserGroupInformation currentUser; // Will be setup during init

    @VisibleForTesting
    protected volatile boolean isLastAMRetry = false;
    //Something happened and we should shut down right after we start up.
    boolean errorHappenedShutDown = false;
    private String shutDownMessage = null;
    JobStateInternal forcedState = null;
    private final ScheduledExecutorService logSyncer;

    private long recoveredJobStartTime = -1L;
    private static boolean mainStarted = false;

    @VisibleForTesting
    protected AtomicBoolean successfullyUnregistered = new AtomicBoolean(false);

    public MRAppMaster(ApplicationAttemptId applicationAttemptId, ContainerId containerId, String nmHost,
            int nmPort, int nmHttpPort, long appSubmitTime) {
        this(applicationAttemptId, containerId, nmHost, nmPort, nmHttpPort, new SystemClock(), appSubmitTime);
    }

    public MRAppMaster(ApplicationAttemptId applicationAttemptId, ContainerId containerId, String nmHost,
            int nmPort, int nmHttpPort, Clock clock, long appSubmitTime) {
        super(MRAppMaster.class.getName());
        this.clock = clock;
        this.startTime = clock.getTime();
        this.appSubmitTime = appSubmitTime;
        this.appAttemptID = applicationAttemptId;
        this.containerID = containerId;
        this.nmHost = nmHost;
        this.nmPort = nmPort;
        this.nmHttpPort = nmHttpPort;
        this.metrics = MRAppMetrics.create();
        logSyncer = TaskLog.createLogSyncer();
        LOG.info("Created MRAppMaster for application " + applicationAttemptId);
    }

    protected TaskAttemptFinishingMonitor createTaskAttemptFinishingMonitor(EventHandler eventHandler) {
        TaskAttemptFinishingMonitor monitor = new TaskAttemptFinishingMonitor(eventHandler);
        return monitor;
    }

    @Override
    protected void serviceInit(final Configuration conf) throws Exception {
        // create the job classloader if enabled
        createJobClassLoader(conf);

        conf.setBoolean(Dispatcher.DISPATCHER_EXIT_ON_ERROR_KEY, true);

        initJobCredentialsAndUGI(conf);

        dispatcher = createDispatcher();
        addIfService(dispatcher);
        taskAttemptFinishingMonitor = createTaskAttemptFinishingMonitor(dispatcher.getEventHandler());
        addIfService(taskAttemptFinishingMonitor);
        context = new RunningAppContext(conf, taskAttemptFinishingMonitor);

        // Job name is the same as the app name util we support DAG of jobs
        // for an app later
        appName = conf.get(MRJobConfig.JOB_NAME, "<missing app name>");

        conf.setInt(MRJobConfig.APPLICATION_ATTEMPT_ID, appAttemptID.getAttemptId());

        newApiCommitter = false;
        jobId = MRBuilderUtils.newJobId(appAttemptID.getApplicationId(), appAttemptID.getApplicationId().getId());
        int numReduceTasks = conf.getInt(MRJobConfig.NUM_REDUCES, 0);
        if ((numReduceTasks > 0 && conf.getBoolean("mapred.reducer.new-api", false))
                || (numReduceTasks == 0 && conf.getBoolean("mapred.mapper.new-api", false))) {
            newApiCommitter = true;
            LOG.info("Using mapred newApiCommitter.");
        }

        boolean copyHistory = false;
        committer = createOutputCommitter(conf);
        try {
            String user = UserGroupInformation.getCurrentUser().getShortUserName();
            Path stagingDir = MRApps.getStagingAreaDir(conf, user);
            FileSystem fs = getFileSystem(conf);

            boolean stagingExists = fs.exists(stagingDir);
            Path startCommitFile = MRApps.getStartJobCommitFile(conf, user, jobId);
            boolean commitStarted = fs.exists(startCommitFile);
            Path endCommitSuccessFile = MRApps.getEndJobCommitSuccessFile(conf, user, jobId);
            boolean commitSuccess = fs.exists(endCommitSuccessFile);
            Path endCommitFailureFile = MRApps.getEndJobCommitFailureFile(conf, user, jobId);
            boolean commitFailure = fs.exists(endCommitFailureFile);
            if (!stagingExists) {
                isLastAMRetry = true;
                LOG.info("Attempt num: " + appAttemptID.getAttemptId() + " is last retry: " + isLastAMRetry
                        + " because the staging dir doesn't exist.");
                errorHappenedShutDown = true;
                forcedState = JobStateInternal.ERROR;
                shutDownMessage = "Staging dir does not exist " + stagingDir;
                LOG.fatal(shutDownMessage);
            } else if (commitStarted) {
                //A commit was started so this is the last time, we just need to know
                // what result we will use to notify, and how we will unregister
                errorHappenedShutDown = true;
                isLastAMRetry = true;
                LOG.info("Attempt num: " + appAttemptID.getAttemptId() + " is last retry: " + isLastAMRetry
                        + " because a commit was started.");
                copyHistory = true;
                if (commitSuccess) {
                    shutDownMessage = "Job commit succeeded in a prior MRAppMaster attempt "
                            + "before it crashed. Recovering.";
                    forcedState = JobStateInternal.SUCCEEDED;
                } else if (commitFailure) {
                    shutDownMessage = "Job commit failed in a prior MRAppMaster attempt "
                            + "before it crashed. Not retrying.";
                    forcedState = JobStateInternal.FAILED;
                } else {
                    if (isCommitJobRepeatable()) {
                        // cleanup previous half done commits if committer supports
                        // repeatable job commit.
                        errorHappenedShutDown = false;
                        cleanupInterruptedCommit(conf, fs, startCommitFile);
                    } else {
                        //The commit is still pending, commit error
                        shutDownMessage = "Job commit from a prior MRAppMaster attempt is "
                                + "potentially in progress. Preventing multiple commit executions";
                        forcedState = JobStateInternal.ERROR;
                    }
                }
            }
        } catch (IOException e) {
            throw new YarnRuntimeException("Error while initializing", e);
        }

        if (errorHappenedShutDown) {
            NoopEventHandler eater = new NoopEventHandler();
            //We do not have a JobEventDispatcher in this path
            dispatcher.register(JobEventType.class, eater);

            EventHandler<JobHistoryEvent> historyService = null;
            if (copyHistory) {
                historyService = createJobHistoryHandler(context);
                dispatcher.register(org.apache.hadoop.mapreduce.jobhistory.EventType.class, historyService);
            } else {
                dispatcher.register(org.apache.hadoop.mapreduce.jobhistory.EventType.class, eater);
            }

            if (copyHistory) {
                // Now that there's a FINISHING state for application on RM to give AMs
                // plenty of time to clean up after unregister it's safe to clean staging
                // directory after unregistering with RM. So, we start the staging-dir
                // cleaner BEFORE the ContainerAllocator so that on shut-down,
                // ContainerAllocator unregisters first and then the staging-dir cleaner
                // deletes staging directory.
                addService(createStagingDirCleaningService());
            }

            // service to allocate containers from RM (if non-uber) or to fake it (uber)
            containerAllocator = createContainerAllocator(null, context);
            addIfService(containerAllocator);
            dispatcher.register(ContainerAllocator.EventType.class, containerAllocator);

            if (copyHistory) {
                // Add the JobHistoryEventHandler last so that it is properly stopped first.
                // This will guarantee that all history-events are flushed before AM goes
                // ahead with shutdown.
                // Note: Even though JobHistoryEventHandler is started last, if any
                // component creates a JobHistoryEvent in the meanwhile, it will be just be
                // queued inside the JobHistoryEventHandler 
                addIfService(historyService);

                JobHistoryCopyService cpHist = new JobHistoryCopyService(appAttemptID,
                        dispatcher.getEventHandler());
                addIfService(cpHist);
            }
        } else {

            //service to handle requests from JobClient
            clientService = createClientService(context);
            // Init ClientService separately so that we stop it separately, since this
            // service needs to wait some time before it stops so clients can know the
            // final states
            clientService.init(conf);

            containerAllocator = createContainerAllocator(clientService, context);

            //service to handle the output committer
            committerEventHandler = createCommitterEventHandler(context, committer);
            addIfService(committerEventHandler);

            //service to handle requests to TaskUmbilicalProtocol
            taskAttemptListener = createTaskAttemptListener(context);
            addIfService(taskAttemptListener);

            //service to log job history events
            EventHandler<JobHistoryEvent> historyService = createJobHistoryHandler(context);
            dispatcher.register(org.apache.hadoop.mapreduce.jobhistory.EventType.class, historyService);

            this.jobEventDispatcher = new JobEventDispatcher();

            //register the event dispatchers
            dispatcher.register(JobEventType.class, jobEventDispatcher);
            dispatcher.register(TaskEventType.class, new TaskEventDispatcher());
            dispatcher.register(TaskAttemptEventType.class, new TaskAttemptEventDispatcher());
            dispatcher.register(CommitterEventType.class, committerEventHandler);

            if (conf.getBoolean(MRJobConfig.MAP_SPECULATIVE, false)
                    || conf.getBoolean(MRJobConfig.REDUCE_SPECULATIVE, false)) {
                //optional service to speculate on task attempts' progress
                speculator = createSpeculator(conf, context);
                addIfService(speculator);
            }

            speculatorEventDispatcher = new SpeculatorEventDispatcher(conf);
            dispatcher.register(Speculator.EventType.class, speculatorEventDispatcher);

            // Now that there's a FINISHING state for application on RM to give AMs
            // plenty of time to clean up after unregister it's safe to clean staging
            // directory after unregistering with RM. So, we start the staging-dir
            // cleaner BEFORE the ContainerAllocator so that on shut-down,
            // ContainerAllocator unregisters first and then the staging-dir cleaner
            // deletes staging directory.
            addService(createStagingDirCleaningService());

            // service to allocate containers from RM (if non-uber) or to fake it (uber)
            addIfService(containerAllocator);
            dispatcher.register(ContainerAllocator.EventType.class, containerAllocator);

            // corresponding service to launch allocated containers via NodeManager
            containerLauncher = createContainerLauncher(context);
            addIfService(containerLauncher);
            dispatcher.register(ContainerLauncher.EventType.class, containerLauncher);

            // Add the JobHistoryEventHandler last so that it is properly stopped first.
            // This will guarantee that all history-events are flushed before AM goes
            // ahead with shutdown.
            // Note: Even though JobHistoryEventHandler is started last, if any
            // component creates a JobHistoryEvent in the meanwhile, it will be just be
            // queued inside the JobHistoryEventHandler 
            addIfService(historyService);
        }
        super.serviceInit(conf);
    } // end of init()

    protected Dispatcher createDispatcher() {
        return new AsyncDispatcher();
    }

    private boolean isCommitJobRepeatable() throws IOException {
        boolean isRepeatable = false;
        Configuration conf = getConfig();
        if (committer != null) {
            final JobContext jobContext = getJobContextFromConf(conf);

            isRepeatable = callWithJobClassLoader(conf, new ExceptionAction<Boolean>() {
                public Boolean call(Configuration conf) throws IOException {
                    return committer.isCommitJobRepeatable(jobContext);
                }
            });
        }
        return isRepeatable;
    }

    private JobContext getJobContextFromConf(Configuration conf) {
        if (newApiCommitter) {
            return new JobContextImpl(conf, TypeConverter.fromYarn(getJobId()));
        } else {
            return new org.apache.hadoop.mapred.JobContextImpl(new JobConf(conf),
                    TypeConverter.fromYarn(getJobId()));
        }
    }

    private void cleanupInterruptedCommit(Configuration conf, FileSystem fs, Path startCommitFile)
            throws IOException {
        LOG.info("Delete startJobCommitFile in case commit is not finished as " + "successful or failed.");
        fs.delete(startCommitFile, false);
    }

    private OutputCommitter createOutputCommitter(Configuration conf) {
        return callWithJobClassLoader(conf, new Action<OutputCommitter>() {
            public OutputCommitter call(Configuration conf) {
                OutputCommitter committer = null;

                LOG.info("OutputCommitter set in config " + conf.get("mapred.output.committer.class"));

                if (newApiCommitter) {
                    org.apache.hadoop.mapreduce.v2.api.records.TaskId taskID = MRBuilderUtils.newTaskId(jobId, 0,
                            TaskType.MAP);
                    org.apache.hadoop.mapreduce.v2.api.records.TaskAttemptId attemptID = MRBuilderUtils
                            .newTaskAttemptId(taskID, 0);
                    TaskAttemptContext taskContext = new TaskAttemptContextImpl(conf,
                            TypeConverter.fromYarn(attemptID));
                    OutputFormat outputFormat;
                    try {
                        outputFormat = ReflectionUtils.newInstance(taskContext.getOutputFormatClass(), conf);
                        committer = outputFormat.getOutputCommitter(taskContext);
                    } catch (Exception e) {
                        throw new YarnRuntimeException(e);
                    }
                } else {
                    committer = ReflectionUtils.newInstance(conf.getClass("mapred.output.committer.class",
                            FileOutputCommitter.class, org.apache.hadoop.mapred.OutputCommitter.class), conf);
                }
                LOG.info("OutputCommitter is " + committer.getClass().getName());
                return committer;
            }
        });
    }

    private boolean isJobNamePatternMatch(JobConf conf, String jobTempDir) {
        // Matched staging files should be preserved after job is finished.
        if (conf.getKeepTaskFilesPattern() != null && jobTempDir != null) {
            String jobFileName = Paths.get(jobTempDir).getFileName().toString();
            Pattern pattern = Pattern.compile(conf.getKeepTaskFilesPattern());
            Matcher matcher = pattern.matcher(jobFileName);
            return matcher.find();
        } else {
            return false;
        }
    }

    private boolean isKeepFailedTaskFiles(JobConf conf) {
        // TODO: Decide which failed task files that should
        // be kept are in application log directory.
        return conf.getKeepFailedTaskFiles();
    }

    protected boolean keepJobFiles(JobConf conf, String jobTempDir) {
        return isJobNamePatternMatch(conf, jobTempDir) || isKeepFailedTaskFiles(conf);
    }

    /**
     * Create the default file System for this job.
     * @param conf the conf object
     * @return the default filesystem for this job
     * @throws IOException
     */
    protected FileSystem getFileSystem(Configuration conf) throws IOException {
        return FileSystem.get(conf);
    }

    protected Credentials getCredentials() {
        return jobCredentials;
    }

    /**
     * clean up staging directories for the job.
     * @throws IOException
     */
    public void cleanupStagingDir() throws IOException {
        /* make sure we clean the staging files */
        String jobTempDir = getConfig().get(MRJobConfig.MAPREDUCE_JOB_DIR);
        FileSystem fs = getFileSystem(getConfig());
        try {
            if (!keepJobFiles(new JobConf(getConfig()), jobTempDir)) {
                jobTempDir = getConfig().get(MRJobConfig.MAPREDUCE_JOB_DIR);
                if (jobTempDir == null) {
                    LOG.warn("Job Staging directory is null");
                    return;
                }
                Path jobTempDirPath = new Path(jobTempDir);
                LOG.info("Deleting staging directory " + FileSystem.getDefaultUri(getConfig()) + " " + jobTempDir);
                fs.delete(jobTempDirPath, true);
            }
        } catch (IOException io) {
            LOG.error("Failed to cleanup staging dir " + jobTempDir, io);
        }
    }

    /**
     * Exit call. Just in a function call to enable testing.
     */
    protected void sysexit() {
        System.exit(0);
    }

    @VisibleForTesting
    public void shutDownJob() {
        // job has finished
        // this is the only job, so shut down the Appmaster
        // note in a workflow scenario, this may lead to creation of a new
        // job (FIXME?)

        try {
            //if isLastAMRetry comes as true, should never set it to false
            if (!isLastAMRetry) {
                if (((JobImpl) job).getInternalState() != JobStateInternal.REBOOT) {
                    LOG.info("Job finished cleanly, recording last MRAppMaster retry");
                    isLastAMRetry = true;
                }
            }
            notifyIsLastAMRetry(isLastAMRetry);
            // Stop all services
            // This will also send the final report to the ResourceManager
            LOG.info("Calling stop for all the services");
            MRAppMaster.this.stop();

            if (isLastAMRetry) {
                // Send job-end notification when it is safe to report termination to
                // users and it is the last AM retry
                if (getConfig().get(MRJobConfig.MR_JOB_END_NOTIFICATION_URL) != null) {
                    try {
                        LOG.info("Job end notification started for jobID : " + job.getReport().getJobId());
                        JobEndNotifier notifier = new JobEndNotifier();
                        notifier.setConf(getConfig());
                        JobReport report = job.getReport();
                        // If unregistration fails, the final state is unavailable. However,
                        // at the last AM Retry, the client will finally be notified FAILED
                        // from RM, so we should let users know FAILED via notifier as well
                        if (!context.hasSuccessfullyUnregistered()) {
                            report.setJobState(JobState.FAILED);
                        }
                        notifier.notify(report);
                    } catch (InterruptedException ie) {
                        LOG.warn("Job end notification interrupted for jobID : " + job.getReport().getJobId(), ie);
                    }
                }
            }

            try {
                Thread.sleep(5000);
            } catch (InterruptedException e) {
                e.printStackTrace();
            }
            clientService.stop();
        } catch (Throwable t) {
            LOG.warn("Graceful stop failed. Exiting.. ", t);
            exitMRAppMaster(1, t);
        }
        exitMRAppMaster(0, null);
    }

    /** MRAppMaster exit method which has been instrumented for both runtime and
     *  unit testing.
     * If the main thread has not been started, this method was called from a
     * test. In that case, configure the ExitUtil object to not exit the JVM.
     *
     * @param status integer indicating exit status
     * @param t throwable exception that could be null
     */
    private void exitMRAppMaster(int status, Throwable t) {
        if (!mainStarted) {
            ExitUtil.disableSystemExit();
        }
        try {
            if (t != null) {
                ExitUtil.terminate(status, t);
            } else {
                ExitUtil.terminate(status);
            }
        } catch (ExitUtil.ExitException ee) {
            // ExitUtil.ExitException is only thrown from the ExitUtil test code when
            // SystemExit has been disabled. It is always thrown in in the test code,
            // even when no error occurs. Ignore the exception so that tests don't
            // need to handle it.
        }
    }

    private class JobFinishEventHandler implements EventHandler<JobFinishEvent> {
        @Override
        public void handle(JobFinishEvent event) {
            // Create a new thread to shutdown the AM. We should not do it in-line
            // to avoid blocking the dispatcher itself.
            new Thread() {

                @Override
                public void run() {
                    shutDownJob();
                }
            }.start();
        }
    }

    /**
     * create an event handler that handles the job finish event.
     * @return the job finish event handler.
     */
    protected EventHandler<JobFinishEvent> createJobFinishEventHandler() {
        return new JobFinishEventHandler();
    }

    /** Create and initialize (but don't start) a single job. 
     * @param forcedState a state to force the job into or null for normal operation. 
     * @param diagnostic a diagnostic message to include with the job.
     */
    protected Job createJob(Configuration conf, JobStateInternal forcedState, String diagnostic) {

        // create single job
        Job newJob = new JobImpl(jobId, appAttemptID, conf, dispatcher.getEventHandler(), taskAttemptListener,
                jobTokenSecretManager, jobCredentials, clock, completedTasksFromPreviousRun, metrics, committer,
                newApiCommitter, currentUser.getUserName(), appSubmitTime, amInfos, context, forcedState,
                diagnostic);
        ((RunningAppContext) context).jobs.put(newJob.getID(), newJob);

        dispatcher.register(JobFinishEvent.Type.class, createJobFinishEventHandler());
        return newJob;
    } // end createJob()

    /**
     * Obtain the tokens needed by the job and put them in the UGI
     * @param conf
     */
    protected void initJobCredentialsAndUGI(Configuration conf) {

        try {
            this.currentUser = UserGroupInformation.getCurrentUser();
            this.jobCredentials = ((JobConf) conf).getCredentials();
            if (CryptoUtils.isEncryptedSpillEnabled(conf)) {
                int keyLen = conf.getInt(MRJobConfig.MR_ENCRYPTED_INTERMEDIATE_DATA_KEY_SIZE_BITS,
                        MRJobConfig.DEFAULT_MR_ENCRYPTED_INTERMEDIATE_DATA_KEY_SIZE_BITS);
                KeyGenerator keyGen = KeyGenerator.getInstance(INTERMEDIATE_DATA_ENCRYPTION_ALGO);
                keyGen.init(keyLen);
                encryptedSpillKey = keyGen.generateKey().getEncoded();
            } else {
                encryptedSpillKey = new byte[] { 0 };
            }
        } catch (IOException e) {
            throw new YarnRuntimeException(e);
        } catch (NoSuchAlgorithmException e) {
            throw new YarnRuntimeException(e);
        }
    }

    protected EventHandler<JobHistoryEvent> createJobHistoryHandler(AppContext context) {
        this.jobHistoryEventHandler = new JobHistoryEventHandler(context, getStartCount());
        return this.jobHistoryEventHandler;
    }

    protected AbstractService createStagingDirCleaningService() {
        return new StagingDirCleaningService();
    }

    protected Speculator createSpeculator(Configuration conf, final AppContext context) {
        return callWithJobClassLoader(conf, new Action<Speculator>() {
            public Speculator call(Configuration conf) {
                Class<? extends Speculator> speculatorClass;
                try {
                    speculatorClass
                    // "yarn.mapreduce.job.speculator.class"
                    = conf.getClass(MRJobConfig.MR_AM_JOB_SPECULATOR, DefaultSpeculator.class, Speculator.class);
                    Constructor<? extends Speculator> speculatorConstructor = speculatorClass
                            .getConstructor(Configuration.class, AppContext.class);
                    Speculator result = speculatorConstructor.newInstance(conf, context);

                    return result;
                } catch (InstantiationException ex) {
                    LOG.error("Can't make a speculator -- check " + MRJobConfig.MR_AM_JOB_SPECULATOR, ex);
                    throw new YarnRuntimeException(ex);
                } catch (IllegalAccessException ex) {
                    LOG.error("Can't make a speculator -- check " + MRJobConfig.MR_AM_JOB_SPECULATOR, ex);
                    throw new YarnRuntimeException(ex);
                } catch (InvocationTargetException ex) {
                    LOG.error("Can't make a speculator -- check " + MRJobConfig.MR_AM_JOB_SPECULATOR, ex);
                    throw new YarnRuntimeException(ex);
                } catch (NoSuchMethodException ex) {
                    LOG.error("Can't make a speculator -- check " + MRJobConfig.MR_AM_JOB_SPECULATOR, ex);
                    throw new YarnRuntimeException(ex);
                }
            }
        });
    }

    protected TaskAttemptListener createTaskAttemptListener(AppContext context) {
        TaskAttemptListener lis = new TaskAttemptListenerImpl(context, jobTokenSecretManager,
                getRMHeartbeatHandler(), encryptedSpillKey);
        return lis;
    }

    protected EventHandler<CommitterEvent> createCommitterEventHandler(AppContext context,
            OutputCommitter committer) {
        return new CommitterEventHandler(context, committer, getRMHeartbeatHandler(), jobClassLoader);
    }

    protected ContainerAllocator createContainerAllocator(final ClientService clientService,
            final AppContext context) {
        return new ContainerAllocatorRouter(clientService, context);
    }

    protected RMHeartbeatHandler getRMHeartbeatHandler() {
        return (RMHeartbeatHandler) containerAllocator;
    }

    protected ContainerLauncher createContainerLauncher(final AppContext context) {
        return new ContainerLauncherRouter(context);
    }

    //TODO:should have an interface for MRClientService
    protected ClientService createClientService(AppContext context) {
        return new MRClientService(context);
    }

    public ApplicationId getAppID() {
        return appAttemptID.getApplicationId();
    }

    public ApplicationAttemptId getAttemptID() {
        return appAttemptID;
    }

    public JobId getJobId() {
        return jobId;
    }

    public OutputCommitter getCommitter() {
        return committer;
    }

    public boolean isNewApiCommitter() {
        return newApiCommitter;
    }

    public int getStartCount() {
        return appAttemptID.getAttemptId();
    }

    public AppContext getContext() {
        return context;
    }

    public Dispatcher getDispatcher() {
        return dispatcher;
    }

    public Map<TaskId, TaskInfo> getCompletedTaskFromPreviousRun() {
        return completedTasksFromPreviousRun;
    }

    public List<AMInfo> getAllAMInfos() {
        return amInfos;
    }

    public ContainerAllocator getContainerAllocator() {
        return containerAllocator;
    }

    public ContainerLauncher getContainerLauncher() {
        return containerLauncher;
    }

    public TaskAttemptListener getTaskAttemptListener() {
        return taskAttemptListener;
    }

    public Boolean isLastAMRetry() {
        return isLastAMRetry;
    }

    /**
     * By the time life-cycle of this router starts, job-init would have already
     * happened.
     */
    private final class ContainerAllocatorRouter extends AbstractService
            implements ContainerAllocator, RMHeartbeatHandler {
        private final ClientService clientService;
        private final AppContext context;
        private ContainerAllocator containerAllocator;

        ContainerAllocatorRouter(ClientService clientService, AppContext context) {
            super(ContainerAllocatorRouter.class.getName());
            this.clientService = clientService;
            this.context = context;
        }

        @Override
        protected void serviceStart() throws Exception {
            if (job.isUber()) {
                MRApps.setupDistributedCacheLocal(getConfig());
                this.containerAllocator = new LocalContainerAllocator(this.clientService, this.context, nmHost,
                        nmPort, nmHttpPort, containerID);
            } else {
                this.containerAllocator = new RMContainerAllocator(this.clientService, this.context);
            }
            ((Service) this.containerAllocator).init(getConfig());
            ((Service) this.containerAllocator).start();
            super.serviceStart();
        }

        @Override
        protected void serviceStop() throws Exception {
            ServiceOperations.stop((Service) this.containerAllocator);
            super.serviceStop();
        }

        @Override
        public void handle(ContainerAllocatorEvent event) {
            this.containerAllocator.handle(event);
        }

        public void setSignalled(boolean isSignalled) {
            ((RMCommunicator) containerAllocator).setSignalled(isSignalled);
        }

        public void setShouldUnregister(boolean shouldUnregister) {
            ((RMCommunicator) containerAllocator).setShouldUnregister(shouldUnregister);
        }

        @Override
        public long getLastHeartbeatTime() {
            return ((RMCommunicator) containerAllocator).getLastHeartbeatTime();
        }

        @Override
        public void runOnNextHeartbeat(Runnable callback) {
            ((RMCommunicator) containerAllocator).runOnNextHeartbeat(callback);
        }
    }

    /**
     * By the time life-cycle of this router starts, job-init would have already
     * happened.
     */
    private final class ContainerLauncherRouter extends AbstractService implements ContainerLauncher {
        private final AppContext context;
        private ContainerLauncher containerLauncher;

        ContainerLauncherRouter(AppContext context) {
            super(ContainerLauncherRouter.class.getName());
            this.context = context;
        }

        @Override
        protected void serviceStart() throws Exception {
            if (job.isUber()) {
                this.containerLauncher = new LocalContainerLauncher(context,
                        (TaskUmbilicalProtocol) taskAttemptListener, jobClassLoader);
                ((LocalContainerLauncher) this.containerLauncher).setEncryptedSpillKey(encryptedSpillKey);
            } else {
                this.containerLauncher = new ContainerLauncherImpl(context);
            }
            ((Service) this.containerLauncher).init(getConfig());
            ((Service) this.containerLauncher).start();
            super.serviceStart();
        }

        @Override
        public void handle(ContainerLauncherEvent event) {
            this.containerLauncher.handle(event);
        }

        @Override
        protected void serviceStop() throws Exception {
            ServiceOperations.stop((Service) this.containerLauncher);
            super.serviceStop();
        }
    }

    private final class StagingDirCleaningService extends AbstractService {
        StagingDirCleaningService() {
            super(StagingDirCleaningService.class.getName());
        }

        @Override
        protected void serviceStop() throws Exception {
            try {
                if (isLastAMRetry) {
                    cleanupStagingDir();
                } else {
                    LOG.info("Skipping cleaning up the staging dir. " + "assuming AM will be retried.");
                }
            } catch (IOException io) {
                LOG.error("Failed to cleanup staging dir: ", io);
            }
            super.serviceStop();
        }
    }

    public class RunningAppContext implements AppContext {

        private final Map<JobId, Job> jobs = new ConcurrentHashMap<JobId, Job>();
        private final Configuration conf;
        private final ClusterInfo clusterInfo = new ClusterInfo();
        private final ClientToAMTokenSecretManager clientToAMTokenSecretManager;

        private final TaskAttemptFinishingMonitor taskAttemptFinishingMonitor;

        public RunningAppContext(Configuration config, TaskAttemptFinishingMonitor taskAttemptFinishingMonitor) {
            this.conf = config;
            this.clientToAMTokenSecretManager = new ClientToAMTokenSecretManager(appAttemptID, null);
            this.taskAttemptFinishingMonitor = taskAttemptFinishingMonitor;
        }

        @Override
        public ApplicationAttemptId getApplicationAttemptId() {
            return appAttemptID;
        }

        @Override
        public ApplicationId getApplicationID() {
            return appAttemptID.getApplicationId();
        }

        @Override
        public String getApplicationName() {
            return appName;
        }

        @Override
        public long getStartTime() {
            return startTime;
        }

        @Override
        public Job getJob(JobId jobID) {
            return jobs.get(jobID);
        }

        @Override
        public Map<JobId, Job> getAllJobs() {
            return jobs;
        }

        @Override
        public EventHandler getEventHandler() {
            return dispatcher.getEventHandler();
        }

        @Override
        public CharSequence getUser() {
            return this.conf.get(MRJobConfig.USER_NAME);
        }

        @Override
        public Clock getClock() {
            return clock;
        }

        @Override
        public ClusterInfo getClusterInfo() {
            return this.clusterInfo;
        }

        @Override
        public Set<String> getBlacklistedNodes() {
            return ((RMContainerRequestor) containerAllocator).getBlacklistedNodes();
        }

        @Override
        public ClientToAMTokenSecretManager getClientToAMTokenSecretManager() {
            return clientToAMTokenSecretManager;
        }

        @Override
        public boolean isLastAMRetry() {
            return isLastAMRetry;
        }

        @Override
        public boolean hasSuccessfullyUnregistered() {
            return successfullyUnregistered.get();
        }

        public void markSuccessfulUnregistration() {
            successfullyUnregistered.set(true);
        }

        public void resetIsLastAMRetry() {
            isLastAMRetry = false;
        }

        @Override
        public String getNMHostname() {
            return nmHost;
        }

        @Override
        public TaskAttemptFinishingMonitor getTaskAttemptFinishingMonitor() {
            return taskAttemptFinishingMonitor;
        }

    }

    @SuppressWarnings("unchecked")
    @Override
    protected void serviceStart() throws Exception {

        amInfos = new LinkedList<AMInfo>();
        completedTasksFromPreviousRun = new HashMap<TaskId, TaskInfo>();
        processRecovery();

        // Current an AMInfo for the current AM generation.
        AMInfo amInfo = MRBuilderUtils.newAMInfo(appAttemptID, startTime, containerID, nmHost, nmPort, nmHttpPort);

        // /////////////////// Create the job itself.
        job = createJob(getConfig(), forcedState, shutDownMessage);

        // End of creating the job.

        // Send out an MR AM inited event for all previous AMs.
        for (AMInfo info : amInfos) {
            dispatcher.getEventHandler()
                    .handle(new JobHistoryEvent(job.getID(),
                            new AMStartedEvent(info.getAppAttemptId(), info.getStartTime(), info.getContainerId(),
                                    info.getNodeManagerHost(), info.getNodeManagerPort(),
                                    info.getNodeManagerHttpPort(), appSubmitTime)));
        }

        // Send out an MR AM inited event for this AM.
        dispatcher.getEventHandler().handle(new JobHistoryEvent(job.getID(),
                new AMStartedEvent(amInfo.getAppAttemptId(), amInfo.getStartTime(), amInfo.getContainerId(),
                        amInfo.getNodeManagerHost(), amInfo.getNodeManagerPort(), amInfo.getNodeManagerHttpPort(),
                        this.forcedState == null ? null : this.forcedState.toString(), appSubmitTime)));
        amInfos.add(amInfo);

        // metrics system init is really init & start.
        // It's more test friendly to put it here.
        DefaultMetricsSystem.initialize("MRAppMaster");

        boolean initFailed = false;
        if (!errorHappenedShutDown) {
            // create a job event for job intialization
            JobEvent initJobEvent = new JobEvent(job.getID(), JobEventType.JOB_INIT);
            // Send init to the job (this does NOT trigger job execution)
            // This is a synchronous call, not an event through dispatcher. We want
            // job-init to be done completely here.
            jobEventDispatcher.handle(initJobEvent);

            // If job is still not initialized, an error happened during
            // initialization. Must complete starting all of the services so failure
            // events can be processed.
            initFailed = (((JobImpl) job).getInternalState() != JobStateInternal.INITED);

            // JobImpl's InitTransition is done (call above is synchronous), so the
            // "uber-decision" (MR-1220) has been made.  Query job and switch to
            // ubermode if appropriate (by registering different container-allocator
            // and container-launcher services/event-handlers).

            if (job.isUber()) {
                speculatorEventDispatcher.disableSpeculation();
                LOG.info("MRAppMaster uberizing job " + job.getID() + " in local container (\"uber-AM\") on node "
                        + nmHost + ":" + nmPort + ".");
            } else {
                // send init to speculator only for non-uber jobs. 
                // This won't yet start as dispatcher isn't started yet.
                dispatcher.getEventHandler().handle(new SpeculatorEvent(job.getID(), clock.getTime()));
                LOG.info("MRAppMaster launching normal, non-uberized, multi-container " + "job " + job.getID()
                        + ".");
            }
            // Start ClientService here, since it's not initialized if
            // errorHappenedShutDown is true
            clientService.start();
        }
        //start all the components
        super.serviceStart();

        // finally set the job classloader
        MRApps.setClassLoader(jobClassLoader, getConfig());

        if (initFailed) {
            JobEvent initFailedEvent = new JobEvent(job.getID(), JobEventType.JOB_INIT_FAILED);
            jobEventDispatcher.handle(initFailedEvent);
        } else {
            // All components have started, start the job.
            startJobs();
        }
    }

    protected void shutdownTaskLog() {
        TaskLog.syncLogsShutdown(logSyncer);
    }

    @Override
    public void stop() {
        super.stop();
        shutdownTaskLog();
    }

    private boolean isRecoverySupported() throws IOException {
        boolean isSupported = false;
        Configuration conf = getConfig();
        if (committer != null) {
            final JobContext _jobContext = getJobContextFromConf(conf);
            isSupported = callWithJobClassLoader(conf, new ExceptionAction<Boolean>() {
                public Boolean call(Configuration conf) throws IOException {
                    return committer.isRecoverySupported(_jobContext);
                }
            });
        }
        return isSupported;
    }

    private void processRecovery() throws IOException {
        if (appAttemptID.getAttemptId() == 1) {
            return; // no need to recover on the first attempt
        }

        boolean recoveryEnabled = getConfig().getBoolean(MRJobConfig.MR_AM_JOB_RECOVERY_ENABLE,
                MRJobConfig.MR_AM_JOB_RECOVERY_ENABLE_DEFAULT);

        boolean recoverySupportedByCommitter = isRecoverySupported();

        // If a shuffle secret was not provided by the job client then this app
        // attempt will generate one.  However that disables recovery if there
        // are reducers as the shuffle secret would be app attempt specific.
        int numReduceTasks = getConfig().getInt(MRJobConfig.NUM_REDUCES, 0);
        boolean shuffleKeyValidForRecovery = TokenCache.getShuffleSecretKey(jobCredentials) != null;

        if (recoveryEnabled && recoverySupportedByCommitter
                && (numReduceTasks <= 0 || shuffleKeyValidForRecovery)) {
            LOG.info("Recovery is enabled. " + "Will try to recover from previous life on best effort basis.");
            try {
                parsePreviousJobHistory();
            } catch (IOException e) {
                LOG.warn("Unable to parse prior job history, aborting recovery", e);
                // try to get just the AMInfos
                amInfos.addAll(readJustAMInfos());
            }
        } else {
            LOG.info("Will not try to recover. recoveryEnabled: " + recoveryEnabled
                    + " recoverySupportedByCommitter: " + recoverySupportedByCommitter + " numReduceTasks: "
                    + numReduceTasks + " shuffleKeyValidForRecovery: " + shuffleKeyValidForRecovery
                    + " ApplicationAttemptID: " + appAttemptID.getAttemptId());
            // Get the amInfos anyways whether recovery is enabled or not
            amInfos.addAll(readJustAMInfos());
        }
    }

    private static FSDataInputStream getPreviousJobHistoryStream(Configuration conf,
            ApplicationAttemptId appAttemptId) throws IOException {
        Path historyFile = JobHistoryUtils.getPreviousJobHistoryPath(conf, appAttemptId);
        LOG.info("Previous history file is at " + historyFile);
        return historyFile.getFileSystem(conf).open(historyFile);
    }

    private void parsePreviousJobHistory() throws IOException {
        FSDataInputStream in = getPreviousJobHistoryStream(getConfig(), appAttemptID);
        JobHistoryParser parser = new JobHistoryParser(in);
        JobInfo jobInfo = parser.parse();
        Exception parseException = parser.getParseException();
        if (parseException != null) {
            LOG.info("Got an error parsing job-history file" + ", ignoring incomplete events.", parseException);
        }
        Map<org.apache.hadoop.mapreduce.TaskID, TaskInfo> taskInfos = jobInfo.getAllTasks();
        for (TaskInfo taskInfo : taskInfos.values()) {
            if (TaskState.SUCCEEDED.toString().equals(taskInfo.getTaskStatus())) {
                Iterator<Entry<TaskAttemptID, TaskAttemptInfo>> taskAttemptIterator = taskInfo.getAllTaskAttempts()
                        .entrySet().iterator();
                while (taskAttemptIterator.hasNext()) {
                    Map.Entry<TaskAttemptID, TaskAttemptInfo> currentEntry = taskAttemptIterator.next();
                    if (!jobInfo.getAllCompletedTaskAttempts().containsKey(currentEntry.getKey())) {
                        taskAttemptIterator.remove();
                    }
                }
                completedTasksFromPreviousRun.put(TypeConverter.toYarn(taskInfo.getTaskId()), taskInfo);
                LOG.info("Read from history task " + TypeConverter.toYarn(taskInfo.getTaskId()));
            }
        }
        LOG.info("Read completed tasks from history " + completedTasksFromPreviousRun.size());
        recoveredJobStartTime = jobInfo.getLaunchTime();

        // recover AMInfos
        List<JobHistoryParser.AMInfo> jhAmInfoList = jobInfo.getAMInfos();
        if (jhAmInfoList != null) {
            for (JobHistoryParser.AMInfo jhAmInfo : jhAmInfoList) {
                AMInfo amInfo = MRBuilderUtils.newAMInfo(jhAmInfo.getAppAttemptId(), jhAmInfo.getStartTime(),
                        jhAmInfo.getContainerId(), jhAmInfo.getNodeManagerHost(), jhAmInfo.getNodeManagerPort(),
                        jhAmInfo.getNodeManagerHttpPort());
                amInfos.add(amInfo);
            }
        }
    }

    private List<AMInfo> readJustAMInfos() {
        List<AMInfo> amInfos = new ArrayList<AMInfo>();
        FSDataInputStream inputStream = null;
        try {
            inputStream = getPreviousJobHistoryStream(getConfig(), appAttemptID);
            EventReader jobHistoryEventReader = new EventReader(inputStream);

            // All AMInfos are contiguous. Track when the first AMStartedEvent
            // appears.
            boolean amStartedEventsBegan = false;

            HistoryEvent event;
            while ((event = jobHistoryEventReader.getNextEvent()) != null) {
                if (event.getEventType() == EventType.AM_STARTED) {
                    if (!amStartedEventsBegan) {
                        // First AMStartedEvent.
                        amStartedEventsBegan = true;
                    }
                    AMStartedEvent amStartedEvent = (AMStartedEvent) event;
                    amInfos.add(MRBuilderUtils.newAMInfo(amStartedEvent.getAppAttemptId(),
                            amStartedEvent.getStartTime(), amStartedEvent.getContainerId(),
                            StringInterner.weakIntern(amStartedEvent.getNodeManagerHost()),
                            amStartedEvent.getNodeManagerPort(), amStartedEvent.getNodeManagerHttpPort()));
                } else if (amStartedEventsBegan) {
                    // This means AMStartedEvents began and this event is a
                    // non-AMStarted event.
                    // No need to continue reading all the other events.
                    break;
                }
            }
        } catch (IOException e) {
            LOG.warn("Could not parse the old history file. " + "Will not have old AMinfos ", e);
        } finally {
            if (inputStream != null) {
                IOUtils.closeQuietly(inputStream);
            }
        }
        return amInfos;
    }

    /**
     * This can be overridden to instantiate multiple jobs and create a 
     * workflow.
     *
     * TODO:  Rework the design to actually support this.  Currently much of the
     * job stuff has been moved to init() above to support uberization (MR-1220).
     * In a typical workflow, one presumably would want to uberize only a subset
     * of the jobs (the "small" ones), which is awkward with the current design.
     */
    @SuppressWarnings("unchecked")
    protected void startJobs() {
        /** create a job-start event to get this ball rolling */
        JobEvent startJobEvent = new JobStartEvent(job.getID(), recoveredJobStartTime);
        /** send the job-start event. this triggers the job execution. */
        dispatcher.getEventHandler().handle(startJobEvent);
    }

    private class JobEventDispatcher implements EventHandler<JobEvent> {
        @SuppressWarnings("unchecked")
        @Override
        public void handle(JobEvent event) {
            ((EventHandler<JobEvent>) context.getJob(event.getJobId())).handle(event);
        }
    }

    private class TaskEventDispatcher implements EventHandler<TaskEvent> {
        @SuppressWarnings("unchecked")
        @Override
        public void handle(TaskEvent event) {
            Task task = context.getJob(event.getTaskID().getJobId()).getTask(event.getTaskID());
            ((EventHandler<TaskEvent>) task).handle(event);
        }
    }

    private class TaskAttemptEventDispatcher implements EventHandler<TaskAttemptEvent> {
        @SuppressWarnings("unchecked")
        @Override
        public void handle(TaskAttemptEvent event) {
            Job job = context.getJob(event.getTaskAttemptID().getTaskId().getJobId());
            Task task = job.getTask(event.getTaskAttemptID().getTaskId());
            TaskAttempt attempt = task.getAttempt(event.getTaskAttemptID());
            ((EventHandler<TaskAttemptEvent>) attempt).handle(event);
        }
    }

    private class SpeculatorEventDispatcher implements EventHandler<SpeculatorEvent> {
        private final Configuration conf;
        private volatile boolean disabled;

        public SpeculatorEventDispatcher(Configuration config) {
            this.conf = config;
        }

        @Override
        public void handle(final SpeculatorEvent event) {
            if (disabled) {
                return;
            }

            TaskId tId = event.getTaskID();
            TaskType tType = null;
            /* event's TaskId will be null if the event type is JOB_CREATE or
             * ATTEMPT_STATUS_UPDATE
             */
            if (tId != null) {
                tType = tId.getTaskType();
            }
            boolean shouldMapSpec = conf.getBoolean(MRJobConfig.MAP_SPECULATIVE, false);
            boolean shouldReduceSpec = conf.getBoolean(MRJobConfig.REDUCE_SPECULATIVE, false);

            /* The point of the following is to allow the MAP and REDUCE speculative
             * config values to be independent:
             * IF spec-exec is turned on for maps AND the task is a map task
             * OR IF spec-exec is turned on for reduces AND the task is a reduce task
             * THEN call the speculator to handle the event.
             */
            if ((shouldMapSpec && (tType == null || tType == TaskType.MAP))
                    || (shouldReduceSpec && (tType == null || tType == TaskType.REDUCE))) {
                // Speculator IS enabled, direct the event to there.
                callWithJobClassLoader(conf, new Action<Void>() {
                    public Void call(Configuration conf) {
                        speculator.handle(event);
                        return null;
                    }
                });
            }
        }

        public void disableSpeculation() {
            disabled = true;
        }

    }

    /**
     * Eats events that are not needed in some error cases.
     */
    private static class NoopEventHandler implements EventHandler<Event> {

        @Override
        public void handle(Event event) {
            //Empty
        }
    }

    private static void validateInputParam(String value, String param) throws IOException {
        if (value == null) {
            String msg = param + " is null";
            LOG.error(msg);
            throw new IOException(msg);
        }
    }

    public static void main(String[] args) {
        try {
            mainStarted = true;
            Thread.setDefaultUncaughtExceptionHandler(new YarnUncaughtExceptionHandler());
            String containerIdStr = System.getenv(Environment.CONTAINER_ID.name());
            String nodeHostString = System.getenv(Environment.NM_HOST.name());
            String nodePortString = System.getenv(Environment.NM_PORT.name());
            String nodeHttpPortString = System.getenv(Environment.NM_HTTP_PORT.name());
            String appSubmitTimeStr = System.getenv(ApplicationConstants.APP_SUBMIT_TIME_ENV);

            validateInputParam(containerIdStr, Environment.CONTAINER_ID.name());
            validateInputParam(nodeHostString, Environment.NM_HOST.name());
            validateInputParam(nodePortString, Environment.NM_PORT.name());
            validateInputParam(nodeHttpPortString, Environment.NM_HTTP_PORT.name());
            validateInputParam(appSubmitTimeStr, ApplicationConstants.APP_SUBMIT_TIME_ENV);

            ContainerId containerId = ContainerId.fromString(containerIdStr);
            ApplicationAttemptId applicationAttemptId = containerId.getApplicationAttemptId();
            if (applicationAttemptId != null) {
                CallerContext.setCurrent(
                        new CallerContext.Builder("mr_appmaster_" + applicationAttemptId.toString()).build());
            }
            long appSubmitTime = Long.parseLong(appSubmitTimeStr);

            MRAppMaster appMaster = new MRAppMaster(applicationAttemptId, containerId, nodeHostString,
                    Integer.parseInt(nodePortString), Integer.parseInt(nodeHttpPortString), appSubmitTime);
            ShutdownHookManager.get().addShutdownHook(new MRAppMasterShutdownHook(appMaster),
                    SHUTDOWN_HOOK_PRIORITY);
            JobConf conf = new JobConf(new YarnConfiguration());
            conf.addResource(new Path(MRJobConfig.JOB_CONF_FILE));

            MRWebAppUtil.initialize(conf);
            // log the system properties
            String systemPropsToLog = MRApps.getSystemPropertiesToLog(conf);
            if (systemPropsToLog != null) {
                LOG.info(systemPropsToLog);
            }

            String jobUserName = System.getenv(ApplicationConstants.Environment.USER.name());
            conf.set(MRJobConfig.USER_NAME, jobUserName);
            initAndStartAppMaster(appMaster, conf, jobUserName);
        } catch (Throwable t) {
            LOG.fatal("Error starting MRAppMaster", t);
            ExitUtil.terminate(1, t);
        }
    }

    // The shutdown hook that runs when a signal is received AND during normal
    // close of the JVM.
    static class MRAppMasterShutdownHook implements Runnable {
        MRAppMaster appMaster;

        MRAppMasterShutdownHook(MRAppMaster appMaster) {
            this.appMaster = appMaster;
        }

        public void run() {
            LOG.info("MRAppMaster received a signal. Signaling RMCommunicator and " + "JobHistoryEventHandler.");

            // Notify the JHEH and RMCommunicator that a SIGTERM has been received so
            // that they don't take too long in shutting down
            if (appMaster.containerAllocator instanceof ContainerAllocatorRouter) {
                ((ContainerAllocatorRouter) appMaster.containerAllocator).setSignalled(true);
            }
            appMaster.notifyIsLastAMRetry(appMaster.isLastAMRetry);
            appMaster.stop();
        }
    }

    public void notifyIsLastAMRetry(boolean isLastAMRetry) {
        if (containerAllocator instanceof ContainerAllocatorRouter) {
            LOG.info("Notify RMCommunicator isAMLastRetry: " + isLastAMRetry);
            ((ContainerAllocatorRouter) containerAllocator).setShouldUnregister(isLastAMRetry);
        }
        if (jobHistoryEventHandler != null) {
            LOG.info("Notify JHEH isAMLastRetry: " + isLastAMRetry);
            jobHistoryEventHandler.setForcejobCompletion(isLastAMRetry);
        }
    }

    protected static void initAndStartAppMaster(final MRAppMaster appMaster, final JobConf conf, String jobUserName)
            throws IOException, InterruptedException {
        UserGroupInformation.setConfiguration(conf);
        // Security framework already loaded the tokens into current UGI, just use
        // them
        Credentials credentials = UserGroupInformation.getCurrentUser().getCredentials();
        LOG.info("Executing with tokens:");
        for (Token<?> token : credentials.getAllTokens()) {
            LOG.info(token);
        }

        UserGroupInformation appMasterUgi = UserGroupInformation.createRemoteUser(jobUserName);
        appMasterUgi.addCredentials(credentials);

        // Now remove the AM->RM token so tasks don't have it
        Iterator<Token<?>> iter = credentials.getAllTokens().iterator();
        while (iter.hasNext()) {
            Token<?> token = iter.next();
            if (token.getKind().equals(AMRMTokenIdentifier.KIND_NAME)) {
                iter.remove();
            }
        }
        conf.getCredentials().addAll(credentials);
        if (conf.getBoolean(CommonConfigurationKeys.IPC_SERVER_SSL_ENABLED,
                CommonConfigurationKeys.IPC_SERVER_SSL_ENABLED_DEFAULT)) {
            HopsUtil.generateContainerSSLServerConfiguration(conf);
        }
        appMasterUgi.doAs(new PrivilegedExceptionAction<Object>() {
            @Override
            public Object run() throws Exception {
                appMaster.init(conf);
                appMaster.start();
                if (appMaster.errorHappenedShutDown) {
                    throw new IOException("Was asked to shut down.");
                }
                return null;
            }
        });
    }

    /**
     * Creates a job classloader based on the configuration if the job classloader
     * is enabled. It is a no-op if the job classloader is not enabled.
     */
    private void createJobClassLoader(Configuration conf) throws IOException {
        jobClassLoader = MRApps.createJobClassLoader(conf);
    }

    /**
     * Executes the given action with the job classloader set as the configuration
     * classloader as well as the thread context class loader if the job
     * classloader is enabled. After the call, the original classloader is
     * restored.
     *
     * If the job classloader is enabled and the code needs to load user-supplied
     * classes via configuration or thread context classloader, this method should
     * be used in order to load them.
     *
     * @param conf the configuration on which the classloader will be set
     * @param action the callable action to be executed
     */
    <T> T callWithJobClassLoader(Configuration conf, Action<T> action) {
        // if the job classloader is enabled, we may need it to load the (custom)
        // classes; we make the job classloader available and unset it once it is
        // done
        ClassLoader currentClassLoader = conf.getClassLoader();
        boolean setJobClassLoader = jobClassLoader != null && currentClassLoader != jobClassLoader;
        if (setJobClassLoader) {
            MRApps.setClassLoader(jobClassLoader, conf);
        }
        try {
            return action.call(conf);
        } finally {
            if (setJobClassLoader) {
                // restore the original classloader
                MRApps.setClassLoader(currentClassLoader, conf);
            }
        }
    }

    /**
     * Executes the given action that can throw a checked exception with the job
     * classloader set as the configuration classloader as well as the thread
     * context class loader if the job classloader is enabled. After the call, the
     * original classloader is restored.
     *
     * If the job classloader is enabled and the code needs to load user-supplied
     * classes via configuration or thread context classloader, this method should
     * be used in order to load them.
     *
     * @param conf the configuration on which the classloader will be set
     * @param action the callable action to be executed
     * @throws IOException if the underlying action throws an IOException
     * @throws YarnRuntimeException if the underlying action throws an exception
     * other than an IOException
     */
    <T> T callWithJobClassLoader(Configuration conf, ExceptionAction<T> action) throws IOException {
        // if the job classloader is enabled, we may need it to load the (custom)
        // classes; we make the job classloader available and unset it once it is
        // done
        ClassLoader currentClassLoader = conf.getClassLoader();
        boolean setJobClassLoader = jobClassLoader != null && currentClassLoader != jobClassLoader;
        if (setJobClassLoader) {
            MRApps.setClassLoader(jobClassLoader, conf);
        }
        try {
            return action.call(conf);
        } catch (IOException e) {
            throw e;
        } catch (YarnRuntimeException e) {
            throw e;
        } catch (Exception e) {
            // wrap it with a YarnRuntimeException
            throw new YarnRuntimeException(e);
        } finally {
            if (setJobClassLoader) {
                // restore the original classloader
                MRApps.setClassLoader(currentClassLoader, conf);
            }
        }
    }

    /**
     * Action to be wrapped with setting and unsetting the job classloader
     */
    private static interface Action<T> {
        T call(Configuration conf);
    }

    private static interface ExceptionAction<T> {
        T call(Configuration conf) throws Exception;
    }

    protected void shutdownLogManager() {
        LogManager.shutdown();
    }

    @Override
    protected void serviceStop() throws Exception {
        super.serviceStop();
        shutdownLogManager();
    }

    public ClientService getClientService() {
        return clientService;
    }
}