Source code

Java tutorial


Here is the source code for


 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * See the License for the specific language governing permissions and
 * limitations under the License.
package org.apache.hadoop.mapred;

import java.util.ArrayList;
import java.util.List;
import java.util.concurrent.locks.ReadWriteLock;
import java.util.concurrent.locks.ReentrantReadWriteLock;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.corona.ResourceGrant;
import org.apache.hadoop.corona.ResourceRequest;
import org.apache.hadoop.corona.SessionDriver;
import org.apache.hadoop.corona.Utilities;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.ipc.ProtocolSignature;
import org.apache.hadoop.ipc.RPC;
import org.apache.hadoop.mapred.CoronaCommitPermission.CommitPermissionServer;
import org.apache.hadoop.mapred.CoronaSessionInfo.InetSocketAddressWritable;

 * The Proxy used by the CoronaJobTracker in the client to communicate
 * with the CoronaJobTracker running on the TaskTracker in case of a
 * remote CoronaJobTracker
public class RemoteJTProxy implements InterCoronaJobTrackerProtocol, JobSubmissionProtocol {
    /** Logger */
    public static final Log LOG = LogFactory.getLog(CoronaJobTracker.class);
    /** Amount of time to wait for remote JT to launch. */
    public static final String REMOTE_JT_TIMEOUT_SEC_CONF = "mapred.coronajobtracker.remotejobtracker.wait";
    /** Flag used for test, if to exclude the failed remote job tracker. */
    public static final String REMOTE_JT_EXCLUDE_FAILED = "mpared.coronajobtracker.remotejobtracker.exclude";
    /** Default amount of time to wait for remote JT to launch. */
    public static final int REMOTE_JT_TIMEOUT_SEC_DEFAULT = 60;
    /** Amount of time for a RPC call timeout to remote JT. */
    public static final String REMOTE_JT_RPC_TIMEOUT_SEC_CONF = "mapred.coronajobtracker.remotejobtracker.rpc.timeout";
    /** Default amount of a RPC call timeout to remote JT. */
    public static final int REMOTE_JT_RPC_TIMEOUT_SEC_DEFAULT = 3600;
    /** Boolean, determines whether remote JT restart should restore state */
    public static final String REMOTE_JT_STATE_RESTORING_CONF = "mapred.coronajobtracker.remote.state.restoring";
    /** Default use state restoring mechanism */
    public static final boolean REMOTE_JT_STATE_RESTORING_DEFAULT = true;

    /** The proxy object to the CoronaJobTracker running in the cluster */
    private volatile JobSubmissionProtocol client;
    /** JobSubmissionProtocol client lock */
    // We do need fair lock to enable early recovery after remote JT crash.
    private ReadWriteLock clientLock = new ReentrantReadWriteLock(true);
    // TODO Giving priority to writers will increase performance.
    /** The host where the remote Job Tracker is running. */
    private String remoteJTHost;
    /** The port where the remote Job Tracker is running. */
    private int remoteJTPort;
    /** The task id for the current attempt of running CJT */
    private TaskAttemptID currentAttemptId;
    /** The number of the current attempt */
    private int attempt;
    /** Job configuration */
    private final JobConf conf;
    /** Parent JobTracker */
    private final CoronaJobTracker jt;
    /** The remote JT resource grant. */
    private ResourceGrant remoteJTGrant;
    /** The id of the job */
    private final JobID jobId;
    /** Is true iff the job has been submitted */
    private boolean isJobSubmitted = false;
    /** The session id for the job tracker running in the cluster */
    private String remoteSessionId;
    /** The number of remote JT restart attempts. */
    private volatile int numRemoteJTFailures;
    /** The limit for remote JT restart attempts number. */
    private final int maxRemoteJTFailures;
    /** Current job attempt id */
    private JobID attemptJobId;
    /** Address of remote JT */
    private InetSocketAddress remoteJTAddr;
    /** Holds exceptions from restarting */
    public volatile IOException restartingException = null;
    /** Saved state updates from remote JT */
    private final CoronaJTState remoteJTState;
    /** Authority that gives permission to commit */
    private final CommitPermissionServer commmitPermissionServer;

    private enum RemoteJTStatus {

    private RemoteJTStatus remoteJTStatus;

    //This variable is our internal logic control flag. 
    //It means when the RJT failover is enabled, which API call failure will cause failover. 
    //For API call like killJob, killTasks should not fire the RJT failover.
    protected volatile boolean isRestartable = true;

     * Construct a proxy for the remote job tracker
     * @param jt parent job tracker
     * @param jobId id of the job the proxy is created for
     * @param conf job configuration
     * @throws IOException
    RemoteJTProxy(CoronaJobTracker jt, JobID jobId, JobConf conf) throws IOException {
        this.maxRemoteJTFailures = conf.getInt(CoronaJobTracker.MAX_JT_FAILURES_CONF,
        this.conf = conf;
        this.jt = jt;
        this.jobId = jobId;
        // Prepare first attempt.
        this.attemptJobId = jobId;
        attempt = 0;
        int partitionId = conf.getNumMapTasks() + 100000;
        currentAttemptId = new TaskAttemptID(new TaskID(attemptJobId, true, partitionId), attempt);
        remoteJTStatus = RemoteJTStatus.UNINITIALIZED;

        // Prepare stuff for restoring state if necessary
        if (isStateRestoringEnabled(conf)) {
            remoteJTState = new CoronaJTState();
            commmitPermissionServer = new CommitPermissionServer();
        } else {
            remoteJTState = null;
            commmitPermissionServer = null;

    public String getRemoteSessionId() {
        return remoteSessionId;

    // ///////////////////////////////////////////////////////////////////////////
    // InterCoronaJobTrackerProtocol
    // ///////////////////////////////////////////////////////////////////////////
    public void reportRemoteCoronaJobTracker(String attempt, String host, int port, String sessionId)
            throws IOException {
        TaskAttemptID attemptId = TaskAttemptID.forName(attempt);
        synchronized (this) {
            initializeClientUnprotected(host, port, sessionId);

    public InetSocketAddressWritable getNewJobTrackerAddress(InetSocketAddressWritable failedTracker)
            throws IOException {
        // Die immediately if restarting is disabled
        if (maxRemoteJTFailures == 0) {
            throw new IOException("Restarting remote JT is disabled.");
        assert remoteJTAddr != null : "Not started, but got request to restart.";
        if (clientLock.readLock().tryLock()) {
            // We're not restarting, check address
            InetSocketAddress seenAddr = remoteJTAddr;
            // seenAddr is safe to use, because even if restarting takes place, this
            // is the address of JT that either is fully running or dead
            // (not currently restarting)
            if (seenAddr.equals(failedTracker.getAddress())) {
                // Not restarted yet
                return null;
            } else {
      "Serving new job tracker address request with " + seenAddr + " old "
                        + failedTracker.getAddress());
                return new InetSocketAddressWritable(seenAddr);
        } else {
            // Currently restarting
            return null;

    public void pushCoronaJobTrackerStateUpdate(TaskAttemptID attempt, CoronaStateUpdate[] updates)
            throws IOException {
        if (remoteJTState == null) {
            throw new IOException("Logic error: got state update but state restoring is disabled");
        } else {
            synchronized (remoteJTState) {
                for (CoronaStateUpdate update : updates) {

    public CoronaJTState getCoronaJobTrackerState(TaskAttemptID attemptId) throws IOException {
        if (remoteJTState == null) {
            throw new IOException("Logic error: asked for remote JT state but state " + "restoring is disabled");
        } else {
            synchronized (remoteJTState) {
                return remoteJTState.prepare();

    public TaskAttemptID[] getAndSetCommitting(TaskAttemptID attemptId, TaskAttemptID[] toCommit)
            throws IOException {
        if (commmitPermissionServer == null) {
            throw new IOException(
                    "Logic error: got getAndSet for committing attempt " + "but commit permission server is down");
        } else {
            return commmitPermissionServer.getAndSetCommitting(toCommit);

    public ProtocolSignature getProtocolSignature(String protocol, long clientVersion, int clientMethodsHash)
            throws IOException {
        return ProtocolSignature.getProtocolSignature(this, protocol, clientVersion, clientMethodsHash);

    public long getProtocolVersion(String protocol, long clientVersion) throws IOException {
        if (protocol.equals(InterCoronaJobTrackerProtocol.class.getName())) {
            return InterCoronaJobTrackerProtocol.versionID;
        } else {
            throw new IOException("Unknown protocol " + protocol);

     * Increment the attempt number for launching a remote corona job tracker.
     * Must be called only when holding the object lock.
    private void incrementAttemptUnprotected() {
        currentAttemptId = new TaskAttemptID(
                new TaskID(attemptJobId, currentAttemptId.isMap(), currentAttemptId.getTaskID().getId()), attempt);

     * Checks whether provided attempt id of remote JT matches currently set,
     * throws if not
     * @param attempt attempt id to check
     * @throws IOException
    private void checkAttempt(TaskAttemptID attemptId) throws IOException {
        if (!attemptId.equals(currentAttemptId)) {
            throw new IOException("Attempt " + attemptId + " does not match current attempt " + currentAttemptId);

     * Create the RPC client to the remote corona job tracker.
     * @param host The host running the remote corona job tracker.
     * @param port The port of the remote corona job tracker.
     * @param sessionId The session for the remote corona job tracker.
     * @throws IOException
    void initializeClientUnprotected(String host, int port, String sessionId) throws IOException {
        if (client != null) {
        }"Creating JT client to " + host + ":" + port);
        long connectTimeout = RemoteJTProxy.getRemotJTTimeout(conf);
        int rpcTimeout = RemoteJTProxy.getRemoteJTRPCTimeout(conf);
        remoteJTAddr = new InetSocketAddress(host, port);
        client = RPC.waitForProtocolProxy(JobSubmissionProtocol.class, JobSubmissionProtocol.versionID,
                remoteJTAddr, conf, connectTimeout, rpcTimeout).getProxy();
        remoteJTStatus = RemoteJTStatus.SUCCESS;
        remoteJTHost = host;
        remoteJTPort = port;
        remoteSessionId = sessionId;

        if (remoteJTState != null) {

    private void reinitClientUnprotected() throws IOException {
        if (client != null) {
            client = null;
            remoteJTStatus = RemoteJTStatus.UNINITIALIZED;

        try {
            initializeClientUnprotected(remoteJTHost, remoteJTPort, remoteSessionId);
        } finally {
            if (client == null) {
                remoteJTStatus = RemoteJTStatus.FAILURE;

     * Waits for the remote Corona JT to be ready.
     * This involves
     *    - getting a JOBTRACKER resource from the cluster manager.
     *    - starting the remote job tracker by connecting to the corona task
     *      tracker on the machine.
     *    - waiting for the remote job tracker to report its port back to this
     *      process.
     * @param jobConf The job configuration to use.
     * @throws IOException
    public void waitForJTStart(JobConf jobConf) throws IOException {
        int maxJTAttempts = jobConf.getInt("mapred.coronajobtracker.remotejobtracker.attempts", 4);
        ResourceTracker resourceTracker = jt.getResourceTracker();
        SessionDriver sessionDriver = jt.getSessionDriver();
        List<ResourceGrant> excludeGrants = new ArrayList<ResourceGrant>();
        boolean toExcludeFailed = jobConf.getBoolean(REMOTE_JT_EXCLUDE_FAILED, true);
        // Release and blacklist failed JT grant.
        if (remoteJTGrant != null) {
            if (toExcludeFailed) {
        for (int i = 0; i < maxJTAttempts; i++) {
            try {
                remoteJTGrant = waitForJTGrant(resourceTracker, sessionDriver, excludeGrants);
                boolean success = startRemoteJT(jobConf, remoteJTGrant);
                if (success) {
                } else {
                    List<ResourceRequest> released = resourceTracker.getResourcesToRelease();
            } catch (InterruptedException e) {
                throw new IOException(e);
        throw new IOException("Could not start remote JT after " + maxJTAttempts + " attempts");

     * Wait for a JOBTRACKER grant.
     * @param resourceTracker The resource tracker object for getting the grant
     * @param sessionDriver The session driver for getting the grant
     * @param previousGrants Previous grants that could not be used successfully.
     * @return A new JOBTRACKER grant.
     * @throws IOException
     * @throws InterruptedException
    private ResourceGrant waitForJTGrant(ResourceTracker resourceTracker, SessionDriver sessionDriver,
            List<ResourceGrant> previousGrants) throws IOException, InterruptedException {"Waiting for JT grant for " + attemptJobId);
        ResourceRequest req = resourceTracker.newJobTrackerRequest();
        for (ResourceGrant prev : previousGrants) {
  "Adding " + prev.getNodeName() + " to excluded hosts");
        List<ResourceRequest> newRequests = resourceTracker.getWantedResources();
        final List<ResourceGrant> grants = new ArrayList<ResourceGrant>();
        ResourceTracker.ResourceProcessor proc = new ResourceTracker.ResourceProcessor() {
            public boolean processAvailableResource(ResourceGrant resource) {
                final boolean consumed = true;
                return consumed;
        while (true) {
            // Try to get JT grant while periodically checking for session driver
            // exceptions.
            long timeout = 60 * 1000; // 1 min.
            resourceTracker.processAvailableGrants(proc, 1, timeout);
            IOException e = sessionDriver.getFailed();
            if (e != null) {
                throw e;
            if (!grants.isEmpty()) {
                return grants.get(0);

     * Start corona job tracker on the machine provided by using the corona
     * task tracker API.
     * @param jobConf The job configuration.
     * @param grant The grant that specifies the remote machine.
     * @return A boolean indicating success.
     * @throws InterruptedException
    private boolean startRemoteJT(JobConf jobConf, ResourceGrant grant) throws InterruptedException {
        org.apache.hadoop.corona.InetAddress ttAddr = Utilities.appInfoToAddress(grant.appInfo);
        CoronaTaskTrackerProtocol coronaTT = null;
        try {
            coronaTT = jt.getTaskTrackerClient(ttAddr.getHost(), ttAddr.getPort());
        } catch (IOException e) {
            LOG.error("Error while trying to connect to TT at " + ttAddr.getHost() + ":" + ttAddr.getPort(), e);
            return false;
        LOG.warn("Starting remote JT for " + attemptJobId + " on " + ttAddr.getHost());

        // Get a special map id for the JT task.
        Path systemDir = new Path(jt.getSystemDir());"startRemoteJT:systemDir " + systemDir.toString());
        String jobFile = CoronaJobInProgress.getJobFile(systemDir, attemptJobId).toString();"startRemoteJT:jobFile " + jobFile);
        String splitClass = JobClient.RawSplit.class.getName();
        BytesWritable split = new BytesWritable();
        Task jobTask = new MapTask(jobFile, currentAttemptId, currentAttemptId.getTaskID().getId(), splitClass,
                split, 1, jobConf.getUser());
        CoronaSessionInfo info = new CoronaSessionInfo(jt.getSessionId(), jt.getJobTrackerAddress(),
        synchronized (this) {
            try {
                coronaTT.startCoronaJobTracker(jobTask, info);
            } catch (IOException e) {
                // Increment the attempt so that the older attempt will get an error
                // in reportRemoteCoronaJobTracker().
                LOG.error("Error while performing RPC to TT at " + ttAddr.getHost() + ":" + ttAddr.getPort(), e);
                return false;

        // Now wait for the remote CJT to report its address.
        final long waitStart = System.currentTimeMillis();
        final long timeout = RemoteJTProxy.getRemotJTTimeout(jobConf);
        synchronized (this) {
            while (client == null) {
                LOG.warn("Waiting for remote JT to start on " + ttAddr.getHost());
                if (client == null && System.currentTimeMillis() - waitStart > timeout) {
                    // Increment the attempt so that the older attempt will get an error
                    // in reportRemoteCoronaJobTracker().
                    LOG.warn("Could not start remote JT on " + ttAddr.getHost());
                    return false;
        return true;

     * Returns the timeout in milliseconds after which we timeout the remote job
     * tracker.
     * @param conf
     *          The configuration
     * @return The timeout in milliseconds.
    public static long getRemotJTTimeout(Configuration conf) {
        return conf.getInt(RemoteJTProxy.REMOTE_JT_TIMEOUT_SEC_CONF, RemoteJTProxy.REMOTE_JT_TIMEOUT_SEC_DEFAULT)
                * 1000;

    public static int getRemoteJTRPCTimeout(Configuration conf) {
        return conf.getInt(RemoteJTProxy.REMOTE_JT_RPC_TIMEOUT_SEC_CONF,
                RemoteJTProxy.REMOTE_JT_RPC_TIMEOUT_SEC_DEFAULT) * 1000;

    // ///////////////////////////////////////////////////////////////////////////
    // JobSubmissionProtocol
    // ///////////////////////////////////////////////////////////////////////////
    public JobID getNewJobId() throws IOException {
        throw new UnsupportedOperationException("getNewJobId not supported by proxy");

    public JobStatus submitJob(final JobID jobId) throws IOException {
        return (new Caller<JobStatus>() {
            JobStatus call(JobSubmissionProtocol myClient) throws IOException {
                // This is first time job submission. Called only once
                isJobSubmitted = true;
                return myClient.submitJob(attemptJobId);

    public ClusterStatus getClusterStatus(boolean detailed) throws IOException {
        throw new UnsupportedOperationException("getClusterStatus is not supported by proxy");

    public void killJob(final JobID jobId) throws IOException {
        (new Caller<JobID>() {

            // If the job tracker who hosting the job died,
            // will not do an automatic failover
            protected boolean isRestartableCall() {
                return false;

            JobID call(JobSubmissionProtocol myClient) throws IOException {
                return jobId;

    public void setJobPriority(JobID jobId, String priority) throws IOException {
        throw new UnsupportedOperationException("setJobPriority is not supported by proxy");

    public boolean killTask(final TaskAttemptID taskId, final boolean shouldFail) throws IOException {
        return (new Caller<Boolean>() {
            // If the job tracker who hosting the task died,
            // will not do an automatic failover
            protected boolean isRestartableCall() {
                return false;

            Boolean call(JobSubmissionProtocol myClient) throws IOException {
                return myClient.killTask(taskId, shouldFail);

    public JobProfile getJobProfile(final JobID jobId) throws IOException {
        return (new Caller<JobProfile>() {
            JobProfile call(JobSubmissionProtocol myClient) throws IOException {
                return myClient.getJobProfile(attemptJobId);

    public JobStatus getJobStatus(final JobID jobId) throws IOException {
        return (new Caller<JobStatus>() {
            JobStatus call(JobSubmissionProtocol myClient) throws IOException {
                return myClient.getJobStatus(attemptJobId);

    public Counters getJobCounters(final JobID jobId) throws IOException {
        return (new Caller<Counters>() {
            Counters call(JobSubmissionProtocol myClient) throws IOException {
                return myClient.getJobCounters(attemptJobId);

    public TaskReport[] getMapTaskReports(final JobID jobId) throws IOException {
        return (new Caller<TaskReport[]>() {
            TaskReport[] call(JobSubmissionProtocol myClient) throws IOException {
                return myClient.getMapTaskReports(attemptJobId);

    public TaskReport[] getReduceTaskReports(final JobID jobId) throws IOException {
        return (new Caller<TaskReport[]>() {
            TaskReport[] call(JobSubmissionProtocol myClient) throws IOException {
                return myClient.getReduceTaskReports(attemptJobId);

    public TaskReport[] getCleanupTaskReports(final JobID jobId) throws IOException {
        return (new Caller<TaskReport[]>() {
            TaskReport[] call(JobSubmissionProtocol myClient) throws IOException {
                return myClient.getCleanupTaskReports(attemptJobId);

    public TaskReport[] getSetupTaskReports(final JobID jobId) throws IOException {
        return (new Caller<TaskReport[]>() {
            TaskReport[] call(JobSubmissionProtocol myClient) throws IOException {
                return myClient.getSetupTaskReports(attemptJobId);

    public String getFilesystemName() throws IOException {
        throw new UnsupportedOperationException("getFilesystemName is not supported by proxy");

    public JobStatus[] jobsToComplete() {
        throw new UnsupportedOperationException("jobsToComplete is not supported by proxy");

    public JobStatus[] getAllJobs() {
        throw new UnsupportedOperationException("getAllJobs is not supported by proxy");

    public TaskCompletionEvent[] getTaskCompletionEvents(final JobID jobid, final int fromEventId,
            final int maxEvents) throws IOException {
        return (new Caller<TaskCompletionEvent[]>() {
            TaskCompletionEvent[] call(JobSubmissionProtocol myClient) throws IOException {
                return myClient.getTaskCompletionEvents(attemptJobId, fromEventId, maxEvents);

    public String[] getTaskDiagnostics(final TaskAttemptID taskId) throws IOException {
        return (new Caller<String[]>() {
            String[] call(JobSubmissionProtocol myClient) throws IOException {
                return myClient.getTaskDiagnostics(taskId);

    public String getSystemDir() {
        throw new UnsupportedOperationException("getSystemDir not supported by proxy.");

    public JobQueueInfo[] getQueues() {
        throw new UnsupportedOperationException("getQueues method is " + "not supported by proxy.");

    public JobQueueInfo getQueueInfo(String queue) {
        throw new UnsupportedOperationException("getQueueInfo not supported by proxy.");

    public JobStatus[] getJobsFromQueue(String queue) {
        throw new UnsupportedOperationException("getJobsFromQueue not supported by proxy.");

    public QueueAclsInfo[] getQueueAclsForCurrentUser() throws IOException {
        throw new UnsupportedOperationException("getQueueAclsForCurrentUser not supported by proxy.");

     * Stop RPC client.
    public void close() {
        try {
            if (client != null) {
                client = null;
        } finally {

    // ///////////////////////////////////////////////////////////////////////////
    // Remote CJT reincarnation.
    // ///////////////////////////////////////////////////////////////////////////
     * Generic caller interface.
    private abstract class Caller<T> {
         * Perform the call. Must be overridden by a sub-class.
         * @param myClient the client to make the call with.
         * @return The generic return value.
         * @throws IOException
        abstract T call(JobSubmissionProtocol myClient) throws IOException;

         * Overriding it to let the caller know if the current call is a
         * restartable one. It means if failed to call RJT, if we need to
         * do an automatic failover
        protected boolean isRestartableCall() {
            return isRestartable;

         * Template function to make the call.
         * @return The generic return value.
         * @throws IOException
        public T makeCall() throws IOException {
            int curRestartNo;
            // If restart fails, exception will break this loop
            while (true) {
                curRestartNo = numRemoteJTFailures;
                try {
                    try {
                        return makeCallWithRetries();
                    } finally {
                } catch (IOException e) {
                    LOG.error("Error on remote call with retries", e);
                    if (isRestartableCall()) {
                    } else {
                        throw e;

         * Handles remote JT failure.
         * @param failureRestartNo numRemoteJTFailures when failure that issued this
         * call has occurred
         * @return true iff remote call can be repeated
         * @throws IOException InterruptedException
        private void handleRemoteJTFailure(int failureRestartNo) throws IOException {
            try {
                if (failureRestartNo == numRemoteJTFailures) {
                    try {
                        LOG.warn("failureRestartNo " + failureRestartNo + " maxRemoteFailures "
                                + maxRemoteJTFailures + " numFailure " + numRemoteJTFailures);

                        if (numRemoteJTFailures <= maxRemoteJTFailures) {
                            LOG.warn("JobTracker died or is unreachable." + " Restarting remote JT.");
                            synchronized (RemoteJTProxy.this) {
                        } else {
                            LOG.warn("JobTracker died or is unreachable." + " Reached restart number limit."
                                    + " Reporting to ClusterManager.");
                            if (remoteSessionId != null) {
                                // Kill remote session - it will release resources immediately
                            jt.close(false, true);
                            throw new IOException("Reached remote JT restart limit.");
                    } catch (IOException e) {
                        restartingException = e;
                        throw restartingException;
                    } catch (InterruptedException e) {
                        restartingException = new IOException(e);
                        throw restartingException;
                } else {
                    // Other thread restarted remote JT, check if successfully
                    if (restartingException != null) {
                        throw new IOException(restartingException);
            } finally {

         * Restarts remote JT if there was running job and resubmits this job.
         * @throws IOException
        private void restartRemoteJTUnprotected() throws IOException {
            SessionDriver sessionDriver = jt.getSessionDriver();
            if (remoteSessionId != null) {
                // Kill remote session - new JT will acquire new session
            if (!isStateRestoringEnabled(conf)) {
                // Change attempt id only if we're not restoring state
                attemptJobId = prepareNextAttempt(attemptJobId);
            } else {
                // notify the remote job tracker the number of 
                // remote job tracker get restarted
                remoteJTState.restartNum = numRemoteJTFailures;
            // Stop RPC client.
            client = null;
            // Increment attempt to kill old JT on next connection attempt.
            if (sessionDriver != null) {
                sessionDriver.setName("Launch pending for " + conf.getJobName());
            // Restart remote JT, don't release client lock yet.
            // Resubmit job directly.
            try {
                if (isJobSubmitted) {
                    LOG.warn("Resubmitting job " + jobId.toString());
                // Set our info server url in parent JT and CM.
                String url = getJobProfile(attemptJobId).getURL().toString();
                if (sessionDriver != null) {
                    sessionDriver.setName("Launched session " + getRemoteSessionId());
                // If reached this point assume success.
                LOG.warn("Successfully restarted remote JT.");
                if (remoteJTState != null) {
                    if (LOG.isInfoEnabled()) {
                        synchronized (remoteJTState) {
            } catch (IOException e) {
                // in case the new job tracker get failed when doing submitJob
                // or getJobProfile
                LOG.error("Exception happened when doing RJT restart, try it another time", e);

         * Prepares next attempt of job.
         * @param oldId a job id of last submitted attempt or id known by client
         * @return job id of next attempt
         * @throws IOException
        private JobID prepareNextAttempt(final JobID oldId) throws IOException {
            JobID newId = CoronaJobTracker.nextJobID(oldId);
            // TODO copy only necessary files
            Path oldJobDir = new Path(jt.getSystemDir(), oldId.toString()),
                    newJobDir = new Path(jt.getSystemDir(), newId.toString());
            FileSystem fs = FileSystem.get(conf);
  "oldJobDir " + oldJobDir.toString() + " newJobDir " + newJobDir.toString());
            // Copy job files.
            Path localTemp = new Path("file:///tmp/" + newId.toString());
            if (fs.exists(newJobDir)) {
      "newJobDir " + localTemp.toString() + " exists, delete it");
                fs.delete(newJobDir, true);
            if (!oldJobDir.equals(newJobDir) && fs.exists(oldJobDir)) {
                fs.copyToLocalFile(oldJobDir, localTemp);
                fs.moveFromLocalFile(localTemp, newJobDir);
  "Job files copied to " + newJobDir.toString());
            return newId;

        private T makeCallWithRetries() throws IOException {
            int errorCount = 0;
            final int maxErrorCount = 10; // can make configurable later
            IOException lastException = null;
            while (errorCount < maxErrorCount) {
                try {
                    JobSubmissionProtocol myClient = checkClient();
                    return call(myClient);
                } catch (ConnectException e) {
                    throw e;
                } catch (IOException e) {
                    lastException = e;
                    if (errorCount == maxErrorCount) {
                    } else {
                        long backoff = errorCount * 1000;
                        LOG.warn("Retrying after error connecting to remote JT " + remoteJTHost + ":" + remoteJTPort
                                + " will wait " + backoff + " msec ", e);
                        try {
                        } catch (InterruptedException ie) {
                            throw new IOException(ie);
                        synchronized (RemoteJTProxy.this) {
            LOG.error("Too many errors " + errorCount + " in connecting to remote JT " + remoteJTHost + ":"
                    + remoteJTPort, lastException);
            throw lastException;

     * Check if the RPC client to the remote job tracker is ready, and wait if
     * not.
     * @throws IOException
    private JobSubmissionProtocol checkClient() throws IOException {
        synchronized (this) {
            while (client == null) {
                try {
                    if (remoteJTStatus == RemoteJTStatus.FAILURE) {
                        throw new IOException("Remote Job Tracker is not available");
                } catch (InterruptedException e) {
                    throw new IOException(e);
            return client;

     * Check job configuration if state restoring is enabled
     * @param conf configuration of job
     * @return true iff enabled
    public static boolean isStateRestoringEnabled(JobConf conf) {
        return isJTRestartingEnabled(conf)

     * Check job configuration if remote JT restarting is enabled
     * @param conf configuration of job
     * @return true iff enabled
    public static boolean isJTRestartingEnabled(JobConf conf) {
        return (0 < conf.getInt(CoronaJobTracker.MAX_JT_FAILURES_CONF, CoronaJobTracker.MAX_JT_FAILURES_DEFAULT));