org.apache.hadoop.mapred.ClientServiceDelegate.java Source code

Introduction

Here is the source code for org.apache.hadoop.mapred.ClientServiceDelegate.java
Source

/**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.hadoop.mapred;

import java.io.IOException;
import java.lang.reflect.InvocationTargetException;
import java.lang.reflect.Method;
import java.net.InetSocketAddress;
import java.security.PrivilegedExceptionAction;
import java.util.EnumSet;
import java.util.HashMap;
import java.util.List;
import java.util.concurrent.atomic.AtomicBoolean;

import org.apache.commons.lang3.StringUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.CommonConfigurationKeysPublic;
import org.apache.hadoop.ipc.RPC;
import org.apache.hadoop.mapreduce.JobID;
import org.apache.hadoop.mapreduce.JobStatus;
import org.apache.hadoop.mapreduce.MRJobConfig;
import org.apache.hadoop.mapreduce.TaskAttemptID;
import org.apache.hadoop.mapreduce.TaskType;
import org.apache.hadoop.mapreduce.TypeConverter;
import org.apache.hadoop.mapreduce.v2.LogParams;
import org.apache.hadoop.mapreduce.v2.api.MRClientProtocol;
import org.apache.hadoop.mapreduce.v2.api.protocolrecords.FailTaskAttemptRequest;
import org.apache.hadoop.mapreduce.v2.api.protocolrecords.GetCountersRequest;
import org.apache.hadoop.mapreduce.v2.api.protocolrecords.GetCountersResponse;
import org.apache.hadoop.mapreduce.v2.api.protocolrecords.GetDiagnosticsRequest;
import org.apache.hadoop.mapreduce.v2.api.protocolrecords.GetDiagnosticsResponse;
import org.apache.hadoop.mapreduce.v2.api.protocolrecords.GetJobReportRequest;
import org.apache.hadoop.mapreduce.v2.api.protocolrecords.GetJobReportResponse;
import org.apache.hadoop.mapreduce.v2.api.protocolrecords.GetTaskAttemptCompletionEventsRequest;
import org.apache.hadoop.mapreduce.v2.api.protocolrecords.GetTaskAttemptCompletionEventsResponse;
import org.apache.hadoop.mapreduce.v2.api.protocolrecords.GetTaskAttemptReportRequest;
import org.apache.hadoop.mapreduce.v2.api.protocolrecords.GetTaskAttemptReportResponse;
import org.apache.hadoop.mapreduce.v2.api.protocolrecords.GetTaskReportsRequest;
import org.apache.hadoop.mapreduce.v2.api.protocolrecords.GetTaskReportsResponse;
import org.apache.hadoop.mapreduce.v2.api.protocolrecords.KillJobRequest;
import org.apache.hadoop.mapreduce.v2.api.protocolrecords.KillTaskAttemptRequest;
import org.apache.hadoop.mapreduce.v2.api.records.AMInfo;
import org.apache.hadoop.mapreduce.v2.api.records.Counters;
import org.apache.hadoop.mapreduce.v2.api.records.JobReport;
import org.apache.hadoop.mapreduce.v2.api.records.JobState;
import org.apache.hadoop.mapreduce.v2.api.records.TaskAttemptReport;
import org.apache.hadoop.mapreduce.v2.util.MRApps;
import org.apache.hadoop.net.NetUtils;
import org.apache.hadoop.security.UserGroupInformation;
import org.apache.hadoop.security.authorize.AuthorizationException;
import org.apache.hadoop.security.token.Token;
import org.apache.hadoop.yarn.api.records.ApplicationId;
import org.apache.hadoop.yarn.api.records.ApplicationReport;
import org.apache.hadoop.yarn.api.records.NodeId;
import org.apache.hadoop.yarn.api.records.YarnApplicationState;
import org.apache.hadoop.yarn.exceptions.ApplicationNotFoundException;
import org.apache.hadoop.yarn.exceptions.YarnException;
import org.apache.hadoop.yarn.exceptions.YarnRuntimeException;
import org.apache.hadoop.yarn.factories.RecordFactory;
import org.apache.hadoop.yarn.factory.providers.RecordFactoryProvider;
import org.apache.hadoop.yarn.ipc.YarnRPC;
import org.apache.hadoop.yarn.security.client.ClientToAMTokenIdentifier;
import org.apache.hadoop.yarn.util.ConverterUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import com.google.common.annotations.VisibleForTesting;

public class ClientServiceDelegate {
    private static final Logger LOG = LoggerFactory.getLogger(ClientServiceDelegate.class);
    private static final String UNAVAILABLE = "N/A";

    // Caches for per-user NotRunningJobs
    private HashMap<JobState, HashMap<String, NotRunningJob>> notRunningJobs;

    private final Configuration conf;
    private final JobID jobId;
    private final ApplicationId appId;
    private final ResourceMgrDelegate rm;
    private final MRClientProtocol historyServerProxy;
    private MRClientProtocol realProxy = null;
    private RecordFactory recordFactory = RecordFactoryProvider.getRecordFactory(null);
    private static String UNKNOWN_USER = "Unknown User";
    private String trackingUrl;
    private AtomicBoolean usingAMProxy = new AtomicBoolean(false);
    private int maxClientRetry;
    private boolean amAclDisabledStatusLogged = false;

    public ClientServiceDelegate(Configuration conf, ResourceMgrDelegate rm, JobID jobId,
            MRClientProtocol historyServerProxy) {
        this.conf = new Configuration(conf); // Cloning for modifying.
        // For faster redirects from AM to HS.
        this.conf.setInt(CommonConfigurationKeysPublic.IPC_CLIENT_CONNECT_MAX_RETRIES_KEY, this.conf.getInt(
                MRJobConfig.MR_CLIENT_TO_AM_IPC_MAX_RETRIES, MRJobConfig.DEFAULT_MR_CLIENT_TO_AM_IPC_MAX_RETRIES));
        this.conf.setInt(CommonConfigurationKeysPublic.IPC_CLIENT_CONNECT_MAX_RETRIES_ON_SOCKET_TIMEOUTS_KEY,
                this.conf.getInt(MRJobConfig.MR_CLIENT_TO_AM_IPC_MAX_RETRIES_ON_TIMEOUTS,
                        MRJobConfig.DEFAULT_MR_CLIENT_TO_AM_IPC_MAX_RETRIES_ON_TIMEOUTS));
        this.rm = rm;
        this.jobId = jobId;
        this.historyServerProxy = historyServerProxy;
        this.appId = TypeConverter.toYarn(jobId).getAppId();
        notRunningJobs = new HashMap<JobState, HashMap<String, NotRunningJob>>();
    }

    // Get the instance of the NotRunningJob corresponding to the specified
    // user and state
    private NotRunningJob getNotRunningJob(ApplicationReport applicationReport, JobState state) {
        synchronized (notRunningJobs) {
            HashMap<String, NotRunningJob> map = notRunningJobs.get(state);
            if (map == null) {
                map = new HashMap<String, NotRunningJob>();
                notRunningJobs.put(state, map);
            }
            String user = (applicationReport == null) ? UNKNOWN_USER : applicationReport.getUser();
            NotRunningJob notRunningJob = map.get(user);
            if (notRunningJob == null) {
                notRunningJob = new NotRunningJob(applicationReport, state);
                map.put(user, notRunningJob);
            }
            return notRunningJob;
        }
    }

    private MRClientProtocol getProxy() throws IOException {
        if (realProxy != null) {
            return realProxy;
        }

        // Possibly allow nulls through the PB tunnel, otherwise deal with an exception
        // and redirect to the history server.
        ApplicationReport application = null;
        try {
            application = rm.getApplicationReport(appId);
        } catch (ApplicationNotFoundException e) {
            application = null;
        } catch (YarnException e2) {
            throw new IOException(e2);
        }
        if (application != null) {
            trackingUrl = application.getTrackingUrl();
        }
        InetSocketAddress serviceAddr = null;
        while (application == null || YarnApplicationState.RUNNING == application.getYarnApplicationState()) {
            if (application == null) {
                LOG.info(
                        "Could not get Job info from RM for job " + jobId + ". Redirecting to job history server.");
                return checkAndGetHSProxy(null, JobState.NEW);
            }
            try {
                if (application.getHost() == null || "".equals(application.getHost())) {
                    LOG.debug("AM not assigned to Job. Waiting to get the AM ...");
                    Thread.sleep(2000);

                    LOG.debug("Application state is " + application.getYarnApplicationState());
                    application = rm.getApplicationReport(appId);
                    continue;
                } else if (UNAVAILABLE.equals(application.getHost())) {
                    if (!amAclDisabledStatusLogged) {
                        LOG.info("Job " + jobId + " is running, but the host is unknown."
                                + " Verify user has VIEW_JOB access.");
                        amAclDisabledStatusLogged = true;
                    }
                    return getNotRunningJob(application, JobState.RUNNING);
                }
                if (!conf.getBoolean(MRJobConfig.JOB_AM_ACCESS_DISABLED, false)) {
                    UserGroupInformation newUgi = UserGroupInformation
                            .createRemoteUser(UserGroupInformation.getCurrentUser().getUserName());
                    serviceAddr = NetUtils.createSocketAddrForHost(application.getHost(), application.getRpcPort());
                    if (UserGroupInformation.isSecurityEnabled()) {
                        org.apache.hadoop.yarn.api.records.Token clientToAMToken = application.getClientToAMToken();
                        Token<ClientToAMTokenIdentifier> token = ConverterUtils.convertFromYarn(clientToAMToken,
                                serviceAddr);
                        newUgi.addToken(token);
                    }
                    LOG.debug("Connecting to " + serviceAddr);
                    final InetSocketAddress finalServiceAddr = serviceAddr;
                    realProxy = newUgi.doAs(new PrivilegedExceptionAction<MRClientProtocol>() {
                        @Override
                        public MRClientProtocol run() throws IOException {
                            return instantiateAMProxy(finalServiceAddr);
                        }
                    });
                } else {
                    if (!amAclDisabledStatusLogged) {
                        LOG.info("Network ACL closed to AM for job " + jobId
                                + ". Not going to try to reach the AM.");
                        amAclDisabledStatusLogged = true;
                    }
                    return getNotRunningJob(null, JobState.RUNNING);
                }
                return realProxy;
            } catch (IOException e) {
                //possibly the AM has crashed
                //there may be some time before AM is restarted
                //keep retrying by getting the address from RM
                LOG.info("Could not connect to " + serviceAddr + ". Waiting for getting the latest AM address...");
                try {
                    Thread.sleep(2000);
                } catch (InterruptedException e1) {
                    LOG.warn("getProxy() call interruped", e1);
                    throw new YarnRuntimeException(e1);
                }
                try {
                    application = rm.getApplicationReport(appId);
                } catch (YarnException e1) {
                    throw new IOException(e1);
                }
                if (application == null) {
                    LOG.info("Could not get Job info from RM for job " + jobId
                            + ". Redirecting to job history server.");
                    return checkAndGetHSProxy(null, JobState.RUNNING);
                }
            } catch (InterruptedException e) {
                LOG.warn("getProxy() call interruped", e);
                throw new YarnRuntimeException(e);
            } catch (YarnException e) {
                throw new IOException(e);
            }
        }

        /** we just want to return if its allocating, so that we don't
         * block on it. This is to be able to return job status
         * on an allocating Application.
         */
        String user = application.getUser();
        if (user == null) {
            throw new IOException("User is not set in the application report");
        }
        if (application.getYarnApplicationState() == YarnApplicationState.NEW
                || application.getYarnApplicationState() == YarnApplicationState.NEW_SAVING
                || application.getYarnApplicationState() == YarnApplicationState.SUBMITTED
                || application.getYarnApplicationState() == YarnApplicationState.ACCEPTED) {
            realProxy = null;
            return getNotRunningJob(application, JobState.NEW);
        }

        if (application.getYarnApplicationState() == YarnApplicationState.FAILED) {
            realProxy = null;
            return getNotRunningJob(application, JobState.FAILED);
        }

        if (application.getYarnApplicationState() == YarnApplicationState.KILLED) {
            realProxy = null;
            return getNotRunningJob(application, JobState.KILLED);
        }

        //History server can serve a job only if application
        //succeeded.
        if (application.getYarnApplicationState() == YarnApplicationState.FINISHED) {
            LOG.info("Application state is completed. FinalApplicationStatus="
                    + application.getFinalApplicationStatus().toString() + ". Redirecting to job history server");
            realProxy = checkAndGetHSProxy(application, JobState.SUCCEEDED);
        }
        return realProxy;
    }

    private MRClientProtocol checkAndGetHSProxy(ApplicationReport applicationReport, JobState state) {
        if (null == historyServerProxy) {
            LOG.warn("Job History Server is not configured.");
            return getNotRunningJob(applicationReport, state);
        }
        return historyServerProxy;
    }

    MRClientProtocol instantiateAMProxy(final InetSocketAddress serviceAddr) throws IOException {
        LOG.trace("Connecting to ApplicationMaster at: " + serviceAddr);
        YarnRPC rpc = YarnRPC.create(conf);
        MRClientProtocol proxy = (MRClientProtocol) rpc.getProxy(MRClientProtocol.class, serviceAddr, conf);
        usingAMProxy.set(true);
        LOG.trace("Connected to ApplicationMaster at: " + serviceAddr);
        return proxy;
    }

    private synchronized Object invoke(String method, Class argClass, Object args) throws IOException {
        Method methodOb = null;
        try {
            methodOb = MRClientProtocol.class.getMethod(method, argClass);
        } catch (SecurityException e) {
            throw new YarnRuntimeException(e);
        } catch (NoSuchMethodException e) {
            throw new YarnRuntimeException("Method name mismatch", e);
        }
        maxClientRetry = this.conf.getInt(MRJobConfig.MR_CLIENT_MAX_RETRIES,
                MRJobConfig.DEFAULT_MR_CLIENT_MAX_RETRIES);
        IOException lastException = null;
        while (maxClientRetry > 0) {
            MRClientProtocol MRClientProxy = null;
            try {
                MRClientProxy = getProxy();
                return methodOb.invoke(MRClientProxy, args);
            } catch (InvocationTargetException e) {
                // Will not throw out YarnException anymore
                LOG.debug("Failed to contact AM/History for job " + jobId + " retrying..", e.getTargetException());
                // Force reconnection by setting the proxy to null.
                realProxy = null;
                // HS/AMS shut down

                if (e.getCause() instanceof AuthorizationException) {
                    throw new IOException(e.getTargetException());
                }

                // if its AM shut down, do not decrement maxClientRetry while we wait
                // for its AM to be restarted.
                if (!usingAMProxy.get()) {
                    maxClientRetry--;
                }
                usingAMProxy.set(false);
                lastException = new IOException(e.getTargetException());
                try {
                    Thread.sleep(100);
                } catch (InterruptedException ie) {
                    LOG.warn("ClientServiceDelegate invoke call interrupted", ie);
                    throw new YarnRuntimeException(ie);
                }
            } catch (Exception e) {
                LOG.debug("Failed to contact AM/History for job " + jobId + "  Will retry..", e);
                // Force reconnection by setting the proxy to null.
                realProxy = null;
                // RM shutdown
                maxClientRetry--;
                lastException = new IOException(e.getMessage());
                try {
                    Thread.sleep(100);
                } catch (InterruptedException ie) {
                    LOG.warn("ClientServiceDelegate invoke call interrupted", ie);
                    throw new YarnRuntimeException(ie);
                }
            }
        }
        throw lastException;
    }

    // Only for testing
    @VisibleForTesting
    public int getMaxClientRetry() {
        return this.maxClientRetry;
    }

    public org.apache.hadoop.mapreduce.Counters getJobCounters(JobID arg0)
            throws IOException, InterruptedException {
        org.apache.hadoop.mapreduce.v2.api.records.JobId jobID = TypeConverter.toYarn(arg0);
        GetCountersRequest request = recordFactory.newRecordInstance(GetCountersRequest.class);
        request.setJobId(jobID);
        Counters cnt = ((GetCountersResponse) invoke("getCounters", GetCountersRequest.class, request))
                .getCounters();
        return TypeConverter.fromYarn(cnt);

    }

    public TaskCompletionEvent[] getTaskCompletionEvents(JobID arg0, int arg1, int arg2)
            throws IOException, InterruptedException {
        org.apache.hadoop.mapreduce.v2.api.records.JobId jobID = TypeConverter.toYarn(arg0);
        GetTaskAttemptCompletionEventsRequest request = recordFactory
                .newRecordInstance(GetTaskAttemptCompletionEventsRequest.class);
        request.setJobId(jobID);
        request.setFromEventId(arg1);
        request.setMaxEvents(arg2);
        List<org.apache.hadoop.mapreduce.v2.api.records.TaskAttemptCompletionEvent> list = ((GetTaskAttemptCompletionEventsResponse) invoke(
                "getTaskAttemptCompletionEvents", GetTaskAttemptCompletionEventsRequest.class, request))
                        .getCompletionEventList();
        return TypeConverter.fromYarn(
                list.toArray(new org.apache.hadoop.mapreduce.v2.api.records.TaskAttemptCompletionEvent[0]));
    }

    public String[] getTaskDiagnostics(org.apache.hadoop.mapreduce.TaskAttemptID arg0)
            throws IOException, InterruptedException {

        org.apache.hadoop.mapreduce.v2.api.records.TaskAttemptId attemptID = TypeConverter.toYarn(arg0);
        GetDiagnosticsRequest request = recordFactory.newRecordInstance(GetDiagnosticsRequest.class);
        request.setTaskAttemptId(attemptID);
        List<String> list = ((GetDiagnosticsResponse) invoke("getDiagnostics", GetDiagnosticsRequest.class,
                request)).getDiagnosticsList();
        String[] result = new String[list.size()];
        int i = 0;
        for (String c : list) {
            result[i++] = c.toString();
        }
        return result;
    }

    public JobStatus getJobStatus(JobID oldJobID) throws IOException {
        org.apache.hadoop.mapreduce.v2.api.records.JobId jobId = TypeConverter.toYarn(oldJobID);
        GetJobReportRequest request = recordFactory.newRecordInstance(GetJobReportRequest.class);
        request.setJobId(jobId);
        JobReport report = ((GetJobReportResponse) invoke("getJobReport", GetJobReportRequest.class, request))
                .getJobReport();
        JobStatus jobStatus = null;
        if (report != null) {
            if (StringUtils.isEmpty(report.getJobFile())) {
                String jobFile = MRApps.getJobFile(conf, report.getUser(), oldJobID);
                report.setJobFile(jobFile);
            }
            String historyTrackingUrl = report.getTrackingUrl();
            String url = StringUtils.isNotEmpty(historyTrackingUrl) ? historyTrackingUrl : trackingUrl;
            jobStatus = TypeConverter.fromYarn(report, url);
        }
        return jobStatus;
    }

    public org.apache.hadoop.mapreduce.TaskReport[] getTaskReports(JobID oldJobID, TaskType taskType)
            throws IOException {
        org.apache.hadoop.mapreduce.v2.api.records.JobId jobId = TypeConverter.toYarn(oldJobID);
        GetTaskReportsRequest request = recordFactory.newRecordInstance(GetTaskReportsRequest.class);
        request.setJobId(jobId);
        request.setTaskType(TypeConverter.toYarn(taskType));

        List<org.apache.hadoop.mapreduce.v2.api.records.TaskReport> taskReports = ((GetTaskReportsResponse) invoke(
                "getTaskReports", GetTaskReportsRequest.class, request)).getTaskReportList();

        return TypeConverter.fromYarn(taskReports).toArray(new org.apache.hadoop.mapreduce.TaskReport[0]);
    }

    public boolean killTask(TaskAttemptID taskAttemptID, boolean fail) throws IOException {
        org.apache.hadoop.mapreduce.v2.api.records.TaskAttemptId attemptID = TypeConverter.toYarn(taskAttemptID);
        if (fail) {
            FailTaskAttemptRequest failRequest = recordFactory.newRecordInstance(FailTaskAttemptRequest.class);
            failRequest.setTaskAttemptId(attemptID);
            invoke("failTaskAttempt", FailTaskAttemptRequest.class, failRequest);
        } else {
            KillTaskAttemptRequest killRequest = recordFactory.newRecordInstance(KillTaskAttemptRequest.class);
            killRequest.setTaskAttemptId(attemptID);
            invoke("killTaskAttempt", KillTaskAttemptRequest.class, killRequest);
        }
        return true;
    }

    public boolean killJob(JobID oldJobID) throws IOException {
        org.apache.hadoop.mapreduce.v2.api.records.JobId jobId = TypeConverter.toYarn(oldJobID);
        KillJobRequest killRequest = recordFactory.newRecordInstance(KillJobRequest.class);
        killRequest.setJobId(jobId);
        invoke("killJob", KillJobRequest.class, killRequest);
        return true;
    }

    public LogParams getLogFilePath(JobID oldJobID, TaskAttemptID oldTaskAttemptID) throws IOException {
        org.apache.hadoop.mapreduce.v2.api.records.JobId jobId = TypeConverter.toYarn(oldJobID);
        GetJobReportRequest request = recordFactory.newRecordInstance(GetJobReportRequest.class);
        request.setJobId(jobId);

        JobReport report = ((GetJobReportResponse) invoke("getJobReport", GetJobReportRequest.class, request))
                .getJobReport();
        if (EnumSet.of(JobState.SUCCEEDED, JobState.FAILED, JobState.KILLED, JobState.ERROR)
                .contains(report.getJobState())) {
            if (oldTaskAttemptID != null) {
                GetTaskAttemptReportRequest taRequest = recordFactory
                        .newRecordInstance(GetTaskAttemptReportRequest.class);
                taRequest.setTaskAttemptId(TypeConverter.toYarn(oldTaskAttemptID));
                TaskAttemptReport taReport = ((GetTaskAttemptReportResponse) invoke("getTaskAttemptReport",
                        GetTaskAttemptReportRequest.class, taRequest)).getTaskAttemptReport();
                if (taReport.getContainerId() == null || taReport.getNodeManagerHost() == null) {
                    throw new IOException("Unable to get log information for task: " + oldTaskAttemptID);
                }
                return new LogParams(taReport.getContainerId().toString(),
                        taReport.getContainerId().getApplicationAttemptId().getApplicationId().toString(),
                        NodeId.newInstance(taReport.getNodeManagerHost(), taReport.getNodeManagerPort()).toString(),
                        report.getUser());
            } else {
                if (report.getAMInfos() == null || report.getAMInfos().size() == 0) {
                    throw new IOException("Unable to get log information for job: " + oldJobID);
                }
                AMInfo amInfo = report.getAMInfos().get(report.getAMInfos().size() - 1);
                return new LogParams(amInfo.getContainerId().toString(),
                        amInfo.getAppAttemptId().getApplicationId().toString(),
                        NodeId.newInstance(amInfo.getNodeManagerHost(), amInfo.getNodeManagerPort()).toString(),
                        report.getUser());
            }
        } else {
            throw new IOException("Cannot get log path for a in-progress job");
        }
    }

    public void close() throws IOException {
        if (rm != null) {
            rm.close();
        }

        if (historyServerProxy != null) {
            RPC.stopProxy(historyServerProxy);
        }

        if (realProxy != null) {
            RPC.stopProxy(realProxy);
            realProxy = null;
        }
    }
}