org.apache.flink.yarn.YarnClusterClient.java Source code

Java tutorial

Introduction

Here is the source code for org.apache.flink.yarn.YarnClusterClient.java

Source

/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.flink.yarn;

import akka.actor.ActorRef;

import akka.actor.Props;
import akka.pattern.Patterns;
import akka.util.Timeout;
import org.apache.flink.api.common.JobID;
import org.apache.flink.api.common.JobSubmissionResult;
import org.apache.flink.client.program.ClusterClient;
import org.apache.flink.client.program.ProgramInvocationException;
import org.apache.flink.runtime.akka.AkkaUtils;
import org.apache.flink.runtime.clusterframework.ApplicationStatus;
import org.apache.flink.runtime.clusterframework.messages.GetClusterStatus;
import org.apache.flink.runtime.clusterframework.messages.GetClusterStatusResponse;
import org.apache.flink.runtime.clusterframework.messages.InfoMessage;
import org.apache.flink.runtime.clusterframework.messages.ShutdownClusterAfterJob;
import org.apache.flink.runtime.jobgraph.JobGraph;
import org.apache.flink.runtime.leaderretrieval.LeaderRetrievalService;
import org.apache.flink.runtime.util.LeaderRetrievalUtils;
import org.apache.flink.util.Preconditions;
import org.apache.flink.yarn.cli.FlinkYarnSessionCli;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.service.Service;
import org.apache.hadoop.yarn.api.records.ApplicationId;
import org.apache.hadoop.yarn.api.records.ApplicationReport;
import org.apache.hadoop.yarn.api.records.YarnApplicationState;
import org.apache.hadoop.yarn.client.api.YarnClient;
import org.apache.hadoop.yarn.exceptions.YarnException;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import scala.Option;
import scala.concurrent.Await;
import scala.concurrent.Future;
import scala.concurrent.duration.FiniteDuration;

import java.io.File;
import java.io.IOException;
import java.net.URL;
import java.util.ArrayList;
import java.util.List;
import java.util.concurrent.atomic.AtomicBoolean;

/**
 * Java representation of a running Flink cluster within YARN.
 */
public class YarnClusterClient extends ClusterClient {

    private static final Logger LOG = LoggerFactory.getLogger(YarnClusterClient.class);

    private static final int POLLING_THREAD_INTERVAL_MS = 1000;

    private YarnClient yarnClient;

    private Thread clientShutdownHook = new ClientShutdownHook();
    private PollingThread pollingRunner;
    private final Configuration hadoopConfig;
    // (HDFS) location of the files required to run on YARN. Needed here to delete them on shutdown.
    private final Path sessionFilesDir;

    //---------- Class internal fields -------------------

    private final AbstractYarnClusterDescriptor clusterDescriptor;
    private final LazApplicationClientLoader applicationClient;
    private final FiniteDuration akkaDuration;
    private final ApplicationReport appReport;
    private final ApplicationId appId;
    private final String trackingURL;

    private boolean isConnected = true;

    /** Indicator whether this cluster has just been created */
    private final boolean newlyCreatedCluster;

    /**
     * Create a new Flink on YARN cluster.
     *
     * @param clusterDescriptor The descriptor used at cluster creation
     * @param yarnClient Client to talk to YARN
     * @param appReport the YARN application ID
     * @param flinkConfig Flink configuration
     * @param sessionFilesDir Location of files required for YARN session
     * @param newlyCreatedCluster Indicator whether this cluster has just been created
     * @throws IOException
     * @throws YarnException
     */
    public YarnClusterClient(final AbstractYarnClusterDescriptor clusterDescriptor, final YarnClient yarnClient,
            final ApplicationReport appReport, org.apache.flink.configuration.Configuration flinkConfig,
            Path sessionFilesDir, boolean newlyCreatedCluster) throws IOException, YarnException {

        super(flinkConfig);

        this.akkaDuration = AkkaUtils.getTimeout(flinkConfig);
        this.clusterDescriptor = clusterDescriptor;
        this.yarnClient = yarnClient;
        this.hadoopConfig = yarnClient.getConfig();
        this.sessionFilesDir = sessionFilesDir;
        this.appReport = appReport;
        this.appId = appReport.getApplicationId();
        this.trackingURL = appReport.getTrackingUrl();
        this.newlyCreatedCluster = newlyCreatedCluster;

        this.applicationClient = new LazApplicationClientLoader(flinkConfig, actorSystemLoader);

        this.pollingRunner = new PollingThread(yarnClient, appId);
        this.pollingRunner.setDaemon(true);
        this.pollingRunner.start();

        Runtime.getRuntime().addShutdownHook(clientShutdownHook);
    }

    /**
     * Disconnect from the Yarn cluster
     */
    public void disconnect() {

        if (hasBeenShutDown.getAndSet(true)) {
            return;
        }

        if (!isConnected) {
            throw new IllegalStateException("Can not disconnect from an unconnected cluster.");
        }

        LOG.info("Disconnecting YarnClusterClient from ApplicationMaster");

        try {
            Runtime.getRuntime().removeShutdownHook(clientShutdownHook);
        } catch (IllegalStateException e) {
            // we are already in the shutdown hook
        }

        try {
            pollingRunner.stopRunner();
            pollingRunner.join(1000);
        } catch (InterruptedException e) {
            LOG.warn("Shutdown of the polling runner was interrupted", e);
            Thread.currentThread().interrupt();
        }

        isConnected = false;
    }

    // -------------------------- Interaction with the cluster ------------------------

    /*
     * Tells the Cluster to monitor the status of JobId and stop itself once the specified job has finished.
     */
    private void stopAfterJob(JobID jobID) {
        Preconditions.checkNotNull(jobID, "The job id must not be null");
        try {
            Future<Object> replyFuture = getJobManagerGateway().ask(new ShutdownClusterAfterJob(jobID),
                    akkaDuration);
            Await.ready(replyFuture, akkaDuration);
        } catch (Exception e) {
            throw new RuntimeException(
                    "Unable to tell application master to stop once the specified job has been finised", e);
        }
    }

    @Override
    public org.apache.flink.configuration.Configuration getFlinkConfiguration() {
        return flinkConfig;
    }

    @Override
    public int getMaxSlots() {
        int maxSlots = clusterDescriptor.getTaskManagerCount() * clusterDescriptor.getTaskManagerSlots();
        return maxSlots > 0 ? maxSlots : -1;
    }

    @Override
    public boolean hasUserJarsInClassPath(List<URL> userJarFiles) {
        return clusterDescriptor.hasUserJarFiles(userJarFiles);
    }

    @Override
    protected JobSubmissionResult submitJob(JobGraph jobGraph, ClassLoader classLoader)
            throws ProgramInvocationException {
        if (isDetached()) {
            if (newlyCreatedCluster) {
                stopAfterJob(jobGraph.getJobID());
            }
            return super.runDetached(jobGraph, classLoader);
        } else {
            return super.run(jobGraph, classLoader);
        }
    }

    @Override
    public String getWebInterfaceURL() {
        // there seems to be a difference between HD 2.2.0 and 2.6.0
        if (!trackingURL.startsWith("http://")) {
            return "http://" + trackingURL;
        } else {
            return trackingURL;
        }
    }

    @Override
    public String getClusterIdentifier() {
        return "Yarn cluster with application id " + appReport.getApplicationId();
    }

    /**
     * This method is only available if the cluster hasn't been started in detached mode.
     */
    @Override
    public GetClusterStatusResponse getClusterStatus() {
        if (!isConnected) {
            throw new IllegalStateException("The cluster is not connected to the cluster.");
        }
        if (hasBeenShutdown()) {
            throw new IllegalStateException("The cluster has already been shutdown.");
        }

        try {
            final Future<Object> clusterStatusOption = getJobManagerGateway().ask(GetClusterStatus.getInstance(),
                    akkaDuration);
            return (GetClusterStatusResponse) Await.result(clusterStatusOption, akkaDuration);
        } catch (Exception e) {
            throw new RuntimeException("Unable to get ClusterClient status from Application Client", e);
        }
    }

    public ApplicationStatus getApplicationStatus() {
        if (!isConnected) {
            throw new IllegalStateException("The cluster has been connected to the ApplicationMaster.");
        }
        ApplicationReport lastReport = null;
        if (pollingRunner == null) {
            LOG.warn("YarnClusterClient.getApplicationStatus() has been called on an uninitialized cluster."
                    + "The system might be in an erroneous state");
        } else {
            lastReport = pollingRunner.getLastReport();
        }
        if (lastReport == null) {
            LOG.warn(
                    "YarnClusterClient.getApplicationStatus() has been called on a cluster that didn't receive a status so far."
                            + "The system might be in an erroneous state");
            return ApplicationStatus.UNKNOWN;
        } else {
            YarnApplicationState appState = lastReport.getYarnApplicationState();
            ApplicationStatus status = (appState == YarnApplicationState.FAILED
                    || appState == YarnApplicationState.KILLED) ? ApplicationStatus.FAILED
                            : ApplicationStatus.SUCCEEDED;
            if (status != ApplicationStatus.SUCCEEDED) {
                LOG.warn("YARN reported application state {}", appState);
                LOG.warn("Diagnostics: {}", lastReport.getDiagnostics());
            }
            return status;
        }
    }

    @Override
    public List<String> getNewMessages() {

        if (hasBeenShutdown()) {
            throw new RuntimeException("The YarnClusterClient has already been stopped");
        }

        if (!isConnected) {
            throw new IllegalStateException("The cluster has been connected to the ApplicationMaster.");
        }

        List<String> ret = new ArrayList<String>();
        // get messages from ApplicationClient (locally)
        while (true) {
            Object result;
            try {
                Future<Object> response = Patterns.ask(applicationClient.get(),
                        YarnMessages.getLocalGetYarnMessage(), new Timeout(akkaDuration));
                result = Await.result(response, akkaDuration);
            } catch (Exception ioe) {
                LOG.warn("Error retrieving the YARN messages locally", ioe);
                break;
            }

            if (!(result instanceof Option)) {
                throw new RuntimeException("LocalGetYarnMessage requires a response of type "
                        + "Option. Instead the response is of type " + result.getClass() + ".");
            } else {
                Option messageOption = (Option) result;
                LOG.debug("Received message option {}", messageOption);
                if (messageOption.isEmpty()) {
                    break;
                } else {
                    Object obj = messageOption.get();

                    if (obj instanceof InfoMessage) {
                        InfoMessage msg = (InfoMessage) obj;
                        ret.add("[" + msg.date() + "] " + msg.message());
                    } else {
                        LOG.warn("LocalGetYarnMessage returned unexpected type: " + messageOption);
                    }
                }
            }
        }
        return ret;
    }

    // -------------------------- Shutdown handling ------------------------

    private AtomicBoolean hasBeenShutDown = new AtomicBoolean(false);

    /**
     * Shuts down or disconnects from the YARN cluster.
     */
    @Override
    public void finalizeCluster() {
        if (isDetached() || !newlyCreatedCluster) {
            disconnect();
        } else {
            shutdownCluster();
        }
    }

    /**
     * Shuts down the Yarn application
     */
    public void shutdownCluster() {

        if (hasBeenShutDown.getAndSet(true)) {
            return;
        }

        if (!isConnected) {
            throw new IllegalStateException("The cluster has been not been connected to the ApplicationMaster.");
        }

        try {
            Runtime.getRuntime().removeShutdownHook(clientShutdownHook);
        } catch (IllegalStateException e) {
            // we are already in the shutdown hook
        }

        LOG.info("Sending shutdown request to the Application Master");
        try {
            Future<Object> response = Patterns.ask(applicationClient.get(), new YarnMessages.LocalStopYarnSession(
                    getApplicationStatus(), "Flink YARN Client requested shutdown"), new Timeout(akkaDuration));
            Await.ready(response, akkaDuration);
        } catch (Exception e) {
            LOG.warn("Error while stopping YARN cluster.", e);
        }

        try {
            File propertiesFile = FlinkYarnSessionCli.getYarnPropertiesLocation(flinkConfig);
            if (propertiesFile.isFile()) {
                if (propertiesFile.delete()) {
                    LOG.info("Deleted Yarn properties file at {}", propertiesFile.getAbsoluteFile().toString());
                } else {
                    LOG.warn("Couldn't delete Yarn properties file at {}",
                            propertiesFile.getAbsoluteFile().toString());
                }
            }
        } catch (Exception e) {
            LOG.warn("Exception while deleting the JobManager address file", e);
        }

        if (sessionFilesDir != null) {
            LOG.info("Deleting files in " + sessionFilesDir);
            try {
                FileSystem shutFS = FileSystem.get(hadoopConfig);
                shutFS.delete(sessionFilesDir, true); // delete conf and jar file.
                shutFS.close();
            } catch (IOException e) {
                LOG.error("Could not delete the Flink jar and configuration files in HDFS..", e);
            }
        } else {
            LOG.warn("Session file directory not set. Not deleting session files");
        }

        try {
            pollingRunner.stopRunner();
            pollingRunner.join(1000);
        } catch (InterruptedException e) {
            LOG.warn("Shutdown of the polling runner was interrupted", e);
            Thread.currentThread().interrupt();
        }

        try {
            ApplicationReport appReport = yarnClient.getApplicationReport(appId);

            LOG.info("Application " + appId + " finished with state " + appReport.getYarnApplicationState()
                    + " and final state " + appReport.getFinalApplicationStatus() + " at "
                    + appReport.getFinishTime());

            if (appReport.getYarnApplicationState() == YarnApplicationState.FAILED
                    || appReport.getYarnApplicationState() == YarnApplicationState.KILLED) {
                LOG.warn("Application failed. Diagnostics " + appReport.getDiagnostics());
                LOG.warn("If log aggregation is activated in the Hadoop cluster, we recommend to retrieve "
                        + "the full application log using this command:" + System.lineSeparator()
                        + "\tyarn logs -applicationId " + appReport.getApplicationId() + System.lineSeparator()
                        + "(It sometimes takes a few seconds until the logs are aggregated)");
            }
        } catch (Exception e) {
            LOG.warn("Couldn't get final report", e);
        }

        LOG.info("YARN Client is shutting down");
        yarnClient.stop(); // actorRunner is using the yarnClient.
        yarnClient = null; // set null to clearly see if somebody wants to access it afterwards.
    }

    public boolean hasBeenShutdown() {
        return hasBeenShutDown.get();
    }

    private class ClientShutdownHook extends Thread {
        @Override
        public void run() {
            LOG.info("Shutting down YarnClusterClient from the client shutdown hook");
            shutdown();
        }
    }

    // -------------------------- Polling ------------------------

    private static class PollingThread extends Thread {

        AtomicBoolean running = new AtomicBoolean(true);
        private YarnClient yarnClient;
        private ApplicationId appId;

        // ------- status information stored in the polling thread
        private final Object lock = new Object();
        private ApplicationReport lastReport;

        public PollingThread(YarnClient yarnClient, ApplicationId appId) {
            this.yarnClient = yarnClient;
            this.appId = appId;
        }

        public void stopRunner() {
            if (!running.get()) {
                LOG.warn("Polling thread was already stopped");
            }
            running.set(false);
        }

        public ApplicationReport getLastReport() {
            synchronized (lock) {
                return lastReport;
            }
        }

        @Override
        public void run() {
            while (running.get() && yarnClient.isInState(Service.STATE.STARTED)) {
                try {
                    ApplicationReport report = yarnClient.getApplicationReport(appId);
                    synchronized (lock) {
                        lastReport = report;
                    }
                } catch (Exception e) {
                    LOG.warn("Error while getting application report", e);
                }
                try {
                    Thread.sleep(YarnClusterClient.POLLING_THREAD_INTERVAL_MS);
                } catch (InterruptedException e) {
                    LOG.error("Polling thread got interrupted", e);
                    Thread.currentThread().interrupt(); // pass interrupt.
                    stopRunner();
                }
            }
            if (running.get() && !yarnClient.isInState(Service.STATE.STARTED)) {
                // == if the polling thread is still running but the yarn client is stopped.
                LOG.warn("YARN client is unexpected in state " + yarnClient.getServiceState());
            }
        }
    }

    @Override
    public boolean isDetached() {
        return super.isDetached() || clusterDescriptor.isDetachedMode();
    }

    /**
     * Blocks until all TaskManagers are connected to the JobManager.
     */
    @Override
    public void waitForClusterToBeReady() {
        logAndSysout("Waiting until all TaskManagers have connected");

        for (GetClusterStatusResponse currentStatus, lastStatus = null;; lastStatus = currentStatus) {
            currentStatus = getClusterStatus();
            if (currentStatus != null && !currentStatus.equals(lastStatus)) {
                logAndSysout("TaskManager status (" + currentStatus.numRegisteredTaskManagers() + "/"
                        + clusterDescriptor.getTaskManagerCount() + ")");
                if (currentStatus.numRegisteredTaskManagers() >= clusterDescriptor.getTaskManagerCount()) {
                    logAndSysout("All TaskManagers are connected");
                    break;
                }
            } else if (lastStatus == null) {
                logAndSysout("No status updates from the YARN cluster received so far. Waiting ...");
            }

            try {
                Thread.sleep(250);
            } catch (InterruptedException e) {
                throw new RuntimeException("Interrupted while waiting for TaskManagers", e);
            }
        }
    }

    public ApplicationId getApplicationId() {
        return appId;
    }

    private static class LazApplicationClientLoader {

        private final org.apache.flink.configuration.Configuration flinkConfig;
        private final LazyActorSystemLoader actorSystemLoader;

        private ActorRef applicationClient;

        private LazApplicationClientLoader(org.apache.flink.configuration.Configuration flinkConfig,
                LazyActorSystemLoader actorSystemLoader) {
            this.flinkConfig = flinkConfig;
            this.actorSystemLoader = actorSystemLoader;
        }

        /**
         * Creates a new ApplicationClient actor or returns an existing one. May start an ActorSystem.
         * @return ActorSystem
         */
        public ActorRef get() {
            if (applicationClient == null) {
                /* The leader retrieval service for connecting to the cluster and finding the active leader. */
                LeaderRetrievalService leaderRetrievalService;
                try {
                    leaderRetrievalService = LeaderRetrievalUtils.createLeaderRetrievalService(flinkConfig);
                } catch (Exception e) {
                    throw new RuntimeException("Could not create the leader retrieval service.", e);
                }

                // start application client
                LOG.info("Start application client.");

                applicationClient = actorSystemLoader.get().actorOf(
                        Props.create(ApplicationClient.class, flinkConfig, leaderRetrievalService),
                        "applicationClient");
            }

            return applicationClient;
        }
    }
}