org.apache.hadoop.mapreduce.v2.app.launcher.ContainerLauncherImpl.java Source code

Java tutorial

Introduction

Here is the source code for org.apache.hadoop.mapreduce.v2.app.launcher.ContainerLauncherImpl.java

Source

/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements.  See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership.  The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License.  You may obtain a copy of the License at
*
*     http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package org.apache.hadoop.mapreduce.v2.app.launcher;

import java.io.File;
import java.io.IOException;
import java.nio.ByteBuffer;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
import java.util.concurrent.BlockingQueue;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.LinkedBlockingQueue;
import java.util.concurrent.ThreadFactory;
import java.util.concurrent.ThreadPoolExecutor;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.atomic.AtomicBoolean;

import io.hops.security.HopsUtil;
import org.apache.commons.io.FileUtils;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.CommonConfigurationKeys;
import org.apache.hadoop.fs.CommonConfigurationKeysPublic;
import org.apache.hadoop.mapred.ShuffleHandler;
import org.apache.hadoop.mapreduce.MRJobConfig;
import org.apache.hadoop.mapreduce.v2.api.records.TaskAttemptId;
import org.apache.hadoop.mapreduce.v2.app.AppContext;
import org.apache.hadoop.mapreduce.v2.app.job.event.TaskAttemptContainerLaunchedEvent;
import org.apache.hadoop.mapreduce.v2.app.job.event.TaskAttemptDiagnosticsUpdateEvent;
import org.apache.hadoop.mapreduce.v2.app.job.event.TaskAttemptEvent;
import org.apache.hadoop.mapreduce.v2.app.job.event.TaskAttemptEventType;
import org.apache.hadoop.net.HopsSSLSocketFactory;
import org.apache.hadoop.security.UserGroupInformation;
import org.apache.hadoop.security.ssl.JWTSecurityMaterial;
import org.apache.hadoop.service.AbstractService;
import org.apache.hadoop.util.StringUtils;
import org.apache.hadoop.yarn.api.protocolrecords.SignalContainerRequest;
import org.apache.hadoop.yarn.api.protocolrecords.StartContainerRequest;
import org.apache.hadoop.yarn.api.protocolrecords.StartContainersRequest;
import org.apache.hadoop.yarn.api.protocolrecords.StartContainersResponse;
import org.apache.hadoop.yarn.api.protocolrecords.StopContainersRequest;
import org.apache.hadoop.yarn.api.protocolrecords.StopContainersResponse;
import org.apache.hadoop.yarn.api.records.ContainerId;
import org.apache.hadoop.yarn.api.records.ContainerLaunchContext;
import org.apache.hadoop.yarn.api.records.SignalContainerCommand;
import org.apache.hadoop.yarn.client.api.impl.ContainerManagementProtocolProxy;
import org.apache.hadoop.yarn.client.api.impl.ContainerManagementProtocolProxy.ContainerManagementProtocolProxyData;
import org.apache.hadoop.yarn.conf.YarnConfiguration;
import org.apache.hadoop.yarn.exceptions.YarnRuntimeException;

import com.google.common.util.concurrent.ThreadFactoryBuilder;

/**
 * This class is responsible for launching of containers.
 */
public class ContainerLauncherImpl extends AbstractService implements ContainerLauncher {

    static final Log LOG = LogFactory.getLog(ContainerLauncherImpl.class);

    private ConcurrentHashMap<ContainerId, Container> containers = new ConcurrentHashMap<ContainerId, Container>();
    private final AppContext context;
    protected ThreadPoolExecutor launcherPool;
    protected int initialPoolSize;
    private int limitOnPoolSize;
    private Thread eventHandlingThread;
    protected BlockingQueue<ContainerLauncherEvent> eventQueue = new LinkedBlockingQueue<ContainerLauncherEvent>();
    private final AtomicBoolean stopped;
    private ContainerManagementProtocolProxy cmProxy;

    private String certificatePassword = null;

    private Container getContainer(ContainerLauncherEvent event) {
        ContainerId id = event.getContainerID();
        Container c = containers.get(id);
        if (c == null) {
            c = new Container(event.getTaskAttemptID(), event.getContainerID(), event.getContainerMgrAddress());
            Container old = containers.putIfAbsent(id, c);
            if (old != null) {
                c = old;
            }
        }
        return c;
    }

    private void removeContainerIfDone(ContainerId id) {
        Container c = containers.get(id);
        if (c != null && c.isCompletelyDone()) {
            containers.remove(id);
        }
    }

    private static enum ContainerState {
        PREP, FAILED, RUNNING, DONE, KILLED_BEFORE_LAUNCH
    }

    private class Container {
        private ContainerState state;
        // store enough information to be able to cleanup the container
        private TaskAttemptId taskAttemptID;
        private ContainerId containerID;
        final private String containerMgrAddress;

        public Container(TaskAttemptId taId, ContainerId containerID, String containerMgrAddress) {
            this.state = ContainerState.PREP;
            this.taskAttemptID = taId;
            this.containerMgrAddress = containerMgrAddress;
            this.containerID = containerID;
        }

        public synchronized boolean isCompletelyDone() {
            return state == ContainerState.DONE || state == ContainerState.FAILED;
        }

        public synchronized void done() {
            state = ContainerState.DONE;
        }

        @SuppressWarnings("unchecked")
        public synchronized void launch(ContainerRemoteLaunchEvent event) {
            LOG.info("Launching " + taskAttemptID);
            if (this.state == ContainerState.KILLED_BEFORE_LAUNCH) {
                state = ContainerState.DONE;
                sendContainerLaunchFailedMsg(taskAttemptID, "Container was killed before it was launched");
                return;
            }

            ContainerManagementProtocolProxyData proxy = null;
            try {

                proxy = getCMProxy(containerMgrAddress, containerID);

                // Construct the actual Container
                ContainerLaunchContext containerLaunchContext = event.getContainerLaunchContext();

                // Now launch the actual container
                StartContainerRequest startRequest = StartContainerRequest.newInstance(containerLaunchContext,
                        event.getContainerToken());
                List<StartContainerRequest> list = new ArrayList<StartContainerRequest>();
                list.add(startRequest);
                StartContainersRequest requestList = StartContainersRequest.newInstance(list);

                setupCryptoMaterial(requestList);

                StartContainersResponse response = proxy.getContainerManagementProtocol()
                        .startContainers(requestList);
                if (response.getFailedRequests() != null && response.getFailedRequests().containsKey(containerID)) {
                    throw response.getFailedRequests().get(containerID).deSerialize();
                }
                ByteBuffer portInfo = response.getAllServicesMetaData()
                        .get(ShuffleHandler.MAPREDUCE_SHUFFLE_SERVICEID);
                int port = -1;
                if (portInfo != null) {
                    port = ShuffleHandler.deserializeMetaData(portInfo);
                }
                LOG.info("Shuffle port returned by ContainerManager for " + taskAttemptID + " : " + port);

                if (port < 0) {
                    this.state = ContainerState.FAILED;
                    throw new IllegalStateException(
                            "Invalid shuffle port number " + port + " returned for " + taskAttemptID);
                }

                // after launching, send launched event to task attempt to move
                // it from ASSIGNED to RUNNING state
                context.getEventHandler().handle(new TaskAttemptContainerLaunchedEvent(taskAttemptID, port));
                this.state = ContainerState.RUNNING;
            } catch (Throwable t) {
                String message = "Container launch failed for " + containerID + " : "
                        + StringUtils.stringifyException(t);
                this.state = ContainerState.FAILED;
                sendContainerLaunchFailedMsg(taskAttemptID, message);
            } finally {
                if (proxy != null) {
                    cmProxy.mayBeCloseProxy(proxy);
                }
            }
        }

        public void kill() {
            kill(false);
        }

        private void setupCryptoMaterial(StartContainersRequest request) throws IOException {
            if (getConfig().getBoolean(CommonConfigurationKeys.IPC_SERVER_SSL_ENABLED,
                    CommonConfigurationKeys.IPC_SERVER_SSL_ENABLED_DEFAULT)) {
                Path kStorePath = Paths.get(HopsSSLSocketFactory.LOCALIZED_KEYSTORE_FILE_NAME);
                Path tStorePath = Paths.get(HopsSSLSocketFactory.LOCALIZED_TRUSTSTORE_FILE_NAME);
                Path passwdPath = Paths.get(HopsSSLSocketFactory.LOCALIZED_PASSWD_FILE_NAME);

                ByteBuffer kStore = ByteBuffer.wrap(Files.readAllBytes(kStorePath));
                ByteBuffer tStore = ByteBuffer.wrap(Files.readAllBytes(tStorePath));
                String password = readCryptoMaterialPassword(passwdPath.toFile());

                request.setKeyStore(kStore);
                request.setKeyStorePassword(password);
                request.setTrustStore(tStore);
                request.setTrustStorePassword(password);
            }

            if (getConfig().getBoolean(YarnConfiguration.RM_JWT_ENABLED,
                    YarnConfiguration.DEFAULT_RM_JWT_ENABLED)) {
                Path path2jwt = Paths.get(JWTSecurityMaterial.JWT_LOCAL_RESOURCE_FILE);
                String jwt = FileUtils.readFileToString(path2jwt.toFile()).trim();
                request.setJWT(jwt);
            }
        }

        private String readCryptoMaterialPassword(File passwdFile) throws IOException {
            if (null != certificatePassword) {
                return certificatePassword;
            }

            certificatePassword = HopsUtil.readCryptoMaterialPassword(passwdFile);
            return certificatePassword;
        }

        @SuppressWarnings("unchecked")
        public synchronized void kill(boolean dumpThreads) {

            if (this.state == ContainerState.PREP) {
                this.state = ContainerState.KILLED_BEFORE_LAUNCH;
            } else if (!isCompletelyDone()) {
                LOG.info("KILLING " + taskAttemptID);

                ContainerManagementProtocolProxyData proxy = null;
                try {
                    proxy = getCMProxy(this.containerMgrAddress, this.containerID);

                    if (dumpThreads) {
                        final SignalContainerRequest request = SignalContainerRequest.newInstance(containerID,
                                SignalContainerCommand.OUTPUT_THREAD_DUMP);
                        proxy.getContainerManagementProtocol().signalToContainer(request);
                    }

                    // kill the remote container if already launched
                    List<ContainerId> ids = new ArrayList<ContainerId>();
                    ids.add(this.containerID);
                    StopContainersRequest request = StopContainersRequest.newInstance(ids);
                    StopContainersResponse response = proxy.getContainerManagementProtocol()
                            .stopContainers(request);
                    if (response.getFailedRequests() != null
                            && response.getFailedRequests().containsKey(this.containerID)) {
                        throw response.getFailedRequests().get(this.containerID).deSerialize();
                    }
                } catch (Throwable t) {
                    // ignore the cleanup failure
                    String message = "cleanup failed for container " + this.containerID + " : "
                            + StringUtils.stringifyException(t);
                    context.getEventHandler()
                            .handle(new TaskAttemptDiagnosticsUpdateEvent(this.taskAttemptID, message));
                    LOG.warn(message);
                } finally {
                    if (proxy != null) {
                        cmProxy.mayBeCloseProxy(proxy);
                    }
                }
                this.state = ContainerState.DONE;
            }
            // after killing, send killed event to task attempt
            context.getEventHandler()
                    .handle(new TaskAttemptEvent(this.taskAttemptID, TaskAttemptEventType.TA_CONTAINER_CLEANED));
        }
    }

    public ContainerLauncherImpl(AppContext context) {
        super(ContainerLauncherImpl.class.getName());
        this.context = context;
        this.stopped = new AtomicBoolean(false);
    }

    @Override
    protected void serviceInit(Configuration conf) throws Exception {
        this.limitOnPoolSize = conf.getInt(MRJobConfig.MR_AM_CONTAINERLAUNCHER_THREAD_COUNT_LIMIT,
                MRJobConfig.DEFAULT_MR_AM_CONTAINERLAUNCHER_THREAD_COUNT_LIMIT);
        LOG.info("Upper limit on the thread pool size is " + this.limitOnPoolSize);

        this.initialPoolSize = conf.getInt(MRJobConfig.MR_AM_CONTAINERLAUNCHER_THREADPOOL_INITIAL_SIZE,
                MRJobConfig.DEFAULT_MR_AM_CONTAINERLAUNCHER_THREADPOOL_INITIAL_SIZE);
        LOG.info("The thread pool initial size is " + this.initialPoolSize);

        super.serviceInit(conf);
        cmProxy = new ContainerManagementProtocolProxy(conf);
    }

    protected void serviceStart() throws Exception {

        ThreadFactory tf = new ThreadFactoryBuilder().setNameFormat("ContainerLauncher #%d").setDaemon(true)
                .build();

        // Start with a default core-pool size of 10 and change it dynamically.
        launcherPool = new ThreadPoolExecutor(initialPoolSize, Integer.MAX_VALUE, 1, TimeUnit.HOURS,
                new LinkedBlockingQueue<Runnable>(), tf);
        eventHandlingThread = new Thread() {
            @Override
            public void run() {
                ContainerLauncherEvent event = null;
                Set<String> allNodes = new HashSet<String>();

                while (!stopped.get() && !Thread.currentThread().isInterrupted()) {
                    try {
                        event = eventQueue.take();
                    } catch (InterruptedException e) {
                        if (!stopped.get()) {
                            LOG.error("Returning, interrupted : " + e);
                        }
                        return;
                    }
                    allNodes.add(event.getContainerMgrAddress());

                    int poolSize = launcherPool.getCorePoolSize();

                    // See if we need up the pool size only if haven't reached the
                    // maximum limit yet.
                    if (poolSize != limitOnPoolSize) {

                        // nodes where containers will run at *this* point of time. This is
                        // *not* the cluster size and doesn't need to be.
                        int numNodes = allNodes.size();
                        int idealPoolSize = Math.min(limitOnPoolSize, numNodes);

                        if (poolSize < idealPoolSize) {
                            // Bump up the pool size to idealPoolSize+initialPoolSize, the
                            // later is just a buffer so we are not always increasing the
                            // pool-size
                            int newPoolSize = Math.min(limitOnPoolSize, idealPoolSize + initialPoolSize);
                            LOG.info("Setting ContainerLauncher pool size to " + newPoolSize
                                    + " as number-of-nodes to talk to is " + numNodes);
                            launcherPool.setCorePoolSize(newPoolSize);
                        }
                    }

                    // the events from the queue are handled in parallel
                    // using a thread pool
                    launcherPool.execute(createEventProcessor(event));

                    // TODO: Group launching of multiple containers to a single
                    // NodeManager into a single connection
                }
            }
        };
        eventHandlingThread.setName("ContainerLauncher Event Handler");
        eventHandlingThread.start();
        super.serviceStart();
    }

    private void shutdownAllContainers() {
        for (Container ct : this.containers.values()) {
            if (ct != null) {
                ct.kill();
            }
        }
    }

    protected void serviceStop() throws Exception {
        if (stopped.getAndSet(true)) {
            // return if already stopped
            return;
        }
        // shutdown any containers that might be left running
        shutdownAllContainers();
        if (eventHandlingThread != null) {
            eventHandlingThread.interrupt();
        }
        if (launcherPool != null) {
            launcherPool.shutdownNow();
        }
        super.serviceStop();
    }

    protected EventProcessor createEventProcessor(ContainerLauncherEvent event) {
        return new EventProcessor(event);
    }

    /**
     * Setup and start the container on remote nodemanager.
     */
    class EventProcessor implements Runnable {
        private ContainerLauncherEvent event;

        EventProcessor(ContainerLauncherEvent event) {
            this.event = event;
        }

        @Override
        public void run() {
            LOG.info("Processing the event " + event.toString());

            // Load ContainerManager tokens before creating a connection.
            // TODO: Do it only once per NodeManager.
            ContainerId containerID = event.getContainerID();

            Container c = getContainer(event);
            switch (event.getType()) {

            case CONTAINER_REMOTE_LAUNCH:
                ContainerRemoteLaunchEvent launchEvent = (ContainerRemoteLaunchEvent) event;
                c.launch(launchEvent);
                break;

            case CONTAINER_REMOTE_CLEANUP:
                c.kill(event.getDumpContainerThreads());
                break;

            case CONTAINER_COMPLETED:
                c.done();
                break;

            }
            removeContainerIfDone(containerID);
        }
    }

    @SuppressWarnings("unchecked")
    void sendContainerLaunchFailedMsg(TaskAttemptId taskAttemptID, String message) {
        LOG.error(message);
        context.getEventHandler().handle(new TaskAttemptDiagnosticsUpdateEvent(taskAttemptID, message));
        context.getEventHandler()
                .handle(new TaskAttemptEvent(taskAttemptID, TaskAttemptEventType.TA_CONTAINER_LAUNCH_FAILED));
    }

    @Override
    public void handle(ContainerLauncherEvent event) {
        try {
            eventQueue.put(event);
        } catch (InterruptedException e) {
            throw new YarnRuntimeException(e);
        }
    }

    public ContainerManagementProtocolProxy.ContainerManagementProtocolProxyData getCMProxy(
            String containerMgrBindAddr, ContainerId containerId) throws IOException {
        return cmProxy.getProxy(containerMgrBindAddr, containerId);
    }
}