Java tutorial
/** * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.hadoop.yarn.server.nodemanager.containermanager; import static org.apache.hadoop.service.Service.STATE.STARTED; import java.io.DataInputStream; import java.io.File; import java.io.IOException; import java.net.InetSocketAddress; import java.net.URISyntaxException; import java.nio.ByteBuffer; import java.nio.file.Path; import java.util.ArrayList; import java.util.Arrays; import java.util.HashMap; import java.util.List; import java.util.Map; import java.util.Map.Entry; import java.util.Set; import java.util.concurrent.ExecutorService; import java.util.concurrent.Executors; import java.util.concurrent.Future; import java.util.concurrent.atomic.AtomicBoolean; import java.util.concurrent.locks.ReentrantReadWriteLock; import java.util.concurrent.locks.ReentrantReadWriteLock.ReadLock; import java.util.concurrent.locks.ReentrantReadWriteLock.WriteLock; import com.google.common.util.concurrent.ThreadFactoryBuilder; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.hadoop.classification.InterfaceAudience.Private; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.CommonConfigurationKeysPublic; import org.apache.hadoop.io.DataOutputBuffer; import org.apache.hadoop.ipc.Server; import org.apache.hadoop.net.HopsSSLSocketFactory; import org.apache.hadoop.net.NetUtils; import org.apache.hadoop.security.Credentials; import org.apache.hadoop.security.SaslRpcServer; import org.apache.hadoop.security.UserGroupInformation; import org.apache.hadoop.security.authorize.PolicyProvider; import org.apache.hadoop.security.ssl.JWTSecurityMaterial; import org.apache.hadoop.security.ssl.X509SecurityMaterial; import org.apache.hadoop.security.token.SecretManager.InvalidToken; import org.apache.hadoop.security.token.TokenIdentifier; import org.apache.hadoop.service.CompositeService; import org.apache.hadoop.service.Service; import org.apache.hadoop.service.ServiceStateChangeListener; import org.apache.hadoop.util.StringUtils; import org.apache.hadoop.yarn.api.ContainerManagementProtocol; import org.apache.hadoop.yarn.api.protocolrecords.GetContainerStatusesRequest; import org.apache.hadoop.yarn.api.protocolrecords.GetContainerStatusesResponse; import org.apache.hadoop.yarn.api.protocolrecords.IncreaseContainersResourceRequest; import org.apache.hadoop.yarn.api.protocolrecords.IncreaseContainersResourceResponse; import org.apache.hadoop.yarn.api.protocolrecords.SignalContainerRequest; import org.apache.hadoop.yarn.api.protocolrecords.SignalContainerResponse; import org.apache.hadoop.yarn.api.protocolrecords.StartContainerRequest; import org.apache.hadoop.yarn.api.protocolrecords.StartContainersRequest; import org.apache.hadoop.yarn.api.protocolrecords.StartContainersResponse; import org.apache.hadoop.yarn.api.protocolrecords.StopContainersRequest; import org.apache.hadoop.yarn.api.protocolrecords.StopContainersResponse; import org.apache.hadoop.yarn.api.protocolrecords.impl.pb.SignalContainerResponsePBImpl; import org.apache.hadoop.yarn.api.records.ApplicationAccessType; import org.apache.hadoop.yarn.api.records.ApplicationId; import org.apache.hadoop.yarn.api.records.ContainerExitStatus; import org.apache.hadoop.yarn.api.records.ContainerId; import org.apache.hadoop.yarn.api.records.ContainerLaunchContext; import org.apache.hadoop.yarn.api.records.ContainerState; import org.apache.hadoop.yarn.api.records.ContainerStatus; import org.apache.hadoop.yarn.api.records.LocalResource; import org.apache.hadoop.yarn.api.records.LocalResourceType; import org.apache.hadoop.yarn.api.records.LocalResourceVisibility; import org.apache.hadoop.yarn.api.records.LogAggregationContext; import org.apache.hadoop.yarn.api.records.NodeId; import org.apache.hadoop.yarn.api.records.Resource; import org.apache.hadoop.yarn.api.records.SerializedException; import org.apache.hadoop.yarn.api.records.URL; import org.apache.hadoop.yarn.api.records.impl.pb.ApplicationIdPBImpl; import org.apache.hadoop.yarn.api.records.impl.pb.LogAggregationContextPBImpl; import org.apache.hadoop.yarn.api.records.impl.pb.ProtoUtils; import org.apache.hadoop.yarn.conf.YarnConfiguration; import org.apache.hadoop.yarn.event.AsyncDispatcher; import org.apache.hadoop.yarn.event.EventHandler; import org.apache.hadoop.yarn.exceptions.InvalidAuxServiceException; import org.apache.hadoop.yarn.exceptions.InvalidContainerException; import org.apache.hadoop.yarn.exceptions.NMNotYetReadyException; import org.apache.hadoop.yarn.exceptions.YarnException; import org.apache.hadoop.yarn.exceptions.YarnRuntimeException; import org.apache.hadoop.yarn.ipc.RPCUtil; import org.apache.hadoop.yarn.ipc.YarnRPC; import org.apache.hadoop.yarn.proto.YarnProtos.ApplicationACLMapProto; import org.apache.hadoop.yarn.proto.YarnServerNodemanagerRecoveryProtos.ContainerManagerApplicationProto; import org.apache.hadoop.yarn.security.ContainerTokenIdentifier; import org.apache.hadoop.yarn.security.NMTokenIdentifier; import org.apache.hadoop.yarn.server.api.ContainerType; import org.apache.hadoop.yarn.server.nodemanager.CMgrCompletedAppsEvent; import org.apache.hadoop.yarn.server.nodemanager.CMgrCompletedContainersEvent; import org.apache.hadoop.yarn.server.nodemanager.CMgrDecreaseContainersResourceEvent; import org.apache.hadoop.yarn.server.nodemanager.CMgrSignalContainersEvent; import org.apache.hadoop.yarn.server.nodemanager.CMgrUpdateJWTEvent; import org.apache.hadoop.yarn.server.nodemanager.CMgrUpdateX509Event; import org.apache.hadoop.yarn.server.nodemanager.ContainerExecutor; import org.apache.hadoop.yarn.server.nodemanager.ContainerManagerEvent; import org.apache.hadoop.yarn.server.nodemanager.Context; import org.apache.hadoop.yarn.server.nodemanager.DeletionService; import org.apache.hadoop.yarn.server.nodemanager.LocalDirsHandlerService; import org.apache.hadoop.yarn.server.nodemanager.NMAuditLogger; import org.apache.hadoop.yarn.server.nodemanager.NMAuditLogger.AuditConstants; import org.apache.hadoop.yarn.server.nodemanager.NodeManager; import org.apache.hadoop.yarn.server.nodemanager.NodeStatusUpdater; import org.apache.hadoop.yarn.server.nodemanager.amrmproxy.AMRMProxyService; import org.apache.hadoop.yarn.server.nodemanager.containermanager.application.Application; import org.apache.hadoop.yarn.server.nodemanager.containermanager.application.ApplicationContainerInitEvent; import org.apache.hadoop.yarn.server.nodemanager.containermanager.application.ApplicationEvent; import org.apache.hadoop.yarn.server.nodemanager.containermanager.application.ApplicationEventType; import org.apache.hadoop.yarn.server.nodemanager.containermanager.application.ApplicationFinishEvent; import org.apache.hadoop.yarn.server.nodemanager.containermanager.application.ApplicationImpl; import org.apache.hadoop.yarn.server.nodemanager.containermanager.application.ApplicationInitEvent; import org.apache.hadoop.yarn.server.nodemanager.containermanager.container.Container; import org.apache.hadoop.yarn.server.nodemanager.containermanager.container.ContainerEvent; import org.apache.hadoop.yarn.server.nodemanager.containermanager.container.ContainerEventType; import org.apache.hadoop.yarn.server.nodemanager.containermanager.container.ContainerImpl; import org.apache.hadoop.yarn.server.nodemanager.containermanager.container.ContainerKillEvent; import org.apache.hadoop.yarn.server.nodemanager.containermanager.launcher.ContainersLauncher; import org.apache.hadoop.yarn.server.nodemanager.containermanager.launcher.ContainersLauncherEventType; import org.apache.hadoop.yarn.server.nodemanager.containermanager.launcher.SignalContainersLauncherEvent; import org.apache.hadoop.yarn.server.nodemanager.containermanager.localizer.ResourceLocalizationService; import org.apache.hadoop.yarn.server.nodemanager.containermanager.localizer.event.LocalizationEventType; import org.apache.hadoop.yarn.server.nodemanager.containermanager.localizer.sharedcache.SharedCacheUploadEventType; import org.apache.hadoop.yarn.server.nodemanager.containermanager.localizer.sharedcache.SharedCacheUploadService; import org.apache.hadoop.yarn.server.nodemanager.containermanager.logaggregation.LogAggregationService; import org.apache.hadoop.yarn.server.nodemanager.containermanager.loghandler.LogHandler; import org.apache.hadoop.yarn.server.nodemanager.containermanager.loghandler.NonAggregatingLogHandler; import org.apache.hadoop.yarn.server.nodemanager.containermanager.loghandler.event.LogHandlerEventType; import org.apache.hadoop.yarn.server.nodemanager.containermanager.monitor.ChangeMonitoringContainerResourceEvent; import org.apache.hadoop.yarn.server.nodemanager.containermanager.monitor.ContainersMonitor; import org.apache.hadoop.yarn.server.nodemanager.containermanager.monitor.ContainersMonitorEventType; import org.apache.hadoop.yarn.server.nodemanager.containermanager.monitor.ContainersMonitorImpl; import org.apache.hadoop.yarn.server.nodemanager.metrics.NodeManagerMetrics; import org.apache.hadoop.yarn.server.nodemanager.recovery.NMStateStoreService; import org.apache.hadoop.yarn.server.nodemanager.recovery.NMStateStoreService.RecoveredApplicationsState; import org.apache.hadoop.yarn.server.nodemanager.recovery.NMStateStoreService.RecoveredContainerState; import org.apache.hadoop.yarn.server.nodemanager.recovery.NMStateStoreService.RecoveredContainerStatus; import org.apache.hadoop.yarn.server.nodemanager.security.authorize.NMPolicyProvider; import org.apache.hadoop.yarn.server.security.CertificateLocalizationService; import org.apache.hadoop.yarn.server.utils.BuilderUtils; import org.apache.hadoop.yarn.server.utils.YarnServerSecurityUtils; import com.google.common.annotations.VisibleForTesting; import com.google.protobuf.ByteString; import org.apache.hadoop.yarn.util.resource.Resources; public class ContainerManagerImpl extends CompositeService implements ServiceStateChangeListener, ContainerManagementProtocol, EventHandler<ContainerManagerEvent> { /** * Extra duration to wait for applications to be killed on shutdown. */ private static final int SHUTDOWN_CLEANUP_SLOP_MS = 1000; private static final Log LOG = LogFactory.getLog(ContainerManagerImpl.class); static final String INVALID_NMTOKEN_MSG = "Invalid NMToken"; static final String INVALID_CONTAINERTOKEN_MSG = "Invalid ContainerToken"; final Context context; private final ContainersMonitor containersMonitor; private Server server; private final ResourceLocalizationService rsrcLocalizationSrvc; private final ContainersLauncher containersLauncher; private final AuxServices auxiliaryServices; private final NodeManagerMetrics metrics; private final ContainerExecutor exec; private final NodeStatusUpdater nodeStatusUpdater; protected LocalDirsHandlerService dirsHandler; protected final AsyncDispatcher dispatcher; private final DeletionService deletionService; private AtomicBoolean blockNewContainerRequests = new AtomicBoolean(false); private boolean serviceStopped = false; private final ReadLock readLock; private final WriteLock writeLock; private AMRMProxyService amrmProxyService; protected boolean amrmProxyEnabled = false; private long waitForContainersOnShutdownMillis; private final ExecutorService cryptoMaterialUpdaterThreadPool; private final Map<ContainerId, Future> x509Updaters = new HashMap<>(); private final Map<ContainerId, Future> jwtUpdaters = new HashMap<>(); public ContainerManagerImpl(Context context, ContainerExecutor exec, DeletionService deletionContext, NodeStatusUpdater nodeStatusUpdater, NodeManagerMetrics metrics, LocalDirsHandlerService dirsHandler) { super(ContainerManagerImpl.class.getName()); this.context = context; this.dirsHandler = dirsHandler; // ContainerManager level dispatcher. dispatcher = new AsyncDispatcher(); this.deletionService = deletionContext; this.metrics = metrics; rsrcLocalizationSrvc = createResourceLocalizationService(exec, deletionContext, context); addService(rsrcLocalizationSrvc); containersLauncher = createContainersLauncher(context, exec); addService(containersLauncher); this.exec = exec; this.nodeStatusUpdater = nodeStatusUpdater; // Start configurable services auxiliaryServices = new AuxServices(); auxiliaryServices.registerServiceListener(this); addService(auxiliaryServices); this.containersMonitor = new ContainersMonitorImpl(exec, dispatcher, this.context); addService(this.containersMonitor); dispatcher.register(ContainerEventType.class, new ContainerEventDispatcher()); dispatcher.register(ApplicationEventType.class, new ApplicationEventDispatcher()); dispatcher.register(LocalizationEventType.class, rsrcLocalizationSrvc); dispatcher.register(AuxServicesEventType.class, auxiliaryServices); dispatcher.register(ContainersMonitorEventType.class, containersMonitor); dispatcher.register(ContainersLauncherEventType.class, containersLauncher); addService(dispatcher); ReentrantReadWriteLock lock = new ReentrantReadWriteLock(); this.readLock = lock.readLock(); this.writeLock = lock.writeLock(); this.cryptoMaterialUpdaterThreadPool = Executors.newFixedThreadPool(3, new ThreadFactoryBuilder() .setDaemon(true).setNameFormat("Container crypto material updater thread #%d").build()); } @Override public void serviceInit(Configuration conf) throws Exception { LogHandler logHandler = createLogHandler(conf, this.context, this.deletionService); addIfService(logHandler); dispatcher.register(LogHandlerEventType.class, logHandler); // add the shared cache upload service (it will do nothing if the shared // cache is disabled) SharedCacheUploadService sharedCacheUploader = createSharedCacheUploaderService(); addService(sharedCacheUploader); dispatcher.register(SharedCacheUploadEventType.class, sharedCacheUploader); createAMRMProxyService(conf); waitForContainersOnShutdownMillis = conf.getLong(YarnConfiguration.NM_SLEEP_DELAY_BEFORE_SIGKILL_MS, YarnConfiguration.DEFAULT_NM_SLEEP_DELAY_BEFORE_SIGKILL_MS) + conf.getLong(YarnConfiguration.NM_PROCESS_KILL_WAIT_MS, YarnConfiguration.DEFAULT_NM_PROCESS_KILL_WAIT_MS) + SHUTDOWN_CLEANUP_SLOP_MS; super.serviceInit(conf); recover(); } protected void createAMRMProxyService(Configuration conf) { this.amrmProxyEnabled = conf.getBoolean(YarnConfiguration.AMRM_PROXY_ENABLED, YarnConfiguration.DEFAULT_AMRM_PROXY_ENABLED); if (amrmProxyEnabled) { LOG.info("AMRMProxyService is enabled. " + "All the AM->RM requests will be intercepted by the proxy"); this.setAMRMProxyService(new AMRMProxyService(this.context, this.dispatcher)); addService(this.getAMRMProxyService()); } else { LOG.info("AMRMProxyService is disabled"); } } @SuppressWarnings("unchecked") private void recover() throws IOException, URISyntaxException { NMStateStoreService stateStore = context.getNMStateStore(); if (stateStore.canRecover()) { rsrcLocalizationSrvc.recoverLocalizedResources(stateStore.loadLocalizationState()); RecoveredApplicationsState appsState = stateStore.loadApplicationsState(); for (ContainerManagerApplicationProto proto : appsState.getApplications()) { recoverApplication(proto); } for (RecoveredContainerState rcs : stateStore.loadContainersState()) { recoverContainer(rcs); } } } private void recoverApplication(ContainerManagerApplicationProto p) throws IOException { ApplicationId appId = new ApplicationIdPBImpl(p.getId()); Credentials creds = new Credentials(); creds.readTokenStorageStream(new DataInputStream(p.getCredentials().newInput())); int cryptoMaterialVersion = -1; long jwtExpiration = -1L; if (isHopsTLSEnabled()) { materializeX509(appId, p.getUser(), p.getUserFolder(), ProtoUtils.convertFromProtoFormat(p.getKeyStore()), p.getKeyStorePassword(), ProtoUtils.convertFromProtoFormat(p.getTrustStore()), p.getTrustStorePassword()); cryptoMaterialVersion = p.getCryptoVersion(); } if (isJWTEnabled()) { materializeJWT(appId, p.getUser(), p.getUserFolder(), p.getJwt()); jwtExpiration = p.getJwtExpiration(); } List<ApplicationACLMapProto> aclProtoList = p.getAclsList(); Map<ApplicationAccessType, String> acls = new HashMap<ApplicationAccessType, String>(aclProtoList.size()); for (ApplicationACLMapProto aclProto : aclProtoList) { acls.put(ProtoUtils.convertFromProtoFormat(aclProto.getAccessType()), aclProto.getAcl()); } LogAggregationContext logAggregationContext = null; if (p.getLogAggregationContext() != null) { logAggregationContext = new LogAggregationContextPBImpl(p.getLogAggregationContext()); } LOG.info("Recovering application " + appId); ApplicationImpl app = null; if (isHopsTLSEnabled() || isJWTEnabled()) { app = new ApplicationImpl(dispatcher, p.getUser(), appId, creds, context, p.getUserFolder(), cryptoMaterialVersion, jwtExpiration); } else { app = new ApplicationImpl(dispatcher, p.getUser(), appId, creds, context, p.getUserFolder()); } context.getApplications().put(appId, app); app.handle(new ApplicationInitEvent(appId, acls, logAggregationContext)); } @SuppressWarnings("unchecked") private void recoverContainer(RecoveredContainerState rcs) throws IOException { StartContainerRequest req = rcs.getStartRequest(); ContainerLaunchContext launchContext = req.getContainerLaunchContext(); ContainerTokenIdentifier token = BuilderUtils.newContainerTokenIdentifier(req.getContainerToken()); ContainerId containerId = token.getContainerID(); ApplicationId appId = containerId.getApplicationAttemptId().getApplicationId(); LOG.info("Recovering " + containerId + " in state " + rcs.getStatus() + " with exit code " + rcs.getExitCode()); Application app = context.getApplications().get(appId); if (app != null) { Credentials credentials = YarnServerSecurityUtils.parseCredentials(launchContext); Container container = new ContainerImpl(getConfig(), dispatcher, req.getContainerLaunchContext(), credentials, metrics, token, context, rcs); context.getContainers().put(containerId, container); app.handle(new ApplicationContainerInitEvent(container)); } else { if (rcs.getStatus() != RecoveredContainerStatus.COMPLETED) { LOG.warn(containerId + " has no corresponding application!"); } LOG.info("Adding " + containerId + " to recently stopped containers"); nodeStatusUpdater.addCompletedContainer(containerId); } } private void waitForRecoveredContainers() throws InterruptedException { final int sleepMsec = 100; int waitIterations = 100; List<ContainerId> newContainers = new ArrayList<ContainerId>(); while (--waitIterations >= 0) { newContainers.clear(); for (Container container : context.getContainers().values()) { if (container .getContainerState() == org.apache.hadoop.yarn.server.nodemanager.containermanager.container.ContainerState.NEW) { newContainers.add(container.getContainerId()); } } if (newContainers.isEmpty()) { break; } LOG.info("Waiting for containers: " + newContainers); Thread.sleep(sleepMsec); } if (waitIterations < 0) { LOG.warn("Timeout waiting for recovered containers"); } } protected LogHandler createLogHandler(Configuration conf, Context context, DeletionService deletionService) { if (conf.getBoolean(YarnConfiguration.LOG_AGGREGATION_ENABLED, YarnConfiguration.DEFAULT_LOG_AGGREGATION_ENABLED)) { return new LogAggregationService(this.dispatcher, context, deletionService, dirsHandler); } else { return new NonAggregatingLogHandler(this.dispatcher, deletionService, dirsHandler, context.getNMStateStore()); } } public ContainersMonitor getContainersMonitor() { return this.containersMonitor; } protected ResourceLocalizationService createResourceLocalizationService(ContainerExecutor exec, DeletionService deletionContext, Context context) { return new ResourceLocalizationService(this.dispatcher, exec, deletionContext, dirsHandler, context); } protected SharedCacheUploadService createSharedCacheUploaderService() { return new SharedCacheUploadService(); } protected ContainersLauncher createContainersLauncher(Context context, ContainerExecutor exec) { return new ContainersLauncher(context, this.dispatcher, exec, dirsHandler, this); } @Override protected void serviceStart() throws Exception { // Enqueue user dirs in deletion context Configuration conf = getConfig(); final InetSocketAddress initialAddress = conf.getSocketAddr(YarnConfiguration.NM_BIND_HOST, YarnConfiguration.NM_ADDRESS, YarnConfiguration.DEFAULT_NM_ADDRESS, YarnConfiguration.DEFAULT_NM_PORT); boolean usingEphemeralPort = (initialAddress.getPort() == 0); if (context.getNMStateStore().canRecover() && usingEphemeralPort) { throw new IllegalArgumentException("Cannot support recovery with an " + "ephemeral server port. Check the setting of " + YarnConfiguration.NM_ADDRESS); } // If recovering then delay opening the RPC service until the recovery // of resources and containers have completed, otherwise requests from // clients during recovery can interfere with the recovery process. final boolean delayedRpcServerStart = context.getNMStateStore().canRecover(); Configuration serverConf = new Configuration(conf); // always enforce it to be token-based. serverConf.set(CommonConfigurationKeysPublic.HADOOP_SECURITY_AUTHENTICATION, SaslRpcServer.AuthMethod.TOKEN.toString()); YarnRPC rpc = YarnRPC.create(conf); server = rpc.getServer(ContainerManagementProtocol.class, this, initialAddress, serverConf, this.context.getNMTokenSecretManager(), conf.getInt(YarnConfiguration.NM_CONTAINER_MGR_THREAD_COUNT, YarnConfiguration.DEFAULT_NM_CONTAINER_MGR_THREAD_COUNT)); // Enable service authorization? if (conf.getBoolean(CommonConfigurationKeysPublic.HADOOP_SECURITY_AUTHORIZATION, false)) { refreshServiceAcls(conf, new NMPolicyProvider()); } LOG.info("Blocking new container-requests as container manager rpc" + " server is still starting."); this.setBlockNewContainerRequests(true); String bindHost = conf.get(YarnConfiguration.NM_BIND_HOST); String nmAddress = conf.getTrimmed(YarnConfiguration.NM_ADDRESS); String hostOverride = null; if (bindHost != null && !bindHost.isEmpty() && nmAddress != null && !nmAddress.isEmpty()) { //a bind-host case with an address, to support overriding the first //hostname found when querying for our hostname with the specified //address, combine the specified address with the actual port listened //on by the server hostOverride = nmAddress.split(":")[0]; } // setup node ID InetSocketAddress connectAddress; if (delayedRpcServerStart) { connectAddress = NetUtils.getConnectAddress(initialAddress); } else { server.start(); connectAddress = NetUtils.getConnectAddress(server); } NodeId nodeId = buildNodeId(connectAddress, hostOverride); ((NodeManager.NMContext) context).setNodeId(nodeId); this.context.getNMTokenSecretManager().setNodeId(nodeId); this.context.getContainerTokenSecretManager().setNodeId(nodeId); // start remaining services super.serviceStart(); if (delayedRpcServerStart) { waitForRecoveredContainers(); server.start(); // check that the node ID is as previously advertised connectAddress = NetUtils.getConnectAddress(server); NodeId serverNode = buildNodeId(connectAddress, hostOverride); if (!serverNode.equals(nodeId)) { throw new IOException("Node mismatch after server started, expected '" + nodeId + "' but found '" + serverNode + "'"); } } LOG.info("ContainerManager started at " + connectAddress); LOG.info("ContainerManager bound to " + initialAddress); } private NodeId buildNodeId(InetSocketAddress connectAddress, String hostOverride) { if (hostOverride != null) { connectAddress = NetUtils .getConnectAddress(new InetSocketAddress(hostOverride, connectAddress.getPort())); } return NodeId.newInstance(connectAddress.getAddress().getCanonicalHostName(), connectAddress.getPort()); } void refreshServiceAcls(Configuration configuration, PolicyProvider policyProvider) { this.server.refreshServiceAcl(configuration, policyProvider); } @Override public void serviceStop() throws Exception { setBlockNewContainerRequests(true); this.writeLock.lock(); try { serviceStopped = true; if (context != null) { cleanUpApplicationsOnNMShutDown(); } } finally { this.writeLock.unlock(); } if (auxiliaryServices.getServiceState() == STARTED) { auxiliaryServices.unregisterServiceListener(this); } if (server != null) { server.stop(); } if (cryptoMaterialUpdaterThreadPool != null) { cryptoMaterialUpdaterThreadPool.shutdownNow(); } super.serviceStop(); } public void cleanUpApplicationsOnNMShutDown() { Map<ApplicationId, Application> applications = this.context.getApplications(); if (applications.isEmpty()) { return; } LOG.info("Applications still running : " + applications.keySet()); if (this.context.getNMStateStore().canRecover() && !this.context.getDecommissioned()) { if (getConfig().getBoolean(YarnConfiguration.NM_RECOVERY_SUPERVISED, YarnConfiguration.DEFAULT_NM_RECOVERY_SUPERVISED)) { // do not cleanup apps as they can be recovered on restart return; } } List<ApplicationId> appIds = new ArrayList<ApplicationId>(applications.keySet()); this.handle(new CMgrCompletedAppsEvent(appIds, CMgrCompletedAppsEvent.Reason.ON_SHUTDOWN)); LOG.info("Waiting for Applications to be Finished"); long waitStartTime = System.currentTimeMillis(); while (!applications.isEmpty() && System.currentTimeMillis() - waitStartTime < waitForContainersOnShutdownMillis) { try { Thread.sleep(1000); } catch (InterruptedException ex) { LOG.warn("Interrupted while sleeping on applications finish on shutdown", ex); } } // All applications Finished if (applications.isEmpty()) { LOG.info("All applications in FINISHED state"); } else { LOG.info("Done waiting for Applications to be Finished. Still alive: " + applications.keySet()); } } public void cleanupContainersOnNMResync() { Map<ContainerId, Container> containers = context.getContainers(); if (containers.isEmpty()) { return; } LOG.info("Containers still running on " + CMgrCompletedContainersEvent.Reason.ON_NODEMANAGER_RESYNC + " : " + containers.keySet()); List<ContainerId> containerIds = new ArrayList<ContainerId>(containers.keySet()); LOG.info("Waiting for containers to be killed"); this.handle(new CMgrCompletedContainersEvent(containerIds, CMgrCompletedContainersEvent.Reason.ON_NODEMANAGER_RESYNC)); /* * We will wait till all the containers change their state to COMPLETE. We * will not remove the container statuses from nm context because these * are used while re-registering node manager with resource manager. */ boolean allContainersCompleted = false; while (!containers.isEmpty() && !allContainersCompleted) { allContainersCompleted = true; for (Entry<ContainerId, Container> container : containers.entrySet()) { if (((ContainerImpl) container.getValue()).getCurrentState() != ContainerState.COMPLETE) { allContainersCompleted = false; try { Thread.sleep(1000); } catch (InterruptedException ex) { LOG.warn("Interrupted while sleeping on container kill on resync", ex); } break; } } } // All containers killed if (allContainersCompleted) { LOG.info("All containers in DONE state"); } else { LOG.info("Done waiting for containers to be killed. Still alive: " + containers.keySet()); } } // Get the remoteUGI corresponding to the api call. protected UserGroupInformation getRemoteUgi() throws YarnException { UserGroupInformation remoteUgi; try { remoteUgi = UserGroupInformation.getCurrentUser(); } catch (IOException e) { String msg = "Cannot obtain the user-name. Got exception: " + StringUtils.stringifyException(e); LOG.warn(msg); throw RPCUtil.getRemoteException(msg); } return remoteUgi; } // Obtain the needed ContainerTokenIdentifier from the remote-UGI. RPC layer // currently sets only the required id, but iterate through anyways just to // be sure. @Private @VisibleForTesting protected NMTokenIdentifier selectNMTokenIdentifier(UserGroupInformation remoteUgi) { Set<TokenIdentifier> tokenIdentifiers = remoteUgi.getTokenIdentifiers(); NMTokenIdentifier resultId = null; for (TokenIdentifier id : tokenIdentifiers) { if (id instanceof NMTokenIdentifier) { resultId = (NMTokenIdentifier) id; break; } } return resultId; } protected void authorizeUser(UserGroupInformation remoteUgi, NMTokenIdentifier nmTokenIdentifier) throws YarnException { if (nmTokenIdentifier == null) { throw RPCUtil.getRemoteException(INVALID_NMTOKEN_MSG); } if (!remoteUgi.getUserName().equals(nmTokenIdentifier.getApplicationAttemptId().toString())) { throw RPCUtil.getRemoteException("Expected applicationAttemptId: " + remoteUgi.getUserName() + "Found: " + nmTokenIdentifier.getApplicationAttemptId()); } } /** * @param containerTokenIdentifier * of the container whose resource is to be started or increased * @throws YarnException */ @Private @VisibleForTesting protected void authorizeStartAndResourceIncreaseRequest(NMTokenIdentifier nmTokenIdentifier, ContainerTokenIdentifier containerTokenIdentifier, boolean startRequest) throws YarnException { if (nmTokenIdentifier == null) { throw RPCUtil.getRemoteException(INVALID_NMTOKEN_MSG); } if (containerTokenIdentifier == null) { throw RPCUtil.getRemoteException(INVALID_CONTAINERTOKEN_MSG); } /* * Check the following: * 1. The request comes from the same application attempt * 2. The request possess a container token that has not expired * 3. The request possess a container token that is granted by a known RM */ ContainerId containerId = containerTokenIdentifier.getContainerID(); String containerIDStr = containerId.toString(); boolean unauthorized = false; StringBuilder messageBuilder = new StringBuilder( "Unauthorized request to " + (startRequest ? "start container." : "increase container resource.")); if (!nmTokenIdentifier.getApplicationAttemptId().getApplicationId() .equals(containerId.getApplicationAttemptId().getApplicationId())) { unauthorized = true; messageBuilder.append("\nNMToken for application attempt : ") .append(nmTokenIdentifier.getApplicationAttemptId()) .append(" was used for " + (startRequest ? "starting " : "increasing resource of ") + "container with container token") .append(" issued for application attempt : ").append(containerId.getApplicationAttemptId()); } else if (startRequest && !this.context.getContainerTokenSecretManager() .isValidStartContainerRequest(containerTokenIdentifier)) { // Is the container being relaunched? Or RPC layer let startCall with // tokens generated off old-secret through? unauthorized = true; messageBuilder.append("\n Attempt to relaunch the same ").append("container with id ") .append(containerIDStr).append("."); } else if (containerTokenIdentifier.getExpiryTimeStamp() < System.currentTimeMillis()) { // Ensure the token is not expired. unauthorized = true; messageBuilder.append("\nThis token is expired. current time is ").append(System.currentTimeMillis()) .append(" found ").append(containerTokenIdentifier.getExpiryTimeStamp()); messageBuilder.append("\nNote: System times on machines may be out of sync.") .append(" Check system time and time zones."); } if (unauthorized) { String msg = messageBuilder.toString(); LOG.error(msg); throw RPCUtil.getRemoteException(msg); } if (containerTokenIdentifier.getRMIdentifier() != nodeStatusUpdater.getRMIdentifier()) { // Is the container coming from unknown RM StringBuilder sb = new StringBuilder("\nContainer "); sb.append(containerTokenIdentifier.getContainerID().toString()) .append(" rejected as it is allocated by a previous RM"); throw new InvalidContainerException(sb.toString()); } } /** * Start a list of containers on this NodeManager. */ @Override public StartContainersResponse startContainers(StartContainersRequest requests) throws YarnException, IOException { if (blockNewContainerRequests.get()) { throw new NMNotYetReadyException( "Rejecting new containers as NodeManager has not" + " yet connected with ResourceManager"); } UserGroupInformation remoteUgi = getRemoteUgi(); NMTokenIdentifier nmTokenIdentifier = selectNMTokenIdentifier(remoteUgi); authorizeUser(remoteUgi, nmTokenIdentifier); materializeSecurityMaterial(requests); List<ContainerId> succeededContainers = new ArrayList<ContainerId>(); Map<ContainerId, SerializedException> failedContainers = new HashMap<ContainerId, SerializedException>(); // Synchronize with NodeStatusUpdaterImpl#registerWithRM // to avoid race condition during NM-RM resync (due to RM restart) while a // container is being started, in particular when the container has not yet // been added to the containers map in NMContext. synchronized (this.context) { for (StartContainerRequest request : requests.getStartContainerRequests()) { ContainerId containerId = null; try { if (request.getContainerToken() == null || request.getContainerToken().getIdentifier() == null) { throw new IOException(INVALID_CONTAINERTOKEN_MSG); } ContainerTokenIdentifier containerTokenIdentifier = BuilderUtils .newContainerTokenIdentifier(request.getContainerToken()); verifyAndGetContainerTokenIdentifier(request.getContainerToken(), containerTokenIdentifier); containerId = containerTokenIdentifier.getContainerID(); // Initialize the AMRMProxy service instance only if the container is of // type AM and if the AMRMProxy service is enabled if (amrmProxyEnabled && containerTokenIdentifier.getContainerType() .equals(ContainerType.APPLICATION_MASTER)) { this.getAMRMProxyService().processApplicationStartRequest(request); } startContainerInternal(nmTokenIdentifier, containerTokenIdentifier, request); succeededContainers.add(containerId); } catch (YarnException e) { failedContainers.put(containerId, SerializedException.newInstance(e)); } catch (InvalidToken ie) { failedContainers.put(containerId, SerializedException.newInstance(ie)); throw ie; } catch (IOException e) { throw RPCUtil.getRemoteException(e); } } return StartContainersResponse.newInstance(getAuxServiceMetaData(), succeededContainers, failedContainers); } } private boolean isHopsTLSEnabled() { return ((NodeManager.NMContext) context).isHopsTLSEnabled(); } private boolean isJWTEnabled() { return ((NodeManager.NMContext) context).isJWTEnabled(); } private void materializeSecurityMaterial(StartContainersRequest requests) throws YarnException, IOException { if (isHopsTLSEnabled() || isJWTEnabled()) { String user = null, userFolder = null; ApplicationId appId = null; // When launching AM container there is only one Container request if (!requests.getStartContainerRequests().isEmpty()) { StartContainerRequest request = requests.getStartContainerRequests().get(0); ContainerTokenIdentifier containerTokenIdentifier = BuilderUtils .newContainerTokenIdentifier(request.getContainerToken()); if (containerTokenIdentifier == null) { throw RPCUtil.getRemoteException(new IOException(INVALID_CONTAINERTOKEN_MSG)); } user = containerTokenIdentifier.getApplicationSubmitter(); userFolder = containerTokenIdentifier.getApplicationSubmitterFolder(); appId = containerTokenIdentifier.getContainerID().getApplicationAttemptId().getApplicationId(); } if (user == null || userFolder == null) { throw new IOException("User requested container or user folder is null"); } if (isHopsTLSEnabled()) { materializeX509(appId, user, userFolder, requests.getKeyStore(), requests.getKeyStorePassword(), requests.getTrustStore(), requests.getTrustStorePassword()); } if (isJWTEnabled()) { materializeJWT(appId, user, userFolder, requests.getJWT()); } } } private void materializeX509(ApplicationId appId, String user, String userFolder, ByteBuffer keyStore, String keyStorePass, ByteBuffer trustStore, String trustStorePass) throws IOException { if (context.getApplications().containsKey(appId)) { LOG.debug("Application reference exists, certificates should have " + "already been materialized"); return; } if (keyStore == null || trustStore == null || (keyStore.capacity() == 0) || (trustStore.capacity() == 0)) { throw new IOException( "RPC TLS is enabled but keyStore or trustStore " + "supplied is either null or empty"); } // ApplicationMasters will also call startContainers() through NMClient // In that case there will be no password set for keystore and truststore // Only RM will set these values when launching AM container through the // AMLauncher if (keyStorePass != null && !keyStorePass.isEmpty() && trustStorePass != null && !trustStorePass.isEmpty()) { try { context.getCertificateLocalizationService().materializeCertificates(user, appId.toString(), userFolder, keyStore, keyStorePass, trustStore, trustStorePass); } catch (InterruptedException ex) { LOG.error(ex, ex); throw new IOException(ex); } } } private void materializeJWT(ApplicationId appId, String user, String userFolder, String jwt) throws IOException { if (context.getApplications().containsKey(appId)) { LOG.debug("Application reference exists, JWT should have " + "already been materialized"); return; } if (jwt == null || jwt.isEmpty()) { throw new IOException("JWT is enabled but it either null or empty for application " + appId); } try { context.getCertificateLocalizationService().materializeJWT(user, appId.toString(), userFolder, jwt); } catch (InterruptedException ex) { LOG.error(ex, ex); throw new IOException(ex); } } private ContainerManagerApplicationProto buildAppProto(ApplicationId appId, String user, String userFolder, Credentials credentials, Map<ApplicationAccessType, String> appAcls, LogAggregationContext logAggregationContext, ByteBuffer keyStore, String keyStorePass, ByteBuffer trustStore, String trustStorePass, int cryptoVersion, String jwt, long jwtExpiration) { ContainerManagerApplicationProto.Builder builder = ContainerManagerApplicationProto.newBuilder(); builder.setId(((ApplicationIdPBImpl) appId).getProto()); builder.setUser(user); builder.setUserFolder(userFolder); if (keyStore != null) { builder.setKeyStore(ProtoUtils.convertToProtoFormat(keyStore)); builder.setKeyStorePassword(keyStorePass); } if (trustStore != null) { builder.setTrustStore(ProtoUtils.convertToProtoFormat(trustStore)); builder.setTrustStorePassword(trustStorePass); } builder.setCryptoVersion(cryptoVersion); if (jwt != null) { builder.setJwt(jwt); } if (jwtExpiration != -1L) { builder.setJwtExpiration(jwtExpiration); } if (logAggregationContext != null) { builder.setLogAggregationContext(((LogAggregationContextPBImpl) logAggregationContext).getProto()); } builder.clearCredentials(); if (credentials != null) { DataOutputBuffer dob = new DataOutputBuffer(); try { credentials.writeTokenStorageToStream(dob); builder.setCredentials(ByteString.copyFrom(dob.getData())); } catch (IOException e) { // should not occur LOG.error("Cannot serialize credentials", e); } } builder.clearAcls(); if (appAcls != null) { for (Map.Entry<ApplicationAccessType, String> acl : appAcls.entrySet()) { ApplicationACLMapProto p = ApplicationACLMapProto.newBuilder() .setAccessType(ProtoUtils.convertToProtoFormat(acl.getKey())).setAcl(acl.getValue()) .build(); builder.addAcls(p); } } return builder.build(); } @SuppressWarnings("unchecked") private void startContainerInternal(NMTokenIdentifier nmTokenIdentifier, ContainerTokenIdentifier containerTokenIdentifier, StartContainerRequest request) throws YarnException, IOException { /* * 1) It should save the NMToken into NMTokenSecretManager. This is done * here instead of RPC layer because at the time of opening/authenticating * the connection it doesn't know what all RPC calls user will make on it. * Also new NMToken is issued only at startContainer (once it gets renewed). * * 2) It should validate containerToken. Need to check below things. a) It * is signed by correct master key (part of retrieve password). b) It * belongs to correct Node Manager (part of retrieve password). c) It has * correct RMIdentifier. d) It is not expired. */ authorizeStartAndResourceIncreaseRequest(nmTokenIdentifier, containerTokenIdentifier, true); // update NMToken updateNMTokenIdentifier(nmTokenIdentifier); ContainerId containerId = containerTokenIdentifier.getContainerID(); String containerIdStr = containerId.toString(); String user = containerTokenIdentifier.getApplicationSubmitter(); String userFolder = containerTokenIdentifier.getApplicationSubmitterFolder(); LOG.info("Start request for " + containerIdStr + " by user " + user); ContainerLaunchContext launchContext = request.getContainerLaunchContext(); Map<String, ByteBuffer> serviceData = getAuxServiceMetaData(); if (launchContext.getServiceData() != null && !launchContext.getServiceData().isEmpty()) { for (Map.Entry<String, ByteBuffer> meta : launchContext.getServiceData().entrySet()) { if (null == serviceData.get(meta.getKey())) { throw new InvalidAuxServiceException("The auxService:" + meta.getKey() + " does not exist"); } } } injectCryptoMaterialAsLocalResources(user, containerId, launchContext); // Crypto version of this material might be greater than 0, but from the NM's perspective it's // the first time it receives it int cryptoMaterialVersion = isHopsTLSEnabled() ? 0 : -1; long jwtExpiration = isJWTEnabled() ? 0L : -1L; // Sanity check for local resources for (Map.Entry<String, LocalResource> rsrc : launchContext.getLocalResources().entrySet()) { if (rsrc.getValue() == null || rsrc.getValue().getResource() == null) { throw new YarnException( "Null resource URL for local resource " + rsrc.getKey() + " : " + rsrc.getValue()); } } Credentials credentials = YarnServerSecurityUtils.parseCredentials(launchContext); Container container = new ContainerImpl(getConfig(), this.dispatcher, launchContext, credentials, metrics, containerTokenIdentifier, context); ApplicationId applicationID = containerId.getApplicationAttemptId().getApplicationId(); if (context.getContainers().putIfAbsent(containerId, container) != null) { NMAuditLogger.logFailure(user, AuditConstants.START_CONTAINER, "ContainerManagerImpl", "Container already running on this node!", applicationID, containerId); throw RPCUtil.getRemoteException("Container " + containerIdStr + " already is running on this node!!"); } this.readLock.lock(); try { if (!serviceStopped) { // Create the application Application application = new ApplicationImpl(dispatcher, user, applicationID, credentials, context, userFolder, cryptoMaterialVersion, jwtExpiration); if (null == context.getApplications().putIfAbsent(applicationID, application)) { LOG.info("Creating a new application reference for app " + applicationID); LogAggregationContext logAggregationContext = containerTokenIdentifier .getLogAggregationContext(); Map<ApplicationAccessType, String> appAcls = container.getLaunchContext().getApplicationACLs(); ByteBuffer keyStore = null, trustStore = null; String keyStorePass = null, trustStorePass = null; String jwt = null; CertificateLocalizationService certLocService = context.getCertificateLocalizationService(); if (certLocService != null) { if (isHopsTLSEnabled()) { try { X509SecurityMaterial x509Material = certLocService.getX509MaterialLocation(user, applicationID.toString()); keyStore = x509Material.getKeyStoreMem(); trustStore = x509Material.getTrustStoreMem(); keyStorePass = x509Material.getKeyStorePass(); trustStorePass = x509Material.getTrustStorePass(); } catch (InterruptedException ex) { throw new YarnException( "Interrupted while waiting to get X.509 material for " + applicationID, ex); } } if (isJWTEnabled()) { try { JWTSecurityMaterial jwtMaterial = certLocService.getJWTMaterialLocation(user, applicationID.toString()); jwt = jwtMaterial.getToken(); } catch (InterruptedException ex) { throw new YarnException( "Interrupted while waiting to get JWT material for " + applicationID, ex); } } } context.getNMStateStore().storeApplication(applicationID, buildAppProto(applicationID, user, userFolder, credentials, appAcls, logAggregationContext, keyStore, keyStorePass, trustStore, trustStorePass, cryptoMaterialVersion, jwt, jwtExpiration)); dispatcher.getEventHandler() .handle(new ApplicationInitEvent(applicationID, appAcls, logAggregationContext)); } this.context.getNMStateStore().storeContainer(containerId, containerTokenIdentifier.getVersion(), request); dispatcher.getEventHandler().handle(new ApplicationContainerInitEvent(container)); this.context.getContainerTokenSecretManager().startContainerSuccessful(containerTokenIdentifier); NMAuditLogger.logSuccess(user, AuditConstants.START_CONTAINER, "ContainerManageImpl", applicationID, containerId); // TODO launchedContainer misplaced -> doesn't necessarily mean a container // launch. A finished Application will not launch containers. metrics.launchedContainer(); metrics.allocateContainer(containerTokenIdentifier.getResource()); } else { throw new YarnException( "Container start failed as the NodeManager is " + "in the process of shutting down"); } } finally { this.readLock.unlock(); } } private void addAsLocalResource(Map<File, String> resources, ContainerId containerId, ContainerLaunchContext containerLaunchContext) throws IOException { for (Map.Entry<File, String> resource : resources.entrySet()) { File localFile = resource.getKey(); if (!localFile.exists() || !localFile.canRead()) { throw new IOException("Crypto material file " + localFile.getAbsolutePath() + " for container " + containerId.toString() + " does not exist or cannot be read"); } URL fileURL = URL.newInstance("file", null, -1, localFile.getAbsolutePath()); LocalResource localResource = LocalResource.newInstance(fileURL, LocalResourceType.FILE, LocalResourceVisibility.PRIVATE, localFile.length(), localFile.lastModified()); containerLaunchContext.getLocalResources().put(resource.getValue(), localResource); } } private void injectCryptoMaterialAsLocalResources(String applicationUser, ContainerId containerId, ContainerLaunchContext containerLaunchContext) throws YarnException, IOException { try { String applicationId = containerId.getApplicationAttemptId().getApplicationId().toString(); Map<File, String> resources = null; // Inject X.509 material if (isHopsTLSEnabled()) { resources = new HashMap<>(); X509SecurityMaterial cryptoMaterial = context.getCertificateLocalizationService() .getX509MaterialLocation(applicationUser, applicationId); Path keyStoreLocation = cryptoMaterial.getKeyStoreLocation(); Path trustStoreLocation = cryptoMaterial.getTrustStoreLocation(); Path passwdLocation = cryptoMaterial.getPasswdLocation(); if (keyStoreLocation == null || trustStoreLocation == null || passwdLocation == null) { throw new YarnException("One of the crypto materials for container " + containerId.toString() + " has not " + "been localized correctly and is null"); } resources.put(keyStoreLocation.toFile(), HopsSSLSocketFactory.LOCALIZED_KEYSTORE_FILE_NAME); resources.put(trustStoreLocation.toFile(), HopsSSLSocketFactory.LOCALIZED_TRUSTSTORE_FILE_NAME); resources.put(passwdLocation.toFile(), HopsSSLSocketFactory.LOCALIZED_PASSWD_FILE_NAME); } // Inject JWT material if (isJWTEnabled()) { JWTSecurityMaterial material = context.getCertificateLocalizationService() .getJWTMaterialLocation(applicationUser, applicationId); if (resources == null) { resources = new HashMap<>(1); } resources.put(material.getTokenLocation().toFile(), JWTSecurityMaterial.JWT_LOCAL_RESOURCE_FILE); } if (resources != null) { addAsLocalResource(resources, containerId, containerLaunchContext); } } catch (InterruptedException ex) { throw new YarnException(ex); } } protected ContainerTokenIdentifier verifyAndGetContainerTokenIdentifier( org.apache.hadoop.yarn.api.records.Token token, ContainerTokenIdentifier containerTokenIdentifier) throws YarnException, InvalidToken { byte[] password = context.getContainerTokenSecretManager().retrievePassword(containerTokenIdentifier); byte[] tokenPass = token.getPassword().array(); if (password == null || tokenPass == null || !Arrays.equals(password, tokenPass)) { throw new InvalidToken( "Invalid container token used for starting container on : " + context.getNodeId().toString()); } return containerTokenIdentifier; } /** * Increase resource of a list of containers on this NodeManager. */ @Override public IncreaseContainersResourceResponse increaseContainersResource(IncreaseContainersResourceRequest requests) throws YarnException, IOException { if (blockNewContainerRequests.get()) { throw new NMNotYetReadyException("Rejecting container resource increase as NodeManager has not" + " yet connected with ResourceManager"); } UserGroupInformation remoteUgi = getRemoteUgi(); NMTokenIdentifier nmTokenIdentifier = selectNMTokenIdentifier(remoteUgi); authorizeUser(remoteUgi, nmTokenIdentifier); List<ContainerId> successfullyIncreasedContainers = new ArrayList<ContainerId>(); Map<ContainerId, SerializedException> failedContainers = new HashMap<ContainerId, SerializedException>(); // Synchronize with NodeStatusUpdaterImpl#registerWithRM // to avoid race condition during NM-RM resync (due to RM restart) while a // container resource is being increased in NM, in particular when the // increased container has not yet been added to the increasedContainers // map in NMContext. synchronized (this.context) { // Process container resource increase requests for (org.apache.hadoop.yarn.api.records.Token token : requests.getContainersToIncrease()) { ContainerId containerId = null; try { if (token.getIdentifier() == null) { throw new IOException(INVALID_CONTAINERTOKEN_MSG); } ContainerTokenIdentifier containerTokenIdentifier = BuilderUtils .newContainerTokenIdentifier(token); verifyAndGetContainerTokenIdentifier(token, containerTokenIdentifier); authorizeStartAndResourceIncreaseRequest(nmTokenIdentifier, containerTokenIdentifier, false); containerId = containerTokenIdentifier.getContainerID(); // Reuse the startContainer logic to update NMToken, // as container resource increase request will have come with // an updated NMToken. updateNMTokenIdentifier(nmTokenIdentifier); Resource resource = containerTokenIdentifier.getResource(); changeContainerResourceInternal(containerId, containerTokenIdentifier.getVersion(), resource, true); successfullyIncreasedContainers.add(containerId); } catch (YarnException | InvalidToken e) { failedContainers.put(containerId, SerializedException.newInstance(e)); } catch (IOException e) { throw RPCUtil.getRemoteException(e); } } } return IncreaseContainersResourceResponse.newInstance(successfullyIncreasedContainers, failedContainers); } @SuppressWarnings("unchecked") private void changeContainerResourceInternal(ContainerId containerId, int containerVersion, Resource targetResource, boolean increase) throws YarnException, IOException { Container container = context.getContainers().get(containerId); // Check container existence if (container == null) { if (nodeStatusUpdater.isContainerRecentlyStopped(containerId)) { throw RPCUtil.getRemoteException( "Container " + containerId.toString() + " was recently stopped on node manager."); } else { throw RPCUtil.getRemoteException( "Container " + containerId.toString() + " is not handled by this NodeManager"); } } // Check container state org.apache.hadoop.yarn.server.nodemanager.containermanager.container.ContainerState currentState = container .getContainerState(); if (currentState != org.apache.hadoop.yarn.server.nodemanager.containermanager.container.ContainerState.RUNNING) { throw RPCUtil.getRemoteException("Container " + containerId.toString() + " is in " + currentState.name() + " state." + " Resource can only be changed when a container is in" + " RUNNING state"); } // Check validity of the target resource. Resource currentResource = container.getResource(); if (currentResource.equals(targetResource)) { LOG.warn("Unable to change resource for container " + containerId.toString() + ". The target resource " + targetResource.toString() + " is the same as the current resource"); return; } if (increase && !Resources.fitsIn(currentResource, targetResource)) { throw RPCUtil.getRemoteException("Unable to increase resource for " + "container " + containerId.toString() + ". The target resource " + targetResource.toString() + " is smaller than the current resource " + currentResource.toString()); } if (!increase && (!Resources.fitsIn(Resources.none(), targetResource) || !Resources.fitsIn(targetResource, currentResource))) { throw RPCUtil.getRemoteException("Unable to decrease resource for " + "container " + containerId.toString() + ". The target resource " + targetResource.toString() + " is not smaller than the current resource " + currentResource.toString()); } if (increase) { org.apache.hadoop.yarn.api.records.Container increasedContainer = org.apache.hadoop.yarn.api.records.Container .newInstance(containerId, null, null, targetResource, null, null); if (context.getIncreasedContainers().putIfAbsent(containerId, increasedContainer) != null) { throw RPCUtil.getRemoteException( "Container " + containerId.toString() + " resource is being increased."); } } this.readLock.lock(); try { if (!serviceStopped) { // Persist container resource change for recovery this.context.getNMStateStore().storeContainerResourceChanged(containerId, containerVersion, targetResource); getContainersMonitor() .handle(new ChangeMonitoringContainerResourceEvent(containerId, targetResource)); } else { throw new YarnException("Unable to change container resource as the NodeManager is " + "in the process of shutting down"); } } finally { this.readLock.unlock(); } } @Private @VisibleForTesting protected void updateNMTokenIdentifier(NMTokenIdentifier nmTokenIdentifier) throws InvalidToken { context.getNMTokenSecretManager().appAttemptStartContainer(nmTokenIdentifier); } /** * Stop a list of containers running on this NodeManager. */ @Override public StopContainersResponse stopContainers(StopContainersRequest requests) throws YarnException, IOException { List<ContainerId> succeededRequests = new ArrayList<ContainerId>(); Map<ContainerId, SerializedException> failedRequests = new HashMap<ContainerId, SerializedException>(); UserGroupInformation remoteUgi = getRemoteUgi(); NMTokenIdentifier identifier = selectNMTokenIdentifier(remoteUgi); if (identifier == null) { throw RPCUtil.getRemoteException(INVALID_NMTOKEN_MSG); } for (ContainerId id : requests.getContainerIds()) { try { stopContainerInternal(identifier, id); succeededRequests.add(id); } catch (YarnException e) { failedRequests.put(id, SerializedException.newInstance(e)); } } return StopContainersResponse.newInstance(succeededRequests, failedRequests); } @SuppressWarnings("unchecked") private void stopContainerInternal(NMTokenIdentifier nmTokenIdentifier, ContainerId containerID) throws YarnException, IOException { String containerIDStr = containerID.toString(); Container container = this.context.getContainers().get(containerID); LOG.info("Stopping container with container Id: " + containerIDStr); authorizeGetAndStopContainerRequest(containerID, container, true, nmTokenIdentifier); if (container == null) { if (!nodeStatusUpdater.isContainerRecentlyStopped(containerID)) { throw RPCUtil .getRemoteException("Container " + containerIDStr + " is not handled by this NodeManager"); } } else { if (container.isRecovering()) { throw new NMNotYetReadyException("Container " + containerIDStr + " is recovering, try later"); } context.getNMStateStore().storeContainerKilled(containerID); dispatcher.getEventHandler().handle(new ContainerKillEvent(containerID, ContainerExitStatus.KILLED_BY_APPMASTER, "Container killed by the ApplicationMaster.")); NMAuditLogger.logSuccess(container.getUser(), AuditConstants.STOP_CONTAINER, "ContainerManageImpl", containerID.getApplicationAttemptId().getApplicationId(), containerID); } } /** * Get a list of container statuses running on this NodeManager */ @Override public GetContainerStatusesResponse getContainerStatuses(GetContainerStatusesRequest request) throws YarnException, IOException { List<ContainerStatus> succeededRequests = new ArrayList<ContainerStatus>(); Map<ContainerId, SerializedException> failedRequests = new HashMap<ContainerId, SerializedException>(); UserGroupInformation remoteUgi = getRemoteUgi(); NMTokenIdentifier identifier = selectNMTokenIdentifier(remoteUgi); if (identifier == null) { throw RPCUtil.getRemoteException(INVALID_NMTOKEN_MSG); } for (ContainerId id : request.getContainerIds()) { try { ContainerStatus status = getContainerStatusInternal(id, identifier); succeededRequests.add(status); } catch (YarnException e) { failedRequests.put(id, SerializedException.newInstance(e)); } } return GetContainerStatusesResponse.newInstance(succeededRequests, failedRequests); } private ContainerStatus getContainerStatusInternal(ContainerId containerID, NMTokenIdentifier nmTokenIdentifier) throws YarnException { String containerIDStr = containerID.toString(); Container container = this.context.getContainers().get(containerID); LOG.info("Getting container-status for " + containerIDStr); authorizeGetAndStopContainerRequest(containerID, container, false, nmTokenIdentifier); if (container == null) { if (nodeStatusUpdater.isContainerRecentlyStopped(containerID)) { throw RPCUtil.getRemoteException( "Container " + containerIDStr + " was recently stopped on node manager."); } else { throw RPCUtil .getRemoteException("Container " + containerIDStr + " is not handled by this NodeManager"); } } ContainerStatus containerStatus = container.cloneAndGetContainerStatus(); LOG.info("Returning " + containerStatus); return containerStatus; } @Private @VisibleForTesting protected void authorizeGetAndStopContainerRequest(ContainerId containerId, Container container, boolean stopRequest, NMTokenIdentifier identifier) throws YarnException { if (identifier == null) { throw RPCUtil.getRemoteException(INVALID_NMTOKEN_MSG); } /* * For get/stop container status; we need to verify that 1) User (NMToken) * application attempt only has started container. 2) Requested containerId * belongs to the same application attempt (NMToken) which was used. (Note:- * This will prevent user in knowing another application's containers). */ ApplicationId nmTokenAppId = identifier.getApplicationAttemptId().getApplicationId(); if ((!nmTokenAppId.equals(containerId.getApplicationAttemptId().getApplicationId())) || (container != null && !nmTokenAppId.equals(container.getContainerId().getApplicationAttemptId().getApplicationId()))) { String msg; if (stopRequest) { msg = identifier.getApplicationAttemptId() + " attempted to stop non-application container : " + containerId; NMAuditLogger.logFailure("UnknownUser", AuditConstants.STOP_CONTAINER, "ContainerManagerImpl", "Trying to stop unknown container!", nmTokenAppId, containerId); } else { msg = identifier.getApplicationAttemptId() + " attempted to get status for non-application container : " + containerId; } LOG.warn(msg); throw RPCUtil.getRemoteException(msg); } } private Future removeX509UpdaterTask(ContainerId cid) { Future task = null; synchronized (x509Updaters) { task = x509Updaters.remove(cid); } return task; } private Future removeJWTUpdaterTask(ContainerId cid) { Future task = null; synchronized (jwtUpdaters) { task = jwtUpdaters.remove(cid); } return task; } private void scheduleSecurityUpdaterForContainer(ContainerManagerEvent event) { if (event instanceof CMgrUpdateX509Event) { scheduleX509Updater((CMgrUpdateX509Event) event); } else if (event instanceof CMgrUpdateJWTEvent) { scheduleJWTUpdater((CMgrUpdateJWTEvent) event); } } private void scheduleX509Updater(CMgrUpdateX509Event event) { LOG.debug("Scheduling X.509 updater for container " + event.getContainerId()); Future previousTask = removeX509UpdaterTask(event.getContainerId()); if (previousTask != null) { previousTask.cancel(true); } ContainerImpl container = (ContainerImpl) context.getContainers().get(event.getContainerId()); if (container != null) { ContainerX509UpdaterTask updaterTask = new ContainerX509UpdaterTask(container, event.getKeyStore(), event.getKeyStorePassword(), event.getTrustStore(), event.getTrustStorePassword(), event.getVersion()); scheduleX509UpdaterTaskInternal(updaterTask, container.getContainerId()); } } private void scheduleJWTUpdater(CMgrUpdateJWTEvent event) { LOG.debug("Scheduling JWT updater for container " + event.getContainerId()); Future previousTask = removeJWTUpdaterTask(event.getContainerId()); if (previousTask != null) { previousTask.cancel(true); } ContainerImpl container = (ContainerImpl) context.getContainers().get(event.getContainerId()); if (container != null) { ContainerJWTUpdaterTask updaterTask = new ContainerJWTUpdaterTask(container, event.getJwt(), event.getJwtExpiration()); scheduleJWTUpdaterTaskInternal(updaterTask, container.getContainerId()); } } private void scheduleX509UpdaterTaskInternal(ContainerX509UpdaterTask updater, ContainerId cid) { // Make sure we put the task to the Map before the worker tries to remove itself from the Map synchronized (x509Updaters) { Future task = cryptoMaterialUpdaterThreadPool.submit(updater); x509Updaters.put(cid, task); } } private void scheduleJWTUpdaterTaskInternal(ContainerJWTUpdaterTask updater, ContainerId cid) { // Make sure we put the task to the Map before the worker tries to remove itself from the Map synchronized (jwtUpdaters) { Future task = cryptoMaterialUpdaterThreadPool.submit(updater); jwtUpdaters.put(cid, task); } } private class ContainerJWTUpdaterTask extends ContainerSecurityUpdaterTask { private final String jwt; private final long jwtExpiration; private ContainerJWTUpdaterTask(ContainerImpl container, String jwt, long jwtExpiration) { super(container); this.jwt = jwt; this.jwtExpiration = jwtExpiration; } @Override protected void removeSecurityUpdaterTask() { removeJWTUpdaterTask(container.getContainerId()); } @Override protected void scheduleSecurityUpdaterTask() { scheduleJWTUpdaterTaskInternal(this, container.getContainerId()); } @Override protected void execute() throws IOException { container.identifyCryptoMaterialLocation(); File jwtFile = container.getJWTLocalizedPath(); if (jwtFile == null) { throw new IOException( "Could not identify localized JWT file for container " + container.getContainerId()); } writeStringToFile(jwtFile, jwt); } @Override protected void updateStateStore() throws IOException { ApplicationId applicationId = container.getContainerId().getApplicationAttemptId().getApplicationId(); Application app = context.getApplications().get(applicationId); app.setJWTExpiration(jwtExpiration); try { ContainerManagerApplicationProto appProto; if (isHopsTLSEnabled()) { X509SecurityMaterial x509SecurityMaterial = context.getCertificateLocalizationService() .getX509MaterialLocation(container.getUser(), applicationId.toString()); appProto = buildAppProto(applicationId, container.getUser(), container.getUserFolder(), container.getCredentials(), container.getLaunchContext().getApplicationACLs(), container.getContainerTokenIdentifier().getLogAggregationContext(), x509SecurityMaterial.getKeyStoreMem(), String.valueOf(x509SecurityMaterial.getKeyStorePass()), x509SecurityMaterial.getTrustStoreMem(), String.valueOf(x509SecurityMaterial.getTrustStorePass()), app.getX509Version(), jwt, jwtExpiration); } else { appProto = buildAppProto(applicationId, container.getUser(), container.getUserFolder(), container.getCredentials(), container.getLaunchContext().getApplicationACLs(), container.getContainerTokenIdentifier().getLogAggregationContext(), null, null, null, null, -1, jwt, jwtExpiration); } context.getNMStateStore().storeApplication(applicationId, appProto); } catch (InterruptedException ex) { throw new IOException(ex); } } } private class ContainerX509UpdaterTask extends ContainerSecurityUpdaterTask { private final ByteBuffer keyStore; private final char[] keyStorePassword; private final ByteBuffer trustStore; private final char[] trustStorePassword; private final int cryptoVersion; private ContainerX509UpdaterTask(ContainerImpl container, ByteBuffer keyStore, char[] keyStorePassword, ByteBuffer trustStore, char[] trustStorePassword, int cryptoVersion) { super(container); this.keyStore = keyStore; this.keyStorePassword = keyStorePassword; this.trustStore = trustStore; this.trustStorePassword = trustStorePassword; this.cryptoVersion = cryptoVersion; } @Override protected void removeSecurityUpdaterTask() { removeX509UpdaterTask(container.getContainerId()); } @Override protected void scheduleSecurityUpdaterTask() { scheduleX509UpdaterTaskInternal(this, container.getContainerId()); } @Override protected void execute() throws IOException { container.identifyCryptoMaterialLocation(); File keyStorePath = container.getKeyStoreLocalizedPath(); File trustStorePath = container.getTrustStoreLocalizedPath(); File passwordFilePath = container.getPasswordFileLocalizedPath(); if (keyStorePath == null || trustStorePath == null || passwordFilePath == null) { throw new IOException( "Could not identify localized X.509 cryptographic material location for container " + container.getContainerId()); } writeByteBufferToFile(keyStorePath, keyStore); writeByteBufferToFile(trustStorePath, trustStore); // Assume key store password is the same for the trust store and for the key itself writeStringToFile(passwordFilePath, String.valueOf(keyStorePassword)); } @Override protected void updateStateStore() throws IOException { ApplicationId applicationId = container.getContainerId().getApplicationAttemptId().getApplicationId(); Application app = context.getApplications().get(applicationId); app.setX509Version(cryptoVersion); try { ContainerManagerApplicationProto appProto; if (isJWTEnabled()) { JWTSecurityMaterial jwtSecurityMaterial = context.getCertificateLocalizationService() .getJWTMaterialLocation(container.getUser(), applicationId.toString()); appProto = buildAppProto(applicationId, container.getUser(), container.getUserFolder(), container.getCredentials(), container.getLaunchContext().getApplicationACLs(), container.getContainerTokenIdentifier().getLogAggregationContext(), keyStore, String.valueOf(keyStorePassword), trustStore, String.valueOf(trustStorePassword), cryptoVersion, jwtSecurityMaterial.getToken(), app.getJWTExpiration()); } else { appProto = buildAppProto(applicationId, container.getUser(), container.getUserFolder(), container.getCredentials(), container.getLaunchContext().getApplicationACLs(), container.getContainerTokenIdentifier().getLogAggregationContext(), keyStore, String.valueOf(keyStorePassword), trustStore, String.valueOf(trustStorePassword), cryptoVersion, null, -1L); } context.getNMStateStore().storeApplication(applicationId, appProto); } catch (InterruptedException ex) { throw new IOException(ex); } } } class ContainerEventDispatcher implements EventHandler<ContainerEvent> { @Override public void handle(ContainerEvent event) { Map<ContainerId, Container> containers = ContainerManagerImpl.this.context.getContainers(); Container c = containers.get(event.getContainerID()); if (c != null) { c.handle(event); } else { LOG.warn("Event " + event + " sent to absent container " + event.getContainerID()); } } } class ApplicationEventDispatcher implements EventHandler<ApplicationEvent> { @Override public void handle(ApplicationEvent event) { Application app = ContainerManagerImpl.this.context.getApplications().get(event.getApplicationID()); if (app != null) { app.handle(event); } else { LOG.warn("Event " + event + " sent to absent application " + event.getApplicationID()); } } } @SuppressWarnings("unchecked") @Override public void handle(ContainerManagerEvent event) { switch (event.getType()) { case FINISH_APPS: CMgrCompletedAppsEvent appsFinishedEvent = (CMgrCompletedAppsEvent) event; for (ApplicationId appID : appsFinishedEvent.getAppsToCleanup()) { Application app = this.context.getApplications().get(appID); if (app == null) { LOG.warn("couldn't find application " + appID + " while processing" + " FINISH_APPS event"); continue; } boolean shouldDropEvent = false; for (Container container : app.getContainers().values()) { if (container.isRecovering()) { LOG.info("drop FINISH_APPS event to " + appID + " because " + "container " + container.getContainerId() + " is recovering"); shouldDropEvent = true; break; } } if (shouldDropEvent) { continue; } String diagnostic = ""; if (appsFinishedEvent.getReason() == CMgrCompletedAppsEvent.Reason.ON_SHUTDOWN) { diagnostic = "Application killed on shutdown"; } else if (appsFinishedEvent.getReason() == CMgrCompletedAppsEvent.Reason.BY_RESOURCEMANAGER) { diagnostic = "Application killed by ResourceManager"; } this.dispatcher.getEventHandler().handle(new ApplicationFinishEvent(appID, diagnostic)); } break; case FINISH_CONTAINERS: CMgrCompletedContainersEvent containersFinishedEvent = (CMgrCompletedContainersEvent) event; for (ContainerId containerId : containersFinishedEvent.getContainersToCleanup()) { ApplicationId appId = containerId.getApplicationAttemptId().getApplicationId(); Application app = this.context.getApplications().get(appId); if (app == null) { LOG.warn("couldn't find app " + appId + " while processing" + " FINISH_CONTAINERS event"); continue; } Container container = app.getContainers().get(containerId); if (container == null) { LOG.warn( "couldn't find container " + containerId + " while processing FINISH_CONTAINERS event"); continue; } if (container.isRecovering()) { LOG.info("drop FINISH_CONTAINERS event to " + containerId + " because container is recovering"); continue; } this.dispatcher.getEventHandler().handle(new ContainerKillEvent(containerId, ContainerExitStatus.KILLED_BY_RESOURCEMANAGER, "Container Killed by ResourceManager")); } break; case DECREASE_CONTAINERS_RESOURCE: CMgrDecreaseContainersResourceEvent containersDecreasedEvent = (CMgrDecreaseContainersResourceEvent) event; for (org.apache.hadoop.yarn.api.records.Container container : containersDecreasedEvent .getContainersToDecrease()) { try { changeContainerResourceInternal(container.getId(), container.getVersion(), container.getResource(), false); } catch (YarnException e) { LOG.error("Unable to decrease container resource", e); } catch (IOException e) { LOG.error("Unable to update container resource in store", e); } } break; case SIGNAL_CONTAINERS: CMgrSignalContainersEvent containersSignalEvent = (CMgrSignalContainersEvent) event; for (SignalContainerRequest request : containersSignalEvent.getContainersToSignal()) { internalSignalToContainer(request, "ResourceManager"); } break; case UPDATE_CRYPTO_MATERIAL: scheduleSecurityUpdaterForContainer(event); break; default: throw new YarnRuntimeException("Got an unknown ContainerManagerEvent type: " + event.getType()); } } public void setBlockNewContainerRequests(boolean blockNewContainerRequests) { this.blockNewContainerRequests.set(blockNewContainerRequests); } @Private @VisibleForTesting public boolean getBlockNewContainerRequestsStatus() { return this.blockNewContainerRequests.get(); } @Override public void stateChanged(Service service) { // TODO Auto-generated method stub } public Context getContext() { return this.context; } @VisibleForTesting public Map<ContainerId, Future> getX509Updaters() { return x509Updaters; } @VisibleForTesting public Map<ContainerId, Future> getJWTUpdaters() { return jwtUpdaters; } public Map<String, ByteBuffer> getAuxServiceMetaData() { return this.auxiliaryServices.getMetaData(); } @Private public AMRMProxyService getAMRMProxyService() { return this.amrmProxyService; } @Private protected void setAMRMProxyService(AMRMProxyService amrmProxyService) { this.amrmProxyService = amrmProxyService; } @SuppressWarnings("unchecked") @Override public SignalContainerResponse signalToContainer(SignalContainerRequest request) throws YarnException, IOException { internalSignalToContainer(request, "Application Master"); return new SignalContainerResponsePBImpl(); } @SuppressWarnings("unchecked") private void internalSignalToContainer(SignalContainerRequest request, String sentBy) { ContainerId containerId = request.getContainerId(); Container container = this.context.getContainers().get(containerId); if (container != null) { LOG.info(containerId + " signal request " + request.getCommand() + " by " + sentBy); this.dispatcher.getEventHandler() .handle(new SignalContainersLauncherEvent(container, request.getCommand())); } else { LOG.info("Container " + containerId + " no longer exists"); } } }