Java tutorial
/** * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.hadoop.yarn.server.nodemanager; import static org.apache.hadoop.yarn.server.utils.YarnServerBuilderUtils.newNodeHeartbeatResponse; import static org.mockito.Mockito.mock; import static org.mockito.Mockito.when; import java.io.EOFException; import java.io.File; import java.io.IOException; import java.net.InetAddress; import java.net.InetSocketAddress; import java.net.UnknownHostException; import java.nio.ByteBuffer; import java.util.ArrayList; import java.util.Collections; import java.util.HashMap; import java.util.HashSet; import java.util.LinkedList; import java.util.List; import java.util.Map; import java.util.Set; import java.util.concurrent.ConcurrentMap; import java.util.concurrent.CountDownLatch; import java.util.concurrent.CyclicBarrier; import java.util.concurrent.ExecutorService; import java.util.concurrent.Executors; import java.util.concurrent.TimeUnit; import java.util.concurrent.atomic.AtomicBoolean; import java.util.concurrent.atomic.AtomicInteger; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileContext; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.DataOutputBuffer; import org.apache.hadoop.io.Text; import org.apache.hadoop.io.retry.RetryPolicy; import org.apache.hadoop.io.retry.RetryProxy; import org.apache.hadoop.metrics2.lib.DefaultMetricsSystem; import org.apache.hadoop.net.NetUtils; import org.apache.hadoop.net.ServerSocketUtil; import org.apache.hadoop.security.Credentials; import org.apache.hadoop.security.token.delegation.web.DelegationTokenIdentifier; import org.apache.hadoop.service.Service.STATE; import org.apache.hadoop.service.ServiceOperations; import org.apache.hadoop.yarn.api.protocolrecords.SignalContainerRequest; import org.apache.hadoop.yarn.api.records.ApplicationAttemptId; import org.apache.hadoop.yarn.api.records.ApplicationId; import org.apache.hadoop.yarn.api.records.ContainerId; import org.apache.hadoop.yarn.api.records.ContainerLaunchContext; import org.apache.hadoop.yarn.api.records.ContainerState; import org.apache.hadoop.yarn.api.records.ContainerStatus; import org.apache.hadoop.yarn.api.records.NodeId; import org.apache.hadoop.yarn.api.records.Resource; import org.apache.hadoop.yarn.api.records.SignalContainerCommand; import org.apache.hadoop.yarn.api.records.Token; import org.apache.hadoop.yarn.client.RMProxy; import org.apache.hadoop.yarn.conf.YarnConfiguration; import org.apache.hadoop.yarn.event.Dispatcher; import org.apache.hadoop.yarn.event.EventHandler; import org.apache.hadoop.yarn.exceptions.YarnException; import org.apache.hadoop.yarn.exceptions.YarnRuntimeException; import org.apache.hadoop.yarn.factories.RecordFactory; import org.apache.hadoop.yarn.factory.providers.RecordFactoryProvider; import org.apache.hadoop.yarn.proto.YarnServerCommonServiceProtos.NodeHeartbeatResponseProto; import org.apache.hadoop.yarn.security.ContainerTokenIdentifier; import org.apache.hadoop.yarn.server.api.ResourceTracker; import org.apache.hadoop.yarn.server.api.protocolrecords.NodeHeartbeatRequest; import org.apache.hadoop.yarn.server.api.protocolrecords.NodeHeartbeatResponse; import org.apache.hadoop.yarn.server.api.protocolrecords.RegisterNodeManagerRequest; import org.apache.hadoop.yarn.server.api.protocolrecords.RegisterNodeManagerResponse; import org.apache.hadoop.yarn.server.api.protocolrecords.UnRegisterNodeManagerRequest; import org.apache.hadoop.yarn.server.api.protocolrecords.UnRegisterNodeManagerResponse; import org.apache.hadoop.yarn.server.api.protocolrecords.impl.pb.NodeHeartbeatResponsePBImpl; import org.apache.hadoop.yarn.server.api.records.MasterKey; import org.apache.hadoop.yarn.server.api.records.NodeAction; import org.apache.hadoop.yarn.server.api.records.NodeStatus; import org.apache.hadoop.yarn.server.api.records.impl.pb.MasterKeyPBImpl; import org.apache.hadoop.fs.CommonConfigurationKeysPublic; import org.apache.hadoop.yarn.server.nodemanager.NodeManager.NMContext; import org.apache.hadoop.yarn.server.nodemanager.containermanager.ContainerManagerImpl; import org.apache.hadoop.yarn.server.nodemanager.containermanager.application.Application; import org.apache.hadoop.yarn.server.nodemanager.containermanager.application.ApplicationState; import org.apache.hadoop.yarn.server.nodemanager.containermanager.container.Container; import org.apache.hadoop.yarn.server.nodemanager.containermanager.container.ContainerImpl; import org.apache.hadoop.yarn.server.nodemanager.metrics.NodeManagerMetrics; import org.apache.hadoop.yarn.server.nodemanager.recovery.NMNullStateStoreService; import org.apache.hadoop.yarn.server.nodemanager.recovery.NMStateStoreService; import org.apache.hadoop.yarn.server.nodemanager.security.NMContainerTokenSecretManager; import org.apache.hadoop.yarn.server.nodemanager.security.NMTokenSecretManagerInNM; import org.apache.hadoop.yarn.server.security.ApplicationACLsManager; import org.apache.hadoop.yarn.server.utils.BuilderUtils; import org.apache.hadoop.yarn.server.utils.YarnServerBuilderUtils; import org.junit.After; import org.junit.Assert; import org.junit.Before; import org.junit.Test; @SuppressWarnings("rawtypes") public class TestNodeStatusUpdater { // temp fix until metrics system can auto-detect itself running in unit test: static { DefaultMetricsSystem.setMiniClusterMode(true); } static final Log LOG = LogFactory.getLog(TestNodeStatusUpdater.class); static final File basedir = new File("target", TestNodeStatusUpdater.class.getName()); static final File nmLocalDir = new File(basedir, "nm0"); static final File tmpDir = new File(basedir, "tmpDir"); static final File remoteLogsDir = new File(basedir, "remotelogs"); static final File logsDir = new File(basedir, "logs"); private static final RecordFactory recordFactory = RecordFactoryProvider.getRecordFactory(null); volatile int heartBeatID = 0; volatile Throwable nmStartError = null; private final List<NodeId> registeredNodes = new ArrayList<NodeId>(); private boolean triggered = false; private Configuration conf; private NodeManager nm; private AtomicBoolean assertionFailedInThread = new AtomicBoolean(false); @Before public void setUp() throws IOException { nmLocalDir.mkdirs(); tmpDir.mkdirs(); logsDir.mkdirs(); remoteLogsDir.mkdirs(); conf = createNMConfig(); } @After public void tearDown() { this.registeredNodes.clear(); heartBeatID = 0; ServiceOperations.stop(nm); assertionFailedInThread.set(false); DefaultMetricsSystem.shutdown(); } public static MasterKey createMasterKey() { MasterKey masterKey = new MasterKeyPBImpl(); masterKey.setKeyId(123); masterKey.setBytes(ByteBuffer.wrap(new byte[] { new Integer(123).byteValue() })); return masterKey; } private class MyResourceTracker implements ResourceTracker { private final Context context; private boolean signalContainer; public MyResourceTracker(Context context, boolean signalContainer) { this.context = context; this.signalContainer = signalContainer; } @Override public RegisterNodeManagerResponse registerNodeManager(RegisterNodeManagerRequest request) throws YarnException, IOException { NodeId nodeId = request.getNodeId(); Resource resource = request.getResource(); LOG.info("Registering " + nodeId.toString()); // NOTE: this really should be checking against the config value InetSocketAddress expected = NetUtils .getConnectAddress(conf.getSocketAddr(YarnConfiguration.NM_ADDRESS, null, -1)); Assert.assertEquals(NetUtils.getHostPortString(expected), nodeId.toString()); Assert.assertEquals(5 * 1024, resource.getMemorySize()); registeredNodes.add(nodeId); RegisterNodeManagerResponse response = recordFactory .newRecordInstance(RegisterNodeManagerResponse.class); response.setContainerTokenMasterKey(createMasterKey()); response.setNMTokenMasterKey(createMasterKey()); return response; } private Map<ApplicationId, List<ContainerStatus>> getAppToContainerStatusMap( List<ContainerStatus> containers) { Map<ApplicationId, List<ContainerStatus>> map = new HashMap<ApplicationId, List<ContainerStatus>>(); for (ContainerStatus cs : containers) { ApplicationId applicationId = cs.getContainerId().getApplicationAttemptId().getApplicationId(); List<ContainerStatus> appContainers = map.get(applicationId); if (appContainers == null) { appContainers = new ArrayList<ContainerStatus>(); map.put(applicationId, appContainers); } appContainers.add(cs); } return map; } @Override public NodeHeartbeatResponse nodeHeartbeat(NodeHeartbeatRequest request) throws YarnException, IOException { NodeStatus nodeStatus = request.getNodeStatus(); LOG.info("Got heartbeat number " + heartBeatID); NodeManagerMetrics mockMetrics = mock(NodeManagerMetrics.class); Dispatcher mockDispatcher = mock(Dispatcher.class); EventHandler mockEventHandler = mock(EventHandler.class); when(mockDispatcher.getEventHandler()).thenReturn(mockEventHandler); NMStateStoreService stateStore = new NMNullStateStoreService(); nodeStatus.setResponseId(heartBeatID++); Map<ApplicationId, List<ContainerStatus>> appToContainers = getAppToContainerStatusMap( nodeStatus.getContainersStatuses()); List<SignalContainerRequest> containersToSignal = null; ApplicationId appId1 = ApplicationId.newInstance(0, 1); ApplicationId appId2 = ApplicationId.newInstance(0, 2); ContainerId firstContainerID = null; if (heartBeatID == 1) { Assert.assertEquals(0, nodeStatus.getContainersStatuses().size()); // Give a container to the NM. ApplicationAttemptId appAttemptID = ApplicationAttemptId.newInstance(appId1, 0); firstContainerID = ContainerId.newContainerId(appAttemptID, heartBeatID); ContainerLaunchContext launchContext = recordFactory .newRecordInstance(ContainerLaunchContext.class); Resource resource = BuilderUtils.newResource(2, 1, 1); long currentTime = System.currentTimeMillis(); String user = "testUser"; ContainerTokenIdentifier containerToken = BuilderUtils .newContainerTokenIdentifier(BuilderUtils.newContainerToken(firstContainerID, 0, InetAddress.getByName("localhost").getCanonicalHostName(), 1234, user, resource, currentTime + 10000, 123, "password".getBytes(), currentTime)); Context context = mock(Context.class); when(context.getNMStateStore()).thenReturn(stateStore); Container container = new ContainerImpl(conf, mockDispatcher, launchContext, null, mockMetrics, containerToken, context); this.context.getContainers().put(firstContainerID, container); } else if (heartBeatID == 2) { // Checks on the RM end Assert.assertEquals("Number of applications should only be one!", 1, nodeStatus.getContainersStatuses().size()); Assert.assertEquals("Number of container for the app should be one!", 1, appToContainers.get(appId1).size()); // Checks on the NM end ConcurrentMap<ContainerId, Container> activeContainers = this.context.getContainers(); Assert.assertEquals(1, activeContainers.size()); if (this.signalContainer) { containersToSignal = new ArrayList<SignalContainerRequest>(); SignalContainerRequest signalReq = recordFactory .newRecordInstance(SignalContainerRequest.class); signalReq.setContainerId(firstContainerID); signalReq.setCommand(SignalContainerCommand.OUTPUT_THREAD_DUMP); containersToSignal.add(signalReq); } // Give another container to the NM. ApplicationAttemptId appAttemptID = ApplicationAttemptId.newInstance(appId2, 0); ContainerId secondContainerID = ContainerId.newContainerId(appAttemptID, heartBeatID); ContainerLaunchContext launchContext = recordFactory .newRecordInstance(ContainerLaunchContext.class); long currentTime = System.currentTimeMillis(); String user = "testUser"; Resource resource = BuilderUtils.newResource(3, 1, 2); ContainerTokenIdentifier containerToken = BuilderUtils .newContainerTokenIdentifier(BuilderUtils.newContainerToken(secondContainerID, 0, InetAddress.getByName("localhost").getCanonicalHostName(), 1234, user, resource, currentTime + 10000, 123, "password".getBytes(), currentTime)); Context context = mock(Context.class); when(context.getNMStateStore()).thenReturn(stateStore); Container container = new ContainerImpl(conf, mockDispatcher, launchContext, null, mockMetrics, containerToken, context); this.context.getContainers().put(secondContainerID, container); } else if (heartBeatID == 3) { // Checks on the RM end Assert.assertEquals("Number of applications should have two!", 2, appToContainers.size()); Assert.assertEquals("Number of container for the app-1 should be only one!", 1, appToContainers.get(appId1).size()); Assert.assertEquals("Number of container for the app-2 should be only one!", 1, appToContainers.get(appId2).size()); // Checks on the NM end ConcurrentMap<ContainerId, Container> activeContainers = this.context.getContainers(); Assert.assertEquals(2, activeContainers.size()); } NodeHeartbeatResponse nhResponse = YarnServerBuilderUtils.newNodeHeartbeatResponse(heartBeatID, null, null, null, null, null, 1000L); if (containersToSignal != null) { nhResponse.addAllContainersToSignal(containersToSignal); } return nhResponse; } @Override public UnRegisterNodeManagerResponse unRegisterNodeManager(UnRegisterNodeManagerRequest request) throws YarnException, IOException { return recordFactory.newRecordInstance(UnRegisterNodeManagerResponse.class); } } private class MyContainerManager extends ContainerManagerImpl { public boolean signaled = false; public MyContainerManager(Context context, ContainerExecutor exec, DeletionService deletionContext, NodeStatusUpdater nodeStatusUpdater, NodeManagerMetrics metrics, LocalDirsHandlerService dirsHandler) { super(context, exec, deletionContext, nodeStatusUpdater, metrics, dirsHandler); } @Override public void handle(ContainerManagerEvent event) { if (event.getType() == ContainerManagerEventType.SIGNAL_CONTAINERS) { signaled = true; } } } private class MyNodeStatusUpdater extends NodeStatusUpdaterImpl { public ResourceTracker resourceTracker; private Context context; public MyNodeStatusUpdater(Context context, Dispatcher dispatcher, NodeHealthCheckerService healthChecker, NodeManagerMetrics metrics) { this(context, dispatcher, healthChecker, metrics, false); } public MyNodeStatusUpdater(Context context, Dispatcher dispatcher, NodeHealthCheckerService healthChecker, NodeManagerMetrics metrics, boolean signalContainer) { super(context, dispatcher, healthChecker, metrics); this.context = context; resourceTracker = new MyResourceTracker(this.context, signalContainer); } @Override protected ResourceTracker getRMClient() { return resourceTracker; } @Override protected void stopRMProxy() { return; } } // Test NodeStatusUpdater sends the right container statuses each time it // heart beats. private class MyNodeStatusUpdater2 extends NodeStatusUpdaterImpl { public ResourceTracker resourceTracker; public MyNodeStatusUpdater2(Context context, Dispatcher dispatcher, NodeHealthCheckerService healthChecker, NodeManagerMetrics metrics) { super(context, dispatcher, healthChecker, metrics); resourceTracker = new MyResourceTracker4(context); } @Override protected ResourceTracker getRMClient() { return resourceTracker; } @Override protected void stopRMProxy() { return; } } private class MyNodeStatusUpdater3 extends NodeStatusUpdaterImpl { public ResourceTracker resourceTracker; private Context context; public MyNodeStatusUpdater3(Context context, Dispatcher dispatcher, NodeHealthCheckerService healthChecker, NodeManagerMetrics metrics) { super(context, dispatcher, healthChecker, metrics); this.context = context; this.resourceTracker = new MyResourceTracker3(this.context); } @Override protected ResourceTracker getRMClient() { return resourceTracker; } @Override protected void stopRMProxy() { return; } @Override protected boolean isTokenKeepAliveEnabled(Configuration conf) { return true; } } private class MyNodeStatusUpdater4 extends NodeStatusUpdaterImpl { private final long rmStartIntervalMS; private final boolean rmNeverStart; public ResourceTracker resourceTracker; public MyNodeStatusUpdater4(Context context, Dispatcher dispatcher, NodeHealthCheckerService healthChecker, NodeManagerMetrics metrics, long rmStartIntervalMS, boolean rmNeverStart) { super(context, dispatcher, healthChecker, metrics); this.rmStartIntervalMS = rmStartIntervalMS; this.rmNeverStart = rmNeverStart; } @Override protected void serviceStart() throws Exception { //record the startup time super.serviceStart(); } @Override protected ResourceTracker getRMClient() throws IOException { RetryPolicy retryPolicy = RMProxy.createRetryPolicy(conf); resourceTracker = (ResourceTracker) RetryProxy.create(ResourceTracker.class, new MyResourceTracker6(rmStartIntervalMS, rmNeverStart), retryPolicy); return resourceTracker; } private boolean isTriggered() { return triggered; } @Override protected void stopRMProxy() { return; } } private class MyNodeStatusUpdater5 extends NodeStatusUpdaterImpl { private ResourceTracker resourceTracker; private Configuration conf; public MyNodeStatusUpdater5(Context context, Dispatcher dispatcher, NodeHealthCheckerService healthChecker, NodeManagerMetrics metrics, Configuration conf) { super(context, dispatcher, healthChecker, metrics); resourceTracker = new MyResourceTracker5(); this.conf = conf; } @Override protected ResourceTracker getRMClient() { RetryPolicy retryPolicy = RMProxy.createRetryPolicy(conf); return (ResourceTracker) RetryProxy.create(ResourceTracker.class, resourceTracker, retryPolicy); } @Override protected void stopRMProxy() { return; } } private class MyNodeStatusUpdater6 extends NodeStatusUpdaterImpl { private final long rmStartIntervalMS; private final boolean rmNeverStart; public ResourceTracker resourceTracker; public MyNodeStatusUpdater6(Context context, Dispatcher dispatcher, NodeHealthCheckerService healthChecker, NodeManagerMetrics metrics, long rmStartIntervalMS, boolean rmNeverStart) { super(context, dispatcher, healthChecker, metrics); this.rmStartIntervalMS = rmStartIntervalMS; this.rmNeverStart = rmNeverStart; } @Override protected void serviceStart() throws Exception { //record the startup time super.serviceStart(); } private boolean isTriggered() { return triggered; } @Override protected void stopRMProxy() { return; } } private class MyNodeManager extends NodeManager { private MyNodeStatusUpdater3 nodeStatusUpdater; @Override protected NodeStatusUpdater createNodeStatusUpdater(Context context, Dispatcher dispatcher, NodeHealthCheckerService healthChecker) { this.nodeStatusUpdater = new MyNodeStatusUpdater3(context, dispatcher, healthChecker, metrics); return this.nodeStatusUpdater; } public MyNodeStatusUpdater3 getNodeStatusUpdater() { return this.nodeStatusUpdater; } } private class MyNodeManager2 extends NodeManager { public boolean isStopped = false; private NodeStatusUpdater nodeStatusUpdater; private CyclicBarrier syncBarrier; private Configuration conf; public MyNodeManager2(CyclicBarrier syncBarrier, Configuration conf) { this.syncBarrier = syncBarrier; this.conf = conf; } @Override protected NodeStatusUpdater createNodeStatusUpdater(Context context, Dispatcher dispatcher, NodeHealthCheckerService healthChecker) { nodeStatusUpdater = new MyNodeStatusUpdater5(context, dispatcher, healthChecker, metrics, conf); return nodeStatusUpdater; } @Override protected void serviceStop() throws Exception { // Make sure that all containers are started before starting shutdown syncBarrier.await(10000, TimeUnit.MILLISECONDS); System.out.println("Called stooppppp"); super.serviceStop(); isStopped = true; ConcurrentMap<ApplicationId, Application> applications = getNMContext().getApplications(); // ensure that applications are empty if (!applications.isEmpty()) { assertionFailedInThread.set(true); } syncBarrier.await(10000, TimeUnit.MILLISECONDS); } } // private class MyResourceTracker2 implements ResourceTracker { public NodeAction heartBeatNodeAction = NodeAction.NORMAL; public NodeAction registerNodeAction = NodeAction.NORMAL; public String shutDownMessage = ""; public String rmVersion = "3.0.1"; @Override public RegisterNodeManagerResponse registerNodeManager(RegisterNodeManagerRequest request) throws YarnException, IOException { RegisterNodeManagerResponse response = recordFactory .newRecordInstance(RegisterNodeManagerResponse.class); response.setNodeAction(registerNodeAction); response.setContainerTokenMasterKey(createMasterKey()); response.setNMTokenMasterKey(createMasterKey()); response.setDiagnosticsMessage(shutDownMessage); response.setRMVersion(rmVersion); return response; } @Override public NodeHeartbeatResponse nodeHeartbeat(NodeHeartbeatRequest request) throws YarnException, IOException { NodeStatus nodeStatus = request.getNodeStatus(); nodeStatus.setResponseId(heartBeatID++); NodeHeartbeatResponse nhResponse = YarnServerBuilderUtils.newNodeHeartbeatResponse(heartBeatID, heartBeatNodeAction, null, null, null, null, 1000L); nhResponse.setDiagnosticsMessage(shutDownMessage); return nhResponse; } @Override public UnRegisterNodeManagerResponse unRegisterNodeManager(UnRegisterNodeManagerRequest request) throws YarnException, IOException { return recordFactory.newRecordInstance(UnRegisterNodeManagerResponse.class); } } private class MyResourceTracker3 implements ResourceTracker { public NodeAction heartBeatNodeAction = NodeAction.NORMAL; public NodeAction registerNodeAction = NodeAction.NORMAL; private Map<ApplicationId, List<Long>> keepAliveRequests = new HashMap<ApplicationId, List<Long>>(); private ApplicationId appId = BuilderUtils.newApplicationId(1, 1); private final Context context; MyResourceTracker3(Context context) { this.context = context; } @Override public RegisterNodeManagerResponse registerNodeManager(RegisterNodeManagerRequest request) throws YarnException, IOException { RegisterNodeManagerResponse response = recordFactory .newRecordInstance(RegisterNodeManagerResponse.class); response.setNodeAction(registerNodeAction); response.setContainerTokenMasterKey(createMasterKey()); response.setNMTokenMasterKey(createMasterKey()); return response; } @Override public NodeHeartbeatResponse nodeHeartbeat(NodeHeartbeatRequest request) throws YarnException, IOException { LOG.info("Got heartBeatId: [" + heartBeatID + "]"); NodeStatus nodeStatus = request.getNodeStatus(); nodeStatus.setResponseId(heartBeatID++); NodeHeartbeatResponse nhResponse = YarnServerBuilderUtils.newNodeHeartbeatResponse(heartBeatID, heartBeatNodeAction, null, null, null, null, 1000L); if (nodeStatus.getKeepAliveApplications() != null && nodeStatus.getKeepAliveApplications().size() > 0) { for (ApplicationId appId : nodeStatus.getKeepAliveApplications()) { List<Long> list = keepAliveRequests.get(appId); if (list == null) { list = new LinkedList<Long>(); keepAliveRequests.put(appId, list); } list.add(System.currentTimeMillis()); } } if (heartBeatID == 2) { LOG.info("Sending FINISH_APP for application: [" + appId + "]"); this.context.getApplications().put(appId, mock(Application.class)); nhResponse.addAllApplicationsToCleanup(Collections.singletonList(appId)); } return nhResponse; } @Override public UnRegisterNodeManagerResponse unRegisterNodeManager(UnRegisterNodeManagerRequest request) throws YarnException, IOException { return recordFactory.newRecordInstance(UnRegisterNodeManagerResponse.class); } } // Test NodeStatusUpdater sends the right container statuses each time it // heart beats. private Credentials expectedCredentials = new Credentials(); private class MyResourceTracker4 implements ResourceTracker { public NodeAction registerNodeAction = NodeAction.NORMAL; public NodeAction heartBeatNodeAction = NodeAction.NORMAL; private Context context; private final ContainerStatus containerStatus2 = createContainerStatus(2, ContainerState.RUNNING); private final ContainerStatus containerStatus3 = createContainerStatus(3, ContainerState.COMPLETE); private final ContainerStatus containerStatus4 = createContainerStatus(4, ContainerState.RUNNING); private final ContainerStatus containerStatus5 = createContainerStatus(5, ContainerState.COMPLETE); public MyResourceTracker4(Context context) { // create app Credentials org.apache.hadoop.security.token.Token<DelegationTokenIdentifier> token1 = new org.apache.hadoop.security.token.Token<DelegationTokenIdentifier>(); token1.setKind(new Text("kind1")); expectedCredentials.addToken(new Text("token1"), token1); this.context = context; } @Override public RegisterNodeManagerResponse registerNodeManager(RegisterNodeManagerRequest request) throws YarnException, IOException { RegisterNodeManagerResponse response = recordFactory .newRecordInstance(RegisterNodeManagerResponse.class); response.setNodeAction(registerNodeAction); response.setContainerTokenMasterKey(createMasterKey()); response.setNMTokenMasterKey(createMasterKey()); return response; } @Override public NodeHeartbeatResponse nodeHeartbeat(NodeHeartbeatRequest request) throws YarnException, IOException { List<ContainerId> finishedContainersPulledByAM = new ArrayList<ContainerId>(); try { if (heartBeatID == 0) { Assert.assertEquals(0, request.getNodeStatus().getContainersStatuses().size()); Assert.assertEquals(0, context.getContainers().size()); } else if (heartBeatID == 1) { List<ContainerStatus> statuses = request.getNodeStatus().getContainersStatuses(); Assert.assertEquals(2, statuses.size()); Assert.assertEquals(2, context.getContainers().size()); boolean container2Exist = false, container3Exist = false; for (ContainerStatus status : statuses) { if (status.getContainerId().equals(containerStatus2.getContainerId())) { Assert.assertTrue(status.getState().equals(containerStatus2.getState())); container2Exist = true; } if (status.getContainerId().equals(containerStatus3.getContainerId())) { Assert.assertTrue(status.getState().equals(containerStatus3.getState())); container3Exist = true; } } Assert.assertTrue(container2Exist && container3Exist); // should throw exception that can be retried by the // nodeStatusUpdaterRunnable, otherwise nm just shuts down and the // test passes. throw new YarnRuntimeException("Lost the heartbeat response"); } else if (heartBeatID == 2 || heartBeatID == 3) { List<ContainerStatus> statuses = request.getNodeStatus().getContainersStatuses(); if (heartBeatID == 2) { // NM should send completed containers again, since the last // heartbeat is lost. Assert.assertEquals(4, statuses.size()); } else { // NM should not send completed containers again, since the last // heartbeat is successful. Assert.assertEquals(2, statuses.size()); } Assert.assertEquals(4, context.getContainers().size()); boolean container2Exist = false, container3Exist = false, container4Exist = false, container5Exist = false; for (ContainerStatus status : statuses) { if (status.getContainerId().equals(containerStatus2.getContainerId())) { Assert.assertTrue(status.getState().equals(containerStatus2.getState())); container2Exist = true; } if (status.getContainerId().equals(containerStatus3.getContainerId())) { Assert.assertTrue(status.getState().equals(containerStatus3.getState())); container3Exist = true; } if (status.getContainerId().equals(containerStatus4.getContainerId())) { Assert.assertTrue(status.getState().equals(containerStatus4.getState())); container4Exist = true; } if (status.getContainerId().equals(containerStatus5.getContainerId())) { Assert.assertTrue(status.getState().equals(containerStatus5.getState())); container5Exist = true; } } if (heartBeatID == 2) { Assert.assertTrue(container2Exist && container3Exist && container4Exist && container5Exist); } else { // NM do not send completed containers again Assert.assertTrue( container2Exist && !container3Exist && container4Exist && !container5Exist); } if (heartBeatID == 3) { finishedContainersPulledByAM.add(containerStatus3.getContainerId()); } } else if (heartBeatID == 4) { List<ContainerStatus> statuses = request.getNodeStatus().getContainersStatuses(); Assert.assertEquals(2, statuses.size()); // Container 3 is acked by AM, hence removed from context Assert.assertEquals(3, context.getContainers().size()); boolean container3Exist = false; for (ContainerStatus status : statuses) { if (status.getContainerId().equals(containerStatus3.getContainerId())) { container3Exist = true; } } Assert.assertFalse(container3Exist); } } catch (AssertionError error) { error.printStackTrace(); assertionFailedInThread.set(true); } finally { heartBeatID++; } NodeStatus nodeStatus = request.getNodeStatus(); nodeStatus.setResponseId(heartBeatID); NodeHeartbeatResponse nhResponse = YarnServerBuilderUtils.newNodeHeartbeatResponse(heartBeatID, heartBeatNodeAction, null, null, null, null, 1000L); nhResponse.addContainersToBeRemovedFromNM(finishedContainersPulledByAM); Map<ApplicationId, ByteBuffer> appCredentials = new HashMap<ApplicationId, ByteBuffer>(); DataOutputBuffer dob = new DataOutputBuffer(); expectedCredentials.writeTokenStorageToStream(dob); ByteBuffer byteBuffer1 = ByteBuffer.wrap(dob.getData(), 0, dob.getLength()); appCredentials.put(ApplicationId.newInstance(1234, 1), byteBuffer1); nhResponse.setSystemCredentialsForApps(appCredentials); return nhResponse; } @Override public UnRegisterNodeManagerResponse unRegisterNodeManager(UnRegisterNodeManagerRequest request) throws YarnException, IOException { return recordFactory.newRecordInstance(UnRegisterNodeManagerResponse.class); } } private class MyResourceTracker5 implements ResourceTracker { public NodeAction registerNodeAction = NodeAction.NORMAL; @Override public RegisterNodeManagerResponse registerNodeManager(RegisterNodeManagerRequest request) throws YarnException, IOException { RegisterNodeManagerResponse response = recordFactory .newRecordInstance(RegisterNodeManagerResponse.class); response.setNodeAction(registerNodeAction); response.setContainerTokenMasterKey(createMasterKey()); response.setNMTokenMasterKey(createMasterKey()); return response; } @Override public NodeHeartbeatResponse nodeHeartbeat(NodeHeartbeatRequest request) throws YarnException, IOException { heartBeatID++; if (heartBeatID == 1) { // EOFException should be retried as well. throw new EOFException("NodeHeartbeat exception"); } else { throw new java.net.ConnectException("NodeHeartbeat exception"); } } @Override public UnRegisterNodeManagerResponse unRegisterNodeManager(UnRegisterNodeManagerRequest request) throws YarnException, IOException { return recordFactory.newRecordInstance(UnRegisterNodeManagerResponse.class); } } private class MyResourceTracker6 implements ResourceTracker { private long rmStartIntervalMS; private boolean rmNeverStart; private final long waitStartTime; public MyResourceTracker6(long rmStartIntervalMS, boolean rmNeverStart) { this.rmStartIntervalMS = rmStartIntervalMS; this.rmNeverStart = rmNeverStart; this.waitStartTime = System.currentTimeMillis(); } @Override public RegisterNodeManagerResponse registerNodeManager(RegisterNodeManagerRequest request) throws YarnException, IOException, IOException { if (System.currentTimeMillis() - waitStartTime <= rmStartIntervalMS || rmNeverStart) { throw new java.net.ConnectException( "Faking RM start failure as start " + "delay timer has not expired."); } else { NodeId nodeId = request.getNodeId(); Resource resource = request.getResource(); LOG.info("Registering " + nodeId.toString()); // NOTE: this really should be checking against the config value InetSocketAddress expected = NetUtils .getConnectAddress(conf.getSocketAddr(YarnConfiguration.NM_ADDRESS, null, -1)); Assert.assertEquals(NetUtils.getHostPortString(expected), nodeId.toString()); Assert.assertEquals(5 * 1024, resource.getMemorySize()); registeredNodes.add(nodeId); RegisterNodeManagerResponse response = recordFactory .newRecordInstance(RegisterNodeManagerResponse.class); triggered = true; return response; } } @Override public NodeHeartbeatResponse nodeHeartbeat(NodeHeartbeatRequest request) throws YarnException, IOException { NodeStatus nodeStatus = request.getNodeStatus(); nodeStatus.setResponseId(heartBeatID++); NodeHeartbeatResponse nhResponse = YarnServerBuilderUtils.newNodeHeartbeatResponse(heartBeatID, NodeAction.NORMAL, null, null, null, null, 1000L); return nhResponse; } @Override public UnRegisterNodeManagerResponse unRegisterNodeManager(UnRegisterNodeManagerRequest request) throws YarnException, IOException { return recordFactory.newRecordInstance(UnRegisterNodeManagerResponse.class); } } @Before public void clearError() { nmStartError = null; } @After public void deleteBaseDir() throws IOException { FileContext lfs = FileContext.getLocalFSFileContext(); lfs.delete(new Path(basedir.getPath()), true); } @Test(timeout = 90000) public void testRecentlyFinishedContainers() throws Exception { NodeManager nm = new NodeManager(); YarnConfiguration conf = new YarnConfiguration(); conf.set(NodeStatusUpdaterImpl.YARN_NODEMANAGER_DURATION_TO_TRACK_STOPPED_CONTAINERS, "10000"); nm.init(conf); NodeStatusUpdaterImpl nodeStatusUpdater = (NodeStatusUpdaterImpl) nm.getNodeStatusUpdater(); ApplicationId appId = ApplicationId.newInstance(0, 0); ApplicationAttemptId appAttemptId = ApplicationAttemptId.newInstance(appId, 0); ContainerId cId = ContainerId.newContainerId(appAttemptId, 0); nm.getNMContext().getApplications().putIfAbsent(appId, mock(Application.class)); nm.getNMContext().getContainers().putIfAbsent(cId, mock(Container.class)); nodeStatusUpdater.addCompletedContainer(cId); Assert.assertTrue(nodeStatusUpdater.isContainerRecentlyStopped(cId)); nm.getNMContext().getContainers().remove(cId); long time1 = System.currentTimeMillis(); int waitInterval = 15; while (waitInterval-- > 0 && nodeStatusUpdater.isContainerRecentlyStopped(cId)) { nodeStatusUpdater.removeVeryOldStoppedContainersFromCache(); Thread.sleep(1000); } long time2 = System.currentTimeMillis(); // By this time the container will be removed from cache. need to verify. Assert.assertFalse(nodeStatusUpdater.isContainerRecentlyStopped(cId)); Assert.assertTrue((time2 - time1) >= 10000 && (time2 - time1) <= 250000); } @Test(timeout = 90000) public void testRemovePreviousCompletedContainersFromContext() throws Exception { NodeManager nm = new NodeManager(); YarnConfiguration conf = new YarnConfiguration(); conf.set(NodeStatusUpdaterImpl.YARN_NODEMANAGER_DURATION_TO_TRACK_STOPPED_CONTAINERS, "10000"); nm.init(conf); NodeStatusUpdaterImpl nodeStatusUpdater = (NodeStatusUpdaterImpl) nm.getNodeStatusUpdater(); ApplicationId appId = ApplicationId.newInstance(0, 0); ApplicationAttemptId appAttemptId = ApplicationAttemptId.newInstance(appId, 0); ContainerId cId = ContainerId.newContainerId(appAttemptId, 1); Token containerToken = BuilderUtils.newContainerToken(cId, 0, "anyHost", 1234, "anyUser", BuilderUtils.newResource(1024, 1), 0, 123, "password".getBytes(), 0); Container anyCompletedContainer = new ContainerImpl(conf, null, null, null, null, BuilderUtils.newContainerTokenIdentifier(containerToken), nm.getNMContext()) { @Override public ContainerState getCurrentState() { return ContainerState.COMPLETE; } @Override public org.apache.hadoop.yarn.server.nodemanager.containermanager.container.ContainerState getContainerState() { return org.apache.hadoop.yarn.server.nodemanager.containermanager.container.ContainerState.DONE; } }; ContainerId runningContainerId = ContainerId.newContainerId(appAttemptId, 3); Token runningContainerToken = BuilderUtils.newContainerToken(runningContainerId, 0, "anyHost", 1234, "anyUser", BuilderUtils.newResource(1024, 1), 0, 123, "password".getBytes(), 0); Container runningContainer = new ContainerImpl(conf, null, null, null, null, BuilderUtils.newContainerTokenIdentifier(runningContainerToken), nm.getNMContext()) { @Override public ContainerState getCurrentState() { return ContainerState.RUNNING; } @Override public org.apache.hadoop.yarn.server.nodemanager.containermanager.container.ContainerState getContainerState() { return org.apache.hadoop.yarn.server.nodemanager.containermanager.container.ContainerState.RUNNING; } }; nm.getNMContext().getApplications().putIfAbsent(appId, mock(Application.class)); nm.getNMContext().getContainers().put(cId, anyCompletedContainer); nm.getNMContext().getContainers().put(runningContainerId, runningContainer); Assert.assertEquals(2, nodeStatusUpdater.getContainerStatuses().size()); List<ContainerId> ackedContainers = new ArrayList<ContainerId>(); ackedContainers.add(cId); ackedContainers.add(runningContainerId); nodeStatusUpdater.removeOrTrackCompletedContainersFromContext(ackedContainers); Set<ContainerId> containerIdSet = new HashSet<ContainerId>(); List<ContainerStatus> containerStatuses = nodeStatusUpdater.getContainerStatuses(); for (ContainerStatus status : containerStatuses) { containerIdSet.add(status.getContainerId()); } Assert.assertEquals(1, containerStatuses.size()); // completed container is removed; Assert.assertFalse(containerIdSet.contains(cId)); // running container is not removed; Assert.assertTrue(containerIdSet.contains(runningContainerId)); } @Test(timeout = 10000) public void testCompletedContainersIsRecentlyStopped() throws Exception { NodeManager nm = new NodeManager(); nm.init(conf); NodeStatusUpdaterImpl nodeStatusUpdater = (NodeStatusUpdaterImpl) nm.getNodeStatusUpdater(); ApplicationId appId = ApplicationId.newInstance(0, 0); Application completedApp = mock(Application.class); when(completedApp.getApplicationState()).thenReturn(ApplicationState.FINISHED); ApplicationAttemptId appAttemptId = ApplicationAttemptId.newInstance(appId, 0); ContainerId containerId = ContainerId.newContainerId(appAttemptId, 1); Token containerToken = BuilderUtils.newContainerToken(containerId, 0, "host", 1234, "user", BuilderUtils.newResource(1024, 1), 0, 123, "password".getBytes(), 0); Container completedContainer = new ContainerImpl(conf, null, null, null, null, BuilderUtils.newContainerTokenIdentifier(containerToken), nm.getNMContext()) { @Override public ContainerState getCurrentState() { return ContainerState.COMPLETE; } }; nm.getNMContext().getApplications().putIfAbsent(appId, completedApp); nm.getNMContext().getContainers().put(containerId, completedContainer); Assert.assertEquals(1, nodeStatusUpdater.getContainerStatuses().size()); Assert.assertTrue(nodeStatusUpdater.isContainerRecentlyStopped(containerId)); } @Test public void testCleanedupApplicationContainerCleanup() throws IOException { NodeManager nm = new NodeManager(); YarnConfiguration conf = new YarnConfiguration(); conf.set(NodeStatusUpdaterImpl.YARN_NODEMANAGER_DURATION_TO_TRACK_STOPPED_CONTAINERS, "1000000"); nm.init(conf); NodeStatusUpdaterImpl nodeStatusUpdater = (NodeStatusUpdaterImpl) nm.getNodeStatusUpdater(); ApplicationId appId = ApplicationId.newInstance(0, 0); ApplicationAttemptId appAttemptId = ApplicationAttemptId.newInstance(appId, 0); ContainerId cId = ContainerId.newContainerId(appAttemptId, 1); Token containerToken = BuilderUtils.newContainerToken(cId, 0, "anyHost", 1234, "anyUser", BuilderUtils.newResource(1024, 1), 0, 123, "password".getBytes(), 0); Container anyCompletedContainer = new ContainerImpl(conf, null, null, null, null, BuilderUtils.newContainerTokenIdentifier(containerToken), nm.getNMContext()) { @Override public ContainerState getCurrentState() { return ContainerState.COMPLETE; } }; Application application = mock(Application.class); when(application.getApplicationState()).thenReturn(ApplicationState.RUNNING); nm.getNMContext().getApplications().putIfAbsent(appId, application); nm.getNMContext().getContainers().put(cId, anyCompletedContainer); Assert.assertEquals(1, nodeStatusUpdater.getContainerStatuses().size()); when(application.getApplicationState()).thenReturn(ApplicationState.FINISHING_CONTAINERS_WAIT); // The completed container will be saved in case of lost heartbeat. Assert.assertEquals(1, nodeStatusUpdater.getContainerStatuses().size()); Assert.assertEquals(1, nodeStatusUpdater.getContainerStatuses().size()); nm.getNMContext().getContainers().put(cId, anyCompletedContainer); nm.getNMContext().getApplications().remove(appId); // The completed container will be saved in case of lost heartbeat. Assert.assertEquals(1, nodeStatusUpdater.getContainerStatuses().size()); Assert.assertEquals(1, nodeStatusUpdater.getContainerStatuses().size()); } @Test public void testNMRegistration() throws InterruptedException, IOException { nm = new NodeManager() { @Override protected NodeStatusUpdater createNodeStatusUpdater(Context context, Dispatcher dispatcher, NodeHealthCheckerService healthChecker) { return new MyNodeStatusUpdater(context, dispatcher, healthChecker, metrics); } }; YarnConfiguration conf = createNMConfig(); nm.init(conf); // verify that the last service is the nodeStatusUpdater (ie registration // with RM) Object[] services = nm.getServices().toArray(); Object lastService = services[services.length - 1]; Assert.assertTrue("last service is NOT the node status updater", lastService instanceof NodeStatusUpdater); new Thread() { public void run() { try { nm.start(); } catch (Throwable e) { TestNodeStatusUpdater.this.nmStartError = e; throw new YarnRuntimeException(e); } } }.start(); System.out.println(" ----- thread already started.." + nm.getServiceState()); int waitCount = 0; while (nm.getServiceState() == STATE.INITED && waitCount++ != 50) { LOG.info("Waiting for NM to start.."); if (nmStartError != null) { LOG.error("Error during startup. ", nmStartError); Assert.fail(nmStartError.getCause().getMessage()); } Thread.sleep(2000); } if (nm.getServiceState() != STATE.STARTED) { // NM could have failed. Assert.fail("NodeManager failed to start"); } waitCount = 0; while (heartBeatID <= 3 && waitCount++ != 200) { Thread.sleep(1000); } Assert.assertFalse(heartBeatID <= 3); Assert.assertEquals("Number of registered NMs is wrong!!", 1, this.registeredNodes.size()); nm.stop(); } @Test public void testStopReentrant() throws Exception { final AtomicInteger numCleanups = new AtomicInteger(0); nm = new NodeManager() { @Override protected NodeStatusUpdater createNodeStatusUpdater(Context context, Dispatcher dispatcher, NodeHealthCheckerService healthChecker) { MyNodeStatusUpdater myNodeStatusUpdater = new MyNodeStatusUpdater(context, dispatcher, healthChecker, metrics); MyResourceTracker2 myResourceTracker2 = new MyResourceTracker2(); myResourceTracker2.heartBeatNodeAction = NodeAction.SHUTDOWN; myNodeStatusUpdater.resourceTracker = myResourceTracker2; return myNodeStatusUpdater; } @Override protected ContainerManagerImpl createContainerManager(Context context, ContainerExecutor exec, DeletionService del, NodeStatusUpdater nodeStatusUpdater, ApplicationACLsManager aclsManager, LocalDirsHandlerService dirsHandler) { return new ContainerManagerImpl(context, exec, del, nodeStatusUpdater, metrics, dirsHandler) { @Override public void cleanUpApplicationsOnNMShutDown() { super.cleanUpApplicationsOnNMShutDown(); numCleanups.incrementAndGet(); } }; } }; YarnConfiguration conf = createNMConfig(); nm.init(conf); nm.start(); int waitCount = 0; while (heartBeatID < 1 && waitCount++ != 200) { Thread.sleep(500); } Assert.assertFalse(heartBeatID < 1); // Meanwhile call stop directly as the shutdown hook would nm.stop(); // NM takes a while to reach the STOPPED state. waitCount = 0; while (nm.getServiceState() != STATE.STOPPED && waitCount++ != 20) { LOG.info("Waiting for NM to stop.."); Thread.sleep(1000); } Assert.assertEquals(STATE.STOPPED, nm.getServiceState()); // It further takes a while after NM reached the STOPPED state. waitCount = 0; while (numCleanups.get() == 0 && waitCount++ != 20) { LOG.info("Waiting for NM shutdown.."); Thread.sleep(1000); } Assert.assertEquals(1, numCleanups.get()); } @Test public void testNodeDecommision() throws Exception { nm = getNodeManager(NodeAction.SHUTDOWN); YarnConfiguration conf = createNMConfig(); nm.init(conf); Assert.assertEquals(STATE.INITED, nm.getServiceState()); nm.start(); int waitCount = 0; while (heartBeatID < 1 && waitCount++ != 200) { Thread.sleep(500); } Assert.assertFalse(heartBeatID < 1); Assert.assertTrue(nm.getNMContext().getDecommissioned()); // NM takes a while to reach the STOPPED state. waitCount = 0; while (nm.getServiceState() != STATE.STOPPED && waitCount++ != 20) { LOG.info("Waiting for NM to stop.."); Thread.sleep(1000); } Assert.assertEquals(STATE.STOPPED, nm.getServiceState()); } private abstract class NodeManagerWithCustomNodeStatusUpdater extends NodeManager { private NodeStatusUpdater updater; private NodeManagerWithCustomNodeStatusUpdater() { } @Override protected NodeStatusUpdater createNodeStatusUpdater(Context context, Dispatcher dispatcher, NodeHealthCheckerService healthChecker) { updater = createUpdater(context, dispatcher, healthChecker); return updater; } public NodeStatusUpdater getUpdater() { return updater; } abstract NodeStatusUpdater createUpdater(Context context, Dispatcher dispatcher, NodeHealthCheckerService healthChecker); } @Test public void testNMShutdownForRegistrationFailure() throws Exception { nm = new NodeManagerWithCustomNodeStatusUpdater() { @Override protected NodeStatusUpdater createUpdater(Context context, Dispatcher dispatcher, NodeHealthCheckerService healthChecker) { MyNodeStatusUpdater nodeStatusUpdater = new MyNodeStatusUpdater(context, dispatcher, healthChecker, metrics); MyResourceTracker2 myResourceTracker2 = new MyResourceTracker2(); myResourceTracker2.registerNodeAction = NodeAction.SHUTDOWN; myResourceTracker2.shutDownMessage = "RM Shutting Down Node"; nodeStatusUpdater.resourceTracker = myResourceTracker2; return nodeStatusUpdater; } }; verifyNodeStartFailure("Recieved SHUTDOWN signal from Resourcemanager, " + "Registration of NodeManager failed, " + "Message from ResourceManager: RM Shutting Down Node"); } @Test(timeout = 100000) public void testNMRMConnectionConf() throws Exception { final long delta = 50000; final long nmRmConnectionWaitMs = 100; final long nmRmRetryInterval = 100; final long connectionWaitMs = -1; final long connectionRetryIntervalMs = 1000; //Waiting for rmStartIntervalMS, RM will be started final long rmStartIntervalMS = 2 * 1000; conf.setLong(YarnConfiguration.NM_RESOURCEMANAGER_CONNECT_MAX_WAIT_MS, nmRmConnectionWaitMs); conf.setLong(YarnConfiguration.NM_RESOURCEMANAGER_CONNECT_RETRY_INTERVAL_MS, nmRmRetryInterval); conf.setLong(YarnConfiguration.RESOURCEMANAGER_CONNECT_MAX_WAIT_MS, connectionWaitMs); conf.setLong(YarnConfiguration.RESOURCEMANAGER_CONNECT_RETRY_INTERVAL_MS, connectionRetryIntervalMs); conf.setInt(CommonConfigurationKeysPublic.IPC_CLIENT_CONNECT_MAX_RETRIES_KEY, 1); //Test NM try to connect to RM Several times, but finally fail NodeManagerWithCustomNodeStatusUpdater nmWithUpdater; nm = nmWithUpdater = new NodeManagerWithCustomNodeStatusUpdater() { @Override protected NodeStatusUpdater createUpdater(Context context, Dispatcher dispatcher, NodeHealthCheckerService healthChecker) { NodeStatusUpdater nodeStatusUpdater = new MyNodeStatusUpdater6(context, dispatcher, healthChecker, metrics, rmStartIntervalMS, true); return nodeStatusUpdater; } }; nm.init(conf); long waitStartTime = System.currentTimeMillis(); try { nm.start(); Assert.fail("NM should have failed to start due to RM connect failure"); } catch (Exception e) { long t = System.currentTimeMillis(); long duration = t - waitStartTime; boolean waitTimeValid = (duration >= nmRmConnectionWaitMs) && (duration < (connectionWaitMs + delta)); if (!waitTimeValid) { // throw exception if NM doesn't retry long enough throw new Exception("NM should have tried re-connecting to RM during " + "period of at least " + connectionWaitMs + " ms, but " + "stopped retrying within " + (connectionWaitMs + delta) + " ms: " + e, e); } } } @Test(timeout = 150000) public void testNMConnectionToRM() throws Exception { final long delta = 50000; final long connectionWaitMs = 5000; final long connectionRetryIntervalMs = 1000; //Waiting for rmStartIntervalMS, RM will be started final long rmStartIntervalMS = 2 * 1000; conf.setLong(YarnConfiguration.RESOURCEMANAGER_CONNECT_MAX_WAIT_MS, connectionWaitMs); conf.setLong(YarnConfiguration.RESOURCEMANAGER_CONNECT_RETRY_INTERVAL_MS, connectionRetryIntervalMs); //Test NM try to connect to RM Several times, but finally fail NodeManagerWithCustomNodeStatusUpdater nmWithUpdater; nm = nmWithUpdater = new NodeManagerWithCustomNodeStatusUpdater() { @Override protected NodeStatusUpdater createUpdater(Context context, Dispatcher dispatcher, NodeHealthCheckerService healthChecker) { NodeStatusUpdater nodeStatusUpdater = new MyNodeStatusUpdater4(context, dispatcher, healthChecker, metrics, rmStartIntervalMS, true); return nodeStatusUpdater; } }; nm.init(conf); long waitStartTime = System.currentTimeMillis(); try { nm.start(); Assert.fail("NM should have failed to start due to RM connect failure"); } catch (Exception e) { long t = System.currentTimeMillis(); long duration = t - waitStartTime; boolean waitTimeValid = (duration >= connectionWaitMs) && (duration < (connectionWaitMs + delta)); if (!waitTimeValid) { //either the exception was too early, or it had a different cause. //reject with the inner stack trace throw new Exception("NM should have tried re-connecting to RM during " + "period of at least " + connectionWaitMs + " ms, but " + "stopped retrying within " + (connectionWaitMs + delta) + " ms: " + e, e); } } //Test NM connect to RM, fail at first several attempts, //but finally success. nm = nmWithUpdater = new NodeManagerWithCustomNodeStatusUpdater() { @Override protected NodeStatusUpdater createUpdater(Context context, Dispatcher dispatcher, NodeHealthCheckerService healthChecker) { NodeStatusUpdater nodeStatusUpdater = new MyNodeStatusUpdater4(context, dispatcher, healthChecker, metrics, rmStartIntervalMS, false); return nodeStatusUpdater; } }; nm.init(conf); NodeStatusUpdater updater = nmWithUpdater.getUpdater(); Assert.assertNotNull("Updater not yet created ", updater); waitStartTime = System.currentTimeMillis(); try { nm.start(); } catch (Exception ex) { LOG.error("NM should have started successfully " + "after connecting to RM.", ex); throw ex; } long duration = System.currentTimeMillis() - waitStartTime; MyNodeStatusUpdater4 myUpdater = (MyNodeStatusUpdater4) updater; Assert.assertTrue("NM started before updater triggered", myUpdater.isTriggered()); Assert.assertTrue("NM should have connected to RM after " + "the start interval of " + rmStartIntervalMS + ": actual " + duration + " " + myUpdater, (duration >= rmStartIntervalMS)); Assert.assertTrue( "NM should have connected to RM less than " + (rmStartIntervalMS + delta) + " milliseconds of RM starting up: actual " + duration + " " + myUpdater, (duration < (rmStartIntervalMS + delta))); } /** * Verifies that if for some reason NM fails to start ContainerManager RPC * server, RM is oblivious to NM's presence. The behaviour is like this * because otherwise, NM will report to RM even if all its servers are not * started properly, RM will think that the NM is alive and will retire the NM * only after NM_EXPIRY interval. See MAPREDUCE-2749. */ @Test public void testNoRegistrationWhenNMServicesFail() throws Exception { nm = new NodeManager() { @Override protected NodeStatusUpdater createNodeStatusUpdater(Context context, Dispatcher dispatcher, NodeHealthCheckerService healthChecker) { return new MyNodeStatusUpdater(context, dispatcher, healthChecker, metrics); } @Override protected ContainerManagerImpl createContainerManager(Context context, ContainerExecutor exec, DeletionService del, NodeStatusUpdater nodeStatusUpdater, ApplicationACLsManager aclsManager, LocalDirsHandlerService diskhandler) { return new ContainerManagerImpl(context, exec, del, nodeStatusUpdater, metrics, diskhandler) { @Override protected void serviceStart() { // Simulating failure of starting RPC server throw new YarnRuntimeException("Starting of RPC Server failed"); } }; } }; verifyNodeStartFailure("Starting of RPC Server failed"); } @Test public void testApplicationKeepAlive() throws Exception { MyNodeManager nm = new MyNodeManager(); try { YarnConfiguration conf = createNMConfig(); conf.setBoolean(YarnConfiguration.LOG_AGGREGATION_ENABLED, true); conf.setLong(YarnConfiguration.RM_NM_EXPIRY_INTERVAL_MS, 4000l); nm.init(conf); nm.start(); // HB 2 -> app cancelled by RM. while (heartBeatID < 12) { Thread.sleep(1000l); } MyResourceTracker3 rt = (MyResourceTracker3) nm.getNodeStatusUpdater().getRMClient(); rt.context.getApplications().remove(rt.appId); Assert.assertEquals(1, rt.keepAliveRequests.size()); int numKeepAliveRequests = rt.keepAliveRequests.get(rt.appId).size(); LOG.info("Number of Keep Alive Requests: [" + numKeepAliveRequests + "]"); Assert.assertTrue(numKeepAliveRequests == 2 || numKeepAliveRequests == 3); while (heartBeatID < 20) { Thread.sleep(1000l); } int numKeepAliveRequests2 = rt.keepAliveRequests.get(rt.appId).size(); Assert.assertEquals(numKeepAliveRequests, numKeepAliveRequests2); } finally { if (nm.getServiceState() == STATE.STARTED) nm.stop(); } } /** * Test completed containerStatus get back up when heart beat lost, and will * be sent via next heart beat. */ @Test(timeout = 200000) public void testCompletedContainerStatusBackup() throws Exception { nm = new NodeManager() { @Override protected NodeStatusUpdater createNodeStatusUpdater(Context context, Dispatcher dispatcher, NodeHealthCheckerService healthChecker) { MyNodeStatusUpdater2 myNodeStatusUpdater = new MyNodeStatusUpdater2(context, dispatcher, healthChecker, metrics); return myNodeStatusUpdater; } @Override protected NMContext createNMContext(NMContainerTokenSecretManager containerTokenSecretManager, NMTokenSecretManagerInNM nmTokenSecretManager, NMStateStoreService store) { return new MyNMContext(containerTokenSecretManager, nmTokenSecretManager); } }; YarnConfiguration conf = createNMConfig(); nm.init(conf); nm.start(); int waitCount = 0; while (heartBeatID <= 4 && waitCount++ != 20) { Thread.sleep(500); } if (heartBeatID <= 4) { Assert.fail("Failed to get all heartbeats in time, " + "heartbeatID:" + heartBeatID); } if (assertionFailedInThread.get()) { Assert.fail("ContainerStatus Backup failed"); } Assert.assertNotNull(nm.getNMContext().getSystemCredentialsForApps().get(ApplicationId.newInstance(1234, 1)) .getToken(new Text("token1"))); nm.stop(); } @Test(timeout = 200000) public void testNodeStatusUpdaterRetryAndNMShutdown() throws Exception { final long connectionWaitSecs = 1000; final long connectionRetryIntervalMs = 1000; int port = ServerSocketUtil.getPort(49156, 10); YarnConfiguration conf = createNMConfig(port); conf.setLong(YarnConfiguration.RESOURCEMANAGER_CONNECT_MAX_WAIT_MS, connectionWaitSecs); conf.setLong(YarnConfiguration.RESOURCEMANAGER_CONNECT_RETRY_INTERVAL_MS, connectionRetryIntervalMs); conf.setLong(YarnConfiguration.NM_SLEEP_DELAY_BEFORE_SIGKILL_MS, 5000); conf.setLong(YarnConfiguration.NM_LOG_RETAIN_SECONDS, 1); CyclicBarrier syncBarrier = new CyclicBarrier(2); nm = new MyNodeManager2(syncBarrier, conf); nm.init(conf); nm.start(); // start a container ContainerId cId = TestNodeManagerShutdown.createContainerId(); FileContext localFS = FileContext.getLocalFSFileContext(); TestNodeManagerShutdown.startContainer(nm, cId, localFS, nmLocalDir, new File("start_file.txt"), port); try { // Wait until we start stopping syncBarrier.await(10000, TimeUnit.MILLISECONDS); // Wait until we finish stopping syncBarrier.await(10000, TimeUnit.MILLISECONDS); } catch (Exception e) { } Assert.assertFalse("Containers not cleaned up when NM stopped", assertionFailedInThread.get()); Assert.assertTrue(((MyNodeManager2) nm).isStopped); Assert.assertTrue("calculate heartBeatCount based on" + " connectionWaitSecs and RetryIntervalSecs", heartBeatID == 2); } @Test public void testRMVersionLessThanMinimum() throws InterruptedException, IOException { final AtomicInteger numCleanups = new AtomicInteger(0); YarnConfiguration conf = createNMConfig(); conf.set(YarnConfiguration.NM_RESOURCEMANAGER_MINIMUM_VERSION, "3.0.0"); nm = new NodeManager() { @Override protected NodeStatusUpdater createNodeStatusUpdater(Context context, Dispatcher dispatcher, NodeHealthCheckerService healthChecker) { MyNodeStatusUpdater myNodeStatusUpdater = new MyNodeStatusUpdater(context, dispatcher, healthChecker, metrics); MyResourceTracker2 myResourceTracker2 = new MyResourceTracker2(); myResourceTracker2.heartBeatNodeAction = NodeAction.NORMAL; myResourceTracker2.rmVersion = "3.0.0"; myNodeStatusUpdater.resourceTracker = myResourceTracker2; return myNodeStatusUpdater; } @Override protected ContainerManagerImpl createContainerManager(Context context, ContainerExecutor exec, DeletionService del, NodeStatusUpdater nodeStatusUpdater, ApplicationACLsManager aclsManager, LocalDirsHandlerService dirsHandler) { return new ContainerManagerImpl(context, exec, del, nodeStatusUpdater, metrics, dirsHandler) { @Override public void cleanUpApplicationsOnNMShutDown() { super.cleanUpApplicationsOnNMShutDown(); numCleanups.incrementAndGet(); } }; } }; nm.init(conf); nm.start(); // NM takes a while to reach the STARTED state. int waitCount = 0; while (nm.getServiceState() != STATE.STARTED && waitCount++ != 20) { LOG.info("Waiting for NM to stop.."); Thread.sleep(1000); } Assert.assertTrue(nm.getServiceState() == STATE.STARTED); nm.stop(); } //Verify that signalContainer request can be dispatched from //NodeStatusUpdaterImpl to ContainerManagerImpl. @Test public void testSignalContainerToContainerManager() throws Exception { nm = new NodeManager() { @Override protected NodeStatusUpdater createNodeStatusUpdater(Context context, Dispatcher dispatcher, NodeHealthCheckerService healthChecker) { return new MyNodeStatusUpdater(context, dispatcher, healthChecker, metrics, true); } @Override protected ContainerManagerImpl createContainerManager(Context context, ContainerExecutor exec, DeletionService del, NodeStatusUpdater nodeStatusUpdater, ApplicationACLsManager aclsManager, LocalDirsHandlerService diskhandler) { return new MyContainerManager(context, exec, del, nodeStatusUpdater, metrics, diskhandler); } }; YarnConfiguration conf = createNMConfig(); nm.init(conf); nm.start(); System.out.println(" ----- thread already started.." + nm.getServiceState()); int waitCount = 0; while (nm.getServiceState() == STATE.INITED && waitCount++ != 20) { LOG.info("Waiting for NM to start.."); if (nmStartError != null) { LOG.error("Error during startup. ", nmStartError); Assert.fail(nmStartError.getCause().getMessage()); } Thread.sleep(1000); } if (nm.getServiceState() != STATE.STARTED) { // NM could have failed. Assert.fail("NodeManager failed to start"); } waitCount = 0; while (heartBeatID <= 3 && waitCount++ != 20) { Thread.sleep(500); } Assert.assertFalse(heartBeatID <= 3); Assert.assertEquals("Number of registered NMs is wrong!!", 1, this.registeredNodes.size()); MyContainerManager containerManager = (MyContainerManager) nm.getContainerManager(); Assert.assertTrue(containerManager.signaled); nm.stop(); } @Test public void testConcurrentAccessToSystemCredentials() { final Map<ApplicationId, ByteBuffer> testCredentials = new HashMap<>(); ByteBuffer byteBuffer = ByteBuffer.wrap(new byte[300]); ApplicationId applicationId = ApplicationId.newInstance(123456, 120); testCredentials.put(applicationId, byteBuffer); final List<Throwable> exceptions = Collections.synchronizedList(new ArrayList<Throwable>()); final int NUM_THREADS = 10; final CountDownLatch allDone = new CountDownLatch(NUM_THREADS); final ExecutorService threadPool = Executors.newFixedThreadPool(NUM_THREADS); final AtomicBoolean stop = new AtomicBoolean(false); try { for (int i = 0; i < NUM_THREADS; i++) { threadPool.submit(new Runnable() { @Override public void run() { try { for (int i = 0; i < 100 && !stop.get(); i++) { NodeHeartbeatResponse nodeHeartBeatResponse = newNodeHeartbeatResponse(0, NodeAction.NORMAL, null, null, null, null, 0); nodeHeartBeatResponse.setSystemCredentialsForApps(testCredentials); NodeHeartbeatResponseProto proto = ((NodeHeartbeatResponsePBImpl) nodeHeartBeatResponse) .getProto(); Assert.assertNotNull(proto); } } catch (Throwable t) { exceptions.add(t); stop.set(true); } finally { allDone.countDown(); } } }); } int testTimeout = 2; Assert.assertTrue("Timeout waiting for more than " + testTimeout + " " + "seconds", allDone.await(testTimeout, TimeUnit.SECONDS)); } catch (InterruptedException ie) { exceptions.add(ie); } finally { threadPool.shutdownNow(); } Assert.assertTrue("Test failed with exception(s)" + exceptions, exceptions.isEmpty()); } // Add new containers info into NM context each time node heart beats. private class MyNMContext extends NMContext { public MyNMContext(NMContainerTokenSecretManager containerTokenSecretManager, NMTokenSecretManagerInNM nmTokenSecretManager) { super(containerTokenSecretManager, nmTokenSecretManager, null, null, new NMNullStateStoreService()); } @Override public ConcurrentMap<ContainerId, Container> getContainers() { if (heartBeatID == 0) { return containers; } else if (heartBeatID == 1) { ContainerStatus containerStatus2 = createContainerStatus(2, ContainerState.RUNNING); putMockContainer(containerStatus2); ContainerStatus containerStatus3 = createContainerStatus(3, ContainerState.COMPLETE); putMockContainer(containerStatus3); return containers; } else if (heartBeatID == 2) { ContainerStatus containerStatus4 = createContainerStatus(4, ContainerState.RUNNING); putMockContainer(containerStatus4); ContainerStatus containerStatus5 = createContainerStatus(5, ContainerState.COMPLETE); putMockContainer(containerStatus5); return containers; } else if (heartBeatID == 3 || heartBeatID == 4) { return containers; } else { containers.clear(); return containers; } } private void putMockContainer(ContainerStatus containerStatus) { Container container = getMockContainer(containerStatus); containers.put(containerStatus.getContainerId(), container); applications.putIfAbsent(containerStatus.getContainerId().getApplicationAttemptId().getApplicationId(), mock(Application.class)); } } public static ContainerStatus createContainerStatus(int id, ContainerState containerState) { ApplicationId applicationId = ApplicationId.newInstance(0, 1); ApplicationAttemptId applicationAttemptId = ApplicationAttemptId.newInstance(applicationId, 1); ContainerId contaierId = ContainerId.newContainerId(applicationAttemptId, id); ContainerStatus containerStatus = BuilderUtils.newContainerStatus(contaierId, containerState, "test_containerStatus: id=" + id + ", containerState: " + containerState, 0, Resource.newInstance(1024, 1)); return containerStatus; } public static Container getMockContainer(ContainerStatus containerStatus) { ContainerImpl container = mock(ContainerImpl.class); when(container.cloneAndGetContainerStatus()).thenReturn(containerStatus); when(container.getCurrentState()).thenReturn(containerStatus.getState()); when(container.getContainerId()).thenReturn(containerStatus.getContainerId()); if (containerStatus.getState().equals(ContainerState.COMPLETE)) { when(container.getContainerState()).thenReturn( org.apache.hadoop.yarn.server.nodemanager.containermanager.container.ContainerState.DONE); } else if (containerStatus.getState().equals(ContainerState.RUNNING)) { when(container.getContainerState()).thenReturn( org.apache.hadoop.yarn.server.nodemanager.containermanager.container.ContainerState.RUNNING); } return container; } private void verifyNodeStartFailure(String errMessage) throws Exception { Assert.assertNotNull("nm is null", nm); YarnConfiguration conf = createNMConfig(); nm.init(conf); try { nm.start(); Assert.fail("NM should have failed to start. Didn't get exception!!"); } catch (Exception e) { //the version in trunk looked in the cause for equality // and assumed failures were nested. //this version assumes that error strings propagate to the base and //use a contains() test only. It should be less brittle if (!e.getMessage().contains(errMessage)) { throw e; } } // the service should be stopped Assert.assertEquals("NM state is wrong!", STATE.STOPPED, nm.getServiceState()); Assert.assertEquals("Number of registered nodes is wrong!", 0, this.registeredNodes.size()); } private YarnConfiguration createNMConfig(int port) throws IOException { YarnConfiguration conf = new YarnConfiguration(); String localhostAddress = null; try { localhostAddress = InetAddress.getByName("localhost").getCanonicalHostName(); } catch (UnknownHostException e) { Assert.fail("Unable to get localhost address: " + e.getMessage()); } conf.setInt(YarnConfiguration.NM_PMEM_MB, 5 * 1024); // 5GB conf.set(YarnConfiguration.NM_ADDRESS, localhostAddress + ":" + port); conf.set(YarnConfiguration.NM_LOCALIZER_ADDRESS, localhostAddress + ":" + ServerSocketUtil.getPort(49160, 10)); conf.set(YarnConfiguration.NM_LOG_DIRS, logsDir.getAbsolutePath()); conf.set(YarnConfiguration.NM_REMOTE_APP_LOG_DIR, remoteLogsDir.getAbsolutePath()); conf.set(YarnConfiguration.NM_LOCAL_DIRS, nmLocalDir.getAbsolutePath()); conf.setLong(YarnConfiguration.NM_LOG_RETAIN_SECONDS, 1); return conf; } private YarnConfiguration createNMConfig() throws IOException { return createNMConfig(ServerSocketUtil.getPort(49170, 10)); } private NodeManager getNodeManager(final NodeAction nodeHeartBeatAction) { return new NodeManager() { @Override protected NodeStatusUpdater createNodeStatusUpdater(Context context, Dispatcher dispatcher, NodeHealthCheckerService healthChecker) { MyNodeStatusUpdater myNodeStatusUpdater = new MyNodeStatusUpdater(context, dispatcher, healthChecker, metrics); MyResourceTracker2 myResourceTracker2 = new MyResourceTracker2(); myResourceTracker2.heartBeatNodeAction = nodeHeartBeatAction; myNodeStatusUpdater.resourceTracker = myResourceTracker2; return myNodeStatusUpdater; } }; } }