Java tutorial
/* * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.slider.server.appmaster; import com.codahale.metrics.MetricRegistry; import com.codahale.metrics.health.HealthCheckRegistry; import com.codahale.metrics.jvm.GarbageCollectorMetricSet; import com.codahale.metrics.jvm.MemoryUsageGaugeSet; import com.codahale.metrics.jvm.ThreadStatesGaugeSet; import com.google.common.base.Preconditions; import com.google.protobuf.BlockingService; import org.apache.commons.collections.CollectionUtils; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.CommonConfigurationKeysPublic; import org.apache.hadoop.fs.Path; import org.apache.hadoop.fs.permission.FsAction; import org.apache.hadoop.fs.permission.FsPermission; import org.apache.hadoop.hdfs.HdfsConfiguration; import org.apache.hadoop.hdfs.security.token.delegation.DelegationTokenIdentifier; import org.apache.hadoop.http.HttpConfig; import org.apache.hadoop.io.Text; import org.apache.hadoop.registry.client.binding.RegistryTypeUtils; import org.apache.hadoop.registry.client.binding.RegistryUtils; import org.apache.hadoop.security.Credentials; import org.apache.hadoop.security.UserGroupInformation; import org.apache.hadoop.security.token.Token; import org.apache.hadoop.security.token.TokenIdentifier; import org.apache.hadoop.service.Service; import org.apache.hadoop.service.ServiceOperations; import org.apache.hadoop.service.ServiceStateChangeListener; import org.apache.hadoop.yarn.api.ApplicationConstants; import org.apache.hadoop.yarn.api.protocolrecords.RegisterApplicationMasterResponse; import org.apache.hadoop.yarn.api.records.ApplicationAccessType; import org.apache.hadoop.yarn.api.records.ApplicationAttemptId; import org.apache.hadoop.yarn.api.records.ApplicationAttemptReport; import org.apache.hadoop.yarn.api.records.ApplicationId; import org.apache.hadoop.yarn.api.records.Container; import org.apache.hadoop.yarn.api.records.ContainerId; import org.apache.hadoop.yarn.api.records.ContainerLaunchContext; import org.apache.hadoop.yarn.api.records.ContainerState; import org.apache.hadoop.yarn.api.records.ContainerStatus; import org.apache.hadoop.yarn.api.records.FinalApplicationStatus; import org.apache.hadoop.yarn.api.records.NodeReport; import org.apache.hadoop.yarn.api.records.NodeState; import org.apache.hadoop.yarn.api.records.Priority; import org.apache.hadoop.yarn.api.records.Resource; import org.apache.hadoop.yarn.client.api.AMRMClient; import org.apache.hadoop.yarn.client.api.async.AMRMClientAsync; import org.apache.hadoop.yarn.client.api.async.NMClientAsync; import org.apache.hadoop.yarn.client.api.async.impl.NMClientAsyncImpl; import org.apache.hadoop.yarn.conf.YarnConfiguration; import static org.apache.hadoop.yarn.conf.YarnConfiguration.*; import static org.apache.slider.common.Constants.HADOOP_JAAS_DEBUG; import org.apache.hadoop.yarn.exceptions.InvalidApplicationMasterRequestException; import org.apache.hadoop.yarn.exceptions.YarnException; import org.apache.hadoop.yarn.ipc.YarnRPC; import org.apache.hadoop.registry.client.api.RegistryOperations; import org.apache.hadoop.registry.client.binding.RegistryPathUtils; import org.apache.hadoop.registry.client.types.yarn.PersistencePolicies; import org.apache.hadoop.registry.client.types.ServiceRecord; import org.apache.hadoop.registry.client.types.yarn.YarnRegistryAttributes; import org.apache.hadoop.registry.server.integration.RMRegistryOperationsService; import org.apache.hadoop.yarn.security.AMRMTokenIdentifier; import org.apache.hadoop.yarn.security.client.ClientToAMTokenSecretManager; import org.apache.hadoop.yarn.security.client.TimelineDelegationTokenIdentifier; import org.apache.hadoop.yarn.util.ConverterUtils; import org.apache.hadoop.yarn.webapp.WebAppException; import org.apache.hadoop.yarn.webapp.WebApps; import org.apache.hadoop.yarn.webapp.util.WebAppUtils; import org.apache.slider.api.ClusterDescription; import org.apache.slider.api.InternalKeys; import org.apache.slider.api.ResourceKeys; import org.apache.slider.api.RoleKeys; import org.apache.slider.api.StatusKeys; import org.apache.slider.api.proto.SliderClusterAPI; import org.apache.slider.client.SliderYarnClientImpl; import org.apache.slider.common.SliderExitCodes; import org.apache.slider.common.SliderKeys; import org.apache.slider.common.params.AbstractActionArgs; import org.apache.slider.common.params.SliderAMArgs; import org.apache.slider.common.params.SliderAMCreateAction; import org.apache.slider.common.params.SliderActions; import org.apache.slider.common.tools.ConfigHelper; import org.apache.slider.common.tools.PortScanner; import org.apache.slider.common.tools.SliderFileSystem; import org.apache.slider.common.tools.SliderUtils; import org.apache.slider.common.tools.SliderVersionInfo; import org.apache.slider.core.build.InstanceIO; import org.apache.slider.core.conf.AggregateConf; import org.apache.slider.core.conf.ConfTree; import org.apache.slider.core.conf.ConfTreeOperations; import org.apache.slider.core.conf.MapOperations; import org.apache.slider.core.exceptions.BadConfigException; import org.apache.slider.core.exceptions.SliderException; import org.apache.slider.core.exceptions.SliderInternalStateException; import org.apache.slider.core.exceptions.TriggerClusterTeardownException; import org.apache.slider.core.launch.CredentialUtils; import org.apache.slider.core.main.ExitCodeProvider; import org.apache.slider.core.main.LauncherExitCodes; import org.apache.slider.core.main.RunService; import org.apache.slider.core.main.ServiceLauncher; import org.apache.slider.core.registry.info.CustomRegistryConstants; import org.apache.slider.providers.ProviderCompleted; import org.apache.slider.providers.ProviderRole; import org.apache.slider.providers.ProviderService; import org.apache.slider.providers.SliderProviderFactory; import org.apache.slider.providers.agent.AgentKeys; import org.apache.slider.providers.agent.AgentProviderService; import org.apache.slider.providers.slideram.SliderAMClientProvider; import org.apache.slider.providers.slideram.SliderAMProviderService; import org.apache.slider.server.appmaster.actions.ActionRegisterServiceInstance; import org.apache.slider.server.appmaster.actions.EscalateOutstandingRequests; import org.apache.slider.server.appmaster.actions.RegisterComponentInstance; import org.apache.slider.server.appmaster.actions.QueueExecutor; import org.apache.slider.server.appmaster.actions.QueueService; import org.apache.slider.server.appmaster.actions.ActionStopSlider; import org.apache.slider.server.appmaster.actions.ActionUpgradeContainers; import org.apache.slider.server.appmaster.actions.AsyncAction; import org.apache.slider.server.appmaster.actions.RenewingAction; import org.apache.slider.server.appmaster.actions.ResetFailureWindow; import org.apache.slider.server.appmaster.actions.ReviewAndFlexApplicationSize; import org.apache.slider.server.appmaster.actions.UnregisterComponentInstance; import org.apache.slider.server.appmaster.management.MetricsAndMonitoring; import org.apache.slider.server.appmaster.management.YarnServiceHealthCheck; import org.apache.slider.server.appmaster.monkey.ChaosKillAM; import org.apache.slider.server.appmaster.monkey.ChaosKillContainer; import org.apache.slider.server.appmaster.monkey.ChaosMonkeyService; import org.apache.slider.server.appmaster.operations.AsyncRMOperationHandler; import org.apache.slider.server.appmaster.operations.ProviderNotifyingOperationHandler; import org.apache.slider.server.appmaster.rpc.RpcBinder; import org.apache.slider.server.appmaster.rpc.SliderAMPolicyProvider; import org.apache.slider.server.appmaster.rpc.SliderClusterProtocolPBImpl; import org.apache.slider.server.appmaster.operations.AbstractRMOperation; import org.apache.slider.server.appmaster.rpc.SliderIPCService; import org.apache.slider.server.appmaster.security.SecurityConfiguration; import org.apache.slider.server.appmaster.state.AppState; import org.apache.slider.server.appmaster.state.AppStateBindingInfo; import org.apache.slider.server.appmaster.state.ContainerAssignment; import org.apache.slider.server.appmaster.state.ProviderAppState; import org.apache.slider.server.appmaster.operations.RMOperationHandler; import org.apache.slider.server.appmaster.state.RoleInstance; import org.apache.slider.server.appmaster.web.AgentService; import org.apache.slider.server.appmaster.web.rest.InsecureAmFilterInitializer; import org.apache.slider.server.appmaster.web.rest.agent.AgentWebApp; import org.apache.slider.server.appmaster.web.SliderAMWebApp; import org.apache.slider.server.appmaster.web.WebAppApi; import org.apache.slider.server.appmaster.web.WebAppApiImpl; import org.apache.slider.server.appmaster.web.rest.RestPaths; import org.apache.slider.server.appmaster.web.rest.application.ApplicationResouceContentCacheFactory; import org.apache.slider.server.appmaster.web.rest.application.resources.ContentCache; import org.apache.slider.server.services.security.CertificateManager; import org.apache.slider.server.services.utility.AbstractSliderLaunchedService; import org.apache.slider.server.services.utility.WebAppService; import org.apache.slider.server.services.workflow.ServiceThreadFactory; import org.apache.slider.server.services.workflow.WorkflowExecutorService; import org.apache.slider.server.services.workflow.WorkflowRpcService; import org.apache.slider.server.services.yarnregistry.YarnRegistryViewForProviders; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import java.io.File; import java.io.IOException; import java.net.InetSocketAddress; import java.net.URI; import java.net.URL; import java.net.URLClassLoader; import java.nio.ByteBuffer; import java.security.PrivilegedExceptionAction; import java.util.ArrayList; import java.util.Arrays; import java.util.HashMap; import java.util.HashSet; import java.util.Iterator; import java.util.List; import java.util.Map; import java.util.Set; import java.util.concurrent.ExecutorService; import java.util.concurrent.Executors; import java.util.concurrent.TimeUnit; import java.util.concurrent.atomic.AtomicBoolean; import java.util.concurrent.locks.Condition; import java.util.concurrent.locks.ReentrantLock; /** * This is the AM, which directly implements the callbacks from the AM and NM */ public class SliderAppMaster extends AbstractSliderLaunchedService implements AMRMClientAsync.CallbackHandler, NMClientAsync.CallbackHandler, RunService, SliderExitCodes, SliderKeys, ServiceStateChangeListener, RoleKeys, ProviderCompleted, AppMasterActionOperations { protected static final Logger log = LoggerFactory.getLogger(SliderAppMaster.class); /** * log for YARN events */ protected static final Logger LOG_YARN = log; public static final String SERVICE_CLASSNAME_SHORT = "SliderAppMaster"; public static final String SERVICE_CLASSNAME = "org.apache.slider.server.appmaster." + SERVICE_CLASSNAME_SHORT; public static final int HEARTBEAT_INTERVAL = 1000; public static final int NUM_RPC_HANDLERS = 5; /** * Metrics and monitoring services. * Deployed in {@link #serviceInit(Configuration)} */ private final MetricsAndMonitoring metricsAndMonitoring = new MetricsAndMonitoring(); /** * metrics registry */ public MetricRegistry metrics; /** Error string on chaos monkey launch failure action: {@value} */ public static final String E_TRIGGERED_LAUNCH_FAILURE = "Chaos monkey triggered launch failure"; /** YARN RPC to communicate with the Resource Manager or Node Manager */ private YarnRPC yarnRPC; /** Handle to communicate with the Resource Manager*/ @SuppressWarnings("FieldAccessedSynchronizedAndUnsynchronized") private AMRMClientAsync asyncRMClient; @SuppressWarnings("FieldAccessedSynchronizedAndUnsynchronized") private RMOperationHandler rmOperationHandler; private RMOperationHandler providerRMOperationHandler; /** Handle to communicate with the Node Manager*/ @SuppressWarnings("FieldAccessedSynchronizedAndUnsynchronized") public NMClientAsync nmClientAsync; /** * Credentials for propagating down to launched containers */ private Credentials containerCredentials; /** * Slider IPC: Real service handler */ private SliderIPCService sliderIPCService; /** * Slider IPC: binding */ private WorkflowRpcService rpcService; /** * Secret manager */ @SuppressWarnings("FieldAccessedSynchronizedAndUnsynchronized") private ClientToAMTokenSecretManager secretManager; /** Hostname of the container*/ private String appMasterHostname = ""; /* Port on which the app master listens for status updates from clients*/ private int appMasterRpcPort = 0; /** Tracking url to which app master publishes info for clients to monitor*/ @SuppressWarnings("FieldAccessedSynchronizedAndUnsynchronized") private String appMasterTrackingUrl = ""; /** Proxied app master URL (as retrieved from AM report at launch time) */ @SuppressWarnings("FieldAccessedSynchronizedAndUnsynchronized") private String appMasterProxiedUrl = ""; /** Application Attempt Id ( combination of attemptId and fail count )*/ private ApplicationAttemptId appAttemptID; /** * App ACLs */ protected Map<ApplicationAccessType, String> applicationACLs; /** * Ongoing state of the cluster: containers, nodes they * live on, etc. */ private final AppState appState = new AppState(new ProtobufClusterServices(), metricsAndMonitoring); /** * App state for external objects. This is almost entirely * a read-only view of the application state. To change the state, * Providers (or anything else) are expected to queue async changes. */ private final ProviderAppState stateForProviders = new ProviderAppState("undefined", appState); /** * model the state using locks and conditions */ private final ReentrantLock AMExecutionStateLock = new ReentrantLock(); private final Condition isAMCompleted = AMExecutionStateLock.newCondition(); /** * Flag set if the AM is to be shutdown */ private final AtomicBoolean amCompletionFlag = new AtomicBoolean(false); /** * Flag set during the init process */ private final AtomicBoolean initCompleted = new AtomicBoolean(false); /** * Flag to set if the process exit code was set before shutdown started */ @SuppressWarnings("FieldAccessedSynchronizedAndUnsynchronized") private boolean spawnedProcessExitedBeforeShutdownTriggered; /** Arguments passed in : raw*/ @SuppressWarnings("FieldAccessedSynchronizedAndUnsynchronized") private SliderAMArgs serviceArgs; /** * ID of the AM container */ @SuppressWarnings("FieldAccessedSynchronizedAndUnsynchronized") private ContainerId appMasterContainerID; /** * Monkey Service -may be null */ private ChaosMonkeyService monkey; /** * ProviderService of this cluster */ @SuppressWarnings("FieldAccessedSynchronizedAndUnsynchronized") private ProviderService providerService; /** * The YARN registry service */ @SuppressWarnings("FieldAccessedSynchronizedAndUnsynchronized") private RegistryOperations registryOperations; /** * The stop request received...the exit details are extracted * from this */ private volatile ActionStopSlider stopAction; @SuppressWarnings("FieldAccessedSynchronizedAndUnsynchronized") private RoleLaunchService launchService; //username -null if it is not known/not to be set @SuppressWarnings("FieldAccessedSynchronizedAndUnsynchronized") private String hadoop_user_name; private String service_user_name; private SliderAMWebApp webApp; @SuppressWarnings("FieldAccessedSynchronizedAndUnsynchronized") private InetSocketAddress rpcServiceAddress; private SliderAMProviderService sliderAMProvider; private CertificateManager certificateManager; /** * Executor. * Assigned in {@link #serviceInit(Configuration)} */ private WorkflowExecutorService<ExecutorService> executorService; /** * Action queues. Created at instance creation, but * added as a child and inited in {@link #serviceInit(Configuration)} */ private final QueueService actionQueues = new QueueService(); private String agentOpsUrl; private String agentStatusUrl; private YarnRegistryViewForProviders yarnRegistryOperations; //private FsDelegationTokenManager fsDelegationTokenManager; private RegisterApplicationMasterResponse amRegistrationData; private PortScanner portScanner; private SecurityConfiguration securityConfiguration; /** * Is security enabled? * Set early on in the {@link #createAndRunCluster(String)} operation. */ private boolean securityEnabled; private ContentCache contentCache; /** * resource limits */ private Resource maximumResourceCapability; /** * Service Constructor */ public SliderAppMaster() { super(SERVICE_CLASSNAME_SHORT); new HdfsConfiguration(); new YarnConfiguration(); } /* =================================================================== */ /* service lifecycle methods */ /* =================================================================== */ @Override //AbstractService public synchronized void serviceInit(Configuration conf) throws Exception { // slider client if found Configuration customConf = SliderUtils.loadSliderClientXML(); // Load in the server configuration - if it is actually on the Classpath URL serverXmlUrl = ConfigHelper.getResourceUrl(SLIDER_SERVER_XML); if (serverXmlUrl != null) { log.info("Loading {} at {}", SLIDER_SERVER_XML, serverXmlUrl); Configuration serverConf = ConfigHelper.loadFromResource(SLIDER_SERVER_XML); ConfigHelper.mergeConfigurations(customConf, serverConf, SLIDER_SERVER_XML, true); } serviceArgs.applyDefinitions(customConf); serviceArgs.applyFileSystemBinding(customConf); // conf now contains all customizations AbstractActionArgs action = serviceArgs.getCoreAction(); SliderAMCreateAction createAction = (SliderAMCreateAction) action; // sort out the location of the AM String rmAddress = createAction.getRmAddress(); if (rmAddress != null) { log.debug("Setting RM address from the command line: {}", rmAddress); SliderUtils.setRmSchedulerAddress(customConf, rmAddress); } log.info("AM configuration:\n{}", ConfigHelper.dumpConfigToString(customConf)); for (Map.Entry<String, String> envs : System.getenv().entrySet()) { log.info("System env {}={}", envs.getKey(), envs.getValue()); } ConfigHelper.mergeConfigurations(conf, customConf, SLIDER_CLIENT_XML, true); //init security with our conf if (SliderUtils.isHadoopClusterSecure(conf)) { log.info("Secure mode with kerberos realm {}", SliderUtils.getKerberosRealm()); UserGroupInformation.setConfiguration(conf); UserGroupInformation ugi = UserGroupInformation.getCurrentUser(); log.debug("Authenticating as {}", ugi); SliderUtils.verifyPrincipalSet(conf, DFS_NAMENODE_KERBEROS_PRINCIPAL_KEY); } else { log.info("Cluster is insecure"); } log.info("Login user is {}", UserGroupInformation.getLoginUser()); //look at settings of Hadoop Auth, to pick up a problem seen once checkAndWarnForAuthTokenProblems(); // validate server env boolean dependencyChecks = !conf.getBoolean(KEY_SLIDER_AM_DEPENDENCY_CHECKS_DISABLED, false); SliderUtils.validateSliderServerEnvironment(log, dependencyChecks); // create and register monitoring services addService(metricsAndMonitoring); metrics = metricsAndMonitoring.getMetrics(); /* TODO: turn these one once the metrics testing is more under control metrics.registerAll(new ThreadStatesGaugeSet()); metrics.registerAll(new MemoryUsageGaugeSet()); metrics.registerAll(new GarbageCollectorMetricSet()); */ contentCache = ApplicationResouceContentCacheFactory.createContentCache(stateForProviders); executorService = new WorkflowExecutorService<>("AmExecutor", Executors.newFixedThreadPool(2, new ServiceThreadFactory("AmExecutor", true))); addService(executorService); addService(actionQueues); //init all child services super.serviceInit(conf); } @Override protected void serviceStart() throws Exception { super.serviceStart(); HealthCheckRegistry health = metricsAndMonitoring.getHealth(); health.register("AM Health", new YarnServiceHealthCheck(this)); } /** * Start the queue processing */ private void startQueueProcessing() { log.info("Queue Processing started"); executorService.execute(actionQueues); executorService.execute(new QueueExecutor(this, actionQueues)); } /* =================================================================== */ /* RunService methods called from ServiceLauncher */ /* =================================================================== */ /** * pick up the args from the service launcher * @param config configuration * @param args argument list */ @Override // RunService public Configuration bindArgs(Configuration config, String... args) throws Exception { // let the superclass process it Configuration superConf = super.bindArgs(config, args); // add the slider XML config ConfigHelper.injectSliderXMLResource(); //yarn-ify YarnConfiguration yarnConfiguration = new YarnConfiguration(superConf); serviceArgs = new SliderAMArgs(args); serviceArgs.parse(); return SliderUtils.patchConfiguration(yarnConfiguration); } /** * this is called by service launcher; when it returns the application finishes * @return the exit code to return by the app * @throws Throwable */ @Override public int runService() throws Throwable { SliderVersionInfo.loadAndPrintVersionInfo(log); //dump the system properties if in debug mode if (log.isDebugEnabled()) { log.debug("System properties:\n" + SliderUtils.propertiesToString(System.getProperties())); } //choose the action String action = serviceArgs.getAction(); List<String> actionArgs = serviceArgs.getActionArgs(); int exitCode; switch (action) { case SliderActions.ACTION_HELP: log.info("{}: {}", getName(), serviceArgs.usage()); exitCode = SliderExitCodes.EXIT_USAGE; break; case SliderActions.ACTION_CREATE: exitCode = createAndRunCluster(actionArgs.get(0)); break; default: throw new SliderException("Unimplemented: " + action); } log.info("Exiting AM; final exit code = {}", exitCode); return exitCode; } /** * Initialize a newly created service then add it. * Because the service is not started, this MUST be done before * the AM itself starts, or it is explicitly added after * @param service the service to init */ public Service initAndAddService(Service service) { service.init(getConfig()); addService(service); return service; } /* =================================================================== */ /** * Create and run the cluster. * @param clustername cluster name * @return exit code * @throws Throwable on a failure */ private int createAndRunCluster(String clustername) throws Throwable { //load the cluster description from the cd argument String sliderClusterDir = serviceArgs.getSliderClusterURI(); URI sliderClusterURI = new URI(sliderClusterDir); Path clusterDirPath = new Path(sliderClusterURI); log.info("Application defined at {}", sliderClusterURI); SliderFileSystem fs = getClusterFS(); // build up information about the running application -this // will be passed down to the cluster status MapOperations appInformation = new MapOperations(); AggregateConf instanceDefinition = InstanceIO.loadInstanceDefinitionUnresolved(fs, clusterDirPath); instanceDefinition.setName(clustername); log.info("Deploying cluster {}:", instanceDefinition); // and resolve it AggregateConf resolvedInstance = new AggregateConf(instanceDefinition); resolvedInstance.resolve(); stateForProviders.setApplicationName(clustername); Configuration serviceConf = getConfig(); // extend AM configuration with component resource MapOperations amConfiguration = resolvedInstance.getAppConfOperations().getComponent(COMPONENT_AM); // and patch configuration with prefix if (amConfiguration != null) { Map<String, String> sliderAppConfKeys = amConfiguration.prefixedWith("slider."); for (Map.Entry<String, String> entry : sliderAppConfKeys.entrySet()) { String k = entry.getKey(); String v = entry.getValue(); boolean exists = serviceConf.get(k) != null; log.info("{} {} to {}", (exists ? "Overwriting" : "Setting"), k, v); serviceConf.set(k, v); } } securityConfiguration = new SecurityConfiguration(serviceConf, resolvedInstance, clustername); // obtain security state securityEnabled = securityConfiguration.isSecurityEnabled(); // set the global security flag for the instance definition instanceDefinition.getAppConfOperations().set(KEY_SECURITY_ENABLED, securityEnabled); // triggers resolution and snapshotting for agent appState.setInitialInstanceDefinition(instanceDefinition); File confDir = getLocalConfDir(); if (!confDir.exists() || !confDir.isDirectory()) { log.info("Conf dir {} does not exist.", confDir); File parentFile = confDir.getParentFile(); log.info("Parent dir {}:\n{}", parentFile, SliderUtils.listDir(parentFile)); } //get our provider MapOperations globalInternalOptions = getGlobalInternalOptions(); String providerType = globalInternalOptions.getMandatoryOption(InternalKeys.INTERNAL_PROVIDER_NAME); log.info("Cluster provider type is {}", providerType); SliderProviderFactory factory = SliderProviderFactory.createSliderProviderFactory(providerType); providerService = factory.createServerProvider(); // init the provider BUT DO NOT START IT YET initAndAddService(providerService); providerRMOperationHandler = new ProviderNotifyingOperationHandler(providerService); // create a slider AM provider sliderAMProvider = new SliderAMProviderService(); initAndAddService(sliderAMProvider); InetSocketAddress rmSchedulerAddress = SliderUtils.getRmSchedulerAddress(serviceConf); log.info("RM is at {}", rmSchedulerAddress); yarnRPC = YarnRPC.create(serviceConf); // set up the YARN client. This may require patching in the RM client-API address if it // is (somehow) unset server-side. String clientRMaddr = serviceConf.get(YarnConfiguration.RM_ADDRESS); InetSocketAddress clientRpcAddress = SliderUtils.getRmAddress(serviceConf); if (!SliderUtils.isAddressDefined(clientRpcAddress)) { // client addr is being unset. We can lift it from the other RM APIs log.warn("Yarn RM address was unbound; attempting to fix up"); serviceConf.set(YarnConfiguration.RM_ADDRESS, String.format("%s:%d", rmSchedulerAddress.getHostString(), clientRpcAddress.getPort())); } /* * Extract the container ID. This is then * turned into an (incomplete) container */ appMasterContainerID = ConverterUtils.toContainerId( SliderUtils.mandatoryEnvVariable(ApplicationConstants.Environment.CONTAINER_ID.name())); appAttemptID = appMasterContainerID.getApplicationAttemptId(); ApplicationId appid = appAttemptID.getApplicationId(); log.info("AM for ID {}", appid.getId()); appInformation.put(StatusKeys.INFO_AM_CONTAINER_ID, appMasterContainerID.toString()); appInformation.put(StatusKeys.INFO_AM_APP_ID, appid.toString()); appInformation.put(StatusKeys.INFO_AM_ATTEMPT_ID, appAttemptID.toString()); Map<String, String> envVars; List<Container> liveContainers; /* * It is critical this section is synchronized, to stop async AM events * arriving while registering a restarting AM. */ synchronized (appState) { int heartbeatInterval = HEARTBEAT_INTERVAL; // add the RM client -this brings the callbacks in asyncRMClient = AMRMClientAsync.createAMRMClientAsync(heartbeatInterval, this); addService(asyncRMClient); //now bring it up deployChildService(asyncRMClient); // nmclient relays callbacks back to this class nmClientAsync = new NMClientAsyncImpl("nmclient", this); deployChildService(nmClientAsync); // set up secret manager secretManager = new ClientToAMTokenSecretManager(appAttemptID, null); if (securityEnabled) { // fix up the ACLs if they are not set String acls = serviceConf.get(KEY_PROTOCOL_ACL); if (acls == null) { getConfig().set(KEY_PROTOCOL_ACL, "*"); } } certificateManager = new CertificateManager(); //bring up the Slider RPC service buildPortScanner(instanceDefinition); startSliderRPCServer(instanceDefinition); rpcServiceAddress = rpcService.getConnectAddress(); appMasterHostname = rpcServiceAddress.getAddress().getCanonicalHostName(); appMasterRpcPort = rpcServiceAddress.getPort(); appMasterTrackingUrl = null; log.info("AM Server is listening at {}:{}", appMasterHostname, appMasterRpcPort); appInformation.put(StatusKeys.INFO_AM_HOSTNAME, appMasterHostname); appInformation.set(StatusKeys.INFO_AM_RPC_PORT, appMasterRpcPort); log.info("Starting Yarn registry"); registryOperations = startRegistryOperationsService(); log.info(registryOperations.toString()); //build the role map List<ProviderRole> providerRoles = new ArrayList<>(providerService.getRoles()); providerRoles.addAll(SliderAMClientProvider.ROLES); // Start up the WebApp and track the URL for it MapOperations component = instanceDefinition.getAppConfOperations() .getComponent(SliderKeys.COMPONENT_AM); certificateManager.initialize(component, appMasterHostname, appMasterContainerID.toString(), clustername); certificateManager.setPassphrase(instanceDefinition.getPassphrase()); if (component.getOptionBool(AgentKeys.KEY_AGENT_TWO_WAY_SSL_ENABLED, false)) { uploadServerCertForLocalization(clustername, fs); } // Web service endpoints: initialize WebAppApiImpl webAppApi = new WebAppApiImpl(stateForProviders, providerService, certificateManager, registryOperations, metricsAndMonitoring, actionQueues, this, contentCache); initAMFilterOptions(serviceConf); // start the agent web app startAgentWebApp(appInformation, serviceConf, webAppApi); int webAppPort = deployWebApplication(webAppApi); String scheme = WebAppUtils.HTTP_PREFIX; appMasterTrackingUrl = scheme + appMasterHostname + ":" + webAppPort; appInformation.put(StatusKeys.INFO_AM_WEB_URL, appMasterTrackingUrl + "/"); appInformation.set(StatusKeys.INFO_AM_WEB_PORT, webAppPort); // ***************************************************** // Register self with ResourceManager // This will start heartbeating to the RM // address = SliderUtils.getRmSchedulerAddress(asyncRMClient.getConfig()); // ***************************************************** log.info("Connecting to RM at {}; AM tracking URL={}", appMasterRpcPort, appMasterTrackingUrl); amRegistrationData = asyncRMClient.registerApplicationMaster(appMasterHostname, appMasterRpcPort, appMasterTrackingUrl); maximumResourceCapability = amRegistrationData.getMaximumResourceCapability(); int minMemory = serviceConf.getInt(RM_SCHEDULER_MINIMUM_ALLOCATION_MB, DEFAULT_RM_SCHEDULER_MINIMUM_ALLOCATION_MB); // validate scheduler vcores allocation setting int minCores = serviceConf.getInt(RM_SCHEDULER_MINIMUM_ALLOCATION_VCORES, DEFAULT_RM_SCHEDULER_MINIMUM_ALLOCATION_VCORES); int maxMemory = maximumResourceCapability.getMemory(); int maxCores = maximumResourceCapability.getVirtualCores(); appState.setContainerLimits(minMemory, maxMemory, minCores, maxCores); // build the handler for RM request/release operations; this uses // the max value as part of its lookup rmOperationHandler = new AsyncRMOperationHandler(asyncRMClient, maximumResourceCapability); // set the RM-defined maximum cluster values appInformation.put(ResourceKeys.YARN_CORES, Integer.toString(maxCores)); appInformation.put(ResourceKeys.YARN_MEMORY, Integer.toString(maxMemory)); processAMCredentials(securityConfiguration); if (securityEnabled) { secretManager.setMasterKey(amRegistrationData.getClientToAMTokenMasterKey().array()); applicationACLs = amRegistrationData.getApplicationACLs(); //tell the server what the ACLs are rpcService.getServer().refreshServiceAcl(serviceConf, new SliderAMPolicyProvider()); if (securityConfiguration.isKeytabProvided()) { // perform keytab based login to establish kerberos authenticated // principal. Can do so now since AM registration with RM above required // tokens associated to principal String principal = securityConfiguration.getPrincipal(); File localKeytabFile = securityConfiguration.getKeytabFile(instanceDefinition); // Now log in... login(principal, localKeytabFile); // obtain new FS reference that should be kerberos based and different // than the previously cached reference fs = new SliderFileSystem(serviceConf); } } // YARN client. // Important: this is only valid at startup, and must be executed within // the right UGI context. Use with care. SliderYarnClientImpl yarnClient = null; List<NodeReport> nodeReports; try { yarnClient = new SliderYarnClientImpl(); yarnClient.init(getConfig()); yarnClient.start(); nodeReports = getNodeReports(yarnClient); log.info("Yarn node report count: {}", nodeReports.size()); // look up the application itself -this is needed to get the proxied // URL of the AM, for registering endpoints. // this call must be made after the AM has registered itself, obviously ApplicationAttemptReport report = getApplicationAttemptReport(yarnClient); appMasterProxiedUrl = report.getTrackingUrl(); if (SliderUtils.isUnset(appMasterProxiedUrl)) { log.warn("Proxied URL is not set in application report"); appMasterProxiedUrl = appMasterTrackingUrl; } } finally { // at this point yarnClient is no longer needed. // stop it immediately ServiceOperations.stop(yarnClient); yarnClient = null; } // extract container list liveContainers = amRegistrationData.getContainersFromPreviousAttempts(); //now validate the installation Configuration providerConf = providerService.loadProviderConfigurationInformation(confDir); providerService.initializeApplicationConfiguration(instanceDefinition, fs); providerService.validateApplicationConfiguration(instanceDefinition, confDir, securityEnabled); //determine the location for the role history data Path historyDir = new Path(clusterDirPath, HISTORY_DIR_NAME); //build the instance AppStateBindingInfo binding = new AppStateBindingInfo(); binding.instanceDefinition = instanceDefinition; binding.serviceConfig = serviceConf; binding.publishedProviderConf = providerConf; binding.roles = providerRoles; binding.fs = fs.getFileSystem(); binding.historyPath = historyDir; binding.liveContainers = liveContainers; binding.applicationInfo = appInformation; binding.releaseSelector = providerService.createContainerReleaseSelector(); binding.nodeReports = nodeReports; appState.buildInstance(binding); providerService.rebuildContainerDetails(liveContainers, instanceDefinition.getName(), appState.getRolePriorityMap()); // add the AM to the list of nodes in the cluster appState.buildAppMasterNode(appMasterContainerID, appMasterHostname, webAppPort, appMasterHostname + ":" + webAppPort); // build up environment variables that the AM wants set in every container // irrespective of provider and role. envVars = new HashMap<>(); if (hadoop_user_name != null) { envVars.put(HADOOP_USER_NAME, hadoop_user_name); } String debug_kerberos = System.getenv(HADOOP_JAAS_DEBUG); if (debug_kerberos != null) { envVars.put(HADOOP_JAAS_DEBUG, debug_kerberos); } } String rolesTmpSubdir = appMasterContainerID.toString() + "/roles"; String amTmpDir = globalInternalOptions.getMandatoryOption(InternalKeys.INTERNAL_AM_TMP_DIR); Path tmpDirPath = new Path(amTmpDir); Path launcherTmpDirPath = new Path(tmpDirPath, rolesTmpSubdir); fs.getFileSystem().mkdirs(launcherTmpDirPath); //launcher service launchService = new RoleLaunchService(actionQueues, providerService, fs, new Path(getGeneratedConfDir()), envVars, launcherTmpDirPath); deployChildService(launchService); appState.noteAMLaunched(); //Give the provider access to the state, and AM providerService.bind(stateForProviders, actionQueues, liveContainers); sliderAMProvider.bind(stateForProviders, actionQueues, liveContainers); // chaos monkey maybeStartMonkey(); // setup token renewal and expiry handling for long lived apps // if (!securityConfiguration.isKeytabProvided() && // SliderUtils.isHadoopClusterSecure(getConfig())) { // fsDelegationTokenManager = new FsDelegationTokenManager(actionQueues); // fsDelegationTokenManager.acquireDelegationToken(getConfig()); // } // if not a secure cluster, extract the username -it will be // propagated to workers if (!UserGroupInformation.isSecurityEnabled()) { hadoop_user_name = System.getenv(HADOOP_USER_NAME); log.info(HADOOP_USER_NAME + "='{}'", hadoop_user_name); } service_user_name = RegistryUtils.currentUser(); log.info("Registry service username ={}", service_user_name); // declare the cluster initialized log.info("Application Master Initialization Completed"); initCompleted.set(true); scheduleFailureWindowResets(instanceDefinition.getResources()); scheduleEscalation(instanceDefinition.getInternal()); try { // schedule YARN Registry registration queue(new ActionRegisterServiceInstance(clustername, appid)); // log the YARN and web UIs log.info("RM Webapp address {}", serviceConf.get(YarnConfiguration.RM_WEBAPP_ADDRESS)); log.info("Slider webapp address {} proxied at {}", appMasterTrackingUrl, appMasterProxiedUrl); // Start the Slider AM provider sliderAMProvider.start(); // launch the real provider; this is expected to trigger a callback that // starts the node review process launchProviderService(instanceDefinition, confDir); // start handling any scheduled events startQueueProcessing(); //now block waiting to be told to exit the process waitForAMCompletionSignal(); } catch (Exception e) { log.error("Exception : {}", e, e); // call the AM stop command as if it had been queued (but without // going via the queue, which may not have started onAMStop(new ActionStopSlider(e)); } //shutdown time return finish(); } /** * Get the YARN application Attempt report as the logged in user * @param yarnClient client to the RM * @return the application report * @throws YarnException * @throws IOException * @throws InterruptedException */ private ApplicationAttemptReport getApplicationAttemptReport(final SliderYarnClientImpl yarnClient) throws YarnException, IOException, InterruptedException { Preconditions.checkNotNull(yarnClient, "Null Yarn client"); ApplicationAttemptReport report; if (securityEnabled) { UserGroupInformation ugi = UserGroupInformation.getLoginUser(); report = ugi.doAs(new PrivilegedExceptionAction<ApplicationAttemptReport>() { @Override public ApplicationAttemptReport run() throws Exception { return yarnClient.getApplicationAttemptReport(appAttemptID); } }); } else { report = yarnClient.getApplicationAttemptReport(appAttemptID); } return report; } /** * List the node reports: uses {@link SliderYarnClientImpl} as the login user * @param yarnClient client to the RM * @return the node reports * @throws IOException * @throws YarnException * @throws InterruptedException */ private List<NodeReport> getNodeReports(final SliderYarnClientImpl yarnClient) throws IOException, YarnException, InterruptedException { Preconditions.checkNotNull(yarnClient, "Null Yarn client"); List<NodeReport> nodeReports; if (securityEnabled) { nodeReports = UserGroupInformation.getLoginUser() .doAs(new PrivilegedExceptionAction<List<NodeReport>>() { @Override public List<NodeReport> run() throws Exception { return yarnClient.getNodeReports(NodeState.RUNNING); } }); } else { nodeReports = yarnClient.getNodeReports(NodeState.RUNNING); } log.info("Yarn node report count: {}", nodeReports.size()); return nodeReports; } /** * Deploy the web application. * <p> * Creates and starts the web application, and adds a * <code>WebAppService</code> service under the AM, to ensure * a managed web application shutdown. * @param webAppApi web app API instance * @return port the web application is deployed on * @throws IOException general problems starting the webapp (network, etc) * @throws WebAppException other issues */ private int deployWebApplication(WebAppApiImpl webAppApi) throws IOException, SliderException { try { webApp = new SliderAMWebApp(webAppApi); HttpConfig.Policy policy = HttpConfig.Policy.HTTP_ONLY; int port = getPortToRequest(); log.info("Launching web application at port {} with policy {}", port, policy); WebApps.$for(SliderAMWebApp.BASE_PATH, WebAppApi.class, webAppApi, RestPaths.WS_CONTEXT) .withHttpPolicy(getConfig(), policy).at("0.0.0.0", port, true).inDevMode().start(webApp); WebAppService<SliderAMWebApp> webAppService = new WebAppService<>("slider", webApp); deployChildService(webAppService); return webApp.port(); } catch (WebAppException e) { if (e.getCause() instanceof IOException) { throw (IOException) e.getCause(); } else { throw e; } } } /** * Process the initial user to obtain the set of user * supplied credentials (tokens were passed in by client). * Removes the AM/RM token. * If a keytab has been provided, also strip the HDFS delegation token. * @param securityConfig slider security config * @throws IOException */ private void processAMCredentials(SecurityConfiguration securityConfig) throws IOException { List<Text> filteredTokens = new ArrayList<>(3); filteredTokens.add(AMRMTokenIdentifier.KIND_NAME); filteredTokens.add(TimelineDelegationTokenIdentifier.KIND_NAME); boolean keytabProvided = securityConfig.isKeytabProvided(); log.info("Slider AM Security Mode: {}", keytabProvided ? "KEYTAB" : "TOKEN"); if (keytabProvided) { filteredTokens.add(DelegationTokenIdentifier.HDFS_DELEGATION_KIND); } containerCredentials = CredentialUtils.filterTokens(UserGroupInformation.getCurrentUser().getCredentials(), filteredTokens); log.info(CredentialUtils.dumpTokens(containerCredentials, "\n")); } /** * Build up the port scanner. This may include setting a port range. */ private void buildPortScanner(AggregateConf instanceDefinition) throws BadConfigException { portScanner = new PortScanner(); String portRange = instanceDefinition.getAppConfOperations().getGlobalOptions() .getOption(SliderKeys.KEY_ALLOWED_PORT_RANGE, "0"); if (!"0".equals(portRange)) { portScanner.setPortRange(portRange); } } /** * Locate a port to request for a service such as RPC or web/REST. * This uses port range definitions in the <code>instanceDefinition</code> * to fix the port range if one is set. * <p> * The port returned is available at the time of the request; there are * no guarantees as to how long that situation will last. * @return the port to request. * @throws SliderException */ private int getPortToRequest() throws SliderException, IOException { return portScanner.getAvailablePort(); } private void uploadServerCertForLocalization(String clustername, SliderFileSystem fs) throws IOException { Path certsDir = fs.buildClusterSecurityDirPath(clustername); if (!fs.getFileSystem().exists(certsDir)) { fs.getFileSystem().mkdirs(certsDir, new FsPermission(FsAction.ALL, FsAction.NONE, FsAction.NONE)); } Path destPath = new Path(certsDir, SliderKeys.CRT_FILE_NAME); if (!fs.getFileSystem().exists(destPath)) { fs.getFileSystem().copyFromLocalFile( new Path(CertificateManager.getServerCertficateFilePath().getAbsolutePath()), destPath); log.info("Uploaded server cert to localization path {}", destPath); } fs.getFileSystem().setPermission(destPath, new FsPermission(FsAction.READ, FsAction.NONE, FsAction.NONE)); } protected void login(String principal, File localKeytabFile) throws IOException, SliderException { log.info("Logging in as {} with keytab {}", principal, localKeytabFile); UserGroupInformation.loginUserFromKeytab(principal, localKeytabFile.getAbsolutePath()); validateLoginUser(UserGroupInformation.getLoginUser()); } /** * Ensure that the user is generated from a keytab and has no HDFS delegation * tokens. * * @param user user to validate * @throws SliderException */ protected void validateLoginUser(UserGroupInformation user) throws SliderException { if (!user.isFromKeytab()) { log.error("User is not holding on a keytab in a secure deployment:" + " slider will fail as tokens expire"); } Credentials credentials = user.getCredentials(); Iterator<Token<? extends TokenIdentifier>> iter = credentials.getAllTokens().iterator(); while (iter.hasNext()) { Token<? extends TokenIdentifier> token = iter.next(); log.info("Token {}", token.getKind()); if (token.getKind().equals(DelegationTokenIdentifier.HDFS_DELEGATION_KIND)) { log.info("HDFS delegation token {}. Removing...", token); iter.remove(); } } } /** * Set up and start the agent web application * @param appInformation application information * @param serviceConf service configuration * @param webAppApi web app API instance to bind to * @throws IOException */ private void startAgentWebApp(MapOperations appInformation, Configuration serviceConf, WebAppApiImpl webAppApi) throws IOException, SliderException { URL[] urls = ((URLClassLoader) AgentWebApp.class.getClassLoader()).getURLs(); StringBuilder sb = new StringBuilder("AM classpath:"); for (URL url : urls) { sb.append("\n").append(url.toString()); } LOG_YARN.debug(sb.append("\n").toString()); initAMFilterOptions(serviceConf); // Start up the agent web app and track the URL for it MapOperations appMasterConfig = getInstanceDefinition().getAppConfOperations() .getComponent(SliderKeys.COMPONENT_AM); AgentWebApp agentWebApp = AgentWebApp.$for(AgentWebApp.BASE_PATH, webAppApi, RestPaths.AGENT_WS_CONTEXT) .withComponentConfig(appMasterConfig).withPort(getPortToRequest()) .withSecuredPort(getPortToRequest()).start(); agentOpsUrl = "https://" + appMasterHostname + ":" + agentWebApp.getSecuredPort(); agentStatusUrl = "https://" + appMasterHostname + ":" + agentWebApp.getPort(); AgentService agentService = new AgentService("slider-agent", agentWebApp); agentService.init(serviceConf); agentService.start(); addService(agentService); appInformation.put(StatusKeys.INFO_AM_AGENT_OPS_URL, agentOpsUrl + "/"); appInformation.put(StatusKeys.INFO_AM_AGENT_STATUS_URL, agentStatusUrl + "/"); appInformation.set(StatusKeys.INFO_AM_AGENT_STATUS_PORT, agentWebApp.getPort()); appInformation.set(StatusKeys.INFO_AM_AGENT_OPS_PORT, agentWebApp.getSecuredPort()); } /** * Set up the AM filter * @param serviceConf configuration to patch */ private void initAMFilterOptions(Configuration serviceConf) { // IP filtering String amFilterName = AM_FILTER_NAME; // This is here until YARN supports proxy & redirect operations // on verbs other than GET, and is only supported for testing if (X_DEV_INSECURE_REQUIRED && serviceConf.getBoolean(X_DEV_INSECURE_WS, X_DEV_INSECURE_DEFAULT)) { log.warn("Insecure filter enabled: REST operations are unauthenticated"); amFilterName = InsecureAmFilterInitializer.NAME; } serviceConf.set(HADOOP_HTTP_FILTER_INITIALIZERS, amFilterName); } /** * This registers the service instance and its external values * @param instanceName name of this instance * @param appId application ID * @throws IOException */ public void registerServiceInstance(String instanceName, ApplicationId appId) throws IOException { // the registry is running, so register services URL amWebURI = new URL(appMasterProxiedUrl); URL agentOpsURI = new URL(agentOpsUrl); URL agentStatusURI = new URL(agentStatusUrl); //Give the provider restricted access to the state, registry setupInitialRegistryPaths(); yarnRegistryOperations = new YarnRegistryViewForProviders(registryOperations, service_user_name, SliderKeys.APP_TYPE, instanceName, appAttemptID); providerService.bindToYarnRegistry(yarnRegistryOperations); sliderAMProvider.bindToYarnRegistry(yarnRegistryOperations); // Yarn registry ServiceRecord serviceRecord = new ServiceRecord(); serviceRecord.set(YarnRegistryAttributes.YARN_ID, appId.toString()); serviceRecord.set(YarnRegistryAttributes.YARN_PERSISTENCE, PersistencePolicies.APPLICATION); serviceRecord.description = "Slider Application Master"; serviceRecord.addExternalEndpoint( RegistryTypeUtils.ipcEndpoint(CustomRegistryConstants.AM_IPC_PROTOCOL, rpcServiceAddress)); // internal services sliderAMProvider.applyInitialRegistryDefinitions(amWebURI, agentOpsURI, agentStatusURI, serviceRecord); // provider service dynamic definitions. providerService.applyInitialRegistryDefinitions(amWebURI, agentOpsURI, agentStatusURI, serviceRecord); // set any provided attributes setProvidedServiceRecordAttributes( getInstanceDefinition().getAppConfOperations().getComponent(SliderKeys.COMPONENT_AM), serviceRecord); // register the service's entry log.info("Service Record \n{}", serviceRecord); yarnRegistryOperations.registerSelf(serviceRecord, true); log.info("Registered service under {}; absolute path {}", yarnRegistryOperations.getSelfRegistrationPath(), yarnRegistryOperations.getAbsoluteSelfRegistrationPath()); boolean isFirstAttempt = 1 == appAttemptID.getAttemptId(); // delete the children in case there are any and this is an AM startup. // just to make sure everything underneath is purged if (isFirstAttempt) { yarnRegistryOperations.deleteChildren(yarnRegistryOperations.getSelfRegistrationPath(), true); } } /** * TODO: purge this once RM is doing the work * @throws IOException */ protected void setupInitialRegistryPaths() throws IOException { if (registryOperations instanceof RMRegistryOperationsService) { RMRegistryOperationsService rmRegOperations = (RMRegistryOperationsService) registryOperations; rmRegOperations.initUserRegistryAsync(service_user_name); } } /** * Handler for {@link RegisterComponentInstance action} * Register/re-register an ephemeral container that is already in the app state * @param id the component * @param description component description * @param type component type * @return true if the component is registered */ public boolean registerComponent(ContainerId id, String description, String type) throws IOException { RoleInstance instance = appState.getOwnedContainer(id); if (instance == null) { return false; } // this is where component registrations go log.info("Registering component {}", id); String cid = RegistryPathUtils.encodeYarnID(id.toString()); ServiceRecord container = new ServiceRecord(); container.set(YarnRegistryAttributes.YARN_ID, cid); container.description = description; container.set(YarnRegistryAttributes.YARN_PERSISTENCE, PersistencePolicies.CONTAINER); MapOperations compOps = getInstanceDefinition().getAppConfOperations().getComponent(type); setProvidedServiceRecordAttributes(compOps, container); try { yarnRegistryOperations.putComponent(cid, container); } catch (IOException e) { log.warn("Failed to register container {}/{}: {}", id, description, e, e); return false; } return true; } protected void setProvidedServiceRecordAttributes(MapOperations ops, ServiceRecord record) { String prefix = RoleKeys.SERVICE_RECORD_ATTRIBUTE_PREFIX; for (Map.Entry<String, String> entry : ops.entrySet()) { if (entry.getKey().startsWith(prefix)) { String key = entry.getKey().substring(prefix.length() + 1); record.set(key, entry.getValue().trim()); } } } /** * Handler for {@link UnregisterComponentInstance} * * unregister a component. At the time this message is received, * the component may not have been registered * @param id the component */ public void unregisterComponent(ContainerId id) { log.info("Unregistering component {}", id); if (yarnRegistryOperations == null) { log.warn("Processing unregister component event before initialization " + "completed; init flag ={}", initCompleted); return; } String cid = RegistryPathUtils.encodeYarnID(id.toString()); try { yarnRegistryOperations.deleteComponent(cid); } catch (IOException e) { log.warn("Failed to delete container {} : {}", id, e, e); } } /** * looks for a specific case where a token file is provided as an environment * variable, yet the file is not there. * * This surfaced (once) in HBase, where its HDFS library was looking for this, * and somehow the token was missing. This is a check in the AM so that * if the problem re-occurs, the AM can fail with a more meaningful message. * */ private void checkAndWarnForAuthTokenProblems() { String fileLocation = System.getenv(UserGroupInformation.HADOOP_TOKEN_FILE_LOCATION); if (fileLocation != null) { File tokenFile = new File(fileLocation); if (!tokenFile.exists()) { log.warn("Token file {} specified in {} not found", tokenFile, UserGroupInformation.HADOOP_TOKEN_FILE_LOCATION); } } } /** * Build the configuration directory passed in or of the target FS * @return the file */ public File getLocalConfDir() { File confdir = new File(SliderKeys.PROPAGATED_CONF_DIR_NAME).getAbsoluteFile(); return confdir; } /** * Get the path to the DFS configuration that is defined in the cluster specification * @return the generated configuration dir */ public String getGeneratedConfDir() { return getGlobalInternalOptions().get(InternalKeys.INTERNAL_GENERATED_CONF_PATH); } /** * Get the global internal options for the AM * @return a map to access the internals */ public MapOperations getGlobalInternalOptions() { return getInstanceDefinition().getInternalOperations().getGlobalOptions(); } /** * Get the filesystem of this cluster * @return the FS of the config */ public SliderFileSystem getClusterFS() throws IOException { return new SliderFileSystem(getConfig()); } /** * Get the AM log * @return the log of the AM */ public static Logger getLog() { return log; } /** * Get the application state * @return the application state */ public AppState getAppState() { return appState; } /** * Block until it is signalled that the AM is done */ private void waitForAMCompletionSignal() { AMExecutionStateLock.lock(); try { if (!amCompletionFlag.get()) { log.debug("blocking until signalled to terminate"); isAMCompleted.awaitUninterruptibly(); } } finally { AMExecutionStateLock.unlock(); } } /** * Signal that the AM is complete .. queues it in a separate thread * * @param stopActionRequest request containing shutdown details */ public synchronized void signalAMComplete(ActionStopSlider stopActionRequest) { // this is a queued action: schedule it through the queues schedule(stopActionRequest); } /** * Signal that the AM is complete * * @param stopActionRequest request containing shutdown details */ public synchronized void onAMStop(ActionStopSlider stopActionRequest) { AMExecutionStateLock.lock(); try { if (amCompletionFlag.compareAndSet(false, true)) { // first stop request received this.stopAction = stopActionRequest; isAMCompleted.signal(); } } finally { AMExecutionStateLock.unlock(); } } /** * trigger the YARN cluster termination process * @return the exit code * @throws Exception if the stop action contained an Exception which implements * ExitCodeProvider */ private synchronized int finish() throws Exception { Preconditions.checkNotNull(stopAction, "null stop action"); FinalApplicationStatus appStatus; log.info("Triggering shutdown of the AM: {}", stopAction); String appMessage = stopAction.getMessage(); //stop the daemon & grab its exit code int exitCode = stopAction.getExitCode(); Exception exception = stopAction.getEx(); appStatus = stopAction.getFinalApplicationStatus(); if (!spawnedProcessExitedBeforeShutdownTriggered) { //stopped the forked process but don't worry about its exit code int forkedExitCode = stopForkedProcess(); log.debug("Stopped forked process: exit code={}", forkedExitCode); } // make sure the AM is actually registered. If not, there's no point // trying to unregister it if (amRegistrationData == null) { log.info("Application attempt not yet registered; skipping unregistration"); if (exception != null) { throw exception; } return exitCode; } //stop any launches in progress launchService.stop(); //now release all containers releaseAllContainers(); // When the application completes, it should send a finish application // signal to the RM log.info("Application completed. Signalling finish to RM"); try { log.info("Unregistering AM status={} message={}", appStatus, appMessage); asyncRMClient.unregisterApplicationMaster(appStatus, appMessage, null); } catch (InvalidApplicationMasterRequestException e) { log.info("Application not found in YARN application list;" + " it may have been terminated/YARN shutdown in progress: {}", e, e); } catch (YarnException | IOException e) { log.info("Failed to unregister application: " + e, e); } if (exception != null) { throw exception; } return exitCode; } /** * Get diagnostics info about containers */ private String getContainerDiagnosticInfo() { return appState.getContainerDiagnosticInfo(); } public Object getProxy(Class protocol, InetSocketAddress addr) { return yarnRPC.getProxy(protocol, addr, getConfig()); } /** * Start the slider RPC server */ private void startSliderRPCServer(AggregateConf instanceDefinition) throws IOException, SliderException { verifyIPCAccess(); sliderIPCService = new SliderIPCService(this, certificateManager, stateForProviders, actionQueues, metricsAndMonitoring, contentCache); deployChildService(sliderIPCService); SliderClusterProtocolPBImpl protobufRelay = new SliderClusterProtocolPBImpl(sliderIPCService); BlockingService blockingService = SliderClusterAPI.SliderClusterProtocolPB .newReflectiveBlockingService(protobufRelay); int port = getPortToRequest(); InetSocketAddress rpcAddress = new InetSocketAddress("0.0.0.0", port); rpcService = new WorkflowRpcService("SliderRPC", RpcBinder.createProtobufServer(rpcAddress, getConfig(), secretManager, NUM_RPC_HANDLERS, blockingService, null)); deployChildService(rpcService); } /** * verify that if the cluster is authed, the ACLs are set. * @throws BadConfigException if Authorization is set without any ACL */ private void verifyIPCAccess() throws BadConfigException { boolean authorization = getConfig().getBoolean(CommonConfigurationKeysPublic.HADOOP_SECURITY_AUTHORIZATION, false); String acls = getConfig().get(KEY_PROTOCOL_ACL); if (authorization && SliderUtils.isUnset(acls)) { throw new BadConfigException("Application has IPC authorization enabled in " + CommonConfigurationKeysPublic.HADOOP_SECURITY_AUTHORIZATION + " but no ACLs in " + KEY_PROTOCOL_ACL); } } /* =================================================================== */ /* AMRMClientAsync callbacks */ /* =================================================================== */ /** * Callback event when a container is allocated. * * The app state is updated with the allocation, and builds up a list * of assignments and RM operations. The assignments are * handed off into the pool of service launchers to asynchronously schedule * container launch operations. * * The operations are run in sequence; they are expected to be 0 or more * release operations (to handle over-allocations) * * @param allocatedContainers list of containers that are now ready to be * given work. */ @SuppressWarnings("SynchronizationOnLocalVariableOrMethodParameter") @Override //AMRMClientAsync public void onContainersAllocated(List<Container> allocatedContainers) { LOG_YARN.info("onContainersAllocated({})", allocatedContainers.size()); List<ContainerAssignment> assignments = new ArrayList<>(); List<AbstractRMOperation> operations = new ArrayList<>(); //app state makes all the decisions appState.onContainersAllocated(allocatedContainers, assignments, operations); //for each assignment: instantiate that role for (ContainerAssignment assignment : assignments) { try { launchService.launchRole(assignment, getInstanceDefinition(), buildContainerCredentials()); } catch (IOException e) { // Can be caused by failure to renew credentials with the remote // service. If so, don't launch the application. Container is retained, // though YARN will take it away after a timeout. log.error("Failed to build credentials to launch container: {}", e, e); } } //for all the operations, exec them execute(operations); log.info("Diagnostics: {}", getContainerDiagnosticInfo()); } @Override //AMRMClientAsync public synchronized void onContainersCompleted(List<ContainerStatus> completedContainers) { LOG_YARN.info("onContainersCompleted([{}]", completedContainers.size()); for (ContainerStatus status : completedContainers) { ContainerId containerId = status.getContainerId(); LOG_YARN.info( "Container Completion for" + " containerID={}," + " state={}," + " exitStatus={}," + " diagnostics={}", containerId, status.getState(), status.getExitStatus(), status.getDiagnostics()); // non complete containers should not be here assert (status.getState() == ContainerState.COMPLETE); AppState.NodeCompletionResult result = appState.onCompletedNode(status); if (result.containerFailed) { RoleInstance ri = result.roleInstance; log.error("Role instance {} failed ", ri); } // known nodes trigger notifications if (!result.unknownNode) { getProviderService().notifyContainerCompleted(containerId); queue(new UnregisterComponentInstance(containerId, 0, TimeUnit.MILLISECONDS)); } } reviewRequestAndReleaseNodes("onContainersCompleted"); } /** * Signal that containers are being upgraded. Containers specified with * --containers option and all containers of all roles specified with * --components option are merged and upgraded. * * @param upgradeContainersRequest * request containing upgrade details */ public synchronized void onUpgradeContainers(ActionUpgradeContainers upgradeContainersRequest) throws IOException, SliderException { LOG_YARN.info("onUpgradeContainers({})", upgradeContainersRequest.getMessage()); Set<String> containers = upgradeContainersRequest.getContainers() == null ? new HashSet<String>() : upgradeContainersRequest.getContainers(); LOG_YARN.info(" Container list provided (total {}) : {}", containers.size(), containers); Set<String> components = upgradeContainersRequest.getComponents() == null ? new HashSet<String>() : upgradeContainersRequest.getComponents(); LOG_YARN.info(" Component list provided (total {}) : {}", components.size(), components); // If components are specified as well, then grab all the containers of // each of the components (roles) if (CollectionUtils.isNotEmpty(components)) { Map<ContainerId, RoleInstance> liveContainers = appState.getLiveContainers(); if (CollectionUtils.isNotEmpty(liveContainers.keySet())) { Map<String, Set<String>> roleContainerMap = prepareRoleContainerMap(liveContainers); for (String component : components) { Set<String> roleContainers = roleContainerMap.get(component); if (roleContainers != null) { containers.addAll(roleContainers); } } } } LOG_YARN.info("Final list of containers to be upgraded (total {}) : {}", containers.size(), containers); if (providerService instanceof AgentProviderService) { AgentProviderService agentProviderService = (AgentProviderService) providerService; agentProviderService.setInUpgradeMode(true); agentProviderService.addUpgradeContainers(containers); } } // create a reverse map of roles -> set of all live containers private Map<String, Set<String>> prepareRoleContainerMap(Map<ContainerId, RoleInstance> liveContainers) { // liveContainers is ensured to be not empty Map<String, Set<String>> roleContainerMap = new HashMap<>(); for (Map.Entry<ContainerId, RoleInstance> liveContainer : liveContainers.entrySet()) { RoleInstance role = liveContainer.getValue(); if (roleContainerMap.containsKey(role.role)) { roleContainerMap.get(role.role).add(liveContainer.getKey().toString()); } else { Set<String> containers = new HashSet<String>(); containers.add(liveContainer.getKey().toString()); roleContainerMap.put(role.role, containers); } } return roleContainerMap; } /** * Implementation of cluster flexing. * It should be the only way that anything -even the AM itself on startup- * asks for nodes. * @param resources the resource tree * @throws SliderException slider problems, including invalid configs * @throws IOException IO problems */ public void flexCluster(ConfTree resources) throws IOException, SliderException { AggregateConf newConf = new AggregateConf(appState.getInstanceDefinitionSnapshot()); newConf.setResources(resources); // verify the new definition is valid sliderAMProvider.validateInstanceDefinition(newConf); providerService.validateInstanceDefinition(newConf); appState.updateResourceDefinitions(resources); // reset the scheduled windows...the values // may have changed appState.resetFailureCounts(); // ask for more containers if needed reviewRequestAndReleaseNodes("flexCluster"); } /** * Schedule the failure window * @param resources the resource tree * @throws BadConfigException if the window is out of range */ private void scheduleFailureWindowResets(ConfTree resources) throws BadConfigException { ResetFailureWindow reset = new ResetFailureWindow(); ConfTreeOperations ops = new ConfTreeOperations(resources); MapOperations globals = ops.getGlobalOptions(); long seconds = globals.getTimeRange(ResourceKeys.CONTAINER_FAILURE_WINDOW, ResourceKeys.DEFAULT_CONTAINER_FAILURE_WINDOW_DAYS, ResourceKeys.DEFAULT_CONTAINER_FAILURE_WINDOW_HOURS, ResourceKeys.DEFAULT_CONTAINER_FAILURE_WINDOW_MINUTES, 0); if (seconds > 0) { log.info("Scheduling the failure window reset interval to every {} seconds", seconds); RenewingAction<ResetFailureWindow> renew = new RenewingAction<>(reset, seconds, seconds, TimeUnit.SECONDS, 0); actionQueues.renewing("failures", renew); } else { log.info("Failure window reset interval is not set"); } } /** * Schedule the escalation action * @param internal * @throws BadConfigException */ private void scheduleEscalation(ConfTree internal) throws BadConfigException { EscalateOutstandingRequests escalate = new EscalateOutstandingRequests(); ConfTreeOperations ops = new ConfTreeOperations(internal); int seconds = ops.getGlobalOptions().getOptionInt(InternalKeys.ESCALATION_CHECK_INTERVAL, InternalKeys.DEFAULT_ESCALATION_CHECK_INTERVAL); RenewingAction<EscalateOutstandingRequests> renew = new RenewingAction<>(escalate, seconds, seconds, TimeUnit.SECONDS, 0); actionQueues.renewing("escalation", renew); } /** * Look at where the current node state is -and whether it should be changed * @param reason reason for operation */ private synchronized void reviewRequestAndReleaseNodes(String reason) { log.debug("reviewRequestAndReleaseNodes({})", reason); queue(new ReviewAndFlexApplicationSize(reason, 0, TimeUnit.SECONDS)); } /** * Handle the event requesting a review ... look at the queue and decide * whether to act or not * @param action action triggering the event. It may be put * back into the queue * @throws SliderInternalStateException */ public void handleReviewAndFlexApplicationSize(ReviewAndFlexApplicationSize action) throws SliderInternalStateException { if (actionQueues .hasQueuedActionWithAttribute(AsyncAction.ATTR_REVIEWS_APP_SIZE | AsyncAction.ATTR_HALTS_APP)) { // this operation isn't needed at all -existing duplicate or shutdown due return; } // if there is an action which changes cluster size, wait if (actionQueues.hasQueuedActionWithAttribute(AsyncAction.ATTR_CHANGES_APP_SIZE)) { // place the action at the back of the queue actionQueues.put(action); } executeNodeReview(action.name); } /** * Look at where the current node state is -and whether it should be changed */ public synchronized void executeNodeReview(String reason) throws SliderInternalStateException { log.debug("in executeNodeReview({})", reason); if (amCompletionFlag.get()) { log.info("Ignoring node review operation: shutdown in progress"); } try { List<AbstractRMOperation> allOperations = appState.reviewRequestAndReleaseNodes(); // tell the provider providerRMOperationHandler.execute(allOperations); //now apply the operations execute(allOperations); } catch (TriggerClusterTeardownException e) { //App state has decided that it is time to exit log.error("Cluster teardown triggered {}", e, e); queue(new ActionStopSlider(e)); } } /** * Escalate operation as triggered by external timer. * <p> * Get the list of new operations off the AM, then executest them. */ public void escalateOutstandingRequests() { List<AbstractRMOperation> operations = appState.escalateOutstandingRequests(); providerRMOperationHandler.execute(operations); execute(operations); } /** * Shutdown operation: release all containers */ private void releaseAllContainers() { if (providerService instanceof AgentProviderService) { log.info("Setting stopInitiated flag to true"); AgentProviderService agentProviderService = (AgentProviderService) providerService; agentProviderService.setAppStopInitiated(true); } // Add the sleep here (before releasing containers) so that applications get // time to perform graceful shutdown try { long timeout = getContainerReleaseTimeout(); if (timeout > 0) { Thread.sleep(timeout); } } catch (InterruptedException e) { log.info("Sleep for container release interrupted"); } finally { List<AbstractRMOperation> operations = appState.releaseAllContainers(); providerRMOperationHandler.execute(operations); // now apply the operations execute(operations); } } private long getContainerReleaseTimeout() { // Get container release timeout in millis or 0 if the property is not set. // If non-zero then add the agent heartbeat delay time, since it can take up // to that much time for agents to receive the stop command. int timeout = getInstanceDefinition().getAppConfOperations().getGlobalOptions() .getOptionInt(SliderKeys.APP_CONTAINER_RELEASE_TIMEOUT, 0); if (timeout > 0) { timeout += SliderKeys.APP_CONTAINER_HEARTBEAT_INTERVAL_SEC; } // convert to millis long timeoutInMillis = timeout * 1000l; log.info("Container release timeout in millis = {}", timeoutInMillis); return timeoutInMillis; } /** * RM wants to shut down the AM */ @Override //AMRMClientAsync public void onShutdownRequest() { LOG_YARN.info("Shutdown Request received"); signalAMComplete(new ActionStopSlider("stop", EXIT_SUCCESS, FinalApplicationStatus.SUCCEEDED, "Shutdown requested from RM")); } /** * Monitored nodes have been changed * @param updatedNodes list of updated nodes */ @Override //AMRMClientAsync public void onNodesUpdated(List<NodeReport> updatedNodes) { LOG_YARN.info("onNodesUpdated({})", updatedNodes.size()); log.info("Updated nodes {}", updatedNodes); // Check if any nodes are lost or revived and update state accordingly AppState.NodeUpdatedOutcome outcome = appState.onNodesUpdated(updatedNodes); if (!outcome.operations.isEmpty()) { execute(outcome.operations); } // trigger a review if the cluster changed if (outcome.clusterChanged) { reviewRequestAndReleaseNodes("nodes updated"); } } /** * heartbeat operation; return the ratio of requested * to actual * @return progress */ @Override //AMRMClientAsync public float getProgress() { return appState.getApplicationProgressPercentage(); } @Override //AMRMClientAsync public void onError(Throwable e) { //callback says it's time to finish LOG_YARN.error("AMRMClientAsync.onError() received {}", e, e); signalAMComplete(new ActionStopSlider("stop", EXIT_EXCEPTION_THROWN, FinalApplicationStatus.FAILED, "AMRMClientAsync.onError() received " + e)); } /* =================================================================== */ /* RMOperationHandlerActions */ /* =================================================================== */ @Override public void execute(List<AbstractRMOperation> operations) { rmOperationHandler.execute(operations); } @Override public void releaseAssignedContainer(ContainerId containerId) { rmOperationHandler.releaseAssignedContainer(containerId); } @Override public void addContainerRequest(AMRMClient.ContainerRequest req) { rmOperationHandler.addContainerRequest(req); } @Override public int cancelContainerRequests(Priority priority1, Priority priority2, int count) { return rmOperationHandler.cancelContainerRequests(priority1, priority2, count); } @Override public void cancelSingleRequest(AMRMClient.ContainerRequest request) { rmOperationHandler.cancelSingleRequest(request); } /* =================================================================== */ /* END */ /* =================================================================== */ /** * Launch the provider service * * @param instanceDefinition definition of the service * @param confDir directory of config data * @throws IOException * @throws SliderException */ protected synchronized void launchProviderService(AggregateConf instanceDefinition, File confDir) throws IOException, SliderException { Map<String, String> env = new HashMap<>(); boolean execStarted = providerService.exec(instanceDefinition, confDir, env, this); if (execStarted) { providerService.registerServiceListener(this); providerService.start(); } else { // didn't start, so don't register providerService.start(); // and send the started event ourselves eventCallbackEvent(null); } } /* =================================================================== */ /* EventCallback from the child or ourselves directly */ /* =================================================================== */ @Override // ProviderCompleted public void eventCallbackEvent(Object parameter) { // signalled that the child process is up. appState.noteAMLive(); // now ask for the cluster nodes try { flexCluster(getInstanceDefinition().getResources()); } catch (Exception e) { // cluster flex failure: log log.error("Failed to flex cluster nodes: {}", e, e); // then what? exit queue(new ActionStopSlider(e)); } } /** * report container loss. If this isn't already known about, react * * @param containerId id of the container which has failed * @throws SliderException */ public synchronized void providerLostContainer(ContainerId containerId) throws SliderException { log.info("containerLostContactWithProvider: container {} lost", containerId); RoleInstance activeContainer = appState.getOwnedContainer(containerId); if (activeContainer != null) { execute(appState.releaseContainer(containerId)); // ask for more containers if needed log.info("Container released; triggering review"); reviewRequestAndReleaseNodes("Loss of container"); } else { log.info("Container not in active set - ignoring"); } } /* =================================================================== */ /* ServiceStateChangeListener */ /* =================================================================== */ /** * Received on listening service termination. * @param service the service that has changed. */ @Override //ServiceStateChangeListener public void stateChanged(Service service) { if (service == providerService && service.isInState(STATE.STOPPED)) { //its the current master process in play int exitCode = providerService.getExitCode(); int mappedProcessExitCode = exitCode; boolean shouldTriggerFailure = !amCompletionFlag.get() && (mappedProcessExitCode != 0); if (shouldTriggerFailure) { String reason = "Spawned process failed with raw " + exitCode + " mapped to " + mappedProcessExitCode; ActionStopSlider stop = new ActionStopSlider("stop", mappedProcessExitCode, FinalApplicationStatus.FAILED, reason); //this wasn't expected: the process finished early spawnedProcessExitedBeforeShutdownTriggered = true; log.info("Process has exited with exit code {} mapped to {} -triggering termination", exitCode, mappedProcessExitCode); //tell the AM the cluster is complete signalAMComplete(stop); } else { //we don't care log.info("Process has exited with exit code {} mapped to {} -ignoring", exitCode, mappedProcessExitCode); } } else { super.stateChanged(service); } } /** * stop forked process if it the running process var is not null * @return the process exit code */ protected synchronized Integer stopForkedProcess() { providerService.stop(); return providerService.getExitCode(); } /** * Async start container request * @param container container * @param ctx context * @param instance node details */ public void startContainer(Container container, ContainerLaunchContext ctx, RoleInstance instance) throws IOException { appState.containerStartSubmitted(container, instance); nmClientAsync.startContainerAsync(container, ctx); } /** * Build the credentials needed for containers. This will include * getting new delegation tokens for HDFS if the AM is running * with a keytab. * @return a buffer of credentials * @throws IOException */ private Credentials buildContainerCredentials() throws IOException { Credentials credentials = new Credentials(containerCredentials); if (securityConfiguration.isKeytabProvided()) { CredentialUtils.addSelfRenewableFSDelegationTokens(getClusterFS().getFileSystem(), credentials); } return credentials; } @Override // NMClientAsync.CallbackHandler public void onContainerStopped(ContainerId containerId) { // do nothing but log: container events from the AM // are the source of container halt details to react to log.info("onContainerStopped {} ", containerId); } @Override // NMClientAsync.CallbackHandler public void onContainerStarted(ContainerId containerId, Map<String, ByteBuffer> allServiceResponse) { LOG_YARN.info("Started Container {} ", containerId); RoleInstance cinfo = appState.onNodeManagerContainerStarted(containerId); if (cinfo != null) { LOG_YARN.info("Deployed instance of role {} onto {}", cinfo.role, containerId); //trigger an async container status nmClientAsync.getContainerStatusAsync(containerId, cinfo.container.getNodeId()); // push out a registration queue(new RegisterComponentInstance(containerId, cinfo.role, cinfo.group, 0, TimeUnit.MILLISECONDS)); } else { //this is a hypothetical path not seen. We react by warning log.error("Notified of started container that isn't pending {} - releasing", containerId); //then release it asyncRMClient.releaseAssignedContainer(containerId); } } @Override // NMClientAsync.CallbackHandler public void onStartContainerError(ContainerId containerId, Throwable t) { LOG_YARN.error("Failed to start Container {}", containerId, t); appState.onNodeManagerContainerStartFailed(containerId, t); } @Override // NMClientAsync.CallbackHandler public void onContainerStatusReceived(ContainerId containerId, ContainerStatus containerStatus) { LOG_YARN.debug("Container Status: id={}, status={}", containerId, containerStatus); } @Override // NMClientAsync.CallbackHandler public void onGetContainerStatusError(ContainerId containerId, Throwable t) { LOG_YARN.error("Failed to query the status of Container {}", containerId); } @Override // NMClientAsync.CallbackHandler public void onStopContainerError(ContainerId containerId, Throwable t) { LOG_YARN.warn("Failed to stop Container {}", containerId); } public AggregateConf getInstanceDefinition() { return appState.getInstanceDefinition(); } /** * This is the status, the live model */ public ClusterDescription getClusterDescription() { return appState.getClusterStatus(); } public ProviderService getProviderService() { return providerService; } /** * Queue an action for immediate execution in the executor thread * @param action action to execute */ public void queue(AsyncAction action) { actionQueues.put(action); } /** * Schedule an action * @param action for delayed execution */ public void schedule(AsyncAction action) { actionQueues.schedule(action); } /** * Handle any exception in a thread. If the exception provides an exit * code, that is the one that will be used * @param thread thread throwing the exception * @param exception exception */ public void onExceptionInThread(Thread thread, Throwable exception) { log.error("Exception in {}: {}", thread.getName(), exception, exception); // if there is a teardown in progress, ignore it if (amCompletionFlag.get()) { log.info("Ignoring exception: shutdown in progress"); } else { int exitCode = EXIT_EXCEPTION_THROWN; if (exception instanceof ExitCodeProvider) { exitCode = ((ExitCodeProvider) exception).getExitCode(); } signalAMComplete( new ActionStopSlider("stop", exitCode, FinalApplicationStatus.FAILED, exception.toString())); } } /** * Start the chaos monkey * @return true if it started */ private boolean maybeStartMonkey() { MapOperations internals = getGlobalInternalOptions(); Boolean enabled = internals.getOptionBool(InternalKeys.CHAOS_MONKEY_ENABLED, InternalKeys.DEFAULT_CHAOS_MONKEY_ENABLED); if (!enabled) { log.debug("Chaos monkey disabled"); return false; } long monkeyInterval = internals.getTimeRange(InternalKeys.CHAOS_MONKEY_INTERVAL, InternalKeys.DEFAULT_CHAOS_MONKEY_INTERVAL_DAYS, InternalKeys.DEFAULT_CHAOS_MONKEY_INTERVAL_HOURS, InternalKeys.DEFAULT_CHAOS_MONKEY_INTERVAL_MINUTES, 0); if (monkeyInterval == 0) { log.debug("Chaos monkey not configured with a time interval...not enabling"); return false; } long monkeyDelay = internals.getTimeRange(InternalKeys.CHAOS_MONKEY_DELAY, 0, 0, 0, (int) monkeyInterval); log.info("Adding Chaos Monkey scheduled every {} seconds ({} hours -delay {}", monkeyInterval, monkeyInterval / (60 * 60), monkeyDelay); monkey = new ChaosMonkeyService(metrics, actionQueues); initAndAddService(monkey); // configure the targets // launch failure: special case with explicit failure triggered now int amLaunchFailProbability = internals .getOptionInt(InternalKeys.CHAOS_MONKEY_PROBABILITY_AM_LAUNCH_FAILURE, 0); if (amLaunchFailProbability > 0 && monkey.chaosCheck(amLaunchFailProbability)) { log.info("Chaos Monkey has triggered AM Launch failure"); // trigger a failure ActionStopSlider stop = new ActionStopSlider("stop", 0, TimeUnit.SECONDS, LauncherExitCodes.EXIT_FALSE, FinalApplicationStatus.FAILED, E_TRIGGERED_LAUNCH_FAILURE); queue(stop); } int amKillProbability = internals.getOptionInt(InternalKeys.CHAOS_MONKEY_PROBABILITY_AM_FAILURE, InternalKeys.DEFAULT_CHAOS_MONKEY_PROBABILITY_AM_FAILURE); monkey.addTarget("AM killer", new ChaosKillAM(actionQueues, -1), amKillProbability); int containerKillProbability = internals.getOptionInt( InternalKeys.CHAOS_MONKEY_PROBABILITY_CONTAINER_FAILURE, InternalKeys.DEFAULT_CHAOS_MONKEY_PROBABILITY_CONTAINER_FAILURE); monkey.addTarget("Container killer", new ChaosKillContainer(appState, actionQueues, rmOperationHandler), containerKillProbability); // and schedule it if (monkey.schedule(monkeyDelay, monkeyInterval, TimeUnit.SECONDS)) { log.info("Chaos Monkey is running"); return true; } else { log.info("Chaos monkey not started"); return false; } } /** * This is the main entry point for the service launcher. * @param args command line arguments. */ public static void main(String[] args) { //turn the args to a list List<String> argsList = Arrays.asList(args); //create a new list, as the ArrayList type doesn't push() on an insert List<String> extendedArgs = new ArrayList<String>(argsList); //insert the service name extendedArgs.add(0, SERVICE_CLASSNAME); //now have the service launcher do its work ServiceLauncher.serviceMain(extendedArgs); } }