Java tutorial
/** * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.storm.daemon.supervisor; import org.apache.commons.io.FileUtils; import org.apache.commons.lang.StringUtils; import org.apache.storm.Config; import org.apache.storm.container.cgroup.CgroupManager; import org.apache.storm.daemon.supervisor.workermanager.IWorkerManager; import org.apache.storm.generated.ExecutorInfo; import org.apache.storm.generated.LSWorkerHeartbeat; import org.apache.storm.generated.LocalAssignment; import org.apache.storm.generated.WorkerResources; import org.apache.storm.utils.ConfigUtils; import org.apache.storm.utils.LocalState; import org.apache.storm.utils.Time; import org.apache.storm.utils.Utils; import org.eclipse.jetty.util.ConcurrentHashSet; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.yaml.snakeyaml.Yaml; import java.io.File; import java.io.FileWriter; import java.io.IOException; import java.util.*; /** * 1. to kill are those in allocated that are dead or disallowed * 2. kill the ones that should be dead - read pids, kill -9 and individually remove file - rmr heartbeat dir, rmdir pid dir, rmdir id dir (catch exception and log) * 3. remove any downloaded code that's no longer assigned to this supervisor * 4. of the rest, figure out what assignments aren't yet satisfied * 5. generate new worker ids, write new "approved workers" to LS * 6. create local dir for worker id * 7. launch new workers (give worker-id, port, and supervisor-id) * 8. wait for workers launch */ public class SyncProcessEvent implements Runnable { private static Logger LOG = LoggerFactory.getLogger(SyncProcessEvent.class); private LocalState localState; private SupervisorData supervisorData; public static final ExecutorInfo SYSTEM_EXECUTOR_INFO = new ExecutorInfo(-1, -1); private class ProcessExitCallback implements Utils.ExitCodeCallable { private final String logPrefix; private final String workerId; public ProcessExitCallback(String logPrefix, String workerId) { this.logPrefix = logPrefix; this.workerId = workerId; } @Override public Object call() throws Exception { return null; } @Override public Object call(int exitCode) { LOG.info("{} exited with code: {}", logPrefix, exitCode); supervisorData.getDeadWorkers().add(workerId); return null; } } public SyncProcessEvent() { } public SyncProcessEvent(SupervisorData supervisorData) { init(supervisorData); } public void init(SupervisorData supervisorData) { this.supervisorData = supervisorData; this.localState = supervisorData.getLocalState(); } @Override public void run() { LOG.debug("Syncing processes"); try { Map conf = supervisorData.getConf(); Map<Integer, LocalAssignment> assignedExecutors = localState.getLocalAssignmentsMap(); if (assignedExecutors == null) { assignedExecutors = new HashMap<>(); } Set<String> assignedStormIds = new HashSet<>(); for (Map.Entry<Integer, LocalAssignment> entry : assignedExecutors.entrySet()) { assignedStormIds.add(entry.getValue().get_topology_id()); } int now = Time.currentTimeSecs(); Map<String, StateHeartbeat> localWorkerStats = getLocalWorkerStats(supervisorData, assignedExecutors, now); Set<String> keeperWorkerIds = new HashSet<>(); Set<Integer> keepPorts = new HashSet<>(); for (Map.Entry<String, StateHeartbeat> entry : localWorkerStats.entrySet()) { StateHeartbeat stateHeartbeat = entry.getValue(); if (stateHeartbeat.getState() == State.VALID) { keeperWorkerIds.add(entry.getKey()); keepPorts.add(stateHeartbeat.getHeartbeat().get_port()); } } Map<Integer, LocalAssignment> reassignExecutors = getReassignExecutors(assignedExecutors, keepPorts); Map<Integer, String> newWorkerIds = new HashMap<>(); for (Integer port : reassignExecutors.keySet()) { newWorkerIds.put(port, Utils.uuid()); } Set<String> allDownloadedTopologyIds = SupervisorUtils.readDownLoadedStormIds(conf); LOG.debug("Assigned executors: {}", assignedExecutors); LOG.debug("Allocated: {}", localWorkerStats); for (Map.Entry<String, StateHeartbeat> entry : localWorkerStats.entrySet()) { StateHeartbeat stateHeartbeat = entry.getValue(); if (stateHeartbeat.getState() != State.VALID) { LOG.info( "Shutting down and clearing state for id {}, Current supervisor time: {}, State: {}, Heartbeat: {}", entry.getKey(), now, stateHeartbeat.getState(), stateHeartbeat.getHeartbeat()); killWorker(supervisorData, supervisorData.getWorkerManager(), entry.getKey()); } } // remove any downloaded code that's no longer assigned or active for (String downloadedTopologyId : allDownloadedTopologyIds) { if (!assignedStormIds.contains(downloadedTopologyId)) { LOG.info("Removing code for storm id {}.", downloadedTopologyId); SupervisorUtils.rmTopoFiles(conf, downloadedTopologyId, supervisorData.getLocalizer(), true); } } // start new workers Map<String, Integer> newWorkerPortToIds = startNewWorkers(newWorkerIds, reassignExecutors); Map<String, Integer> allWorkerPortToIds = new HashMap<>(); Map<String, Integer> approvedWorkers = localState.getApprovedWorkers(); for (String keeper : keeperWorkerIds) { allWorkerPortToIds.put(keeper, approvedWorkers.get(keeper)); } allWorkerPortToIds.putAll(newWorkerPortToIds); localState.setApprovedWorkers(allWorkerPortToIds); waitForWorkersLaunch(conf, newWorkerPortToIds.keySet()); } catch (Exception e) { LOG.error("Failed Sync Process", e); throw Utils.wrapInRuntime(e); } } protected void waitForWorkersLaunch(Map conf, Set<String> workerIds) throws Exception { int startTime = Time.currentTimeSecs(); int timeOut = (int) conf.get(Config.NIMBUS_SUPERVISOR_TIMEOUT_SECS); for (String workerId : workerIds) { LocalState localState = ConfigUtils.workerState(conf, workerId); while (true) { LSWorkerHeartbeat hb = localState.getWorkerHeartBeat(); if (hb != null || (Time.currentTimeSecs() - startTime) > timeOut) break; LOG.info("{} still hasn't started", workerId); Time.sleep(500); } if (localState.getWorkerHeartBeat() == null) { LOG.info("Worker {} failed to start", workerId); } } } protected Map<Integer, LocalAssignment> getReassignExecutors(Map<Integer, LocalAssignment> assignExecutors, Set<Integer> keepPorts) { Map<Integer, LocalAssignment> reassignExecutors = new HashMap<>(); reassignExecutors.putAll(assignExecutors); for (Integer port : keepPorts) { reassignExecutors.remove(port); } return reassignExecutors; } /** * Returns map from worker id to worker heartbeat. if the heartbeat is nil, then the worker is dead * * @param assignedExecutors * @return * @throws Exception */ public Map<String, StateHeartbeat> getLocalWorkerStats(SupervisorData supervisorData, Map<Integer, LocalAssignment> assignedExecutors, int now) throws Exception { Map<String, StateHeartbeat> workerIdHbstate = new HashMap<>(); Map conf = supervisorData.getConf(); LocalState localState = supervisorData.getLocalState(); Map<String, LSWorkerHeartbeat> idToHeartbeat = SupervisorUtils.readWorkerHeartbeats(conf); Map<String, Integer> approvedWorkers = localState.getApprovedWorkers(); Set<String> approvedIds = new HashSet<>(); if (approvedWorkers != null) { approvedIds.addAll(approvedWorkers.keySet()); } for (Map.Entry<String, LSWorkerHeartbeat> entry : idToHeartbeat.entrySet()) { String workerId = entry.getKey(); LSWorkerHeartbeat whb = entry.getValue(); State state; if (whb == null) { state = State.NOT_STARTED; } else if (!approvedIds.contains(workerId) || !matchesAssignment(whb, assignedExecutors)) { state = State.DISALLOWED; } else if (supervisorData.getDeadWorkers().contains(workerId)) { LOG.info("Worker Process {} has died", workerId); state = State.TIMED_OUT; } else if (SupervisorUtils.isWorkerHbTimedOut(now, whb, conf)) { state = State.TIMED_OUT; } else { state = State.VALID; } LOG.debug("Worker:{} state:{} WorkerHeartbeat:{} at supervisor time-secs {}", workerId, state, whb, now); workerIdHbstate.put(workerId, new StateHeartbeat(state, whb)); } return workerIdHbstate; } protected boolean matchesAssignment(LSWorkerHeartbeat whb, Map<Integer, LocalAssignment> assignedExecutors) { LocalAssignment localAssignment = assignedExecutors.get(whb.get_port()); if (localAssignment == null || !localAssignment.get_topology_id().equals(whb.get_topology_id())) { return false; } List<ExecutorInfo> executorInfos = new ArrayList<>(); executorInfos.addAll(whb.get_executors()); // remove SYSTEM_EXECUTOR_ID executorInfos.remove(SYSTEM_EXECUTOR_INFO); List<ExecutorInfo> localExecuorInfos = localAssignment.get_executors(); if (localExecuorInfos.size() != executorInfos.size()) return false; for (ExecutorInfo executorInfo : localExecuorInfos) { if (!localExecuorInfos.contains(executorInfo)) return false; } return true; } /** * launch a worker in local mode. */ protected void launchLocalWorker(SupervisorData supervisorData, String stormId, Long port, String workerId, WorkerResources resources) throws IOException { // port this function after porting worker to java } protected void launchDistributedWorker(IWorkerManager workerManager, Map conf, String supervisorId, String assignmentId, String stormId, Long port, String workerId, WorkerResources resources, ConcurrentHashSet deadWorkers) throws IOException { Map stormConf = ConfigUtils.readSupervisorStormConf(conf, stormId); String user = (String) stormConf.get(Config.TOPOLOGY_SUBMITTER_USER); writeLogMetadata(stormConf, user, workerId, stormId, port, conf); ConfigUtils.setWorkerUserWSE(conf, workerId, user); createArtifactsLink(conf, stormId, port, workerId); String logPrefix = "Worker Process " + workerId; if (deadWorkers != null) deadWorkers.remove(workerId); createBlobstoreLinks(conf, stormId, workerId); ProcessExitCallback processExitCallback = new ProcessExitCallback(logPrefix, workerId); workerManager.launchWorker(supervisorId, assignmentId, stormId, port, workerId, resources, processExitCallback); } protected Map<String, Integer> startNewWorkers(Map<Integer, String> newWorkerIds, Map<Integer, LocalAssignment> reassignExecutors) throws IOException { Map<String, Integer> newValidWorkerIds = new HashMap<>(); Map conf = supervisorData.getConf(); String supervisorId = supervisorData.getSupervisorId(); String clusterMode = ConfigUtils.clusterMode(conf); for (Map.Entry<Integer, LocalAssignment> entry : reassignExecutors.entrySet()) { Integer port = entry.getKey(); LocalAssignment assignment = entry.getValue(); String workerId = newWorkerIds.get(port); String stormId = assignment.get_topology_id(); WorkerResources resources = assignment.get_resources(); // This condition checks for required files exist before launching the worker if (SupervisorUtils.doRequiredTopoFilesExist(conf, stormId)) { String pidsPath = ConfigUtils.workerPidsRoot(conf, workerId); String hbPath = ConfigUtils.workerHeartbeatsRoot(conf, workerId); LOG.info("Launching worker with assignment {} for this supervisor {} on port {} with id {}", assignment, supervisorData.getSupervisorId(), port, workerId); FileUtils.forceMkdir(new File(pidsPath)); FileUtils.forceMkdir(new File(ConfigUtils.workerTmpRoot(conf, workerId))); FileUtils.forceMkdir(new File(hbPath)); if (clusterMode.endsWith("distributed")) { launchDistributedWorker(supervisorData.getWorkerManager(), conf, supervisorId, supervisorData.getAssignmentId(), stormId, port.longValue(), workerId, resources, supervisorData.getDeadWorkers()); } else if (clusterMode.endsWith("local")) { launchLocalWorker(supervisorData, stormId, port.longValue(), workerId, resources); } newValidWorkerIds.put(workerId, port); } else { LOG.info( "Missing topology storm code, so can't launch worker with assignment {} for this supervisor {} on port {} with id {}", assignment, supervisorData.getSupervisorId(), port, workerId); } } return newValidWorkerIds; } public void writeLogMetadata(Map stormconf, String user, String workerId, String stormId, Long port, Map conf) throws IOException { Map data = new HashMap(); data.put(Config.TOPOLOGY_SUBMITTER_USER, user); data.put("worker-id", workerId); Set<String> logsGroups = new HashSet<>(); //for supervisor-test if (stormconf.get(Config.LOGS_GROUPS) != null) { List<String> groups = (List<String>) stormconf.get(Config.LOGS_GROUPS); for (String group : groups) { logsGroups.add(group); } } if (stormconf.get(Config.TOPOLOGY_GROUPS) != null) { List<String> topGroups = (List<String>) stormconf.get(Config.TOPOLOGY_GROUPS); logsGroups.addAll(topGroups); } data.put(Config.LOGS_GROUPS, logsGroups.toArray()); Set<String> logsUsers = new HashSet<>(); if (stormconf.get(Config.LOGS_USERS) != null) { List<String> logUsers = (List<String>) stormconf.get(Config.LOGS_USERS); for (String logUser : logUsers) { logsUsers.add(logUser); } } if (stormconf.get(Config.TOPOLOGY_USERS) != null) { List<String> topUsers = (List<String>) stormconf.get(Config.TOPOLOGY_USERS); for (String logUser : topUsers) { logsUsers.add(logUser); } } data.put(Config.LOGS_USERS, logsUsers.toArray()); writeLogMetadataToYamlFile(stormId, port, data, conf); } /** * run worker as user needs the directory to have special permissions or it is insecure * * @param stormId * @param port * @param data * @param conf * @throws IOException */ public void writeLogMetadataToYamlFile(String stormId, Long port, Map data, Map conf) throws IOException { File file = ConfigUtils.getLogMetaDataFile(conf, stormId, port.intValue()); if (!Utils.checkFileExists(file.getParent())) { if (Utils.getBoolean(conf.get(Config.SUPERVISOR_RUN_WORKER_AS_USER), false)) { FileUtils.forceMkdir(file.getParentFile()); SupervisorUtils.setupStormCodeDir(conf, ConfigUtils.readSupervisorStormConf(conf, stormId), file.getParentFile().getCanonicalPath()); } else { file.getParentFile().mkdirs(); } } FileWriter writer = new FileWriter(file); Yaml yaml = new Yaml(); try { yaml.dump(data, writer); } finally { writer.close(); } } /** * Create a symlink from workder directory to its port artifacts directory * * @param conf * @param stormId * @param port * @param workerId */ protected void createArtifactsLink(Map conf, String stormId, Long port, String workerId) throws IOException { String workerDir = ConfigUtils.workerRoot(conf, workerId); String topoDir = ConfigUtils.workerArtifactsRoot(conf, stormId); if (Utils.checkFileExists(workerDir)) { LOG.debug("Creating symlinks for worker-id: {} storm-id: {} to its port artifacts directory", workerId, stormId); Utils.createSymlink(workerDir, topoDir, "artifacts", String.valueOf(port)); } } /** * Create symlinks in worker launch directory for all blobs * * @param conf * @param stormId * @param workerId * @throws IOException */ protected void createBlobstoreLinks(Map conf, String stormId, String workerId) throws IOException { String stormRoot = ConfigUtils.supervisorStormDistRoot(conf, stormId); Map stormConf = ConfigUtils.readSupervisorStormConf(conf, stormId); String workerRoot = ConfigUtils.workerRoot(conf, workerId); Map<String, Map<String, Object>> blobstoreMap = (Map<String, Map<String, Object>>) stormConf .get(Config.TOPOLOGY_BLOBSTORE_MAP); List<String> blobFileNames = new ArrayList<>(); if (blobstoreMap != null) { for (Map.Entry<String, Map<String, Object>> entry : blobstoreMap.entrySet()) { String key = entry.getKey(); Map<String, Object> blobInfo = entry.getValue(); String ret = null; if (blobInfo != null && blobInfo.containsKey("localname")) { ret = (String) blobInfo.get("localname"); } else { ret = key; } blobFileNames.add(ret); } } List<String> resourceFileNames = new ArrayList<>(); resourceFileNames.add(ConfigUtils.RESOURCES_SUBDIR); resourceFileNames.addAll(blobFileNames); LOG.info("Creating symlinks for worker-id: {} storm-id: {} for files({}): {}", workerId, stormId, resourceFileNames.size(), resourceFileNames); Utils.createSymlink(workerRoot, stormRoot, ConfigUtils.RESOURCES_SUBDIR); for (String fileName : blobFileNames) { Utils.createSymlink(workerRoot, stormRoot, fileName, fileName); } } public void killWorker(SupervisorData supervisorData, IWorkerManager workerManager, String workerId) throws IOException, InterruptedException { workerManager.shutdownWorker(supervisorData.getSupervisorId(), workerId, supervisorData.getWorkerThreadPids()); boolean success = workerManager.cleanupWorker(workerId); if (success) { supervisorData.getDeadWorkers().remove(workerId); } } }