Java tutorial
/* * Copyright 2016 Pinterest, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.pinterest.teletraan.worker; import com.google.common.base.Joiner; import com.mysql.management.util.Str; import com.pinterest.arcee.autoscaling.AutoScalingManager; import com.pinterest.arcee.dao.HostInfoDAO; import com.pinterest.clusterservice.bean.AwsVmBean; import com.pinterest.clusterservice.bean.ClusterBean; import com.pinterest.clusterservice.bean.ClusterState; import com.pinterest.clusterservice.bean.ClusterUpgradeEventBean; import com.pinterest.clusterservice.bean.ClusterUpgradeEventState; import com.pinterest.clusterservice.bean.ClusterUpgradeEventStatus; import com.pinterest.clusterservice.cm.AwsVmManager; import com.pinterest.clusterservice.cm.ClusterManager; import com.pinterest.clusterservice.dao.ClusterDAO; import com.pinterest.clusterservice.dao.ClusterUpgradeEventDAO; import com.pinterest.clusterservice.handler.ClusterHandler; import com.pinterest.deployservice.ServiceContext; import com.pinterest.deployservice.bean.AgentBean; import com.pinterest.deployservice.bean.AgentStatus; import com.pinterest.deployservice.bean.DeployStage; import com.pinterest.deployservice.bean.EnvironBean; import com.pinterest.deployservice.bean.HostBean; import com.pinterest.deployservice.common.NotificationJob; import com.pinterest.deployservice.dao.AgentDAO; import com.pinterest.deployservice.dao.EnvironDAO; import com.pinterest.deployservice.dao.HostDAO; import com.pinterest.deployservice.dao.UtilDAO; import com.pinterest.deployservice.handler.CommonHandler; import org.apache.commons.lang.StringUtils; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import java.sql.Connection; import java.util.ArrayList; import java.util.Arrays; import java.util.Collection; import java.util.HashSet; import java.util.List; import java.util.Set; import java.util.concurrent.ExecutorService; public class ClusterReplacer implements Runnable { private static final Logger LOG = LoggerFactory.getLogger(ClusterReplacer.class); private static final long DEFAULT_CLUSTER_UPGRADE_EVENT_TIMEOUT = 1800; private static final int MAX_HOST_LAUNCH_SIZE = 50; private final AgentDAO agentDAO; private final ClusterDAO clusterDAO; private final ClusterUpgradeEventDAO clusterUpgradeEventDAO; private final EnvironDAO environDAO; private final HostDAO hostDAO; private final HostInfoDAO hostInfoDAO; private final UtilDAO utilDAO; private final AutoScalingManager autoScalingManager; private final ClusterManager clusterManager; private final ClusterHandler clusterHandler; private final CommonHandler commonHandler; private final ExecutorService jobPool; public ClusterReplacer(ServiceContext serviceContext) { agentDAO = serviceContext.getAgentDAO(); clusterDAO = serviceContext.getClusterDAO(); clusterUpgradeEventDAO = serviceContext.getClusterUpgradeEventDAO(); environDAO = serviceContext.getEnvironDAO(); hostDAO = serviceContext.getHostDAO(); hostInfoDAO = serviceContext.getHostInfoDAO(); utilDAO = serviceContext.getUtilDAO(); autoScalingManager = serviceContext.getAutoScalingManager(); clusterManager = serviceContext.getClusterManager(); clusterHandler = new ClusterHandler(serviceContext); commonHandler = new CommonHandler(serviceContext); jobPool = serviceContext.getJobPool(); } private void transitionState(String id, ClusterUpgradeEventBean updateBean) throws Exception { updateBean.setState_start_time(System.currentTimeMillis()); updateBean.setLast_worked_on(System.currentTimeMillis()); clusterUpgradeEventDAO.updateById(id, updateBean); } private void updateClusterState(String clusterName) throws Exception { ClusterBean updateBean = new ClusterBean(); updateBean.setState(ClusterState.NORMAL); updateBean.setLast_update(System.currentTimeMillis()); clusterDAO.update(clusterName, updateBean); } private void updateHostsInClusterEvent(String id, Collection<String> hostIds) throws Exception { ClusterUpgradeEventBean updateBean = new ClusterUpgradeEventBean(); updateBean.setHost_ids(Joiner.on(",").join(hostIds)); updateBean.setLast_worked_on(System.currentTimeMillis()); clusterUpgradeEventDAO.updateById(id, updateBean); } private boolean shouldTimeoutClusterUpgradeEvent(ClusterUpgradeEventBean eventBean) throws Exception { String clusterName = eventBean.getCluster_name(); long lastStateElapsedTime = System.currentTimeMillis() - eventBean.getLast_worked_on(); if (lastStateElapsedTime >= DEFAULT_CLUSTER_UPGRADE_EVENT_TIMEOUT * 1000) { LOG.info(String.format("Timeout cluster upgrade event id %s for cluster %s", eventBean.getId(), clusterName)); ClusterUpgradeEventBean updateEventBean = new ClusterUpgradeEventBean(); if (eventBean.getState() == ClusterUpgradeEventState.COMPLETING) { updateEventBean.setState(ClusterUpgradeEventState.COMPLETED); } else { updateEventBean.setState(ClusterUpgradeEventState.COMPLETING); } updateEventBean.setStatus(ClusterUpgradeEventStatus.TIMEOUT); updateEventBean.setError_message( String.format("Cluster upgrade event timeout at state %s", eventBean.getState().toString())); transitionState(eventBean.getId(), updateEventBean); updateClusterState(clusterName); EnvironBean environBean = environDAO.getById(eventBean.getEnv_id()); String message = String.format("Cluster upgrade event timeout at state %s for cluster <%s>", eventBean.getState().toString(), clusterName); String subject = String.format("Cluster Upgrade Event Alert - Timeout for cluster <%s>", clusterName); jobPool.submit(new NotificationJob(message, subject, environBean.getEmail_recipients(), environBean.getChatroom(), commonHandler)); return true; } return false; } /** * Step 1. INIT state will launch hosts outside of the auto scaling group * The number of hosts to be launched should be max_parallel_rp * If launching failed, retry INIT state until timeout meets */ private void processInitState(ClusterUpgradeEventBean eventBean) throws Exception { String clusterName = eventBean.getCluster_name(); EnvironBean environBean = environDAO.getById(eventBean.getEnv_id()); int totToLaunch = environBean.getMax_parallel_rp() <= 0 ? 1 : environBean.getMax_parallel_rp(); if (!StringUtils.isEmpty(eventBean.getHost_ids())) { Collection<String> oldHostIds = Arrays.asList(eventBean.getHost_ids().split(",")); totToLaunch -= oldHostIds.size(); } LOG.info(String.format("Start to launch hosts (number to launch: %d)", totToLaunch)); boolean succeeded = true; while (totToLaunch > 0) { int numToLaunch = Math.min(totToLaunch, MAX_HOST_LAUNCH_SIZE); Collection<HostBean> newHosts = clusterManager.launchHosts(clusterName, numToLaunch, false); if (newHosts.isEmpty()) { LOG.error(String.format("Failed to launch %s hosts in INIT state", numToLaunch)); succeeded = false; break; } LOG.info(String.format("Successfully launched %d hosts: %s", newHosts.size(), newHosts.toString())); Collection<String> updateHostIds = new ArrayList<>(); for (HostBean host : newHosts) { updateHostIds.add(host.getHost_id()); hostDAO.insert(host); } if (!StringUtils.isEmpty(eventBean.getHost_ids())) { Collection<String> oldHostIds = Arrays.asList(eventBean.getHost_ids().split(",")); updateHostIds.addAll(oldHostIds); } updateHostsInClusterEvent(eventBean.getId(), updateHostIds); totToLaunch -= newHosts.size(); } if (succeeded) { LOG.info("Successfully completed INIT state, move to LAUNCHING state"); ClusterUpgradeEventBean updateBean = new ClusterUpgradeEventBean(); updateBean.setState(ClusterUpgradeEventState.LAUNCHING); updateBean.setStatus(ClusterUpgradeEventStatus.SUCCEEDED); transitionState(eventBean.getId(), updateBean); } } /** * Step 2. LAUNCHING state should make sure all the host is in RUNNING state and serving builds * If some hosts are terminated or deploy failed, go back to INIT state to relaunch */ private void processLaunchingState(ClusterUpgradeEventBean eventBean) throws Exception { String clusterName = eventBean.getCluster_name(); Collection<String> hostIds = Arrays.asList(eventBean.getHost_ids().split(",")); // 1. make sure every host is running Set<String> failedIds = hostInfoDAO.getTerminatedHosts(new HashSet<>(hostIds)); List<String> runningIds = hostInfoDAO.getRunningInstances(new ArrayList<>(hostIds)); // 2. make sure hosts are serving builds boolean succeeded = true; for (String hostId : runningIds) { List<AgentBean> agents = agentDAO.getByHostId(hostId); if (agents.isEmpty()) { LOG.info(String.format("Host %s has not ping server yet", hostId)); succeeded = false; continue; } // Make sure every env on the host are serving build for (AgentBean agent : agents) { if (agent.getDeploy_stage() != DeployStage.SERVING_BUILD) { succeeded = false; if (agent.getStatus() != AgentStatus.SUCCEEDED && agent.getStatus() != AgentStatus.UNKNOWN && agent.getStatus() != AgentStatus.SCRIPT_FAILED) { LOG.info(String.format("Deploy failed on host %s", hostId)); failedIds.add(hostId); } } } } // 3. if found failed hosts, terminate them and go back to INIT state to relaunch hosts if (!failedIds.isEmpty()) { Collection<String> updateHostIds = Arrays.asList(eventBean.getHost_ids().split(",")); updateHostIds.removeAll(failedIds); clusterManager.terminateHosts(clusterName, failedIds, true); LOG.info(String.format("Successfully terminate failed hosts %s, go back to INIT state", failedIds.toString())); ClusterUpgradeEventBean updateBean = new ClusterUpgradeEventBean(); updateBean.setHost_ids(Joiner.on(",").join(updateHostIds)); updateBean.setState(ClusterUpgradeEventState.INIT); updateBean.setStatus(ClusterUpgradeEventStatus.SUCCEEDED); transitionState(eventBean.getId(), updateBean); return; } if (succeeded) { LOG.info("Successfully completed LAUNCHING state, move to REPLACING state"); ClusterUpgradeEventBean updateBean = new ClusterUpgradeEventBean(); updateBean.setState(ClusterUpgradeEventState.REPLACING); updateBean.setStatus(ClusterUpgradeEventStatus.SUCCEEDED); transitionState(eventBean.getId(), updateBean); } } /** * Step 3. REPLACING state will guarantee that total agent count must be larger than number of hosts * in auto scaling group. If yes, stop the failed and can_retire host first and then stop the can_retire host * until there is no more can_retire host */ private void processReplacingState(ClusterUpgradeEventBean eventBean) throws Exception { String clusterName = eventBean.getCluster_name(); EnvironBean envBean = environDAO.getById(eventBean.getEnv_id()); String envName = envBean.getEnv_name(); String stageName = envBean.getStage_name(); List<String> retiredHosts = new ArrayList<>(hostDAO.getRetiredHostIdsByGroup(clusterName)); if (retiredHosts.isEmpty()) { LOG.info("Successfully completed REPLACING state, move to COMPLETING state"); ClusterUpgradeEventBean updateBean = new ClusterUpgradeEventBean(); updateBean.setState(ClusterUpgradeEventState.COMPLETING); updateBean.setStatus(ClusterUpgradeEventStatus.SUCCEEDED); transitionState(eventBean.getId(), updateBean); } else { AwsVmBean awsVmBean = autoScalingManager.getAutoScalingGroupInfo(clusterName); int curAsgCapacity = awsVmBean.getCurSize(); // Total agent count = auto scaling capacity + non asg hosts count long servingAgentCnt = agentDAO.countServingTotal(envBean.getEnv_id()); if (servingAgentCnt <= curAsgCapacity) { LOG.debug(String.format("Wait for enough serving agents for %s/%s", envName, stageName)); return; } long stopCnt = servingAgentCnt - curAsgCapacity; // Stop failed and can_retire agents first List<String> retiredAndFailedHosts = new ArrayList<>( hostDAO.getRetiredAndFailedHostIdsByGroup(clusterName)); if (!retiredAndFailedHosts.isEmpty()) { int numToStop = Math.min(retiredAndFailedHosts.size(), (int) stopCnt); clusterHandler.stopHosts(envName, stageName, retiredAndFailedHosts.subList(0, numToStop)); LOG.info(String.format("Successfully stopped %d failed and retired hosts", numToStop)); stopCnt -= numToStop; } if (stopCnt <= 0) { ClusterUpgradeEventBean updateBean = new ClusterUpgradeEventBean(); updateBean.setLast_worked_on(System.currentTimeMillis()); clusterUpgradeEventDAO.updateById(eventBean.getId(), updateBean); return; } // Stop can_retire agents retiredHosts.removeAll(retiredAndFailedHosts); if (!retiredHosts.isEmpty()) { int numToStop = Math.min(retiredHosts.size(), (int) stopCnt); clusterHandler.stopHosts(envName, stageName, retiredHosts.subList(0, numToStop)); LOG.info(String.format("Successfully stopped %d retired hosts", numToStop)); } ClusterUpgradeEventBean updateBean = new ClusterUpgradeEventBean(); updateBean.setLast_worked_on(System.currentTimeMillis()); clusterUpgradeEventDAO.updateById(eventBean.getId(), updateBean); } } /** * Step 4. COMPLETING state should clean up hosts launched in INIT state * If found failed deploy asg hosts, use non-asg host to replace it. * No more launch activity in this state * If previous state failed/timeout/abort, should come to this state to do final clean up */ private void processCompletingState(ClusterUpgradeEventBean eventBean) throws Exception { String clusterName = eventBean.getCluster_name(); if (StringUtils.isEmpty(eventBean.getHost_ids())) { LOG.info("Successfully completed COMPLETING state, move to COMPLETED state"); ClusterUpgradeEventBean updateBean = new ClusterUpgradeEventBean(); // Do not set status, leave it updateBean.setState(ClusterUpgradeEventState.COMPLETED); transitionState(eventBean.getId(), updateBean); updateClusterState(clusterName); return; } List<String> nonAsgHosts = Arrays.asList(eventBean.getHost_ids().split(",")); List<String> activeNonAsgHosts = hostInfoDAO.getRunningInstances(new ArrayList<>(nonAsgHosts)); if (activeNonAsgHosts.size() != activeNonAsgHosts.size()) { updateHostsInClusterEvent(eventBean.getId(), activeNonAsgHosts); } Collection<String> failedHosts = hostDAO.getFailedHostIdsByGroup(clusterName); List<String> failedDeployNonAsgHosts = new ArrayList<>(); List<String> failedDeployAsgHosts = new ArrayList<>(); for (String failedHost : failedHosts) { if (activeNonAsgHosts.contains(failedHost)) { failedDeployNonAsgHosts.add(failedHost); } else { failedDeployAsgHosts.add(failedHost); } } if (!failedDeployNonAsgHosts.isEmpty()) { activeNonAsgHosts.removeAll(failedDeployNonAsgHosts); clusterManager.terminateHosts(clusterName, failedDeployAsgHosts, true); LOG.info(String.format("Successfully terminated %d failed deploy non-asg hosts for cluster %s: %s", failedDeployNonAsgHosts.size(), clusterName, failedDeployAsgHosts.toString())); } if (!failedDeployAsgHosts.isEmpty() && !activeNonAsgHosts.isEmpty()) { int num = Math.min(failedDeployAsgHosts.size(), activeNonAsgHosts.size()); List<String> hostsToAttach = new ArrayList<>(activeNonAsgHosts.subList(0, num)); autoScalingManager.addInstancesToAutoScalingGroup(hostsToAttach, clusterName); activeNonAsgHosts.removeAll(hostsToAttach); LOG.info(String.format("Successfully attached %d hosts to cluster %s: %s", num, clusterName, hostsToAttach.toString())); List<String> hostsToTerminate = new ArrayList<>(failedDeployAsgHosts.subList(0, num)); clusterManager.terminateHosts(clusterName, hostsToTerminate, false); LOG.info(String.format("Successfully terminated %d failed deploy asg hosts in cluster %s: %s", num, clusterName, hostsToTerminate.toString())); } if (!activeNonAsgHosts.isEmpty()) { clusterManager.terminateHosts(clusterName, activeNonAsgHosts, true); LOG.info(String.format("Successfully terminated %d non asg hosts for cluster %s: %s", activeNonAsgHosts.size(), clusterName, activeNonAsgHosts.toString())); } LOG.info("Successfully completed COMPLETING state, move to COMPLETED state"); ClusterUpgradeEventBean updateBean = new ClusterUpgradeEventBean(); updateBean.setHost_ids(""); // Do not set status, leave it updateBean.setState(ClusterUpgradeEventState.COMPLETED); updateBean.setLast_worked_on(System.currentTimeMillis()); transitionState(eventBean.getId(), updateBean); updateClusterState(clusterName); } private void processEvent(ClusterUpgradeEventBean eventBean) throws Exception { String clusterName = eventBean.getCluster_name(); ClusterBean clusterBean = clusterDAO.getByClusterName(clusterName); if (clusterBean.getState() == ClusterState.PAUSE) { LOG.info(String.format("Cluster upgrade event is paused for %s", clusterName)); ClusterUpgradeEventBean updateBean = new ClusterUpgradeEventBean(); updateBean.setLast_worked_on(System.currentTimeMillis()); clusterUpgradeEventDAO.updateById(eventBean.getId(), updateBean); return; } if (shouldTimeoutClusterUpgradeEvent(eventBean)) { return; } LOG.info(String.format("Start to process %s state for cluster %s: %s", eventBean.getState().toString(), clusterName, eventBean.toString())); if (eventBean.getState() == ClusterUpgradeEventState.INIT) { processInitState(eventBean); } else if (eventBean.getState() == ClusterUpgradeEventState.LAUNCHING) { processLaunchingState(eventBean); } else if (eventBean.getState() == ClusterUpgradeEventState.REPLACING) { processReplacingState(eventBean); } else if (eventBean.getState() == ClusterUpgradeEventState.COMPLETING) { processCompletingState(eventBean); } } private void processBatch() throws Exception { Collection<ClusterUpgradeEventBean> eventBeans = clusterUpgradeEventDAO.getOngoingEvents(); if (eventBeans.isEmpty()) { return; } for (ClusterUpgradeEventBean eventBean : eventBeans) { LOG.info(String.format("Start to process cluster upgrade event %s", eventBean.toString())); String lockName = String.format("CLUSTERREPLACER-%s", eventBean.getId()); Connection connection = utilDAO.getLock(lockName); if (connection != null) { try { processEvent(eventBean); } catch (Exception e) { LOG.error(String.format("Failed to process event %s", eventBean.toString()), e); } finally { utilDAO.releaseLock(lockName, connection); } } else { LOG.warn(String.format("Failed to get lock: %s", lockName)); } } } @Override public void run() { try { LOG.info("Start to run ClusterReplacer"); processBatch(); } catch (Throwable t) { LOG.error("Faile to run ClusterReplacer", t); } } }