Java tutorial
/* * Copyright (c) 2015 EMC Corporation * All Rights Reserved */ package com.emc.storageos.api.service.impl.resource; import java.net.InetAddress; import java.net.UnknownHostException; import java.nio.charset.Charset; import java.util.ArrayList; import java.util.Date; import java.util.HashSet; import java.util.List; import java.util.Map; import java.util.Objects; import java.util.Properties; import java.util.Set; import java.util.concurrent.Executors; import java.util.concurrent.ScheduledExecutorService; import java.util.concurrent.TimeUnit; import javax.crypto.SecretKey; import javax.ws.rs.Consumes; import javax.ws.rs.DELETE; import javax.ws.rs.GET; import javax.ws.rs.HeaderParam; import javax.ws.rs.POST; import javax.ws.rs.PUT; import javax.ws.rs.Path; import javax.ws.rs.PathParam; import javax.ws.rs.Produces; import javax.ws.rs.QueryParam; import javax.ws.rs.core.MediaType; import javax.ws.rs.core.Response; import com.emc.storageos.coordinator.client.service.impl.DualInetAddress; import org.apache.commons.codec.binary.Base64; import org.apache.commons.lang.StringUtils; import org.apache.curator.framework.recipes.barriers.DistributedBarrier; import org.apache.curator.framework.recipes.leader.LeaderSelector; import org.apache.curator.framework.recipes.locks.InterProcessLock; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.springframework.beans.factory.annotation.Autowired; import com.emc.storageos.api.mapper.SiteMapper; import com.emc.storageos.api.service.impl.resource.utils.InternalSiteServiceClient; import com.emc.storageos.coordinator.client.model.Constants; import com.emc.storageos.coordinator.client.model.PropertyInfoExt; import com.emc.storageos.coordinator.client.model.RepositoryInfo; import com.emc.storageos.coordinator.client.model.Site; import com.emc.storageos.coordinator.client.model.SiteError; import com.emc.storageos.coordinator.client.model.SiteInfo; import com.emc.storageos.coordinator.client.model.SiteMonitorResult; import com.emc.storageos.coordinator.client.model.SiteState; import com.emc.storageos.coordinator.client.model.SoftwareVersion; import com.emc.storageos.coordinator.client.model.Site.NetworkHealth; import com.emc.storageos.coordinator.client.service.CoordinatorClient; import com.emc.storageos.coordinator.client.service.DrUtil; import com.emc.storageos.coordinator.client.service.impl.CoordinatorClientInetAddressMap; import com.emc.storageos.coordinator.client.service.impl.LeaderSelectorListenerImpl; import com.emc.storageos.coordinator.common.Configuration; import com.emc.storageos.coordinator.common.impl.ZkPath; import com.emc.storageos.coordinator.exceptions.CoordinatorException; import com.emc.storageos.coordinator.exceptions.RetryableCoordinatorException; import com.emc.storageos.db.client.DbClient; import com.emc.storageos.db.client.impl.DbClientImpl; import com.emc.storageos.db.client.model.StringMap; import com.emc.storageos.db.client.model.uimodels.InitialSetup; import com.emc.storageos.db.common.DbConfigConstants; import com.emc.storageos.model.dr.DRNatCheckParam; import com.emc.storageos.model.dr.DRNatCheckResponse; import com.emc.storageos.model.dr.FailoverPrecheckResponse; import com.emc.storageos.model.dr.SiteActive; import com.emc.storageos.model.dr.SiteAddParam; import com.emc.storageos.model.dr.SiteConfigParam; import com.emc.storageos.model.dr.SiteConfigRestRep; import com.emc.storageos.model.dr.SiteDetailRestRep; import com.emc.storageos.model.dr.SiteErrorResponse; import com.emc.storageos.model.dr.SiteIdListParam; import com.emc.storageos.model.dr.SiteList; import com.emc.storageos.model.dr.SiteParam; import com.emc.storageos.model.dr.SiteRestRep; import com.emc.storageos.model.dr.SiteUpdateParam; import com.emc.storageos.model.property.PropertyConstants; import com.emc.storageos.security.audit.AuditLogManager; import com.emc.storageos.security.authentication.InternalApiSignatureKeyGenerator; import com.emc.storageos.security.authentication.InternalApiSignatureKeyGenerator.SignatureKeyType; import com.emc.storageos.security.authorization.CheckPermission; import com.emc.storageos.security.authorization.DefaultPermissions; import com.emc.storageos.security.authorization.ExcludeLicenseCheck; import com.emc.storageos.security.authorization.Role; import com.emc.storageos.security.ipsec.IPsecConfig; import com.emc.storageos.services.OperationTypeEnum; import com.emc.storageos.services.util.SysUtils; import com.emc.storageos.svcs.errorhandling.resources.APIException; import com.emc.storageos.svcs.errorhandling.resources.InternalServerErrorException; import com.emc.vipr.client.ViPRCoreClient; import com.emc.vipr.client.ViPRSystemClient; import com.emc.vipr.model.sys.ClusterInfo; /** * APIs implementation to standby sites lifecycle management such as add-standby, remove-standby, failover, pause * resume replication etc. */ @Path("/site") @DefaultPermissions(readRoles = { Role.SYSTEM_ADMIN, Role.RESTRICTED_SYSTEM_ADMIN, Role.SECURITY_ADMIN, Role.RESTRICTED_SECURITY_ADMIN, Role.SYSTEM_MONITOR }, writeRoles = { Role.SECURITY_ADMIN, Role.RESTRICTED_SECURITY_ADMIN }) public class DisasterRecoveryService { private static final Logger log = LoggerFactory.getLogger(DisasterRecoveryService.class); private static final String SHORTID_FMT = "site%d"; private static final int MAX_NUM_OF_STANDBY = 10; private static final String EVENT_SERVICE_TYPE = "DisasterRecovery"; private static final String NTPSERVERS = "network_ntpservers"; private static final int SITE_NAME_LENGTH_LIMIT = 64; private static final int SITE_CONNECT_TEST_TIMEOUT = 10 * 1000; private static final int SITE_CONNECTION_TEST_PORT = 443; private InternalApiSignatureKeyGenerator apiSignatureGenerator; private SiteMapper siteMapper; private SysUtils sysUtils; private CoordinatorClient coordinator; private DbClient dbClient; private IPsecConfig ipsecConfig; private Properties dbCommonInfo; private DrUtil drUtil; @Autowired private AuditLogManager auditMgr; /** * Record audit log for DisasterRecoveryService * * @param auditType * @param operationalStatus * @param operationStage * @param descparams */ protected void auditDisasterRecoveryOps(OperationTypeEnum auditType, String operationalStatus, String operationStage, Object... descparams) { auditMgr.recordAuditLog(null, null, EVENT_SERVICE_TYPE, auditType, System.currentTimeMillis(), operationalStatus, operationStage, descparams); } /** * init method, this will be called by Spring framework after create bean successfully */ public void init() { siteMapper = new SiteMapper(); startLeaderSelector(); } /** * Attach one fresh install site to this acitve site as standby * Or attach a acitve site for the local standby site when it's first being added. * * @param param site detail information * @return site response information */ @POST @Consumes({ MediaType.APPLICATION_XML, MediaType.APPLICATION_JSON }) @Produces({ MediaType.APPLICATION_XML, MediaType.APPLICATION_JSON }) @CheckPermission(roles = { Role.SECURITY_ADMIN, Role.RESTRICTED_SECURITY_ADMIN }, blockProxies = true) public SiteRestRep addStandby(SiteAddParam param) { log.info("Adding standby site: {}", param.getVip()); precheckForGeo(); List<Site> existingSites = drUtil.listStandbySites(); // parameter validation and precheck validateAddParam(param, existingSites); // check the version before using the ViPR client, otherwise there might be compatibility issues. precheckStandbyVersion(param); ViPRCoreClient viprCoreClient; SiteConfigRestRep standbyConfig; try { viprCoreClient = createViPRCoreClient(param.getVip(), param.getUsername(), param.getPassword()); standbyConfig = viprCoreClient.site().getStandbyConfig(); } catch (Exception e) { log.error("Unexpected error when retrieving standby config", e); throw APIException.internalServerErrors .addStandbyPrecheckFailed("Cannot retrieve config from standby site"); } String siteId = standbyConfig.getUuid(); precheckForStandbyAdd(standbyConfig); InterProcessLock lock = drUtil.getDROperationLock(); Site standbySite = null; try { standbySite = new Site(); standbySite.setCreationTime((new Date()).getTime()); standbySite.setName(param.getName()); standbySite.setVdcShortId(drUtil.getLocalVdcShortId()); standbySite.setVip(standbyConfig.getVip()); standbySite.setVip6(standbyConfig.getVip6()); standbySite.getHostIPv4AddressMap().putAll(new StringMap(standbyConfig.getHostIPv4AddressMap())); standbySite.getHostIPv6AddressMap().putAll(new StringMap(standbyConfig.getHostIPv6AddressMap())); standbySite.setNodeCount(standbyConfig.getNodeCount()); standbySite.setUuid(standbyConfig.getUuid()); String shortId = generateShortId(drUtil.listSites()); standbySite.setSiteShortId(shortId); standbySite.setDescription(param.getDescription()); standbySite.setState(SiteState.STANDBY_ADDING); if (log.isDebugEnabled()) { log.debug(standbySite.toString()); } // Do this before tx get started which might write key to zk. SecretKey secretKey = apiSignatureGenerator.getSignatureKey(SignatureKeyType.INTERVDC_API); coordinator.startTransaction(); coordinator.addSite(standbyConfig.getUuid()); log.info("Persist standby site to ZK {}", shortId); // coordinator.setTargetInfo(standbySite); coordinator.persistServiceConfiguration(standbySite.toConfiguration()); drUtil.recordDrOperationStatus(standbySite); // wake up syssvc to regenerate configurations long vdcConfigVersion = DrUtil.newVdcConfigVersion(); drUtil.updateVdcTargetVersion(coordinator.getSiteId(), SiteInfo.DR_OP_ADD_STANDBY, vdcConfigVersion); for (Site site : existingSites) { drUtil.updateVdcTargetVersion(site.getUuid(), SiteInfo.DR_OP_ADD_STANDBY, vdcConfigVersion); } // sync site related info with to be added standby site long dataRevision = System.currentTimeMillis(); List<Site> allStandbySites = new ArrayList<>(); allStandbySites.add(standbySite); allStandbySites.addAll(existingSites); SiteConfigParam configParam = prepareSiteConfigParam(allStandbySites, ipsecConfig.getPreSharedKey(), standbyConfig.getUuid(), dataRevision, vdcConfigVersion, secretKey); viprCoreClient.site().syncSite(standbyConfig.getUuid(), configParam); drUtil.updateVdcTargetVersion(siteId, SiteInfo.DR_OP_CHANGE_DATA_REVISION, vdcConfigVersion, dataRevision); coordinator.commitTransaction(); auditDisasterRecoveryOps(OperationTypeEnum.ADD_STANDBY, AuditLogManager.AUDITLOG_SUCCESS, AuditLogManager.AUDITOP_BEGIN, standbySite.toBriefString()); return siteMapper.map(standbySite); } catch (Exception e) { log.error("Internal error for updating coordinator on standby", e); coordinator.discardTransaction(); auditDisasterRecoveryOps(OperationTypeEnum.ADD_STANDBY, AuditLogManager.AUDITLOG_FAILURE, null, standbySite.toBriefString()); InternalServerErrorException addStandbyFailedException = APIException.internalServerErrors .addStandbyFailed(e.getMessage()); throw addStandbyFailedException; } finally { try { lock.release(); } catch (Exception ignore) { log.error(String.format("Lock release failed when adding standby %s", siteId)); } } } /** * Prepare all sites related info for synchronizing them from master to be added or resumed standby site * * @param standbySites All standby sites * @param ipsecKey The cluster ipsec key * @param targetStandbyUUID The uuid of the target standby * @param targetStandbyDataRevision The data revision of the target standby * @return SiteConfigParam all the sites configuration */ private SiteConfigParam prepareSiteConfigParam(List<Site> standbySites, String ipsecKey, String targetStandbyUUID, long targetStandbyDataRevision, long vdcConfigVersion, SecretKey secretKey) { log.info("Preparing to sync sites info among to be added/resumed standby site..."); Site active = drUtil.getActiveSite(); SiteConfigParam configParam = new SiteConfigParam(); SiteParam activeSite = new SiteParam(); siteMapper.map(active, activeSite); activeSite.setIpsecKey(ipsecKey); log.info(" active site info:{}", activeSite.toString()); configParam.setActiveSite(activeSite); List<SiteParam> standbySitesParam = new ArrayList<>(); for (Site standby : standbySites) { SiteParam standbyParam = new SiteParam(); siteMapper.map(standby, standbyParam); standbyParam.setSecretKey( new String(Base64.encodeBase64(secretKey.getEncoded()), Charset.forName("UTF-8"))); if (standby.getUuid().equals(targetStandbyUUID)) { log.info("Set data revision for site {} to {}", standby.getUuid(), targetStandbyDataRevision); standbyParam.setDataRevision(targetStandbyDataRevision); } standbySitesParam.add(standbyParam); log.info(" standby site info:{}", standbyParam.toString()); } configParam.setStandbySites(standbySitesParam); configParam.setVdcConfigVersion(vdcConfigVersion); // Need set stanby's NTP same as primary, so standby time is consistent with primary after reboot // It's because time inconsistency between primary and standby will cause db rebuild issue: COP-17965 PropertyInfoExt targetPropInfo = coordinator.getTargetInfo(PropertyInfoExt.class); String ntpServers = targetPropInfo.getProperty(NTPSERVERS); log.info(" active site ntp servers: {}", ntpServers); configParam.setNtpServers(ntpServers); return configParam; } /** * Initialize a to be added target standby * The current site will be demoted from active to standby during the process * * @param configParam * @return */ @PUT @Path("/{uuid}/initstandby") @Consumes({ MediaType.APPLICATION_XML, MediaType.APPLICATION_JSON }) @Produces({ MediaType.APPLICATION_XML, MediaType.APPLICATION_JSON }) @CheckPermission(roles = { Role.SECURITY_ADMIN, Role.RESTRICTED_SECURITY_ADMIN }, blockProxies = true) @ExcludeLicenseCheck public Response syncSites(SiteConfigParam configParam) { log.info("sync sites from acitve site"); return initStandby(configParam); } /** * Initialize a to-be added/resumed target standby * a) re-set all the latest site related info (persisted in ZK) in the target standby * b) vdc properties would be changed accordingly * c) the target standby reboot * d) re-set zk/db data during the target standby reboot * e) the target standby would connect with active and sync all the latest ZK&DB data. * * Scenarios: * a) For adding standby site scenario (External API), the current site will be demoted from active to standby during the process * b) For resuming standby site scenario (Internal API), the current site's original data will be cleaned by setting new data revision. * It is now only used for resuming long paused (> 5 days) standby site * * @param configParam * @return */ @PUT @Path("/internal/initstandby") @Consumes({ MediaType.APPLICATION_XML, MediaType.APPLICATION_JSON }) @Produces({ MediaType.APPLICATION_XML, MediaType.APPLICATION_JSON }) public Response initStandby(SiteConfigParam configParam) { try { SiteParam activeSiteParam = configParam.getActiveSite(); ipsecConfig.setPreSharedKey(activeSiteParam.getIpsecKey()); coordinator.addSite(activeSiteParam.getUuid()); Site activeSite = new Site(); siteMapper.map(activeSiteParam, activeSite); activeSite.setVdcShortId(drUtil.getLocalVdcShortId()); coordinator.persistServiceConfiguration(activeSite.toConfiguration()); Long dataRevision = null; // Add other standby sites for (SiteParam standby : configParam.getStandbySites()) { Site site = new Site(); siteMapper.map(standby, site); site.setVdcShortId(drUtil.getLocalVdcShortId()); coordinator.persistServiceConfiguration(site.toConfiguration()); coordinator.addSite(standby.getUuid()); if (standby.getUuid().equals(coordinator.getSiteId())) { dataRevision = standby.getDataRevision(); log.info("Set data revision to {}", dataRevision); } log.info("Persist standby site {} to ZK", standby.getVip()); } if (dataRevision == null) { throw new IllegalStateException("Illegal request on standby site. No data revision in request"); } String ntpServers = configParam.getNtpServers(); PropertyInfoExt targetPropInfo = coordinator.getTargetInfo(PropertyInfoExt.class); if (ntpServers != null && !ntpServers.equals(targetPropInfo.getProperty(NTPSERVERS))) { targetPropInfo.addProperty(NTPSERVERS, ntpServers); coordinator.setTargetInfo(targetPropInfo); log.info("Set ntp servers to {}", ntpServers); } drUtil.updateVdcTargetVersion(coordinator.getSiteId(), SiteInfo.DR_OP_CHANGE_DATA_REVISION, configParam.getVdcConfigVersion(), dataRevision); return Response.status(Response.Status.ACCEPTED).build(); } catch (Exception e) { log.error("Internal error for updating coordinator on standby", e); throw APIException.internalServerErrors.configStandbyFailed(e.getMessage()); } } /** * Get all sites including standby and acitve * * @return site list contains all sites with detail information */ @GET @Produces({ MediaType.APPLICATION_XML, MediaType.APPLICATION_JSON }) @CheckPermission(roles = { Role.SECURITY_ADMIN, Role.RESTRICTED_SECURITY_ADMIN, Role.SYSTEM_ADMIN, Role.RESTRICTED_SYSTEM_ADMIN, Role.SYSTEM_MONITOR }) public SiteList getSites() { log.info("Begin to list all standby sites of local VDC"); SiteList standbyList = new SiteList(); for (Site site : drUtil.listSites()) { standbyList.getSites().add(siteMapper.map(site)); } return standbyList; } /** * Check if current site is acitve site * * @return SiteActive true if current site is acitve else false */ @GET @Produces({ MediaType.APPLICATION_XML, MediaType.APPLICATION_JSON }) @Path("/active") public SiteActive checkIsActive() { log.info("Begin to check if site Active or Standby"); SiteActive isActiveSite = new SiteActive(); try { Site localSite = drUtil.getLocalSite(); isActiveSite.setIsActive(localSite.getState() == SiteState.ACTIVE); isActiveSite.setLocalSiteName(localSite.getName()); return isActiveSite; } catch (Exception e) { log.error("Can't get site is Active or Standby"); throw APIException.badRequests.siteIdNotFound(); } } /** * Get specified site according site UUID * * @param uuid site UUID * @return site response with detail information */ @GET @Produces({ MediaType.APPLICATION_XML, MediaType.APPLICATION_JSON }) @CheckPermission(roles = { Role.SECURITY_ADMIN, Role.RESTRICTED_SECURITY_ADMIN, Role.SYSTEM_ADMIN, Role.RESTRICTED_SYSTEM_ADMIN, Role.SYSTEM_MONITOR }) @Path("/{uuid}") public SiteRestRep getSite(@PathParam("uuid") String uuid) { log.info("Begin to get standby site by uuid {}", uuid); try { Site site = drUtil.getSiteFromLocalVdc(uuid); return siteMapper.map(site); } catch (Exception e) { log.error("Can't find site with specified site ID {}", uuid); throw APIException.badRequests.siteIdNotFound(); } } /** * Remove a standby. After successfully done, it stops data replication to this site * * @param uuid standby site uuid * @return */ @DELETE @Produces({ MediaType.APPLICATION_XML, MediaType.APPLICATION_JSON }) @CheckPermission(roles = { Role.SECURITY_ADMIN, Role.RESTRICTED_SECURITY_ADMIN }, blockProxies = true) @Path("/{uuid}") public Response remove(@PathParam("uuid") String uuid) { SiteIdListParam param = new SiteIdListParam(); param.getIds().add(uuid); return remove(param); } /** * Remove multiple standby sites. After successfully done, it stops data replication to those sites * * @param idList site uuid list to be removed * @return */ @POST @Consumes({ MediaType.APPLICATION_XML, MediaType.APPLICATION_JSON }) @Produces({ MediaType.APPLICATION_XML, MediaType.APPLICATION_JSON }) @CheckPermission(roles = { Role.SECURITY_ADMIN, Role.RESTRICTED_SECURITY_ADMIN }, blockProxies = true) @Path("/remove") public Response remove(SiteIdListParam idList) { List<String> siteIdList = idList.getIds(); String siteIdStr = StringUtils.join(siteIdList, ","); log.info("Begin to remove standby site from local vdc by uuid: {}", siteIdStr); List<Site> toBeRemovedSites = new ArrayList<>(); for (String siteId : siteIdList) { Site site; try { site = drUtil.getSiteFromLocalVdc(siteId); } catch (Exception ex) { log.error("Can't load site {} from ZK", siteId); throw APIException.badRequests.siteIdNotFound(); } if (site.getState().equals(SiteState.ACTIVE)) { log.error("Unable to remove this site {}. It is acitve", siteId); throw APIException.badRequests.operationNotAllowedOnActiveSite(); } if (site.getState().isDROperationOngoing() && !site.getState().equals(SiteState.STANDBY_SYNCING)) { log.error( "Unable to remove this site {} in state {}. " + "DR operation other than STANDBY_SYNCING is ongoing", siteId, site.getState().name()); throw APIException.internalServerErrors.concurrentDROperationNotAllowed(site.getName(), site.getState().toString()); } toBeRemovedSites.add(site); } // Build a site names' string for more human-readable Exception error message StringBuilder siteNamesSb = new StringBuilder(); for (Site site : toBeRemovedSites) { if (siteNamesSb.length() != 0) { siteNamesSb.append(", "); } siteNamesSb.append(site.getName()); } String SiteNamesStr = siteNamesSb.toString(); try { commonPrecheck(siteIdList); } catch (IllegalStateException e) { throw APIException.internalServerErrors.removeStandbyPrecheckFailed(SiteNamesStr, e.getMessage()); } InterProcessLock lock = drUtil.getDROperationLock(false); List<String> sitesString = new ArrayList<>(); try { log.info("Removing sites"); coordinator.startTransaction(); for (Site site : toBeRemovedSites) { site.setState(SiteState.STANDBY_REMOVING); coordinator.persistServiceConfiguration(site.toConfiguration()); drUtil.recordDrOperationStatus(site); sitesString.add(site.toBriefString()); } log.info("Notify all sites for reconfig"); long vdcTargetVersion = DrUtil.newVdcConfigVersion(); for (Site standbySite : drUtil.listSites()) { drUtil.updateVdcTargetVersion(standbySite.getUuid(), SiteInfo.DR_OP_REMOVE_STANDBY, vdcTargetVersion); } coordinator.commitTransaction(); auditDisasterRecoveryOps(OperationTypeEnum.REMOVE_STANDBY, AuditLogManager.AUDITLOG_SUCCESS, AuditLogManager.AUDITOP_BEGIN, StringUtils.join(sitesString, ',')); return Response.status(Response.Status.ACCEPTED).build(); } catch (Exception e) { log.error("Failed to remove site {}", siteIdStr, e); coordinator.discardTransaction(); auditDisasterRecoveryOps(OperationTypeEnum.REMOVE_STANDBY, AuditLogManager.AUDITLOG_FAILURE, null, StringUtils.join(sitesString, ',')); throw APIException.internalServerErrors.removeStandbyFailed(SiteNamesStr, e.getMessage()); } finally { try { lock.release(); } catch (Exception ignore) { log.error(String.format("Lock release failed when removing standby sites: %s", siteIdStr)); } } } /** * Get standby site configuration * * @return SiteConfigRestRep standby site configuration. */ @GET @Consumes({ MediaType.APPLICATION_XML, MediaType.APPLICATION_JSON }) @Produces({ MediaType.APPLICATION_XML, MediaType.APPLICATION_JSON }) @CheckPermission(roles = { Role.SECURITY_ADMIN, Role.RESTRICTED_SECURITY_ADMIN, Role.SYSTEM_ADMIN, Role.RESTRICTED_SYSTEM_ADMIN, Role.SYSTEM_MONITOR }) @Path("/localconfig") public SiteConfigRestRep getStandbyConfig() { log.info("Begin to get standby config"); String siteId = coordinator.getSiteId(); SecretKey key = apiSignatureGenerator.getSignatureKey(SignatureKeyType.INTERVDC_API); Site site = drUtil.getSiteFromLocalVdc(siteId); SiteConfigRestRep siteConfigRestRep = new SiteConfigRestRep(); siteConfigRestRep.setUuid(siteId); siteConfigRestRep.setVip(site.getVip()); siteConfigRestRep.setVip6(site.getVip6()); siteConfigRestRep.setSecretKey(new String(Base64.encodeBase64(key.getEncoded()), Charset.forName("UTF-8"))); siteConfigRestRep.setHostIPv4AddressMap(site.getHostIPv4AddressMap()); siteConfigRestRep.setHostIPv6AddressMap(site.getHostIPv6AddressMap()); siteConfigRestRep.setDbSchemaVersion(coordinator.getCurrentDbSchemaVersion()); siteConfigRestRep.setFreshInstallation(isFreshInstallation()); siteConfigRestRep.setClusterStable(isClusterStable()); siteConfigRestRep.setNodeCount(site.getNodeCount()); siteConfigRestRep.setState(site.getState().toString()); try { siteConfigRestRep.setSoftwareVersion( coordinator.getTargetInfo(RepositoryInfo.class).getCurrentVersion().toString()); } catch (Exception e) { log.error("Fail to get software version {}", e); } log.info("Return result: {}", siteConfigRestRep); return siteConfigRestRep; } @POST @Produces({ MediaType.APPLICATION_XML, MediaType.APPLICATION_JSON }) @CheckPermission(roles = { Role.SECURITY_ADMIN, Role.RESTRICTED_SECURITY_ADMIN }, blockProxies = true) @Path("/natcheck") @ExcludeLicenseCheck public DRNatCheckResponse checkIfBehindNat(DRNatCheckParam checkParam, @HeaderParam("X-Forwarded-For") String clientIp) { if (checkParam == null) { log.error("checkParam is null, X-Forwarded-For is {}", clientIp); throw APIException.internalServerErrors.invalidNatCheckCall("(null)", clientIp); } String ipv4Str = checkParam.getIPv4Address(); String ipv6Str = checkParam.getIPv6Address(); log.info(String.format( "Performing NAT check, client address connecting to VIP: %s. Client reports its IPv4 = %s, IPv6 = %s", clientIp, ipv4Str, ipv6Str)); boolean isBehindNat = false; try { isBehindNat = sysUtils.checkIfBehindNat(ipv4Str, ipv6Str, clientIp); } catch (Exception e) { log.error("Fail to check NAT {}", e); throw APIException.internalServerErrors.invalidNatCheckCall(e.getMessage(), clientIp); } DRNatCheckResponse resp = new DRNatCheckResponse(); resp.setSeenIp(clientIp); resp.setBehindNAT(isBehindNat); return resp; } /** * Pause a standby site that is already sync'ed with the active * * @param uuid site UUID * @return updated standby site representation */ @POST @Produces({ MediaType.APPLICATION_XML, MediaType.APPLICATION_JSON }) @CheckPermission(roles = { Role.SECURITY_ADMIN, Role.RESTRICTED_SECURITY_ADMIN, Role.SYSTEM_ADMIN, Role.RESTRICTED_SYSTEM_ADMIN }, blockProxies = true) @Path("/{uuid}/pause") public Response pauseStandby(@PathParam("uuid") String uuid) { SiteIdListParam param = new SiteIdListParam(); param.getIds().add(uuid); return pause(param); } /** * Pause data replication to multiple standby sites. * * @param idList site uuid list to be removed * @return */ @POST @Consumes({ MediaType.APPLICATION_XML, MediaType.APPLICATION_JSON }) @Produces({ MediaType.APPLICATION_XML, MediaType.APPLICATION_JSON }) @CheckPermission(roles = { Role.SECURITY_ADMIN, Role.RESTRICTED_SECURITY_ADMIN, Role.SYSTEM_ADMIN, Role.RESTRICTED_SYSTEM_ADMIN }, blockProxies = true) @Path("/pause") public Response pause(SiteIdListParam idList) { List<String> siteIdList = idList.getIds(); String siteIdStr = StringUtils.join(siteIdList, ","); log.info("Begin to pause standby site from local vdc by uuid: {}", siteIdStr); List<Site> toBePausedSites = new ArrayList<>(); List<String> siteNameList = new ArrayList<>(); for (String siteId : siteIdList) { Site site; try { site = drUtil.getSiteFromLocalVdc(siteId); } catch (Exception ex) { log.error("Can't load site {} from ZK", siteId); throw APIException.badRequests.siteIdNotFound(); } SiteState state = site.getState(); if (state.equals(SiteState.ACTIVE)) { log.error("Unable to pause this site {}. It is acitve", siteId); throw APIException.badRequests.operationNotAllowedOnActiveSite(); } if (!state.equals(SiteState.STANDBY_SYNCED)) { log.error("Unable to pause this site {}. It is in state {}", siteId, state); throw APIException.badRequests.operationOnlyAllowedOnSyncedSite(site.getName(), state.toString()); } toBePausedSites.add(site); siteNameList.add(site.getName()); } // This String is only used to output human readable message to user when Exception is thrown String siteNameStr = StringUtils.join(siteNameList, ','); try { commonPrecheck(siteIdList); } catch (IllegalStateException e) { throw APIException.internalServerErrors.pauseStandbyPrecheckFailed(siteNameStr, e.getMessage()); } InterProcessLock lock = drUtil.getDROperationLock(); // any error is not retry-able beyond this point. List<String> sitesString = new ArrayList<>(); try { log.info("Pausing sites"); long vdcTargetVersion = DrUtil.newVdcConfigVersion(); coordinator.startTransaction(); for (Site site : toBePausedSites) { site.setState(SiteState.STANDBY_PAUSING); site.setLastStateUpdateTime(System.currentTimeMillis()); coordinator.persistServiceConfiguration(site.toConfiguration()); drUtil.recordDrOperationStatus(site); sitesString.add(site.toBriefString()); // notify the to-be-paused sites before others. drUtil.updateVdcTargetVersion(site.getUuid(), SiteInfo.DR_OP_PAUSE_STANDBY, vdcTargetVersion); } log.info("Notify all sites for reconfig"); for (Site site : drUtil.listSites()) { if (toBePausedSites.contains(site)) { // Site#equals only compares the site uuid // already notified continue; } drUtil.updateVdcTargetVersion(site.getUuid(), SiteInfo.DR_OP_PAUSE_STANDBY, vdcTargetVersion); } coordinator.commitTransaction(); auditDisasterRecoveryOps(OperationTypeEnum.PAUSE_STANDBY, AuditLogManager.AUDITLOG_SUCCESS, AuditLogManager.AUDITOP_BEGIN, StringUtils.join(sitesString, ',')); return Response.status(Response.Status.ACCEPTED).build(); } catch (Exception e) { log.error("Failed to pause site {}", siteIdStr, e); coordinator.discardTransaction(); auditDisasterRecoveryOps(OperationTypeEnum.PAUSE_STANDBY, AuditLogManager.AUDITLOG_FAILURE, null, StringUtils.join(sitesString, ',')); throw APIException.internalServerErrors.pauseStandbyFailed(siteNameStr, e.getMessage()); } finally { try { lock.release(); } catch (Exception ignore) { log.error(String.format("Lock release failed when pausing standby site: %s", siteIdStr)); } } } /** * Resume data replication for a paused standby site * * @param uuid site UUID * @return updated standby site representation */ @POST @Produces({ MediaType.APPLICATION_XML, MediaType.APPLICATION_JSON }) @CheckPermission(roles = { Role.SECURITY_ADMIN, Role.RESTRICTED_SECURITY_ADMIN, Role.SYSTEM_ADMIN, Role.RESTRICTED_SYSTEM_ADMIN }, blockProxies = true) @Path("/{uuid}/resume") public SiteRestRep resumeStandby(@PathParam("uuid") String uuid) { log.info("Begin to resume data sync to standby site identified by uuid: {}", uuid); Site standby = validateSiteConfig(uuid); if (!standby.getState().equals(SiteState.STANDBY_PAUSED)) { log.error("site {} is in state {}, should be STANDBY_PAUSED", uuid, standby.getState()); throw APIException.badRequests.operationOnlyAllowedOnPausedSite(standby.getName(), standby.getState().toString()); } try (InternalSiteServiceClient client = createInternalSiteServiceClient(standby)) { commonPrecheck(uuid); client.setCoordinatorClient(coordinator); client.setKeyGenerator(apiSignatureGenerator); client.resumePrecheck(); } catch (IllegalStateException e) { throw APIException.internalServerErrors.resumeStandbyPrecheckFailed(standby.getName(), e.getMessage()); } // Do this before tx get started which might write key to zk. SecretKey secretKey = apiSignatureGenerator.getSignatureKey(SignatureKeyType.INTERVDC_API); InterProcessLock lock = drUtil.getDROperationLock(); long vdcTargetVersion = DrUtil.newVdcConfigVersion(); try { coordinator.startTransaction(); for (Site site : drUtil.listStandbySites()) { long dataRevision = 0; if (site.getUuid().equals(uuid)) { int gcGracePeriod = DbConfigConstants.DEFAULT_GC_GRACE_PERIOD; String strVal = dbCommonInfo.getProperty(DbClientImpl.DB_CASSANDRA_INDEX_GC_GRACE_PERIOD); if (strVal != null) { gcGracePeriod = Integer.parseInt(strVal); } // last state update should be PAUSED if ((System.currentTimeMillis() - site.getLastStateUpdateTime()) / 1000 >= gcGracePeriod) { log.error("site {} has been paused for too long, we will re-init the target standby", uuid); // init the to-be resumed standby site dataRevision = System.currentTimeMillis(); List<Site> standbySites = drUtil.listStandbySites(); SiteConfigParam configParam = prepareSiteConfigParam(standbySites, ipsecConfig.getPreSharedKey(), uuid, dataRevision, vdcTargetVersion, secretKey); try (InternalSiteServiceClient internalSiteServiceClient = new InternalSiteServiceClient()) { internalSiteServiceClient.setCoordinatorClient(coordinator); internalSiteServiceClient.setServer(site.getVipEndPoint()); internalSiteServiceClient.initStandby(configParam); } } // update the site state AFTER checking the last state update time site.setState(SiteState.STANDBY_RESUMING); coordinator.persistServiceConfiguration(site.toConfiguration()); drUtil.recordDrOperationStatus(site); } if (dataRevision != 0) { drUtil.updateVdcTargetVersion(site.getUuid(), SiteInfo.DR_OP_CHANGE_DATA_REVISION, dataRevision, vdcTargetVersion); } else { drUtil.updateVdcTargetVersion(site.getUuid(), SiteInfo.DR_OP_RESUME_STANDBY, vdcTargetVersion); } } // update the local(acitve) site last drUtil.updateVdcTargetVersion(coordinator.getSiteId(), SiteInfo.DR_OP_RESUME_STANDBY, vdcTargetVersion); coordinator.commitTransaction(); auditDisasterRecoveryOps(OperationTypeEnum.RESUME_STANDBY, AuditLogManager.AUDITLOG_SUCCESS, AuditLogManager.AUDITOP_BEGIN, standby.toBriefString()); return siteMapper.map(standby); } catch (Exception e) { log.error("Error resuming site {}", uuid, e); coordinator.discardTransaction(); auditDisasterRecoveryOps(OperationTypeEnum.RESUME_STANDBY, AuditLogManager.AUDITLOG_FAILURE, null, standby.toBriefString()); InternalServerErrorException resumeStandbyFailedException = APIException.internalServerErrors .resumeStandbyFailed(standby.getName(), e.getMessage()); throw resumeStandbyFailedException; } finally { try { lock.release(); } catch (Exception ignore) { log.error(String.format("Lock release failed when resuming standby site: %s", uuid)); } } } /** * This is internal API to do precheck for resume */ @POST @Path("/internal/resumeprecheck") @Produces({ MediaType.APPLICATION_XML, MediaType.APPLICATION_JSON }) public SiteErrorResponse resumePrecheck() { log.info("Precheck for resume internally"); SiteErrorResponse response = new SiteErrorResponse(); try { precheckForResumeLocalStandby(); } catch (APIException e) { log.warn("Failed to precheck switchover", e); response.setErrorMessage(e.getMessage()); response.setServiceCode(e.getServiceCode().ordinal()); return response; } catch (Exception e) { log.error("Failed to precheck switchover", e); response.setErrorMessage(e.getMessage()); return response; } return response; } private void precheckForResumeLocalStandby() { Site localSite = drUtil.getLocalSite(); if (!isClusterStable()) { throw APIException.serviceUnavailable.siteClusterStateNotStable(localSite.getName(), Objects.toString(coordinator.getControlNodesState())); } if (SiteState.STANDBY_PAUSED != localSite.getState()) { throw APIException.internalServerErrors.resumeStandbyPrecheckFailed(localSite.getName(), "Standby site is not in paused state"); } } /** * Query the latest error message for specific standby site * * @param uuid site UUID * @return updated standby site representation */ @POST @Produces({ MediaType.APPLICATION_XML, MediaType.APPLICATION_JSON }) @CheckPermission(roles = { Role.SECURITY_ADMIN, Role.RESTRICTED_SECURITY_ADMIN, Role.SYSTEM_ADMIN, Role.RESTRICTED_SYSTEM_ADMIN, Role.SYSTEM_MONITOR }) @Path("/{uuid}/retry") public SiteRestRep retryOperation(@PathParam("uuid") String uuid) { log.info("Begin to get site error by uuid {}", uuid); Site standby; try { standby = drUtil.getSiteFromLocalVdc(uuid); } catch (CoordinatorException e) { log.error("Can't find site {} from ZK", uuid); throw APIException.badRequests.siteIdNotFound(); } if (!standby.getState().equals(SiteState.STANDBY_ERROR)) { log.error("site {} is in state {}, should be STANDBY_ERROR", uuid, standby.getState()); throw APIException.badRequests.operationOnlyAllowedOnErrorSite(standby.getName(), standby.getState().toString()); } if (!standby.getLastState().equals(SiteState.STANDBY_PAUSING) && !standby.getLastState().equals(SiteState.STANDBY_RESUMING) && !standby.getLastState().equals(SiteState.STANDBY_FAILING_OVER)) { log.error("site {} lastState was {}, retry is only supported for Pause, Resume and Failover", uuid, standby.getLastState()); throw APIException.badRequests.operationRetryOnlyAllowedOnLastState(standby.getName(), standby.getLastState().toString()); } InterProcessLock lock = drUtil.getDROperationLock(); try { coordinator.startTransaction(); standby.setState(standby.getLastState()); coordinator.persistServiceConfiguration(standby.toConfiguration()); log.info("Notify all sites for reconfig"); long vdcTargetVersion = DrUtil.newVdcConfigVersion(); //Reuse the current action required SiteInfo siteInfo = coordinator.getTargetInfo(standby.getUuid(), SiteInfo.class); String drOperation = siteInfo.getActionRequired(); for (Site standbySite : drUtil.listSites()) { drUtil.updateVdcTargetVersion(standbySite.getUuid(), drOperation, vdcTargetVersion); } coordinator.commitTransaction(); return siteMapper.map(standby); } catch (Exception e) { log.error("Error retrying site operation for site {}", uuid, e); coordinator.discardTransaction(); auditDisasterRecoveryOps(OperationTypeEnum.RETRY_STANDBY_OP, AuditLogManager.AUDITLOG_FAILURE, null, standby); InternalServerErrorException retryStandbyOpFailedException = APIException.internalServerErrors .retryStandbyOpFailed(standby.getName(), e.getMessage()); throw retryStandbyOpFailedException; } finally { try { lock.release(); } catch (Exception ignore) { log.error(String.format("Lock release failed when retrying standby site last op: %s", uuid)); } } } /** * Retry last operation when in STANDBY_ERROR * * @param uuid site UUID * @return site response with detail information */ @GET @Produces({ MediaType.APPLICATION_XML, MediaType.APPLICATION_JSON }) @CheckPermission(roles = { Role.SECURITY_ADMIN, Role.RESTRICTED_SECURITY_ADMIN, Role.SYSTEM_ADMIN, Role.RESTRICTED_SYSTEM_ADMIN, Role.SYSTEM_MONITOR }) @Path("/{uuid}/error") public SiteErrorResponse getSiteError(@PathParam("uuid") String uuid) { log.info("Begin to get site error by uuid {}", uuid); try { Site standby = drUtil.getSiteFromLocalVdc(uuid); if (standby.getState().equals(SiteState.STANDBY_ERROR)) { return coordinator.getTargetInfo(uuid, SiteError.class).toResponse(); } } catch (CoordinatorException e) { log.error("Can't find site {} from ZK", uuid); throw APIException.badRequests.siteIdNotFound(); } catch (Exception e) { log.error("Find find site from ZK for UUID {} : {}" + uuid, e); } return SiteErrorResponse.noError(); } /** * This API will do switchover to target new acitve site according passed in site UUID. After failover, old acitve site will * work as normal standby site and target site will be promoted to acitve. All site will update properties to trigger reconfig. * * @param uuid target new acitve site UUID * @return return accepted response if operation is successful */ @POST @Produces({ MediaType.APPLICATION_XML, MediaType.APPLICATION_JSON }) @Path("/{uuid}/switchover") @CheckPermission(roles = { Role.SECURITY_ADMIN, Role.RESTRICTED_SECURITY_ADMIN }, blockProxies = true) public Response doSwitchover(@PathParam("uuid") String uuid) { log.info("Begin to switchover for standby UUID {}", uuid); precheckForSwitchoverForActiveSite(uuid); List<Site> allStandbySites = drUtil.listStandbySites(); for (Site site : allStandbySites) { if (!site.getUuid().equals(uuid) && site.getState() == SiteState.STANDBY_PAUSED) { try (InternalSiteServiceClient client = new InternalSiteServiceClient(site)) { client.setCoordinatorClient(coordinator); client.setKeyGenerator(apiSignatureGenerator); client.switchoverPrecheck(); } } } String oldActiveUUID = drUtil.getActiveSite().getUuid(); InterProcessLock lock = drUtil.getDROperationLock(); Site newActiveSite = null; Site oldActiveSite = null; try { newActiveSite = drUtil.getSiteFromLocalVdc(uuid); // Set old active site's state, short id and key oldActiveSite = drUtil.getSiteFromLocalVdc(oldActiveUUID); if (StringUtils.isEmpty(oldActiveSite.getSiteShortId())) { oldActiveSite.setSiteShortId(newActiveSite.getVdcShortId()); } coordinator.startTransaction(); oldActiveSite.setState(SiteState.ACTIVE_SWITCHING_OVER); coordinator.persistServiceConfiguration(oldActiveSite.toConfiguration()); // this barrier is set when begin switchover and will be removed by new active site. Old active site will wait and reboot after // barrier is removed DistributedBarrier restartBarrier = coordinator.getDistributedBarrier(String.format("%s/%s/%s", ZkPath.SITES, oldActiveSite.getUuid(), Constants.SWITCHOVER_BARRIER_RESTART)); restartBarrier.setBarrier(); drUtil.recordDrOperationStatus(oldActiveSite); // trigger reconfig long vdcConfigVersion = System.currentTimeMillis(); // a version for all sites. for (Site eachSite : drUtil.listSites()) { if (!eachSite.getUuid().equals(uuid) && eachSite.getState() == SiteState.STANDBY_PAUSED) { try (InternalSiteServiceClient client = new InternalSiteServiceClient(eachSite)) { client.setCoordinatorClient(coordinator); client.setKeyGenerator(apiSignatureGenerator); client.switchover(newActiveSite.getUuid(), vdcConfigVersion); } } else { drUtil.updateVdcTargetVersion(eachSite.getUuid(), SiteInfo.DR_OP_SWITCHOVER, vdcConfigVersion, oldActiveSite.getUuid(), newActiveSite.getUuid()); } } coordinator.commitTransaction(); auditDisasterRecoveryOps(OperationTypeEnum.SWITCHOVER, AuditLogManager.AUDITLOG_SUCCESS, AuditLogManager.AUDITOP_BEGIN, oldActiveSite.toBriefString(), newActiveSite.toBriefString()); return Response.status(Response.Status.ACCEPTED).build(); } catch (Exception e) { log.error(String.format("Error happened when switchover from site %s to site %s", oldActiveUUID, uuid), e); coordinator.discardTransaction(); auditDisasterRecoveryOps(OperationTypeEnum.SWITCHOVER, AuditLogManager.AUDITLOG_FAILURE, null, newActiveSite.getName(), newActiveSite.getVipEndPoint()); throw APIException.internalServerErrors.switchoverFailed(oldActiveSite.getName(), newActiveSite.getName(), e.getMessage()); } finally { try { lock.release(); } catch (Exception ignore) { log.error(String.format("Lock release failed when switchover from %s to %s", oldActiveUUID, uuid)); } } } /** * This is internal API to do precheck for switchover * * @return return response with error message and service code */ @POST @Path("/internal/switchoverprecheck") @Produces({ MediaType.APPLICATION_XML, MediaType.APPLICATION_JSON }) public SiteErrorResponse switchoverPrecheck() { log.info("Precheck for switchover internally"); SiteErrorResponse response = new SiteErrorResponse(); try { precheckForSwitchoverForLocalStandby(); } catch (InternalServerErrorException e) { log.warn("Failed to precheck switchover", e); response.setErrorMessage(e.getMessage()); response.setServiceCode(e.getServiceCode().ordinal()); return response; } catch (Exception e) { log.error("Failed to precheck switchover", e); response.setErrorMessage(e.getMessage()); return response; } return response; } /** * This is internal API to do switchover * * @return return response with error message and service code */ @POST @Path("/internal/switchover") @Consumes({ MediaType.APPLICATION_XML, MediaType.APPLICATION_JSON }) @Produces({ MediaType.APPLICATION_XML, MediaType.APPLICATION_JSON }) public Response switchover(@QueryParam("newActiveSiteUUid") String newActiveSiteUUID, @QueryParam("vdcVersion") String vdcTargetVersion) { log.info("Begin to switchover internally for standby UUID {}", newActiveSiteUUID); Site newActiveSite = null; Site oldActiveSite = null; try { newActiveSite = drUtil.getSiteFromLocalVdc(newActiveSiteUUID); oldActiveSite = drUtil.getSiteFromLocalVdc(drUtil.getActiveSite().getUuid()); if (StringUtils.isEmpty(oldActiveSite.getSiteShortId())) { oldActiveSite.setSiteShortId(newActiveSite.getVdcShortId()); } oldActiveSite.setState(SiteState.STANDBY_SYNCED); coordinator.persistServiceConfiguration(oldActiveSite.toConfiguration()); newActiveSite.setState(SiteState.ACTIVE); coordinator.persistServiceConfiguration(newActiveSite.toConfiguration()); drUtil.updateVdcTargetVersion(drUtil.getLocalSite().getUuid(), SiteInfo.DR_OP_SWITCHOVER, Long.parseLong(vdcTargetVersion), oldActiveSite.getUuid(), newActiveSite.getUuid()); return Response.status(Response.Status.ACCEPTED).build(); } catch (Exception e) { log.error(String.format("Error happened when switchover to site %s", newActiveSiteUUID), e); throw APIException.internalServerErrors.switchoverFailed(oldActiveSite.getName(), newActiveSite.getName(), e.getMessage()); } } /** * This API will do failover from standby site. This operation is only allowed when acitve site is down. * After failover, this standby site will be promoted to acitve site. * * @param uuid target new acitve site UUID * @return return accepted response if operation is successful */ @POST @Produces({ MediaType.APPLICATION_XML, MediaType.APPLICATION_JSON }) @Path("/{uuid}/failover") @CheckPermission(roles = { Role.SECURITY_ADMIN, Role.RESTRICTED_SECURITY_ADMIN }, blockProxies = true) public Response doFailover(@PathParam("uuid") String uuid) { log.info("Begin to failover for standby UUID {}", uuid); Site currentSite = drUtil.getSiteFromLocalVdc(uuid); precheckForFailoverLocally(uuid); List<Site> allStandbySites = drUtil.listStandbySites(); List<SiteRestRep> responseSiteFromRemote = new ArrayList<SiteRestRep>(allStandbySites.size()); for (Site site : allStandbySites) { if (!site.getUuid().equals(uuid)) { try (InternalSiteServiceClient client = new InternalSiteServiceClient(site)) { client.setCoordinatorClient(coordinator); client.setKeyGenerator(apiSignatureGenerator); FailoverPrecheckResponse precheckResponse = client.failoverPrecheck(); if (precheckResponse != null) { responseSiteFromRemote.add(precheckResponse.getSite()); } else { log.warn("Failed to do failover precheck for site {}, ignore it for failover", site.toBriefString()); } } catch (Exception e) { log.error("Failed to do failover precheck for site {}, ignore it for failover", site.toBriefString()); } } } SiteRestRep recommendSite = findRecommendFailoverSite(responseSiteFromRemote, currentSite); if (!recommendSite.getUuid().equals(currentSite.getUuid())) { throw APIException.internalServerErrors.failoverPrecheckFailed(currentSite.getName(), String.format("Another site %s state is %s with latest data. Please failover to site %s", recommendSite.getName(), recommendSite.getState(), recommendSite.getName())); } try { coordinator.startTransaction(); // set state String activeSiteId = drUtil.getActiveSite().getUuid(); Site oldActiveSite = new Site(); if (StringUtils.isEmpty(activeSiteId)) { log.info("Cant't find active site id, go on to do failover"); } else { oldActiveSite = drUtil.getSiteFromLocalVdc(activeSiteId); oldActiveSite.setState(SiteState.ACTIVE_FAILING_OVER); coordinator.persistServiceConfiguration(oldActiveSite.toConfiguration()); } currentSite.setState(SiteState.STANDBY_FAILING_OVER); coordinator.persistServiceConfiguration(currentSite.toConfiguration()); drUtil.recordDrOperationStatus(currentSite); long vdcTargetVersion = DrUtil.newVdcConfigVersion(); //reconfig other standby sites for (Site site : allStandbySites) { if (!site.getUuid().equals(uuid)) { try (InternalSiteServiceClient client = new InternalSiteServiceClient(site)) { client.setCoordinatorClient(coordinator); client.setKeyGenerator(apiSignatureGenerator); client.failover(uuid, oldActiveSite.getUuid(), vdcTargetVersion); } catch (Exception e) { log.error("Failed to do failover for site {}, ignore it for failover", site.toBriefString()); } // update the vdc config version on the new active site. drUtil.updateVdcTargetVersion(site.getUuid(), SiteInfo.DR_OP_FAILOVER, vdcTargetVersion, oldActiveSite.getUuid(), currentSite.getUuid()); } } drUtil.updateVdcTargetVersion(uuid, SiteInfo.DR_OP_FAILOVER, vdcTargetVersion, oldActiveSite.getUuid(), currentSite.getUuid()); coordinator.commitTransaction(); auditDisasterRecoveryOps(OperationTypeEnum.FAILOVER, AuditLogManager.AUDITLOG_SUCCESS, AuditLogManager.AUDITOP_BEGIN, oldActiveSite.toBriefString(), currentSite.toBriefString()); return Response.status(Response.Status.ACCEPTED).build(); } catch (Exception e) { log.error("Error happened when failover at site %s", uuid, e); coordinator.discardTransaction(); auditDisasterRecoveryOps(OperationTypeEnum.FAILOVER, AuditLogManager.AUDITLOG_FAILURE, null, currentSite.getName(), currentSite.getVipEndPoint()); throw APIException.internalServerErrors.failoverFailed(currentSite.getName(), e.getMessage()); } } /** * This is internal API to do precheck for failover * * @return return response with error message and service code */ @POST @Path("/internal/failoverprecheck") @Produces({ MediaType.APPLICATION_XML, MediaType.APPLICATION_JSON }) public FailoverPrecheckResponse failoverPrecheck() { log.info("Precheck for failover internally"); FailoverPrecheckResponse response = new FailoverPrecheckResponse(); response.setSite(this.siteMapper.map(drUtil.getLocalSite())); try { precheckForFailover(); } catch (InternalServerErrorException e) { log.warn("Failed to precheck failover", e); response.setErrorMessage(e.getMessage()); response.setServiceCode(e.getServiceCode().ordinal()); return response; } catch (Exception e) { log.error("Failed to precheck failover", e); response.setErrorMessage(e.getMessage()); return response; } return response; } /** * This is internal API to do failover * * @return return response with error message and service code */ @POST @Path("/internal/failover") @Consumes({ MediaType.APPLICATION_XML, MediaType.APPLICATION_JSON }) @Produces({ MediaType.APPLICATION_XML, MediaType.APPLICATION_JSON }) public Response failover(@QueryParam("newActiveSiteUUid") String newActiveSiteUUID, @QueryParam("oldActiveSiteUUid") String oldActiveSiteUUID, @QueryParam("vdcVersion") String vdcTargetVersion) { log.info("Begin to failover internally with newActiveSiteUUid {}, oldActiveSiteUUid {}", newActiveSiteUUID, oldActiveSiteUUID); Site currentSite = drUtil.getLocalSite(); String uuid = currentSite.getUuid(); try { // set state Site oldActiveSite = new Site(); if (StringUtils.isEmpty(oldActiveSiteUUID)) { log.info("Cant't find active site id, go on to do failover"); } else { oldActiveSite = drUtil.getSiteFromLocalVdc(oldActiveSiteUUID); drUtil.removeSite(oldActiveSite); } Site newActiveSite = drUtil.getSiteFromLocalVdc(newActiveSiteUUID); newActiveSite.setState(SiteState.STANDBY_FAILING_OVER); coordinator.persistServiceConfiguration(newActiveSite.toConfiguration()); drUtil.updateVdcTargetVersion(currentSite.getUuid(), SiteInfo.DR_OP_FAILOVER, Long.parseLong(vdcTargetVersion), oldActiveSite.getUuid(), currentSite.getUuid()); auditDisasterRecoveryOps(OperationTypeEnum.FAILOVER, AuditLogManager.AUDITLOG_SUCCESS, AuditLogManager.AUDITOP_BEGIN, oldActiveSite.toBriefString(), newActiveSite.toBriefString()); return Response.status(Response.Status.ACCEPTED).build(); } catch (Exception e) { log.error("Error happened when failover at site %s", uuid, e); auditDisasterRecoveryOps(OperationTypeEnum.FAILOVER, AuditLogManager.AUDITLOG_FAILURE, null, uuid, currentSite.getVipEndPoint(), currentSite.getName()); throw APIException.internalServerErrors.failoverFailed(currentSite.getName(), e.getMessage()); } } /** * Update site information. Only name and description can be updated. * * @param uuid target site uuid * @param siteParam site information * @return */ @PUT @Path("/{uuid}") @Consumes({ MediaType.APPLICATION_XML, MediaType.APPLICATION_JSON }) @Produces({ MediaType.APPLICATION_XML, MediaType.APPLICATION_JSON }) @CheckPermission(roles = { Role.SECURITY_ADMIN, Role.RESTRICTED_SECURITY_ADMIN }, blockProxies = true) public Response updateSite(@PathParam("uuid") String uuid, SiteUpdateParam siteParam) { log.info("Begin to update site information for {}", uuid); Site site = null; try { site = drUtil.getSiteFromLocalVdc(uuid); } catch (RetryableCoordinatorException e) { log.error("Can't find site with specified site UUID {}", uuid); throw APIException.badRequests.siteIdNotFound(); } if (!validSiteName(siteParam.getName())) { throw APIException.internalServerErrors.updateSiteFailed(site.getName(), String .format("Site name should not be empty or longer than %d characters.", SITE_NAME_LENGTH_LIMIT)); } for (Site eachSite : drUtil.listSites()) { if (eachSite.getUuid().equals(uuid)) { continue; } if (eachSite.getName().equals(siteParam.getName())) { throw APIException.internalServerErrors.addStandbyPrecheckFailed("Duplicate site name"); } } try { site.setName(siteParam.getName()); site.setDescription(siteParam.getDescription()); coordinator.persistServiceConfiguration(site.toConfiguration()); auditDisasterRecoveryOps(OperationTypeEnum.UPDATE_SITE, AuditLogManager.AUDITLOG_SUCCESS, null, site.getName(), site.getVipEndPoint(), site.getUuid()); return Response.status(Response.Status.ACCEPTED).build(); } catch (Exception e) { log.error("Error happened when update site %s", uuid, e); auditDisasterRecoveryOps(OperationTypeEnum.UPDATE_SITE, AuditLogManager.AUDITLOG_FAILURE, null, site.getName(), site.getVipEndPoint(), site.getUuid()); throw APIException.internalServerErrors.updateSiteFailed(site.getName(), e.getMessage()); } } private boolean validSiteName(String siteName) { if (!StringUtils.isBlank(siteName) && siteName.length() <= SITE_NAME_LENGTH_LIMIT) { return true; } return false; } private boolean isDataSynced(Site site) { if (site.getState().equals(SiteState.ACTIVE)) { return true; } else if (site.getState().equals(SiteState.STANDBY_SYNCED) && !Site.NetworkHealth.BROKEN.equals(site.getNetworkHealth())) { return true; } return false; } private Date getLastSyncTime(Site site) { if (site.getNetworkHealth() == NetworkHealth.BROKEN) { return null; } if (site.getState() == SiteState.STANDBY_PAUSED) { return new Date(site.getLastStateUpdateTime()); } else if (site.getState() == SiteState.STANDBY_DEGRADED) { return new Date(site.getLastLostQuorumTime()); } return null; } /** * Query the details, such as transition timings, for specific standby site * * @param uuid site UUID * @return SiteActionsTime with detail information */ @GET @Produces({ MediaType.APPLICATION_XML, MediaType.APPLICATION_JSON }) @CheckPermission(roles = { Role.SECURITY_ADMIN, Role.RESTRICTED_SECURITY_ADMIN, Role.SYSTEM_ADMIN, Role.RESTRICTED_SYSTEM_ADMIN, Role.SYSTEM_MONITOR }) @Path("/{uuid}/details") public SiteDetailRestRep getSiteDetails(@PathParam("uuid") String uuid) { log.info("Begin to get site paused time by uuid {}", uuid); SiteDetailRestRep standbyDetails = new SiteDetailRestRep(); try { Site standby = drUtil.getSiteFromLocalVdc(uuid); standbyDetails.setCreationTime(new Date(standby.getCreationTime())); standbyDetails.setNetworkLatencyInMs(standby.getNetworkLatencyInMs()); Date lastSyncTime = getLastSyncTime(standby); if (lastSyncTime != null) { standbyDetails.setLastSyncTime(lastSyncTime); } standbyDetails.setDataSynced(isDataSynced(standby)); ClusterInfo.ClusterState clusterState = coordinator.getControlNodesState(standby.getUuid(), standby.getNodeCount()); if (clusterState != null) { standbyDetails.setClusterState(clusterState.toString()); } else { standbyDetails.setClusterState(ClusterInfo.ClusterState.UNKNOWN.toString()); } } catch (CoordinatorException e) { log.error("Can't find site {} from ZK", uuid); throw APIException.badRequests.siteIdNotFound(); } catch (Exception e) { log.error("Find find site from ZK for UUID {} : {}" + uuid, e); } return standbyDetails; } @GET @Produces({ MediaType.APPLICATION_XML, MediaType.APPLICATION_JSON }) @Path("/internal/list") public SiteList getSitesInternally() { return this.getSites(); } /** * Common precheck logic for DR operations. * * @param excludedSiteIds, site ids to exclude from the cluster state precheck */ private void commonPrecheck(List<String> excludedSiteIds) { if (drUtil.isStandby()) { throw new IllegalStateException("Operation is allowed on acitve site only"); } if (!isClusterStable()) { throw new IllegalStateException("Cluster is not stable"); } for (Site site : drUtil.listStandbySites()) { if (excludedSiteIds.contains(site.getUuid())) { continue; } // don't check node state for paused sites. if (site.getState().equals(SiteState.STANDBY_PAUSED)) { continue; } int nodeCount = site.getNodeCount(); ClusterInfo.ClusterState state = coordinator.getControlNodesState(site.getUuid(), nodeCount); if (state != ClusterInfo.ClusterState.STABLE) { log.error("Site {} is not stable {}", site.getUuid(), state); throw new IllegalStateException(String.format("Site %s is not stable", site.getName())); } } } /** * Wrapper for commonPrecheck that takes a single site instead of a list * * @param excludedSiteId, site id to be excluded from the cluster state precheck, check all if set to null. */ private void commonPrecheck(String excludedSiteId) { List<String> excludedSiteIds = new ArrayList<>(); if (excludedSiteId != null) { excludedSiteIds.add(excludedSiteId); } commonPrecheck(excludedSiteIds); } private Site validateSiteConfig(String uuid) { if (!isClusterStable()) { log.error("Cluster is unstable"); throw APIException.serviceUnavailable.clusterStateNotStable(); } try { return drUtil.getSiteFromLocalVdc(uuid); } catch (CoordinatorException e) { log.error("Can't find site {} from ZK", uuid); throw APIException.badRequests.siteIdNotFound(); } } private void precheckForGeo() { Map<String, List<Site>> vdcSiteMap = drUtil.getVdcSiteMap(); int numOfVdcs = vdcSiteMap.keySet().size(); if (numOfVdcs > 1) { throw APIException.internalServerErrors .addStandbyPrecheckFailed("Not allowed to add standby site in multivdc configuration"); } } /* * Internal method to check whether standby can be attached to current active site */ protected void precheckForStandbyAdd(SiteConfigRestRep standby) { if (!isClusterStable()) { throw APIException.internalServerErrors.addStandbyPrecheckFailed("Current site is not stable"); } if (!standby.isClusterStable()) { throw APIException.internalServerErrors.addStandbyPrecheckFailed("Remote site is not stable"); } // standby should be refresh install if (!standby.isFreshInstallation() && !SiteState.ACTIVE_DEGRADED.toString().equalsIgnoreCase(standby.getState())) { throw APIException.internalServerErrors.addStandbyPrecheckFailed("Standby is not a fresh installation"); } // DB schema version should be same String currentDbSchemaVersion = coordinator.getCurrentDbSchemaVersion(); String standbyDbSchemaVersion = standby.getDbSchemaVersion(); if (!currentDbSchemaVersion.equalsIgnoreCase(standbyDbSchemaVersion)) { throw APIException.internalServerErrors.addStandbyPrecheckFailed( String.format("Standby db schema version %s is not same as active site %s", standbyDbSchemaVersion, currentDbSchemaVersion)); } // this site should not be standby site String activeId = drUtil.getActiveSite().getUuid(); if (activeId != null && !activeId.equals(coordinator.getSiteId())) { throw APIException.internalServerErrors.addStandbyPrecheckFailed("This site is also a standby site"); } checkSupportedIPForAttachStandby(standby); } protected void checkSupportedIPForAttachStandby(SiteConfigRestRep standby) { Site site = drUtil.getLocalSite(); // active has IPv4 and standby has no IPv4 if (!isHostIPAddressMapEmpty(site.getHostIPv4AddressMap()) && isHostIPAddressMapEmpty(standby.getHostIPv4AddressMap())) { throw APIException.internalServerErrors.addStandbyPrecheckFailed( "Unsupported network configuration. Active site has IPv4. Standby site should be IPv4 or dual stack "); } // active has only IPv6 and standby has IPv4 if (isHostIPAddressMapEmpty(site.getHostIPv4AddressMap()) && !isHostIPAddressMapEmpty(standby.getHostIPv4AddressMap())) { throw APIException.internalServerErrors.addStandbyPrecheckFailed( "Unsupported network configuration. Active site only has IPv6, Standby site should not has IPv4 address"); } } private boolean isHostIPAddressMapEmpty(Map<String, String> map) { if (map == null) { return true; } for (String ip : map.values()) { if (!PropertyConstants.IPV4_ADDR_DEFAULT.equals(ip) && !PropertyConstants.IPV6_ADDR_DEFAULT.equals(ip)) { return false; } } return true; } protected void precheckStandbyVersion(SiteAddParam standby) { ViPRSystemClient viprSystemClient = createViPRSystemClient(standby.getVip(), standby.getUsername(), standby.getPassword()); // software version should be matched SoftwareVersion currentSoftwareVersion; SoftwareVersion standbySoftwareVersion; try { currentSoftwareVersion = coordinator.getTargetInfo(RepositoryInfo.class).getCurrentVersion(); standbySoftwareVersion = new SoftwareVersion( viprSystemClient.upgrade().getTargetVersion().getTargetVersion()); } catch (Exception e) { throw APIException.internalServerErrors .addStandbyPrecheckFailed(String.format("Fail to get software version %s", e.getMessage())); } // enforcing a strict match between active/standby software versions // otherwise the standby site will automatically upgrade/downgrade to the same version with the active site if (!currentSoftwareVersion.equals(standbySoftwareVersion)) { throw APIException.internalServerErrors.addStandbyPrecheckFailed( String.format("Standby site version %s does not equal to current version %s", standbySoftwareVersion, currentSoftwareVersion)); } } /* * Internal method to check whether failover from acitve to standby is allowed */ protected void precheckForSwitchover(String standbyUuid) { Site standby = null; if (drUtil.isStandby()) { throw new IllegalStateException("Operation is allowed on acitve site only"); } try { standby = drUtil.getSiteFromLocalVdc(standbyUuid); } catch (CoordinatorException e) { throw APIException.internalServerErrors.switchoverPrecheckFailed(standby.getUuid(), "Standby uuid is not valid, can't find it"); } if (standbyUuid.equals(drUtil.getActiveSite().getUuid())) { throw APIException.internalServerErrors.switchoverPrecheckFailed(standby.getName(), "Can't switchover to an active site"); } if (!drUtil.isSiteUp(standbyUuid)) { throw APIException.internalServerErrors.switchoverPrecheckFailed(standby.getName(), "Standby site is not up"); } if (standby.getState() != SiteState.STANDBY_SYNCED) { throw APIException.internalServerErrors.switchoverPrecheckFailed(standby.getName(), "Standby site is not fully synced"); } List<Site> existingSites = drUtil.listSites(); for (Site site : existingSites) { ClusterInfo.ClusterState state = coordinator.getControlNodesState(site.getUuid(), site.getNodeCount()); if (state != ClusterInfo.ClusterState.STABLE) { log.info("Site {} is not stable {}", site.getUuid(), state); throw APIException.internalServerErrors.switchoverPrecheckFailed(site.getName(), String.format("Site %s is not stable", site.getName())); } } } /* * Internal method to check whether failover to standby is allowed */ protected void precheckForFailoverLocally(String standbyUuid) { Site standby = drUtil.getLocalSite(); SiteMonitorResult siteMonitorResult = coordinator.getTargetInfo(standby.getUuid(), SiteMonitorResult.class); if (siteMonitorResult == null || siteMonitorResult.isActiveSiteLeaderAlive() || siteMonitorResult.isActiveSiteStable()) { throw APIException.internalServerErrors.failoverPrecheckFailed(standby.getName(), "Active site is available now, can't do failover"); } // API should be only send to local site if (!standby.getUuid().equals(standbyUuid)) { throw APIException.internalServerErrors.failoverPrecheckFailed(standby.getName(), String.format( "Failover can only be executed in local site. Local site uuid %s is not matched with uuid %s", standby.getUuid(), standbyUuid)); } // should be SYNCED or PAUSED if (standby.getState() != SiteState.STANDBY_SYNCED && standby.getState() != SiteState.STANDBY_PAUSED) { throw APIException.internalServerErrors.failoverPrecheckFailed(standby.getName(), "Only paused or synced standby site can do failover"); } precheckForFailover(); } protected void precheckForFailover() { Site standby = drUtil.getLocalSite(); String standbyUuid = standby.getUuid(); String standbyName = standby.getName(); // show be only standby if (drUtil.isActiveSite()) { throw APIException.internalServerErrors.failoverPrecheckFailed(standbyName, "Failover can't be executed in acitve site"); } // Current site is stable ClusterInfo.ClusterState state = coordinator.getControlNodesState(standbyUuid, standby.getNodeCount()); if (state != ClusterInfo.ClusterState.STABLE) { log.info("Site {} is not stable {}", standby.getName(), state); throw APIException.internalServerErrors.failoverPrecheckFailed(standby.getName(), String.format("Site %s is not stable", standby.getName())); } // this is standby site and NOT in ZK read-only or observer mode, // it means acitve is down and local ZK has been reconfig to participant CoordinatorClientInetAddressMap addrLookupMap = coordinator.getInetAddessLookupMap(); String myNodeId = addrLookupMap.getNodeId(); String coordinatorMode = drUtil.getLocalCoordinatorMode(myNodeId); log.info("Local coordinator mode is {}", coordinatorMode); if (DrUtil.ZOOKEEPER_MODE_OBSERVER.equals(coordinatorMode) || DrUtil.ZOOKEEPER_MODE_READONLY.equals(coordinatorMode)) { log.info("Active site is available now, can't do failover"); throw APIException.internalServerErrors.failoverPrecheckFailed(standbyName, "Active site is available now, can't do failover"); } } protected SiteRestRep findRecommendFailoverSite(List<SiteRestRep> responseSiteFromRemote, Site currentSite) { if (currentSite.getState().equals(SiteState.STANDBY_SYNCED)) { return this.siteMapper.map(currentSite); } for (SiteRestRep site : responseSiteFromRemote) { if (site != null && SiteState.STANDBY_SYNCED.toString().equalsIgnoreCase(site.getState())) { return site; } } return this.siteMapper.map(currentSite); } protected void validateAddParam(SiteAddParam param, List<Site> existingSites) { String siteName = param.getName(); if (!validSiteName(siteName)) { throw APIException.internalServerErrors.addStandbyPrecheckFailed(String .format("Site name should not be empty or longer than %d characters.", SITE_NAME_LENGTH_LIMIT)); } String siteVip = param.getVip(); InetAddress address = null; try { address = InetAddress.getByName(siteVip); } catch (UnknownHostException e) { throw APIException.internalServerErrors.addStandbyPrecheckFailed( "Could not resolve target standby site virtual IP. Please check name service."); } if (address.getHostAddress().contains(":")) { param.setVip(DualInetAddress.normalizeInet6Address(address.getHostAddress())); } else { param.setVip(address.getHostAddress()); } log.info("Target standby site ip is {}", param.getVip()); for (Site site : existingSites) { if (site.getName().equals(siteName)) { throw APIException.internalServerErrors.addStandbyPrecheckFailed("Duplicate site name"); } // COP-18954 Skip stability check for paused sites if (site.getState().equals(SiteState.STANDBY_PAUSED)) { continue; } ClusterInfo.ClusterState state = coordinator.getControlNodesState(site.getUuid(), site.getNodeCount()); if (state != ClusterInfo.ClusterState.STABLE) { log.info("Site {} is not stable {}", site.getUuid(), state); throw APIException.internalServerErrors .addStandbyPrecheckFailed(String.format("Site %s is not stable", site.getName())); } } } private String generateShortId(List<Site> existingSites) throws Exception { Set<String> existingShortIds = new HashSet<String>(); for (Site site : existingSites) { existingShortIds.add(site.getSiteShortId()); } for (int i = 1; i < MAX_NUM_OF_STANDBY; i++) { String id = String.format(SHORTID_FMT, i); if (!existingShortIds.contains(id)) { return id; } } throw new Exception("Failed to generate standby short id"); } protected boolean isClusterStable() { return coordinator.getControlNodesState() == ClusterInfo.ClusterState.STABLE; } protected boolean isFreshInstallation() { Configuration setupConfig = coordinator.queryConfiguration(InitialSetup.CONFIG_KIND, InitialSetup.CONFIG_ID); boolean freshInstall = (setupConfig == null) || !Boolean.parseBoolean(setupConfig.getConfig(InitialSetup.COMPLETE)); log.info("Fresh installation {}", freshInstall); boolean hasDataInDB = dbClient.hasUsefulData(); log.info("Has useful data in DB {}", hasDataInDB); return freshInstall && !hasDataInDB; } // encapsulate the create ViPRCoreClient operation for easy UT writing because need to mock ViPRCoreClient protected ViPRCoreClient createViPRCoreClient(String vip, String username, String password) { try { return new ViPRCoreClient(vip, true).withLogin(username, password); } catch (Exception e) { log.error(String.format("Fail to create vipr client, vip: %s, username: %s", vip, username), e); throw APIException.internalServerErrors.failToCreateViPRClient(); } } // encapsulate the create ViPRSystemClient operation for easy UT writing because need to mock ViPRSystemClient protected ViPRSystemClient createViPRSystemClient(String vip, String username, String password) { try { return new ViPRSystemClient(vip, true).withLogin(username, password); } catch (Exception e) { log.error(String.format("Fail to create vipr client, vip: %s, username: %s", vip, username), e); throw APIException.internalServerErrors.failToCreateViPRClient(); } } // encapsulate the create InternalSiteServiceClient operation for easy UT writing because need to mock InternalSiteServiceClient protected InternalSiteServiceClient createInternalSiteServiceClient(Site site) { return new InternalSiteServiceClient(site); } public void setApiSignatureGenerator(InternalApiSignatureKeyGenerator apiSignatureGenerator) { this.apiSignatureGenerator = apiSignatureGenerator; } public void setSiteMapper(SiteMapper siteMapper) { this.siteMapper = siteMapper; } public void setSysUtils(SysUtils sysUtils) { this.sysUtils = sysUtils; } public void setDbClient(DbClient dbClient) { this.dbClient = dbClient; } public void setCoordinator(CoordinatorClient coordinator) { this.coordinator = coordinator; } public void setDrUtil(DrUtil drUtil) { this.drUtil = drUtil; } public void setIpsecConfig(IPsecConfig ipsecConfig) { this.ipsecConfig = ipsecConfig; } // DBSVC config parameters public void setDbCommonInfo(Properties dbCommonInfo) { this.dbCommonInfo = dbCommonInfo; } private void startLeaderSelector() { LeaderSelector leaderSelector = coordinator.getLeaderSelector(coordinator.getSiteId(), Constants.FAILBACK_DETECT_LEADER, new FailbackLeaderSelectorListener()); leaderSelector.autoRequeue(); leaderSelector.start(); } protected void precheckForSwitchoverForActiveSite(String standbyUuid) { Site standby = null; if (drUtil.isStandby()) { throw new IllegalStateException("Operation is allowed on acitve site only"); } try { standby = drUtil.getSiteFromLocalVdc(standbyUuid); } catch (CoordinatorException e) { throw APIException.internalServerErrors.switchoverPrecheckFailed(standby.getUuid(), "Standby uuid is not valid, can't find it"); } if (standbyUuid.equals(drUtil.getActiveSite().getUuid())) { throw APIException.internalServerErrors.switchoverPrecheckFailed(standby.getName(), "Can't switchover to an active site"); } if (standby.getState() != SiteState.STANDBY_SYNCED) { throw APIException.internalServerErrors.switchoverPrecheckFailed(standby.getName(), "Standby site is not fully synced"); } if (!drUtil.isSiteUp(standbyUuid)) { throw APIException.internalServerErrors.switchoverPrecheckFailed(standby.getName(), "Standby site is not up"); } if (coordinator.getControlNodesState(standby.getUuid(), standby.getNodeCount()) != ClusterInfo.ClusterState.STABLE) { throw APIException.internalServerErrors.switchoverPrecheckFailed(standby.getName(), "Standby site is not stable"); } if (!isClusterStable()) { throw APIException.internalServerErrors.switchoverPrecheckFailed(standby.getName(), "Active site is not stable"); } checkSiteConnectivity(standby); List<Site> existingSites = drUtil.listStandbySites(); for (Site site : existingSites) { if (site.getState() != SiteState.STANDBY_SYNCED && site.getState() != SiteState.STANDBY_PAUSED) { throw APIException.internalServerErrors.switchoverPrecheckFailed(site.getName(), "Standby site is not synced or paused"); } ClusterInfo.ClusterState state = coordinator.getControlNodesState(site.getUuid(), site.getNodeCount()); if (site.getState() != SiteState.STANDBY_PAUSED && state != ClusterInfo.ClusterState.STABLE) { log.info("Site {} is not stable {}", site.getUuid(), state); throw APIException.internalServerErrors.switchoverPrecheckFailed(site.getName(), String.format("Site %s is not stable", site.getName())); } } } private void precheckForSwitchoverForLocalStandby() { if (!isClusterStable()) { throw new IllegalStateException("Cluster is not stable"); } Site currentSite = drUtil.getLocalSite(); if (currentSite.getState() != SiteState.STANDBY_SYNCED && currentSite.getState() != SiteState.STANDBY_PAUSED) { throw APIException.internalServerErrors.switchoverPrecheckFailed(currentSite.getName(), "Standby site is not synced or paused state"); } } private void checkSiteConnectivity(Site site) { if (site.getNetworkHealth() == NetworkHealth.BROKEN) { throw APIException.internalServerErrors.siteConnectionBroken(site.getName(), "Network health state is broken."); } if (drUtil.testPing(site.getVip(), SITE_CONNECTION_TEST_PORT, SITE_CONNECT_TEST_TIMEOUT) == -1) { throw APIException.internalServerErrors.siteConnectionBroken(site.getName(), String.format("Can't connect to site by virtual IP: %s", site.getVip())); } } private class FailbackLeaderSelectorListener extends LeaderSelectorListenerImpl { private static final int FAILBACK_DETECT_INTERNVAL_SECONDS = 60; private ScheduledExecutorService service; @Override protected void startLeadership() throws Exception { log.info("This node is selected as failback detector"); service = Executors.newScheduledThreadPool(1); service.scheduleAtFixedRate(failbackDetectMonitor, 0, FAILBACK_DETECT_INTERNVAL_SECONDS, TimeUnit.SECONDS); } @Override protected void stopLeadership() { service.shutdown(); try { while (!service.awaitTermination(30, TimeUnit.SECONDS)) { log.info("Waiting scheduler thread pool to shutdown for another 30s"); } } catch (InterruptedException e) { log.error("Interrupted while waiting to shutdown scheduler thread pool.", e); Thread.currentThread().interrupt(); return; } } } private Runnable failbackDetectMonitor = new Runnable() { @Override public void run() { try { if (!needCheckFailback()) { return; } Site localSite = drUtil.getLocalSite(); for (Site site : drUtil.listStandbySites()) { if (drUtil.isSiteUp(site.getUuid())) { log.info("Site {} is up, ignore to check it", site.getUuid()); continue; } else { if (hasActiveSiteInRemote(site, localSite.getUuid())) { localSite.setState(SiteState.ACTIVE_DEGRADED); coordinator.persistServiceConfiguration(localSite.toConfiguration()); // At this moment this site is disconnected with others, so ok to have own vdc version. drUtil.updateVdcTargetVersion(coordinator.getSiteId(), SiteInfo.DR_OP_FAILBACK_DEGRADE, DrUtil.newVdcConfigVersion()); return; } } } log.info("No another active site detect for failback"); } catch (Exception e) { log.error("Error occurs during failback detect monitor", e); } } private boolean needCheckFailback() { if (drUtil.getLocalSite().getState().equals(SiteState.ACTIVE)) { log.info("Current site is active site, need to detail failback"); return true; } Site localSite = drUtil.getLocalSite(); if (localSite.getState().equals(SiteState.ACTIVE_DEGRADED)) { log.info("Site is already ACTIVE_FAILBACK_DEGRADED"); if (!coordinator.locateAllServices(localSite.getUuid(), "controllersvc", "1", null, null) .isEmpty()) { log.info("there are some controller service alive, process to degrade"); return true; } if (!coordinator.locateAllServices(localSite.getUuid(), "sasvc", "1", null, null).isEmpty()) { log.info("there are some sa service alive, process to degrade"); return true; } if (!coordinator.locateAllServices(localSite.getUuid(), "vasasvc", "1", null, null).isEmpty()) { log.info("there are some vasa service alive, process to degrade"); return true; } } return false; } private boolean hasActiveSiteInRemote(Site site, String localActiveSiteUUID) { try (InternalSiteServiceClient client = new InternalSiteServiceClient(site)) { boolean hasActiveSite = false; client.setCoordinatorClient(coordinator); client.setKeyGenerator(apiSignatureGenerator); SiteList remoteSiteList = client.getSiteList(); for (SiteRestRep siteResp : remoteSiteList.getSites()) { if (SiteState.ACTIVE.toString().equalsIgnoreCase(siteResp.getState()) && !localActiveSiteUUID.equals(siteResp.getUuid())) { log.info("Remote site {} is active site, need to failback", siteResp); hasActiveSite = true; } //these codes will handle the case: //A as old active is down. B is up and C is down too. //Failover to B successfully. Power up A and C. B may query C and found there is another active site A and B failback if (localActiveSiteUUID.equals(siteResp.getUuid())) { log.info("Remote standby still reconganize me, no failback"); return false; } } return hasActiveSite; } catch (Exception e) { log.warn("Failed to check remote site information during failback detect", e); return false; } } }; }