Java tutorial
/** * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.hadoop.hbase.master.assignment; import java.io.IOException; import java.util.ArrayList; import java.util.Collection; import java.util.Collections; import java.util.HashMap; import java.util.HashSet; import java.util.List; import java.util.Map; import java.util.Set; import java.util.concurrent.CopyOnWriteArrayList; import java.util.concurrent.Future; import java.util.concurrent.TimeUnit; import java.util.concurrent.atomic.AtomicBoolean; import java.util.concurrent.locks.Condition; import java.util.concurrent.locks.ReentrantLock; import java.util.stream.Collectors; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.hbase.HBaseIOException; import org.apache.hadoop.hbase.HConstants; import org.apache.hadoop.hbase.HRegionInfo; import org.apache.hadoop.hbase.PleaseHoldException; import org.apache.hadoop.hbase.RegionException; import org.apache.hadoop.hbase.RegionStateListener; import org.apache.hadoop.hbase.ServerName; import org.apache.hadoop.hbase.TableName; import org.apache.hadoop.hbase.classification.InterfaceAudience; import org.apache.hadoop.hbase.client.TableState; import org.apache.hadoop.hbase.exceptions.UnexpectedStateException; import org.apache.hadoop.hbase.master.balancer.FavoredStochasticBalancer; import org.apache.hadoop.hbase.favored.FavoredNodesManager; import org.apache.hadoop.hbase.favored.FavoredNodesPromoter; import org.apache.hadoop.hbase.master.AssignmentListener; import org.apache.hadoop.hbase.master.LoadBalancer; import org.apache.hadoop.hbase.master.MasterServices; import org.apache.hadoop.hbase.master.MetricsAssignmentManager; import org.apache.hadoop.hbase.master.NoSuchProcedureException; import org.apache.hadoop.hbase.master.RegionPlan; import org.apache.hadoop.hbase.master.RegionState; import org.apache.hadoop.hbase.master.RegionState.State; import org.apache.hadoop.hbase.master.ServerListener; import org.apache.hadoop.hbase.master.TableStateManager; import org.apache.hadoop.hbase.master.assignment.RegionStates.RegionStateNode; import org.apache.hadoop.hbase.master.assignment.RegionStates.ServerState; import org.apache.hadoop.hbase.master.assignment.RegionStates.ServerStateNode; // TODO: why are they here? import org.apache.hadoop.hbase.master.normalizer.RegionNormalizer; import org.apache.hadoop.hbase.master.procedure.MasterProcedureEnv; import org.apache.hadoop.hbase.master.procedure.MasterProcedureScheduler; import org.apache.hadoop.hbase.master.procedure.ProcedureSyncWait; import org.apache.hadoop.hbase.master.procedure.ServerCrashProcedure; import org.apache.hadoop.hbase.procedure2.Procedure; import org.apache.hadoop.hbase.procedure2.ProcedureEvent; import org.apache.hadoop.hbase.procedure2.ProcedureExecutor; import org.apache.hadoop.hbase.procedure2.ProcedureInMemoryChore; import org.apache.hadoop.hbase.procedure2.util.StringUtils; import org.apache.hadoop.hbase.shaded.protobuf.ProtobufUtil; import org.apache.hadoop.hbase.shaded.protobuf.generated.RegionServerStatusProtos.RegionStateTransition; import org.apache.hadoop.hbase.shaded.protobuf.generated.RegionServerStatusProtos.RegionStateTransition.TransitionCode; import org.apache.hadoop.hbase.shaded.protobuf.generated.RegionServerStatusProtos.ReportRegionStateTransitionRequest; import org.apache.hadoop.hbase.shaded.protobuf.generated.RegionServerStatusProtos.ReportRegionStateTransitionResponse; import org.apache.hadoop.hbase.util.Bytes; import org.apache.hadoop.hbase.util.EnvironmentEdgeManager; import org.apache.hadoop.hbase.util.Pair; import org.apache.hadoop.hbase.util.Threads; import org.apache.hadoop.hbase.util.VersionInfo; import org.apache.hadoop.hbase.shaded.com.google.common.annotations.VisibleForTesting; import org.apache.hadoop.hbase.shaded.com.google.common.collect.Lists; /** * The AssignmentManager is the coordinator for region assign/unassign operations. * <ul> * <li>In-memory states of regions and servers are stored in {@link RegionStates}.</li> * <li>hbase:meta state updates are handled by {@link RegionStateStore}.</li> * </ul> * Regions are created by CreateTable, Split, Merge. * Regions are deleted by DeleteTable, Split, Merge. * Assigns are triggered by CreateTable, EnableTable, Split, Merge, ServerCrash. * Unassigns are triggered by DisableTable, Split, Merge */ @InterfaceAudience.Private public class AssignmentManager implements ServerListener { private static final Log LOG = LogFactory.getLog(AssignmentManager.class); // TODO: AMv2 // - handle region migration from hbase1 to hbase2. // - handle sys table assignment first (e.g. acl, namespace) // - handle table priorities // - If ServerBusyException trying to update hbase:meta, we abort the Master // See updateRegionLocation in RegionStateStore. // // See also // https://docs.google.com/document/d/1eVKa7FHdeoJ1-9o8yZcOTAQbv0u0bblBlCCzVSIn69g/edit#heading=h.ystjyrkbtoq5 // for other TODOs. public static final String BOOTSTRAP_THREAD_POOL_SIZE_CONF_KEY = "hbase.assignment.bootstrap.thread.pool.size"; public static final String ASSIGN_DISPATCH_WAIT_MSEC_CONF_KEY = "hbase.assignment.dispatch.wait.msec"; private static final int DEFAULT_ASSIGN_DISPATCH_WAIT_MSEC = 150; public static final String ASSIGN_DISPATCH_WAITQ_MAX_CONF_KEY = "hbase.assignment.dispatch.wait.queue.max.size"; private static final int DEFAULT_ASSIGN_DISPATCH_WAITQ_MAX = 100; public static final String RIT_CHORE_INTERVAL_MSEC_CONF_KEY = "hbase.assignment.rit.chore.interval.msec"; private static final int DEFAULT_RIT_CHORE_INTERVAL_MSEC = 5 * 1000; public static final String ASSIGN_MAX_ATTEMPTS = "hbase.assignment.maximum.attempts"; private static final int DEFAULT_ASSIGN_MAX_ATTEMPTS = 10; /** Region in Transition metrics threshold time */ public static final String METRICS_RIT_STUCK_WARNING_THRESHOLD = "hbase.metrics.rit.stuck.warning.threshold"; private static final int DEFAULT_RIT_STUCK_WARNING_THRESHOLD = 60 * 1000; private final ProcedureEvent<?> metaInitializedEvent = new ProcedureEvent<>("meta initialized"); private final ProcedureEvent<?> metaLoadEvent = new ProcedureEvent<>("meta load"); /** * Indicator that AssignmentManager has recovered the region states so * that ServerCrashProcedure can be fully enabled and re-assign regions * of dead servers. So that when re-assignment happens, AssignmentManager * has proper region states. */ private final ProcedureEvent<?> failoverCleanupDone = new ProcedureEvent<>("failover cleanup"); /** Listeners that are called on assignment events. */ private final CopyOnWriteArrayList<AssignmentListener> listeners = new CopyOnWriteArrayList<AssignmentListener>(); // TODO: why is this different from the listeners (carried over from the old AM) private RegionStateListener regionStateListener; private RegionNormalizer regionNormalizer; private final MetricsAssignmentManager metrics; private final RegionInTransitionChore ritChore; private final MasterServices master; private final AtomicBoolean running = new AtomicBoolean(false); private final RegionStates regionStates = new RegionStates(); private final RegionStateStore regionStateStore; private final boolean shouldAssignRegionsWithFavoredNodes; private final int assignDispatchWaitQueueMaxSize; private final int assignDispatchWaitMillis; private final int assignMaxAttempts; private final Object checkIfShouldMoveSystemRegionLock = new Object(); private Thread assignThread; public AssignmentManager(final MasterServices master) { this(master, new RegionStateStore(master)); } public AssignmentManager(final MasterServices master, final RegionStateStore stateStore) { this.master = master; this.regionStateStore = stateStore; this.metrics = new MetricsAssignmentManager(); final Configuration conf = master.getConfiguration(); // Only read favored nodes if using the favored nodes load balancer. this.shouldAssignRegionsWithFavoredNodes = FavoredStochasticBalancer.class .isAssignableFrom(conf.getClass(HConstants.HBASE_MASTER_LOADBALANCER_CLASS, Object.class)); this.assignDispatchWaitMillis = conf.getInt(ASSIGN_DISPATCH_WAIT_MSEC_CONF_KEY, DEFAULT_ASSIGN_DISPATCH_WAIT_MSEC); this.assignDispatchWaitQueueMaxSize = conf.getInt(ASSIGN_DISPATCH_WAITQ_MAX_CONF_KEY, DEFAULT_ASSIGN_DISPATCH_WAITQ_MAX); this.assignMaxAttempts = Math.max(1, conf.getInt(ASSIGN_MAX_ATTEMPTS, DEFAULT_ASSIGN_MAX_ATTEMPTS)); int ritChoreInterval = conf.getInt(RIT_CHORE_INTERVAL_MSEC_CONF_KEY, DEFAULT_RIT_CHORE_INTERVAL_MSEC); this.ritChore = new RegionInTransitionChore(ritChoreInterval); // Used for region related procedure. setRegionNormalizer(master.getRegionNormalizer()); } public void start() throws IOException { if (!running.compareAndSet(false, true)) { return; } LOG.info("Starting assignment manager"); // Register Server Listener master.getServerManager().registerListener(this); // Start the RegionStateStore regionStateStore.start(); // Start the Assignment Thread startAssignmentThread(); } public void stop() { if (!running.compareAndSet(true, false)) { return; } LOG.info("Stopping assignment manager"); // The AM is started before the procedure executor, // but the actual work will be loaded/submitted only once we have the executor final boolean hasProcExecutor = master.getMasterProcedureExecutor() != null; // Remove the RIT chore if (hasProcExecutor) { master.getMasterProcedureExecutor().removeChore(this.ritChore); } // Stop the Assignment Thread stopAssignmentThread(); // Stop the RegionStateStore regionStates.clear(); regionStateStore.stop(); // Unregister Server Listener master.getServerManager().unregisterListener(this); // Update meta events (for testing) if (hasProcExecutor) { getProcedureScheduler().suspendEvent(metaLoadEvent); setFailoverCleanupDone(false); for (HRegionInfo hri : getMetaRegionSet()) { setMetaInitialized(hri, false); } } } public boolean isRunning() { return running.get(); } public Configuration getConfiguration() { return master.getConfiguration(); } public MetricsAssignmentManager getAssignmentManagerMetrics() { return metrics; } private LoadBalancer getBalancer() { return master.getLoadBalancer(); } private MasterProcedureEnv getProcedureEnvironment() { return master.getMasterProcedureExecutor().getEnvironment(); } private MasterProcedureScheduler getProcedureScheduler() { return getProcedureEnvironment().getProcedureScheduler(); } protected int getAssignMaxAttempts() { return assignMaxAttempts; } /** * Add the listener to the notification list. * @param listener The AssignmentListener to register */ public void registerListener(final AssignmentListener listener) { this.listeners.add(listener); } /** * Remove the listener from the notification list. * @param listener The AssignmentListener to unregister */ public boolean unregisterListener(final AssignmentListener listener) { return this.listeners.remove(listener); } public void setRegionStateListener(final RegionStateListener listener) { this.regionStateListener = listener; } public void setRegionNormalizer(final RegionNormalizer normalizer) { this.regionNormalizer = normalizer; } public RegionNormalizer getRegionNormalizer() { return regionNormalizer; } public RegionStates getRegionStates() { return regionStates; } public RegionStateStore getRegionStateStore() { return regionStateStore; } public List<ServerName> getFavoredNodes(final HRegionInfo regionInfo) { return this.shouldAssignRegionsWithFavoredNodes ? ((FavoredStochasticBalancer) getBalancer()).getFavoredNodes(regionInfo) : ServerName.EMPTY_SERVER_LIST; } // ============================================================================================ // Table State Manager helpers // ============================================================================================ TableStateManager getTableStateManager() { return master.getTableStateManager(); } public boolean isTableEnabled(final TableName tableName) { return getTableStateManager().isTableState(tableName, TableState.State.ENABLED); } public boolean isTableDisabled(final TableName tableName) { return getTableStateManager().isTableState(tableName, TableState.State.DISABLED, TableState.State.DISABLING); } // ============================================================================================ // META Helpers // ============================================================================================ private boolean isMetaRegion(final HRegionInfo regionInfo) { return regionInfo.isMetaRegion(); } public boolean isMetaRegion(final byte[] regionName) { return getMetaRegionFromName(regionName) != null; } public HRegionInfo getMetaRegionFromName(final byte[] regionName) { for (HRegionInfo hri : getMetaRegionSet()) { if (Bytes.equals(hri.getRegionName(), regionName)) { return hri; } } return null; } public boolean isCarryingMeta(final ServerName serverName) { for (HRegionInfo hri : getMetaRegionSet()) { if (isCarryingRegion(serverName, hri)) { return true; } } return false; } private boolean isCarryingRegion(final ServerName serverName, final HRegionInfo regionInfo) { // TODO: check for state? final RegionStateNode node = regionStates.getRegionNode(regionInfo); return (node != null && serverName.equals(node.getRegionLocation())); } private HRegionInfo getMetaForRegion(final HRegionInfo regionInfo) { //if (regionInfo.isMetaRegion()) return regionInfo; // TODO: handle multiple meta. if the region provided is not meta lookup // which meta the region belongs to. return HRegionInfo.FIRST_META_REGIONINFO; } // TODO: handle multiple meta. private static final Set<HRegionInfo> META_REGION_SET = Collections .singleton(HRegionInfo.FIRST_META_REGIONINFO); public Set<HRegionInfo> getMetaRegionSet() { return META_REGION_SET; } // ============================================================================================ // META Event(s) helpers // ============================================================================================ public boolean isMetaInitialized() { return metaInitializedEvent.isReady(); } public boolean isMetaRegionInTransition() { return !isMetaInitialized(); } public boolean waitMetaInitialized(final Procedure proc) { // TODO: handle multiple meta. should this wait on all meta? // this is used by the ServerCrashProcedure... return waitMetaInitialized(proc, HRegionInfo.FIRST_META_REGIONINFO); } public boolean waitMetaInitialized(final Procedure proc, final HRegionInfo regionInfo) { return getProcedureScheduler().waitEvent(getMetaInitializedEvent(getMetaForRegion(regionInfo)), proc); } private void setMetaInitialized(final HRegionInfo metaRegionInfo, final boolean isInitialized) { assert isMetaRegion(metaRegionInfo) : "unexpected non-meta region " + metaRegionInfo; final ProcedureEvent metaInitEvent = getMetaInitializedEvent(metaRegionInfo); if (isInitialized) { getProcedureScheduler().wakeEvent(metaInitEvent); } else { getProcedureScheduler().suspendEvent(metaInitEvent); } } private ProcedureEvent getMetaInitializedEvent(final HRegionInfo metaRegionInfo) { assert isMetaRegion(metaRegionInfo) : "unexpected non-meta region " + metaRegionInfo; // TODO: handle multiple meta. return metaInitializedEvent; } public boolean waitMetaLoaded(final Procedure proc) { return getProcedureScheduler().waitEvent(metaLoadEvent, proc); } protected void wakeMetaLoadedEvent() { getProcedureScheduler().wakeEvent(metaLoadEvent); assert isMetaLoaded() : "expected meta to be loaded"; } public boolean isMetaLoaded() { return metaLoadEvent.isReady(); } // ============================================================================================ // TODO: Sync helpers // ============================================================================================ public void assignMeta(final HRegionInfo metaRegionInfo) throws IOException { assignMeta(metaRegionInfo, null); } public void assignMeta(final HRegionInfo metaRegionInfo, final ServerName serverName) throws IOException { assert isMetaRegion(metaRegionInfo) : "unexpected non-meta region " + metaRegionInfo; AssignProcedure proc; if (serverName != null) { LOG.debug("Try assigning Meta " + metaRegionInfo + " to " + serverName); proc = createAssignProcedure(metaRegionInfo, serverName); } else { LOG.debug("Assigning " + metaRegionInfo.getRegionNameAsString()); proc = createAssignProcedure(metaRegionInfo, false); } ProcedureSyncWait.submitAndWaitProcedure(master.getMasterProcedureExecutor(), proc); } /** * Start a new thread to check if there are region servers whose versions are higher than others. * If so, move all system table regions to RS with the highest version to keep compatibility. * The reason is, RS in new version may not be able to access RS in old version when there are * some incompatible changes. */ public void checkIfShouldMoveSystemRegionAsync() { new Thread(() -> { try { synchronized (checkIfShouldMoveSystemRegionLock) { List<ServerName> serverList = master.getServerManager() .createDestinationServersList(getExcludedServersForSystemTable()); List<RegionPlan> plans = new ArrayList<>(); for (ServerName server : getExcludedServersForSystemTable()) { List<HRegionInfo> regionsShouldMove = getCarryingSystemTables(server); if (!regionsShouldMove.isEmpty()) { for (HRegionInfo regionInfo : regionsShouldMove) { RegionPlan plan = new RegionPlan(regionInfo, server, getBalancer().randomAssignment(regionInfo, serverList)); if (regionInfo.isMetaRegion()) { // Must move meta region first. moveAsync(plan); } else { plans.add(plan); } } } for (RegionPlan plan : plans) { moveAsync(plan); } } } } catch (Throwable t) { LOG.error(t); } }).start(); } private List<HRegionInfo> getCarryingSystemTables(ServerName serverName) { Set<RegionStateNode> regions = this.getRegionStates().getServerNode(serverName).getRegions(); if (regions == null) { return new ArrayList<>(); } return regions.stream().map(RegionStateNode::getRegionInfo).filter(HRegionInfo::isSystemTable) .collect(Collectors.toList()); } public void assign(final HRegionInfo regionInfo) throws IOException { assign(regionInfo, true); } public void assign(final HRegionInfo regionInfo, final boolean forceNewPlan) throws IOException { AssignProcedure proc = createAssignProcedure(regionInfo, forceNewPlan); ProcedureSyncWait.submitAndWaitProcedure(master.getMasterProcedureExecutor(), proc); } public void unassign(final HRegionInfo regionInfo) throws IOException { unassign(regionInfo, false); } public void unassign(final HRegionInfo regionInfo, final boolean forceNewPlan) throws IOException { // TODO: rename this reassign RegionStateNode node = this.regionStates.getRegionNode(regionInfo); ServerName destinationServer = node.getRegionLocation(); if (destinationServer == null) { throw new UnexpectedStateException("DestinationServer is null; Assigned? " + node.toString()); } assert destinationServer != null; node.toString(); UnassignProcedure proc = createUnassignProcedure(regionInfo, destinationServer, forceNewPlan); ProcedureSyncWait.submitAndWaitProcedure(master.getMasterProcedureExecutor(), proc); } public Future<byte[]> moveAsync(final RegionPlan regionPlan) { MoveRegionProcedure proc = createMoveRegionProcedure(regionPlan); return ProcedureSyncWait.submitProcedure(master.getMasterProcedureExecutor(), proc); } @VisibleForTesting public boolean waitForAssignment(final HRegionInfo regionInfo) throws IOException { return waitForAssignment(regionInfo, Long.MAX_VALUE); } @VisibleForTesting // TODO: Remove this? public boolean waitForAssignment(final HRegionInfo regionInfo, final long timeout) throws IOException { RegionStateNode node = null; // This method can be called before the regionInfo has made it into the regionStateMap // so wait around here a while. long startTime = System.currentTimeMillis(); // Something badly wrong if takes ten seconds to register a region. long endTime = startTime + 10000; while ((node = regionStates.getRegionNode(regionInfo)) == null && isRunning() && System.currentTimeMillis() < endTime) { // Presume it not yet added but will be added soon. Let it spew a lot so we can tell if // we are waiting here alot. LOG.debug("Waiting on " + regionInfo + " to be added to regionStateMap"); Threads.sleep(10); } if (node == null) { if (!isRunning()) return false; throw new RegionException(regionInfo.getRegionNameAsString() + " never registered with Assigment."); } RegionTransitionProcedure proc = node.getProcedure(); if (proc == null) { throw new NoSuchProcedureException(node.toString()); } ProcedureSyncWait.waitForProcedureToCompleteIOE(master.getMasterProcedureExecutor(), proc.getProcId(), timeout); return true; } // ============================================================================================ // RegionTransition procedures helpers // ============================================================================================ public AssignProcedure[] createAssignProcedures(final Collection<HRegionInfo> regionInfo) { return createAssignProcedures(regionInfo, false); } public AssignProcedure[] createAssignProcedures(final Collection<HRegionInfo> regionInfo, final boolean forceNewPlan) { if (regionInfo.isEmpty()) return null; final AssignProcedure[] procs = new AssignProcedure[regionInfo.size()]; int index = 0; for (HRegionInfo hri : regionInfo) { procs[index++] = createAssignProcedure(hri, forceNewPlan); } return procs; } // Needed for the following method so it can type the created Array we return private static final UnassignProcedure[] UNASSIGNED_PROCEDURE_FOR_TYPE_INFO = new UnassignProcedure[0]; UnassignProcedure[] createUnassignProcedures(final Collection<RegionStateNode> nodes) { if (nodes.isEmpty()) return null; final List<UnassignProcedure> procs = new ArrayList<UnassignProcedure>(nodes.size()); for (RegionStateNode node : nodes) { if (!this.regionStates.include(node, false)) continue; // Look for regions that are offline/closed; i.e. already unassigned. if (this.regionStates.isRegionOffline(node.getRegionInfo())) continue; assert node.getRegionLocation() != null : node.toString(); procs.add(createUnassignProcedure(node.getRegionInfo(), node.getRegionLocation(), false)); } return procs.toArray(UNASSIGNED_PROCEDURE_FOR_TYPE_INFO); } public MoveRegionProcedure[] createReopenProcedures(final Collection<HRegionInfo> regionInfo) { final MoveRegionProcedure[] procs = new MoveRegionProcedure[regionInfo.size()]; int index = 0; for (HRegionInfo hri : regionInfo) { final ServerName serverName = regionStates.getRegionServerOfRegion(hri); final RegionPlan plan = new RegionPlan(hri, serverName, serverName); procs[index++] = createMoveRegionProcedure(plan); } return procs; } /** * Called by things like EnableTableProcedure to get a list of AssignProcedure * to assign the regions of the table. */ public AssignProcedure[] createAssignProcedures(final TableName tableName) { return createAssignProcedures(regionStates.getRegionsOfTable(tableName)); } /** * Called by things like DisableTableProcedure to get a list of UnassignProcedure * to unassign the regions of the table. */ public UnassignProcedure[] createUnassignProcedures(final TableName tableName) { return createUnassignProcedures(regionStates.getTableRegionStateNodes(tableName)); } /** * Called by things like ModifyColumnFamilyProcedure to get a list of MoveRegionProcedure * to reopen the regions of the table. */ public MoveRegionProcedure[] createReopenProcedures(final TableName tableName) { return createReopenProcedures(regionStates.getRegionsOfTable(tableName)); } public AssignProcedure createAssignProcedure(final HRegionInfo regionInfo, final boolean forceNewPlan) { AssignProcedure proc = new AssignProcedure(regionInfo, forceNewPlan); proc.setOwner(getProcedureEnvironment().getRequestUser().getShortName()); return proc; } public AssignProcedure createAssignProcedure(final HRegionInfo regionInfo, final ServerName targetServer) { AssignProcedure proc = new AssignProcedure(regionInfo, targetServer); proc.setOwner(getProcedureEnvironment().getRequestUser().getShortName()); return proc; } public UnassignProcedure createUnassignProcedure(final HRegionInfo regionInfo, final ServerName destinationServer, final boolean force) { // If destinationServer is null, figure it. ServerName sn = destinationServer != null ? destinationServer : getRegionStates().getRegionState(regionInfo).getServerName(); assert sn != null; UnassignProcedure proc = new UnassignProcedure(regionInfo, sn, force); proc.setOwner(getProcedureEnvironment().getRequestUser().getShortName()); return proc; } public MoveRegionProcedure createMoveRegionProcedure(final RegionPlan plan) { if (plan.getRegionInfo().isSystemTable()) { List<ServerName> exclude = getExcludedServersForSystemTable(); if (plan.getDestination() != null && exclude.contains(plan.getDestination())) { try { LOG.info("Can not move " + plan.getRegionInfo() + " to " + plan.getDestination() + " because the server is not with highest version"); plan.setDestination(getBalancer().randomAssignment(plan.getRegionInfo(), this.master.getServerManager().createDestinationServersList(exclude))); } catch (HBaseIOException e) { LOG.warn(e); } } } return new MoveRegionProcedure(getProcedureEnvironment(), plan); } public SplitTableRegionProcedure createSplitProcedure(final HRegionInfo regionToSplit, final byte[] splitKey) throws IOException { return new SplitTableRegionProcedure(getProcedureEnvironment(), regionToSplit, splitKey); } public MergeTableRegionsProcedure createMergeProcedure(final HRegionInfo regionToMergeA, final HRegionInfo regionToMergeB) throws IOException { return new MergeTableRegionsProcedure(getProcedureEnvironment(), regionToMergeA, regionToMergeB); } /** * Delete the region states. This is called by "DeleteTable" */ public void deleteTable(final TableName tableName) throws IOException { final ArrayList<HRegionInfo> regions = regionStates.getTableRegionsInfo(tableName); regionStateStore.deleteRegions(regions); for (int i = 0; i < regions.size(); ++i) { final HRegionInfo regionInfo = regions.get(i); // we expect the region to be offline regionStates.removeFromOfflineRegions(regionInfo); regionStates.deleteRegion(regionInfo); } } // ============================================================================================ // RS Region Transition Report helpers // ============================================================================================ // TODO: Move this code in MasterRpcServices and call on specific event? public ReportRegionStateTransitionResponse reportRegionStateTransition( final ReportRegionStateTransitionRequest req) throws PleaseHoldException { final ReportRegionStateTransitionResponse.Builder builder = ReportRegionStateTransitionResponse .newBuilder(); final ServerName serverName = ProtobufUtil.toServerName(req.getServer()); try { for (RegionStateTransition transition : req.getTransitionList()) { switch (transition.getTransitionCode()) { case OPENED: case FAILED_OPEN: case CLOSED: assert transition.getRegionInfoCount() == 1 : transition; final HRegionInfo hri = HRegionInfo.convert(transition.getRegionInfo(0)); updateRegionTransition(serverName, transition.getTransitionCode(), hri, transition.hasOpenSeqNum() ? transition.getOpenSeqNum() : HConstants.NO_SEQNUM); break; case READY_TO_SPLIT: case SPLIT_PONR: case SPLIT: case SPLIT_REVERTED: assert transition.getRegionInfoCount() == 3 : transition; final HRegionInfo parent = HRegionInfo.convert(transition.getRegionInfo(0)); final HRegionInfo splitA = HRegionInfo.convert(transition.getRegionInfo(1)); final HRegionInfo splitB = HRegionInfo.convert(transition.getRegionInfo(2)); updateRegionSplitTransition(serverName, transition.getTransitionCode(), parent, splitA, splitB); break; case READY_TO_MERGE: case MERGE_PONR: case MERGED: case MERGE_REVERTED: assert transition.getRegionInfoCount() == 3 : transition; final HRegionInfo merged = HRegionInfo.convert(transition.getRegionInfo(0)); final HRegionInfo mergeA = HRegionInfo.convert(transition.getRegionInfo(1)); final HRegionInfo mergeB = HRegionInfo.convert(transition.getRegionInfo(2)); updateRegionMergeTransition(serverName, transition.getTransitionCode(), merged, mergeA, mergeB); break; } } } catch (PleaseHoldException e) { if (LOG.isTraceEnabled()) LOG.trace("Failed transition " + e.getMessage()); throw e; } catch (UnsupportedOperationException | IOException e) { // TODO: at the moment we have a single error message and the RS will abort // if the master says that one of the region transitions failed. LOG.warn("Failed transition", e); builder.setErrorMessage("Failed transition " + e.getMessage()); } return builder.build(); } private void updateRegionTransition(final ServerName serverName, final TransitionCode state, final HRegionInfo regionInfo, final long seqId) throws PleaseHoldException, UnexpectedStateException { checkFailoverCleanupCompleted(regionInfo); final RegionStateNode regionNode = regionStates.getRegionNode(regionInfo); if (regionNode == null) { // the table/region is gone. maybe a delete, split, merge throw new UnexpectedStateException( String.format("Server %s was trying to transition region %s to %s. but the region was removed.", serverName, regionInfo, state)); } if (LOG.isTraceEnabled()) { LOG.trace(String.format("Update region transition serverName=%s region=%s state=%s", serverName, regionNode, state)); } final ServerStateNode serverNode = regionStates.getOrCreateServer(serverName); if (!reportTransition(regionNode, serverNode, state, seqId)) { LOG.warn(String.format("No procedure for %s. server=%s to transition to %s", regionNode, serverName, state)); } } // FYI: regionNode is sometimes synchronized by the caller but not always. private boolean reportTransition(final RegionStateNode regionNode, final ServerStateNode serverNode, final TransitionCode state, final long seqId) throws UnexpectedStateException { final ServerName serverName = serverNode.getServerName(); synchronized (regionNode) { final RegionTransitionProcedure proc = regionNode.getProcedure(); if (proc == null) return false; // serverNode.getReportEvent().removeProcedure(proc); proc.reportTransition(master.getMasterProcedureExecutor().getEnvironment(), serverName, state, seqId); } return true; } private void updateRegionSplitTransition(final ServerName serverName, final TransitionCode state, final HRegionInfo parent, final HRegionInfo hriA, final HRegionInfo hriB) throws IOException { checkFailoverCleanupCompleted(parent); if (state != TransitionCode.READY_TO_SPLIT) { throw new UnexpectedStateException("unsupported split state=" + state + " for parent region " + parent + " maybe an old RS (< 2.0) had the operation in progress"); } // sanity check on the request if (!Bytes.equals(hriA.getEndKey(), hriB.getStartKey())) { throw new UnsupportedOperationException("unsupported split request with bad keys: parent=" + parent + " hriA=" + hriA + " hriB=" + hriB); } // Submit the Split procedure final byte[] splitKey = hriB.getStartKey(); if (LOG.isDebugEnabled()) { LOG.debug("Split request from " + serverName + ", parent=" + parent + " splitKey=" + Bytes.toStringBinary(splitKey)); } master.getMasterProcedureExecutor().submitProcedure(createSplitProcedure(parent, splitKey)); // If the RS is < 2.0 throw an exception to abort the operation, we are handling the split if (regionStates.getOrCreateServer(serverName).getVersionNumber() < 0x0200000) { throw new UnsupportedOperationException( String.format("Split handled by the master: parent=%s hriA=%s hriB=%s", parent.getShortNameToLog(), hriA, hriB)); } } private void updateRegionMergeTransition(final ServerName serverName, final TransitionCode state, final HRegionInfo merged, final HRegionInfo hriA, final HRegionInfo hriB) throws IOException { checkFailoverCleanupCompleted(merged); if (state != TransitionCode.READY_TO_MERGE) { throw new UnexpectedStateException( "Unsupported merge state=" + state + " for regionA=" + hriA + " regionB=" + hriB + " merged=" + merged + " maybe an old RS (< 2.0) had the operation in progress"); } // Submit the Merge procedure if (LOG.isDebugEnabled()) { LOG.debug("Handling merge request from RS=" + merged + ", merged=" + merged); } master.getMasterProcedureExecutor().submitProcedure(createMergeProcedure(hriA, hriB)); // If the RS is < 2.0 throw an exception to abort the operation, we are handling the merge if (regionStates.getOrCreateServer(serverName).getVersionNumber() < 0x0200000) { throw new UnsupportedOperationException(String.format( "Merge not handled yet: state=%s merged=%s hriA=%s hriB=%s", state, merged, hriA, hriB)); } } // ============================================================================================ // RS Status update (report online regions) helpers // ============================================================================================ /** * the master will call this method when the RS send the regionServerReport(). * the report will contains the "hbase version" and the "online regions". * this method will check the the online regions against the in-memory state of the AM, * if there is a mismatch we will try to fence out the RS with the assumption * that something went wrong on the RS side. */ public void reportOnlineRegions(final ServerName serverName, final int versionNumber, final Set<byte[]> regionNames) { if (!isRunning()) return; if (LOG.isTraceEnabled()) { LOG.trace("ReportOnlineRegions " + serverName + " regionCount=" + regionNames.size() + ", metaLoaded=" + isMetaLoaded() + " " + regionNames.stream().map(element -> Bytes.toStringBinary(element)) .collect(Collectors.toList())); } final ServerStateNode serverNode = regionStates.getOrCreateServer(serverName); // update the server version number. This will be used for live upgrades. synchronized (serverNode) { serverNode.setVersionNumber(versionNumber); if (serverNode.isInState(ServerState.SPLITTING, ServerState.OFFLINE)) { LOG.warn("Got a report from a server result in state " + serverNode.getState()); return; } } if (regionNames.isEmpty()) { // nothing to do if we don't have regions LOG.trace("no online region found on " + serverName); } else if (!isMetaLoaded()) { // if we are still on startup, discard the report unless is from someone holding meta checkOnlineRegionsReportForMeta(serverNode, regionNames); } else { // The Heartbeat updates us of what regions are only. check and verify the state. checkOnlineRegionsReport(serverNode, regionNames); } // wake report event wakeServerReportEvent(serverNode); } public void checkOnlineRegionsReportForMeta(final ServerStateNode serverNode, final Set<byte[]> regionNames) { try { for (byte[] regionName : regionNames) { final HRegionInfo hri = getMetaRegionFromName(regionName); if (hri == null) { if (LOG.isTraceEnabled()) { LOG.trace("Skip online report for region=" + Bytes.toStringBinary(regionName) + " while meta is loading"); } continue; } final RegionStateNode regionNode = regionStates.getOrCreateRegionNode(hri); LOG.info("META REPORTED: " + regionNode); if (!reportTransition(regionNode, serverNode, TransitionCode.OPENED, 0)) { LOG.warn("META REPORTED but no procedure found"); regionNode.setRegionLocation(serverNode.getServerName()); } else if (LOG.isTraceEnabled()) { LOG.trace("META REPORTED: " + regionNode); } } } catch (UnexpectedStateException e) { final ServerName serverName = serverNode.getServerName(); LOG.warn("KILLING " + serverName + ": " + e.getMessage()); killRegionServer(serverNode); } } void checkOnlineRegionsReport(final ServerStateNode serverNode, final Set<byte[]> regionNames) { final ServerName serverName = serverNode.getServerName(); try { for (byte[] regionName : regionNames) { if (!isRunning()) return; final RegionStateNode regionNode = regionStates.getRegionNodeFromName(regionName); if (regionNode == null) { throw new UnexpectedStateException("Not online: " + Bytes.toStringBinary(regionName)); } synchronized (regionNode) { if (regionNode.isInState(State.OPENING, State.OPEN)) { if (!regionNode.getRegionLocation().equals(serverName)) { throw new UnexpectedStateException(regionNode.toString() + "reported OPEN on server=" + serverName + " but state has otherwise."); } else if (regionNode.isInState(State.OPENING)) { try { if (!reportTransition(regionNode, serverNode, TransitionCode.OPENED, 0)) { LOG.warn(regionNode.toString() + " reported OPEN on server=" + serverName + " but state has otherwise AND NO procedure is running"); } } catch (UnexpectedStateException e) { LOG.warn(regionNode.toString() + " reported unexpteced OPEN: " + e.getMessage(), e); } } } else if (!regionNode.isInState(State.CLOSING, State.SPLITTING)) { long diff = regionNode.getLastUpdate() - EnvironmentEdgeManager.currentTime(); if (diff > 1000/*One Second... make configurable if an issue*/) { // So, we can get report that a region is CLOSED or SPLIT because a heartbeat // came in at about same time as a region transition. Make sure there is some // elapsed time between killing remote server. throw new UnexpectedStateException(regionNode.toString() + " reported an unexpected OPEN; time since last update=" + diff); } } } } } catch (UnexpectedStateException e) { LOG.warn("Killing " + serverName + ": " + e.getMessage()); killRegionServer(serverNode); } } protected boolean waitServerReportEvent(final ServerName serverName, final Procedure proc) { final ServerStateNode serverNode = regionStates.getOrCreateServer(serverName); return getProcedureScheduler().waitEvent(serverNode.getReportEvent(), proc); } protected void wakeServerReportEvent(final ServerStateNode serverNode) { getProcedureScheduler().wakeEvent(serverNode.getReportEvent()); } // ============================================================================================ // RIT chore // ============================================================================================ private static class RegionInTransitionChore extends ProcedureInMemoryChore<MasterProcedureEnv> { public RegionInTransitionChore(final int timeoutMsec) { super(timeoutMsec); } @Override protected void periodicExecute(final MasterProcedureEnv env) { final AssignmentManager am = env.getAssignmentManager(); final RegionInTransitionStat ritStat = am.computeRegionInTransitionStat(); if (ritStat.hasRegionsOverThreshold()) { for (RegionState hri : ritStat.getRegionOverThreshold()) { am.handleRegionOverStuckWarningThreshold(hri.getRegion()); } } // update metrics am.updateRegionsInTransitionMetrics(ritStat); } } public RegionInTransitionStat computeRegionInTransitionStat() { final RegionInTransitionStat rit = new RegionInTransitionStat(getConfiguration()); rit.update(this); return rit; } public static class RegionInTransitionStat { private final int ritThreshold; private HashMap<String, RegionState> ritsOverThreshold = null; private long statTimestamp; private long oldestRITTime = 0; private int totalRITsTwiceThreshold = 0; private int totalRITs = 0; @VisibleForTesting public RegionInTransitionStat(final Configuration conf) { this.ritThreshold = conf.getInt(METRICS_RIT_STUCK_WARNING_THRESHOLD, DEFAULT_RIT_STUCK_WARNING_THRESHOLD); } public int getRITThreshold() { return ritThreshold; } public long getTimestamp() { return statTimestamp; } public int getTotalRITs() { return totalRITs; } public long getOldestRITTime() { return oldestRITTime; } public int getTotalRITsOverThreshold() { Map<String, RegionState> m = this.ritsOverThreshold; return m != null ? m.size() : 0; } public boolean hasRegionsTwiceOverThreshold() { return totalRITsTwiceThreshold > 0; } public boolean hasRegionsOverThreshold() { Map<String, RegionState> m = this.ritsOverThreshold; return m != null && !m.isEmpty(); } public Collection<RegionState> getRegionOverThreshold() { Map<String, RegionState> m = this.ritsOverThreshold; return m != null ? m.values() : Collections.EMPTY_SET; } public boolean isRegionOverThreshold(final HRegionInfo regionInfo) { Map<String, RegionState> m = this.ritsOverThreshold; return m != null && m.containsKey(regionInfo.getEncodedName()); } public boolean isRegionTwiceOverThreshold(final HRegionInfo regionInfo) { Map<String, RegionState> m = this.ritsOverThreshold; if (m == null) return false; final RegionState state = m.get(regionInfo.getEncodedName()); if (state == null) return false; return (statTimestamp - state.getStamp()) > (ritThreshold * 2); } protected void update(final AssignmentManager am) { final RegionStates regionStates = am.getRegionStates(); this.statTimestamp = EnvironmentEdgeManager.currentTime(); update(regionStates.getRegionsStateInTransition(), statTimestamp); update(regionStates.getRegionFailedOpen(), statTimestamp); } private void update(final Collection<RegionState> regions, final long currentTime) { for (RegionState state : regions) { totalRITs++; final long ritTime = currentTime - state.getStamp(); if (ritTime > ritThreshold) { if (ritsOverThreshold == null) { ritsOverThreshold = new HashMap<String, RegionState>(); } ritsOverThreshold.put(state.getRegion().getEncodedName(), state); totalRITsTwiceThreshold += (ritTime > (ritThreshold * 2)) ? 1 : 0; } if (oldestRITTime < ritTime) { oldestRITTime = ritTime; } } } } private void updateRegionsInTransitionMetrics(final RegionInTransitionStat ritStat) { metrics.updateRITOldestAge(ritStat.getOldestRITTime()); metrics.updateRITCount(ritStat.getTotalRITs()); metrics.updateRITCountOverThreshold(ritStat.getTotalRITsOverThreshold()); } private void handleRegionOverStuckWarningThreshold(final HRegionInfo regionInfo) { final RegionStateNode regionNode = regionStates.getRegionNode(regionInfo); //if (regionNode.isStuck()) { LOG.warn("TODO Handle stuck in transition: " + regionNode); } // ============================================================================================ // TODO: Master load/bootstrap // ============================================================================================ public void joinCluster() throws IOException { final long startTime = System.currentTimeMillis(); LOG.info("Joining the cluster..."); // Scan hbase:meta to build list of existing regions, servers, and assignment loadMeta(); for (int i = 0; master.getServerManager().countOfRegionServers() < 1; ++i) { LOG.info("waiting for RS to join"); Threads.sleep(250); } LOG.info("RS joined " + master.getServerManager().countOfRegionServers()); // This method will assign all user regions if a clean server startup or // it will reconstruct master state and cleanup any leftovers from previous master process. boolean failover = processofflineServersWithOnlineRegions(); // Start the RIT chore master.getMasterProcedureExecutor().addChore(this.ritChore); LOG.info(String.format("Joined the cluster in %s, failover=%s", StringUtils.humanTimeDiff(System.currentTimeMillis() - startTime), failover)); } private void loadMeta() throws IOException { // TODO: use a thread pool regionStateStore.visitMeta(new RegionStateStore.RegionStateVisitor() { @Override public void visitRegionState(final HRegionInfo regionInfo, final State state, final ServerName regionLocation, final ServerName lastHost, final long openSeqNum) { final RegionStateNode regionNode = regionStates.getOrCreateRegionNode(regionInfo); synchronized (regionNode) { if (!regionNode.isInTransition()) { regionNode.setState(state); regionNode.setLastHost(lastHost); regionNode.setRegionLocation(regionLocation); regionNode.setOpenSeqNum(openSeqNum); if (state == State.OPEN) { assert regionLocation != null : "found null region location for " + regionNode; regionStates.addRegionToServer(regionLocation, regionNode); } else if (state == State.OFFLINE || regionInfo.isOffline()) { regionStates.addToOfflineRegions(regionNode); } else { // These regions should have a procedure in replay regionStates.addRegionInTransition(regionNode, null); } } } } }); // every assignment is blocked until meta is loaded. wakeMetaLoadedEvent(); } // TODO: the assumption here is that if RSs are crashing while we are executing this // they will be handled by the SSH that are put in the ServerManager "queue". // we can integrate this a bit better. private boolean processofflineServersWithOnlineRegions() { boolean failover = !master.getServerManager().getDeadServers().isEmpty(); final Set<ServerName> offlineServersWithOnlineRegions = new HashSet<ServerName>(); final ArrayList<HRegionInfo> regionsToAssign = new ArrayList<HRegionInfo>(); long st, et; st = System.currentTimeMillis(); for (RegionStateNode regionNode : regionStates.getRegionNodes()) { if (regionNode.getState() == State.OPEN) { final ServerName serverName = regionNode.getRegionLocation(); if (!master.getServerManager().isServerOnline(serverName)) { offlineServersWithOnlineRegions.add(serverName); } } else if (regionNode.getState() == State.OFFLINE) { if (isTableEnabled(regionNode.getTable())) { regionsToAssign.add(regionNode.getRegionInfo()); } } } et = System.currentTimeMillis(); LOG.info("[STEP-1] " + StringUtils.humanTimeDiff(et - st)); // kill servers with online regions st = System.currentTimeMillis(); for (ServerName serverName : offlineServersWithOnlineRegions) { if (!master.getServerManager().isServerOnline(serverName)) { LOG.info("KILL RS hosting regions but not online " + serverName + " (master=" + master.getServerName() + ")"); killRegionServer(serverName); } } et = System.currentTimeMillis(); LOG.info("[STEP-2] " + StringUtils.humanTimeDiff(et - st)); setFailoverCleanupDone(true); // assign offline regions st = System.currentTimeMillis(); for (HRegionInfo regionInfo : getOrderedRegions(regionsToAssign)) { master.getMasterProcedureExecutor().submitProcedure(createAssignProcedure(regionInfo, false)); } et = System.currentTimeMillis(); LOG.info("[STEP-3] " + StringUtils.humanTimeDiff(et - st)); return failover; } /** * Used by ServerCrashProcedure to make sure AssignmentManager has completed * the failover cleanup before re-assigning regions of dead servers. So that * when re-assignment happens, AssignmentManager has proper region states. */ public boolean isFailoverCleanupDone() { return failoverCleanupDone.isReady(); } /** * Used by ServerCrashProcedure tests verify the ability to suspend the * execution of the ServerCrashProcedure. */ @VisibleForTesting public void setFailoverCleanupDone(final boolean b) { master.getMasterProcedureExecutor().getEnvironment().setEventReady(failoverCleanupDone, b); } public ProcedureEvent getFailoverCleanupEvent() { return failoverCleanupDone; } /** * Used to check if the failover cleanup is done. * if not we throw PleaseHoldException since we are rebuilding the RegionStates * @param hri region to check if it is already rebuild * @throws PleaseHoldException if the failover cleanup is not completed */ private void checkFailoverCleanupCompleted(final HRegionInfo hri) throws PleaseHoldException { if (!isRunning()) { throw new PleaseHoldException("AssignmentManager not running"); } // TODO: can we avoid throwing an exception if hri is already loaded? // at the moment we bypass only meta boolean meta = isMetaRegion(hri); boolean cleanup = isFailoverCleanupDone(); if (!isMetaRegion(hri) && !isFailoverCleanupDone()) { String msg = "Master not fully online; hbase:meta=" + meta + ", failoverCleanup=" + cleanup; throw new PleaseHoldException(msg); } } // ============================================================================================ // TODO: Metrics // ============================================================================================ public int getNumRegionsOpened() { // TODO: Used by TestRegionPlacement.java and assume monotonically increasing value return 0; } public void submitServerCrash(final ServerName serverName, final boolean shouldSplitWal) { boolean carryingMeta = master.getAssignmentManager().isCarryingMeta(serverName); ProcedureExecutor<MasterProcedureEnv> procExec = this.master.getMasterProcedureExecutor(); procExec.submitProcedure( new ServerCrashProcedure(procExec.getEnvironment(), serverName, shouldSplitWal, carryingMeta)); LOG.debug("Added=" + serverName + " to dead servers, submitted shutdown handler to be executed meta=" + carryingMeta); } public void offlineRegion(final HRegionInfo regionInfo) { // TODO used by MasterRpcServices ServerCrashProcedure final RegionStateNode node = regionStates.getRegionNode(regionInfo); if (node != null) node.offline(); } public void onlineRegion(final HRegionInfo regionInfo, final ServerName serverName) { // TODO used by TestSplitTransactionOnCluster.java } public Map<ServerName, List<HRegionInfo>> getSnapShotOfAssignment(final Collection<HRegionInfo> regions) { return regionStates.getSnapShotOfAssignment(regions); } // ============================================================================================ // TODO: UTILS/HELPERS? // ============================================================================================ /** * Used by the client (via master) to identify if all regions have the schema updates * * @param tableName * @return Pair indicating the status of the alter command (pending/total) * @throws IOException */ public Pair<Integer, Integer> getReopenStatus(TableName tableName) { if (isTableDisabled(tableName)) return new Pair<Integer, Integer>(0, 0); final List<RegionState> states = regionStates.getTableRegionStates(tableName); int ritCount = 0; for (RegionState regionState : states) { if (!regionState.isOpened()) ritCount++; } return new Pair<Integer, Integer>(ritCount, states.size()); } /** * Used when assign regions, this method will put system regions in * front of user regions * @param regions * @return A list of regions with system regions at front */ public List<HRegionInfo> getOrderedRegions(final List<HRegionInfo> regions) { if (regions == null) return Collections.emptyList(); List<HRegionInfo> systemList = new ArrayList<>(); List<HRegionInfo> userList = new ArrayList<>(); for (HRegionInfo hri : regions) { if (hri.isSystemTable()) systemList.add(hri); else userList.add(hri); } // Append userList to systemList systemList.addAll(userList); return systemList; } // ============================================================================================ // TODO: Region State In Transition // ============================================================================================ protected boolean addRegionInTransition(final RegionStateNode regionNode, final RegionTransitionProcedure procedure) { return regionStates.addRegionInTransition(regionNode, procedure); } protected void removeRegionInTransition(final RegionStateNode regionNode, final RegionTransitionProcedure procedure) { regionStates.removeRegionInTransition(regionNode, procedure); } public boolean hasRegionsInTransition() { return regionStates.hasRegionsInTransition(); } public List<RegionStateNode> getRegionsInTransition() { return regionStates.getRegionsInTransition(); } public List<HRegionInfo> getAssignedRegions() { return regionStates.getAssignedRegions(); } public HRegionInfo getRegionInfo(final byte[] regionName) { final RegionStateNode regionState = regionStates.getRegionNodeFromName(regionName); return regionState != null ? regionState.getRegionInfo() : null; } // ============================================================================================ // TODO: Region Status update // ============================================================================================ private void sendRegionOpenedNotification(final HRegionInfo regionInfo, final ServerName serverName) { getBalancer().regionOnline(regionInfo, serverName); if (!this.listeners.isEmpty()) { for (AssignmentListener listener : this.listeners) { listener.regionOpened(regionInfo, serverName); } } } private void sendRegionClosedNotification(final HRegionInfo regionInfo) { getBalancer().regionOffline(regionInfo); if (!this.listeners.isEmpty()) { for (AssignmentListener listener : this.listeners) { listener.regionClosed(regionInfo); } } } public void markRegionAsOpening(final RegionStateNode regionNode) throws IOException { synchronized (regionNode) { State state = regionNode.transitionState(State.OPENING, RegionStates.STATES_EXPECTED_ON_OPEN); regionStates.addRegionToServer(regionNode.getRegionLocation(), regionNode); regionStateStore.updateRegionLocation(regionNode.getRegionInfo(), state, regionNode.getRegionLocation(), regionNode.getLastHost(), HConstants.NO_SEQNUM, regionNode.getProcedure().getProcId()); } // update the operation count metrics metrics.incrementOperationCounter(); } public void undoRegionAsOpening(final RegionStateNode regionNode) { boolean opening = false; synchronized (regionNode) { if (regionNode.isInState(State.OPENING)) { opening = true; regionStates.removeRegionFromServer(regionNode.getRegionLocation(), regionNode); } // Should we update hbase:meta? } if (opening) { // TODO: Metrics. Do opposite of metrics.incrementOperationCounter(); } } public void markRegionAsOpened(final RegionStateNode regionNode) throws IOException { final HRegionInfo hri = regionNode.getRegionInfo(); synchronized (regionNode) { State state = regionNode.transitionState(State.OPEN, RegionStates.STATES_EXPECTED_ON_OPEN); if (isMetaRegion(hri)) { setMetaInitialized(hri, true); } regionStates.addRegionToServer(regionNode.getRegionLocation(), regionNode); // TODO: OPENING Updates hbase:meta too... we need to do both here and there? // That is a lot of hbase:meta writing. regionStateStore.updateRegionLocation(regionNode.getRegionInfo(), state, regionNode.getRegionLocation(), regionNode.getLastHost(), regionNode.getOpenSeqNum(), regionNode.getProcedure().getProcId()); sendRegionOpenedNotification(hri, regionNode.getRegionLocation()); } } public void markRegionAsClosing(final RegionStateNode regionNode) throws IOException { final HRegionInfo hri = regionNode.getRegionInfo(); synchronized (regionNode) { State state = regionNode.transitionState(State.CLOSING, RegionStates.STATES_EXPECTED_ON_CLOSE); // Set meta has not initialized early. so people trying to create/edit tables will wait if (isMetaRegion(hri)) { setMetaInitialized(hri, false); } regionStates.addRegionToServer(regionNode.getRegionLocation(), regionNode); regionStateStore.updateRegionLocation(regionNode.getRegionInfo(), state, regionNode.getRegionLocation(), regionNode.getLastHost(), HConstants.NO_SEQNUM, regionNode.getProcedure().getProcId()); } // update the operation count metrics metrics.incrementOperationCounter(); } public void undoRegionAsClosing(final RegionStateNode regionNode) { // TODO: Metrics. Do opposite of metrics.incrementOperationCounter(); // There is nothing to undo? } public void markRegionAsClosed(final RegionStateNode regionNode) throws IOException { final HRegionInfo hri = regionNode.getRegionInfo(); synchronized (regionNode) { State state = regionNode.transitionState(State.CLOSED, RegionStates.STATES_EXPECTED_ON_CLOSE); regionStates.removeRegionFromServer(regionNode.getRegionLocation(), regionNode); regionNode.setLastHost(regionNode.getRegionLocation()); regionNode.setRegionLocation(null); regionStateStore.updateRegionLocation(regionNode.getRegionInfo(), state, regionNode.getRegionLocation()/*null*/, regionNode.getLastHost(), HConstants.NO_SEQNUM, regionNode.getProcedure().getProcId()); sendRegionClosedNotification(hri); } } public void markRegionAsSplit(final HRegionInfo parent, final ServerName serverName, final HRegionInfo daughterA, final HRegionInfo daughterB) throws IOException { // Update hbase:meta. Parent will be marked offline and split up in hbase:meta. // The parent stays in regionStates until cleared when removed by CatalogJanitor. // Update its state in regionStates to it shows as offline and split when read // later figuring what regions are in a table and what are not: see // regionStates#getRegionsOfTable final RegionStateNode node = regionStates.getOrCreateRegionNode(parent); node.setState(State.SPLIT); regionStateStore.splitRegion(parent, daughterA, daughterB, serverName); if (shouldAssignFavoredNodes(parent)) { List<ServerName> onlineServers = this.master.getServerManager().getOnlineServersList(); ((FavoredNodesPromoter) getBalancer()).generateFavoredNodesForDaughter(onlineServers, parent, daughterA, daughterB); } } /** * When called here, the merge has happened. The two merged regions have been * unassigned and the above markRegionClosed has been called on each so they have been * disassociated from a hosting Server. The merged region will be open after this call. The * merged regions are removed from hbase:meta below> Later they are deleted from the filesystem * by the catalog janitor running against hbase:meta. It notices when the merged region no * longer holds references to the old regions. */ public void markRegionAsMerged(final HRegionInfo child, final ServerName serverName, final HRegionInfo mother, final HRegionInfo father) throws IOException { final RegionStateNode node = regionStates.getOrCreateRegionNode(child); node.setState(State.MERGED); regionStates.deleteRegion(mother); regionStates.deleteRegion(father); regionStateStore.mergeRegions(child, mother, father, serverName); if (shouldAssignFavoredNodes(child)) { ((FavoredNodesPromoter) getBalancer()).generateFavoredNodesForMergedRegion(child, mother, father); } } /* * Favored nodes should be applied only when FavoredNodes balancer is configured and the region * belongs to a non-system table. */ private boolean shouldAssignFavoredNodes(HRegionInfo region) { return this.shouldAssignRegionsWithFavoredNodes && FavoredNodesManager.isFavoredNodeApplicable(region); } // ============================================================================================ // Assign Queue (Assign/Balance) // ============================================================================================ private final ArrayList<RegionStateNode> pendingAssignQueue = new ArrayList<RegionStateNode>(); private final ReentrantLock assignQueueLock = new ReentrantLock(); private final Condition assignQueueFullCond = assignQueueLock.newCondition(); /** * Add the assign operation to the assignment queue. * The pending assignment operation will be processed, * and each region will be assigned by a server using the balancer. */ protected void queueAssign(final RegionStateNode regionNode) { getProcedureScheduler().suspendEvent(regionNode.getProcedureEvent()); // TODO: quick-start for meta and the other sys-tables? assignQueueLock.lock(); try { pendingAssignQueue.add(regionNode); if (regionNode.isSystemTable() || pendingAssignQueue.size() == 1 || pendingAssignQueue.size() >= assignDispatchWaitQueueMaxSize) { assignQueueFullCond.signal(); } } finally { assignQueueLock.unlock(); } } private void startAssignmentThread() { assignThread = new Thread("AssignmentThread") { @Override public void run() { while (isRunning()) { processAssignQueue(); } pendingAssignQueue.clear(); } }; assignThread.start(); } private void stopAssignmentThread() { assignQueueSignal(); try { while (assignThread.isAlive()) { assignQueueSignal(); assignThread.join(250); } } catch (InterruptedException e) { LOG.warn("join interrupted", e); Thread.currentThread().interrupt(); } } private void assignQueueSignal() { assignQueueLock.lock(); try { assignQueueFullCond.signal(); } finally { assignQueueLock.unlock(); } } @edu.umd.cs.findbugs.annotations.SuppressWarnings("WA_AWAIT_NOT_IN_LOOP") private HashMap<HRegionInfo, RegionStateNode> waitOnAssignQueue() { HashMap<HRegionInfo, RegionStateNode> regions = null; assignQueueLock.lock(); try { if (pendingAssignQueue.isEmpty() && isRunning()) { assignQueueFullCond.await(); } if (!isRunning()) return null; assignQueueFullCond.await(assignDispatchWaitMillis, TimeUnit.MILLISECONDS); regions = new HashMap<HRegionInfo, RegionStateNode>(pendingAssignQueue.size()); for (RegionStateNode regionNode : pendingAssignQueue) { regions.put(regionNode.getRegionInfo(), regionNode); } pendingAssignQueue.clear(); } catch (InterruptedException e) { LOG.warn("got interrupted ", e); Thread.currentThread().interrupt(); } finally { assignQueueLock.unlock(); } return regions; } private void processAssignQueue() { final HashMap<HRegionInfo, RegionStateNode> regions = waitOnAssignQueue(); if (regions == null || regions.size() == 0 || !isRunning()) { return; } if (LOG.isTraceEnabled()) { LOG.trace("PROCESS ASSIGN QUEUE regionCount=" + regions.size()); } // TODO: Optimize balancer. pass a RegionPlan? final HashMap<HRegionInfo, ServerName> retainMap = new HashMap<HRegionInfo, ServerName>(); final List<HRegionInfo> rrList = new ArrayList<HRegionInfo>(); for (RegionStateNode regionNode : regions.values()) { if (regionNode.getRegionLocation() != null) { retainMap.put(regionNode.getRegionInfo(), regionNode.getRegionLocation()); } else { rrList.add(regionNode.getRegionInfo()); } } // TODO: connect with the listener to invalidate the cache final LoadBalancer balancer = getBalancer(); // TODO use events List<ServerName> servers = master.getServerManager().createDestinationServersList(); for (int i = 0; servers.size() < 1; ++i) { if (i % 4 == 0) { LOG.warn("no server available, unable to find a location for " + regions.size() + " unassigned regions. waiting"); } // the was AM killed if (!isRunning()) { LOG.debug("aborting assignment-queue with " + regions.size() + " not assigned"); return; } Threads.sleep(250); servers = master.getServerManager().createDestinationServersList(); } final boolean isTraceEnabled = LOG.isTraceEnabled(); if (isTraceEnabled) { LOG.trace("available servers count=" + servers.size() + ": " + servers); } // ask the balancer where to place regions if (!retainMap.isEmpty()) { if (isTraceEnabled) { LOG.trace("retain assign regions=" + retainMap); } try { acceptPlan(regions, balancer.retainAssignment(retainMap, servers)); } catch (HBaseIOException e) { LOG.warn("unable to retain assignment", e); addToPendingAssignment(regions, retainMap.keySet()); } } // TODO: Do we need to split retain and round-robin? // the retain seems to fallback to round-robin/random if the region is not in the map. if (!rrList.isEmpty()) { Collections.sort(rrList); if (isTraceEnabled) { LOG.trace("round robin regions=" + rrList); } try { acceptPlan(regions, balancer.roundRobinAssignment(rrList, servers)); } catch (HBaseIOException e) { LOG.warn("unable to round-robin assignment", e); addToPendingAssignment(regions, rrList); } } } private void acceptPlan(final HashMap<HRegionInfo, RegionStateNode> regions, final Map<ServerName, List<HRegionInfo>> plan) throws HBaseIOException { final ProcedureEvent[] events = new ProcedureEvent[regions.size()]; final long st = System.currentTimeMillis(); if (plan == null) { throw new HBaseIOException("unable to compute plans for regions=" + regions.size()); } if (plan.isEmpty()) return; int evcount = 0; for (Map.Entry<ServerName, List<HRegionInfo>> entry : plan.entrySet()) { final ServerName server = entry.getKey(); for (HRegionInfo hri : entry.getValue()) { final RegionStateNode regionNode = regions.get(hri); regionNode.setRegionLocation(server); events[evcount++] = regionNode.getProcedureEvent(); } } getProcedureScheduler().wakeEvents(evcount, events); final long et = System.currentTimeMillis(); if (LOG.isTraceEnabled()) { LOG.trace("ASSIGN ACCEPT " + events.length + " -> " + StringUtils.humanTimeDiff(et - st)); } } private void addToPendingAssignment(final HashMap<HRegionInfo, RegionStateNode> regions, final Collection<HRegionInfo> pendingRegions) { assignQueueLock.lock(); try { for (HRegionInfo hri : pendingRegions) { pendingAssignQueue.add(regions.get(hri)); } } finally { assignQueueLock.unlock(); } } /** * Get a list of servers that this region can not assign to. * For system table, we must assign them to a server with highest version. */ public List<ServerName> getExcludedServersForSystemTable() { List<Pair<ServerName, String>> serverList = master.getServerManager().getOnlineServersList().stream() .map((s) -> new Pair<>(s, master.getRegionServerVersion(s))).collect(Collectors.toList()); if (serverList.isEmpty()) { return new ArrayList<>(); } String highestVersion = Collections .max(serverList, (o1, o2) -> VersionInfo.compareVersion(o1.getSecond(), o2.getSecond())) .getSecond(); return serverList.stream().filter((p) -> !p.getSecond().equals(highestVersion)).map(Pair::getFirst) .collect(Collectors.toList()); } // ============================================================================================ // Server Helpers // ============================================================================================ @Override public void serverAdded(final ServerName serverName) { } @Override public void serverRemoved(final ServerName serverName) { final ServerStateNode serverNode = regionStates.getServerNode(serverName); if (serverNode == null) return; // just in case, wake procedures waiting for this server report wakeServerReportEvent(serverNode); } public int getServerVersion(final ServerName serverName) { final ServerStateNode node = regionStates.getServerNode(serverName); return node != null ? node.getVersionNumber() : 0; } public void killRegionServer(final ServerName serverName) { final ServerStateNode serverNode = regionStates.getServerNode(serverName); killRegionServer(serverNode); } public void killRegionServer(final ServerStateNode serverNode) { /** Don't do this. Messes up accounting. Let ServerCrashProcedure do this. for (RegionStateNode regionNode: serverNode.getRegions()) { regionNode.offline(); }*/ master.getServerManager().expireServer(serverNode.getServerName()); } }