Java tutorial
/* This file is part of VoltDB. * Copyright (C) 2008-2015 VoltDB Inc. * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as * published by the Free Software Foundation, either version 3 of the * License, or (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License * along with VoltDB. If not, see <http://www.gnu.org/licenses/>. */ package org.voltdb.iv2; import java.io.File; import java.io.IOException; import java.util.ArrayDeque; import java.util.HashMap; import java.util.HashSet; import java.util.List; import java.util.Map; import java.util.Map.Entry; import java.util.Queue; import java.util.Set; import java.util.concurrent.ExecutionException; import java.util.concurrent.ExecutorService; import java.util.concurrent.RejectedExecutionException; import java.util.concurrent.TimeUnit; import java.util.concurrent.atomic.AtomicBoolean; import java.util.concurrent.atomic.AtomicReference; import org.apache.zookeeper_voltpatches.CreateMode; import org.apache.zookeeper_voltpatches.KeeperException; import org.apache.zookeeper_voltpatches.WatchedEvent; import org.apache.zookeeper_voltpatches.Watcher; import org.apache.zookeeper_voltpatches.ZooDefs.Ids; import org.apache.zookeeper_voltpatches.ZooKeeper; import org.json_voltpatches.JSONArray; import org.json_voltpatches.JSONException; import org.json_voltpatches.JSONObject; import org.json_voltpatches.JSONStringer; import org.voltcore.logging.VoltLogger; import org.voltcore.messaging.HostMessenger; import org.voltcore.utils.CoreUtils; import org.voltcore.utils.Pair; import org.voltcore.zk.BabySitter; import org.voltcore.zk.LeaderElector; import org.voltcore.zk.ZKUtil; import org.voltdb.Promotable; import org.voltdb.SnapshotFormat; import org.voltdb.TheHashinator; import org.voltdb.VoltDB; import org.voltdb.VoltZK; import org.voltdb.catalog.SnapshotSchedule; import org.voltdb.client.ClientResponse; import org.voltdb.sysprocs.saverestore.SnapshotUtil; import org.voltdb.sysprocs.saverestore.SnapshotUtil.SnapshotResponseHandler; import com.google_voltpatches.common.collect.ImmutableMap; import com.google_voltpatches.common.collect.ImmutableSortedSet; import com.google_voltpatches.common.util.concurrent.SettableFuture; /** * LeaderAppointer handles centralized appointment of partition leaders across * the partition. This is primarily so that the leaders can be evenly * distributed throughout the cluster, reducing bottlenecks (at least at * startup). As a side-effect, this service also controls the initial startup * of the cluster, blocking operation until each partition has a k-safe set of * replicas, each partition has a leader, and the MPI has started. */ public class LeaderAppointer implements Promotable { private static final VoltLogger tmLog = new VoltLogger("TM"); private enum AppointerState { INIT, // Initial start state, used to inhibit ZK callback actions CLUSTER_START, // indicates that we're doing the initial cluster startup DONE // indicates normal running conditions, including repair } private final HostMessenger m_hostMessenger; private final ZooKeeper m_zk; // This should only be accessed through getInitialPartitionCount() on cluster startup. private final int m_initialPartitionCount; private final Map<Integer, BabySitter> m_partitionWatchers; private final LeaderCache m_iv2appointees; private final LeaderCache m_iv2masters; private final Map<Integer, PartitionCallback> m_callbacks; private final int m_kfactor; private final JSONObject m_topo; private final MpInitiator m_MPI; private final AtomicReference<AppointerState> m_state = new AtomicReference<AppointerState>( AppointerState.INIT); private SettableFuture<Object> m_startupLatch = null; private final boolean m_partitionDetectionEnabled; private boolean m_partitionDetected = false; private boolean m_usingCommandLog = false; private final AtomicBoolean m_replayComplete = new AtomicBoolean(false); private final boolean m_expectingDrSnapshot; private final AtomicBoolean m_snapshotSyncComplete = new AtomicBoolean(false); private final KSafetyStats m_stats; /* * Track partitions that are cleaned up during election/promotion etc. * This eliminates the race where the cleanup occurs while constructing babysitters * for partitions that end up being removed. */ private HashSet<Integer> m_removedPartitionsAtPromotionTime = null; // Provide a single single-threaded executor service to all the BabySitters for each partition. // This will guarantee that the ordering of events generated by ZooKeeper is preserved in the // handling of callbacks in LeaderAppointer. private final ExecutorService m_es = CoreUtils.getCachedSingleThreadExecutor("LeaderAppointer-Babysitters", 15000); private final SnapshotSchedule m_partSnapshotSchedule; private final SnapshotResponseHandler m_snapshotHandler = new SnapshotResponseHandler() { @Override public void handleResponse(ClientResponse resp) { if (resp == null) { VoltDB.crashLocalVoltDB("Received a null response to a snapshot initiation request. " + "This should be impossible.", true, null); } else if (resp.getStatus() != ClientResponse.SUCCESS) { tmLog.info("Failed to complete partition detection snapshot, status: " + resp.getStatus() + ", reason: " + resp.getStatusString()); tmLog.info("Retrying partition detection snapshot..."); SnapshotUtil.requestSnapshot(0L, m_partSnapshotSchedule.getPath(), m_partSnapshotSchedule.getPrefix() + System.currentTimeMillis(), true, SnapshotFormat.NATIVE, null, m_snapshotHandler, true); } else if (!SnapshotUtil.didSnapshotRequestSucceed(resp.getResults())) { VoltDB.crashGlobalVoltDB("Unable to complete partition detection snapshot: " + resp.getResults()[0], false, null); } else { VoltDB.crashGlobalVoltDB("Partition detection snapshot completed. Shutting down.", false, null); } } }; private class PartitionCallback extends BabySitter.Callback { final int m_partitionId; final Set<Long> m_replicas; long m_currentLeader; /** Constructor used when we know (or think we know) who the leader for this partition is */ PartitionCallback(int partitionId, long currentLeader) { this(partitionId); // Try to be clever for repair. Create ourselves with the current leader set to // whatever is in the LeaderCache, and claim that replica exists, then let the // first run() call fix the world. m_currentLeader = currentLeader; m_replicas.add(currentLeader); } /** Constructor used at startup when there is no leader */ PartitionCallback(int partitionId) { m_partitionId = partitionId; // A bit of a hack, but we should never end up with an HSID as Long.MAX_VALUE m_currentLeader = Long.MAX_VALUE; m_replicas = new HashSet<Long>(); } @Override public void run(List<String> children) { List<Long> updatedHSIds = VoltZK.childrenToReplicaHSIds(children); // compute previously unseen HSId set in the callback list Set<Long> newHSIds = new HashSet<Long>(updatedHSIds); newHSIds.removeAll(m_replicas); tmLog.debug("Newly seen replicas: " + CoreUtils.hsIdCollectionToString(newHSIds)); // compute previously seen but now vanished from the callback list HSId set Set<Long> missingHSIds = new HashSet<Long>(m_replicas); missingHSIds.removeAll(updatedHSIds); tmLog.debug("Newly dead replicas: " + CoreUtils.hsIdCollectionToString(missingHSIds)); tmLog.debug("Handling babysitter callback for partition " + m_partitionId + ": children: " + CoreUtils.hsIdCollectionToString(updatedHSIds)); if (m_state.get() == AppointerState.CLUSTER_START) { // We can't yet tolerate a host failure during startup. Crash it all if (missingHSIds.size() > 0) { VoltDB.crashGlobalVoltDB("Node failure detected during startup.", false, null); } // ENG-3166: Eventually we would like to get rid of the extra replicas beyond k_factor, // but for now we just look to see how many replicas of this partition we actually expect // and gate leader assignment on that many copies showing up. int replicaCount = m_kfactor + 1; JSONArray parts; try { parts = m_topo.getJSONArray("partitions"); for (int p = 0; p < parts.length(); p++) { JSONObject aPartition = parts.getJSONObject(p); int pid = aPartition.getInt("partition_id"); if (pid == m_partitionId) { replicaCount = aPartition.getJSONArray("replicas").length(); } } } catch (JSONException e) { // Ignore and just assume the normal number of replicas } if (children.size() == replicaCount) { m_currentLeader = assignLeader(m_partitionId, updatedHSIds); } else { tmLog.info("Waiting on " + ((m_kfactor + 1) - children.size()) + " more nodes " + "for k-safety before startup"); } } else { Set<Integer> hostsOnRing = new HashSet<Integer>(); // Check for k-safety if (!isClusterKSafe(hostsOnRing)) { VoltDB.crashGlobalVoltDB("Some partitions have no replicas. Cluster has become unviable.", false, null); } // Check if replay has completed if (m_replayComplete.get() == false) { VoltDB.crashGlobalVoltDB( "Detected node failure during command log replay. Cluster will shut down.", false, null); } // If we are a DR replica and starting from a snapshot, check if that has completed if (m_expectingDrSnapshot && m_snapshotSyncComplete.get() == false) { VoltDB.crashGlobalVoltDB( "Detected node failure during DR snapshot sync. Cluster will shut down.", false, null); } // Check to see if there's been a possible network partition and we're not already handling it if (m_partitionDetectionEnabled && !m_partitionDetected) { doPartitionDetectionActivities(hostsOnRing); } // If we survived the above gauntlet of fail, appoint a new leader for this partition. if (missingHSIds.contains(m_currentLeader)) { m_currentLeader = assignLeader(m_partitionId, updatedHSIds); } // If this partition doesn't have a leader yet, and we have new replicas added, // elect a leader. if (m_currentLeader == Long.MAX_VALUE && !updatedHSIds.isEmpty()) { m_currentLeader = assignLeader(m_partitionId, updatedHSIds); } } m_replicas.clear(); m_replicas.addAll(updatedHSIds); } } /* We'll use this callback purely for startup so we can discover when all * the leaders we have appointed have completed their promotions and * published themselves to Zookeeper */ LeaderCache.Callback m_masterCallback = new LeaderCache.Callback() { @Override public void run(ImmutableMap<Integer, Long> cache) { Set<Long> currentLeaders = new HashSet<Long>(cache.values()); tmLog.debug("Updated leaders: " + currentLeaders); if (m_state.get() == AppointerState.CLUSTER_START) { try { if (currentLeaders.size() == getInitialPartitionCount()) { tmLog.debug("Leader appointment complete, promoting MPI and unblocking."); m_state.set(AppointerState.DONE); m_MPI.acceptPromotion(); m_startupLatch.set(null); } } catch (IllegalAccessException e) { // This should never happen VoltDB.crashLocalVoltDB("Failed to get partition count", true, e); } } } }; Watcher m_partitionCallback = new Watcher() { @Override public void process(WatchedEvent event) { m_es.submit(new Runnable() { @Override public void run() { try { List<String> children = m_zk.getChildren(VoltZK.leaders_initiators, m_partitionCallback); tmLog.info("Noticed partition change " + children + ", " + "currenctly watching " + m_partitionWatchers.keySet()); for (String child : children) { int pid = LeaderElector.getPartitionFromElectionDir(child); if (!m_partitionWatchers.containsKey(pid) && pid != MpInitiator.MP_INIT_PID) { watchPartition(pid, m_es, false); } } tmLog.info("Done " + m_partitionWatchers.keySet()); } catch (Exception e) { VoltDB.crashLocalVoltDB("Cannot read leader initiator directory", false, e); } } }); } }; public LeaderAppointer(HostMessenger hm, int numberOfPartitions, int kfactor, boolean partitionDetectionEnabled, SnapshotSchedule partitionSnapshotSchedule, boolean usingCommandLog, JSONObject topology, MpInitiator mpi, KSafetyStats stats, boolean expectingDrSnapshot) { m_hostMessenger = hm; m_zk = hm.getZK(); m_kfactor = kfactor; m_topo = topology; m_MPI = mpi; m_initialPartitionCount = numberOfPartitions; m_callbacks = new HashMap<Integer, PartitionCallback>(); m_partitionWatchers = new HashMap<Integer, BabySitter>(); m_iv2appointees = new LeaderCache(m_zk, VoltZK.iv2appointees); m_iv2masters = new LeaderCache(m_zk, VoltZK.iv2masters, m_masterCallback); m_partitionDetectionEnabled = partitionDetectionEnabled; m_partSnapshotSchedule = partitionSnapshotSchedule; m_usingCommandLog = usingCommandLog; m_stats = stats; m_expectingDrSnapshot = expectingDrSnapshot; if (m_partitionDetectionEnabled) { if (!testPartitionDetectionDirectory(m_partSnapshotSchedule)) { VoltDB.crashLocalVoltDB("Unable to create partition detection snapshot directory at " + m_partSnapshotSchedule.getPath(), false, null); } } } @Override public void acceptPromotion() throws InterruptedException, ExecutionException { final SettableFuture<Object> blocker = SettableFuture.create(); try { m_es.submit(new Runnable() { @Override public void run() { try { acceptPromotionImpl(blocker); } catch (Throwable t) { blocker.setException(t); } } }); blocker.get(); } catch (RejectedExecutionException e) { if (m_es.isShutdown()) return; throw new RejectedExecutionException(e); } } private void acceptPromotionImpl(final SettableFuture<Object> blocker) throws InterruptedException, ExecutionException, KeeperException { // Crank up the leader caches. Use blocking startup so that we'll have valid point-in-time caches later. m_iv2appointees.start(true); m_iv2masters.start(true); ImmutableMap<Integer, Long> appointees = m_iv2appointees.pointInTimeCache(); // Figure out what conditions we assumed leadership under. if (appointees.size() == 0) { tmLog.debug("LeaderAppointer in startup"); m_state.set(AppointerState.CLUSTER_START); } //INIT is the default before promotion at runtime. Don't do this startup check //Let the rest of the promotion run and determine k-safety which is the else block. else if (m_state.get() == AppointerState.INIT && !VoltDB.instance().isRunning()) { ImmutableMap<Integer, Long> masters = m_iv2masters.pointInTimeCache(); try { if ((appointees.size() < getInitialPartitionCount()) || (masters.size() < getInitialPartitionCount()) || (appointees.size() != masters.size())) { // If we are promoted and the appointees or masters set is partial, the previous appointer failed // during startup (at least for now, until we add remove a partition on the fly). VoltDB.crashGlobalVoltDB("Detected failure during startup, unable to start", false, null); } } catch (IllegalAccessException e) { // This should never happen VoltDB.crashLocalVoltDB("Failed to get partition count", true, e); } } else { tmLog.debug("LeaderAppointer in repair"); m_state.set(AppointerState.DONE); } if (m_state.get() == AppointerState.CLUSTER_START) { // Need to block the return of acceptPromotion until after the MPI is promoted. Wait for this latch // to countdown after appointing all the partition leaders. The // LeaderCache callback will count it down once it has seen all the // appointed leaders publish themselves as the actual leaders. m_startupLatch = SettableFuture.create(); writeKnownLiveNodes(new HashSet<Integer>(m_hostMessenger.getLiveHostIds())); // Theoretically, the whole try/catch block below can be removed because the leader // appointer now watches the parent dir for any new partitions. It doesn't have to // create the partition dirs all at once, it can pick them up one by one as they are // created. But I'm too afraid to remove this block just before the release, // so leaving it here till later. - ning try { final int initialPartitionCount = getInitialPartitionCount(); for (int i = 0; i < initialPartitionCount; i++) { LeaderElector.createRootIfNotExist(m_zk, LeaderElector.electionDirForPartition(VoltZK.leaders_initiators, i)); watchPartition(i, m_es, true); } } catch (IllegalAccessException e) { // This should never happen VoltDB.crashLocalVoltDB("Failed to get partition count on startup", true, e); } //Asynchronously wait for this to finish otherwise it deadlocks //on task that need to run on this thread m_startupLatch.addListener(new Runnable() { @Override public void run() { try { m_zk.getChildren(VoltZK.leaders_initiators, m_partitionCallback); blocker.set(null); } catch (Throwable t) { blocker.setException(t); } } }, m_es); } else { // If we're taking over for a failed LeaderAppointer, we know when // we get here that every partition had a leader at some point in // time. We'll seed each of the PartitionCallbacks for each // partition with the HSID of the last published leader. The // blocking startup of the BabySitter watching that partition will // call our callback, get the current full set of replicas, and // appoint a new leader if the seeded one has actually failed Map<Integer, Long> masters = m_iv2masters.pointInTimeCache(); tmLog.info("LeaderAppointer repairing with master set: " + CoreUtils.hsIdValueMapToString(masters)); //Setting the map to non-null causes the babysitters to populate it when cleaning up partitions //We are only racing with ourselves in that the creation of a babysitter can trigger callbacks //that result in partitions being cleaned up. We don't have to worry about some other leader appointer. //The iteration order of the partitions doesn't matter m_removedPartitionsAtPromotionTime = new HashSet<Integer>(); for (Entry<Integer, Long> master : masters.entrySet()) { //Skip processing the partition if it was cleaned up by a babysitter that was previously //instantiated if (m_removedPartitionsAtPromotionTime.contains(master.getKey())) { tmLog.info("During promotion partition " + master.getKey() + " was cleaned up. Skipping."); continue; } int partId = master.getKey(); String dir = LeaderElector.electionDirForPartition(VoltZK.leaders_initiators, partId); m_callbacks.put(partId, new PartitionCallback(partId, master.getValue())); Pair<BabySitter, List<String>> sitterstuff = BabySitter.blockingFactory(m_zk, dir, m_callbacks.get(partId), m_es); //We could get this far and then find out that creating this particular //babysitter triggered cleanup so we need to bail out here as well if (!m_removedPartitionsAtPromotionTime.contains(master.getKey())) { m_partitionWatchers.put(partId, sitterstuff.getFirst()); } } m_removedPartitionsAtPromotionTime = null; // just go ahead and promote our MPI m_MPI.acceptPromotion(); // set up a watcher on the partitions dir so that new partitions will be picked up m_zk.getChildren(VoltZK.leaders_initiators, m_partitionCallback); blocker.set(null); } } /** * Watch the partition ZK dir in the leader appointer. * * This should be called on the elected leader appointer only. m_callbacks and * m_partitionWatchers are only accessed on initialization, promotion, * or elastic add node. * * @param pid The partition ID * @param es The executor service to use to construct the baby sitter * @param shouldBlock Whether or not to wait for the initial read of children * @throws KeeperException * @throws InterruptedException * @throws ExecutionException */ void watchPartition(int pid, ExecutorService es, boolean shouldBlock) throws InterruptedException, ExecutionException { String dir = LeaderElector.electionDirForPartition(VoltZK.leaders_initiators, pid); m_callbacks.put(pid, new PartitionCallback(pid)); BabySitter babySitter; if (shouldBlock) { babySitter = BabySitter.blockingFactory(m_zk, dir, m_callbacks.get(pid), es).getFirst(); } else { babySitter = BabySitter.nonblockingFactory(m_zk, dir, m_callbacks.get(pid), es); } m_partitionWatchers.put(pid, babySitter); } private long assignLeader(int partitionId, List<Long> children) { // We used masterHostId = -1 as a way to force the leader choice to be // the first replica in the list, if we don't have some other mechanism // which has successfully overridden it. int masterHostId = -1; if (m_state.get() == AppointerState.CLUSTER_START) { try { // find master in topo JSONArray parts = m_topo.getJSONArray("partitions"); for (int p = 0; p < parts.length(); p++) { JSONObject aPartition = parts.getJSONObject(p); int pid = aPartition.getInt("partition_id"); if (pid == partitionId) { masterHostId = aPartition.getInt("master"); } } } catch (JSONException jse) { tmLog.error("Failed to find master for partition " + partitionId + ", defaulting to 0"); jse.printStackTrace(); masterHostId = -1; // stupid default } } else { // For now, if we're appointing a new leader as a result of a // failure, just pick the first replica in the children list. // Could eventually do something more complex here to try to keep a // semi-balance, but it's unclear that this has much utility until // we add rebalancing on rejoin as well. masterHostId = -1; } long masterHSId = children.get(0); for (Long child : children) { if (CoreUtils.getHostIdFromHSId(child) == masterHostId) { masterHSId = child; break; } } tmLog.info("Appointing HSId " + CoreUtils.hsIdToString(masterHSId) + " as leader for partition " + partitionId); try { m_iv2appointees.put(partitionId, masterHSId); } catch (Exception e) { VoltDB.crashLocalVoltDB("Unable to appoint new master for partition " + partitionId, true, e); } return masterHSId; } private void writeKnownLiveNodes(Set<Integer> liveNodes) { try { if (m_zk.exists(VoltZK.lastKnownLiveNodes, null) == null) { // VoltZK.createPersistentZKNodes should have done this m_zk.create(VoltZK.lastKnownLiveNodes, null, Ids.OPEN_ACL_UNSAFE, CreateMode.PERSISTENT); } JSONStringer stringer = new JSONStringer(); stringer.object(); stringer.key("liveNodes").array(); for (Integer node : liveNodes) { stringer.value(node); } stringer.endArray(); stringer.endObject(); JSONObject obj = new JSONObject(stringer.toString()); tmLog.debug("Writing live nodes to ZK: " + obj.toString(4)); m_zk.setData(VoltZK.lastKnownLiveNodes, obj.toString(4).getBytes("UTF-8"), -1); } catch (Exception e) { VoltDB.crashLocalVoltDB("Unable to update known live nodes at ZK path: " + VoltZK.lastKnownLiveNodes, true, e); } } private Set<Integer> readPriorKnownLiveNodes() { Set<Integer> nodes = new HashSet<Integer>(); try { byte[] data = m_zk.getData(VoltZK.lastKnownLiveNodes, false, null); String jsonString = new String(data, "UTF-8"); tmLog.debug("Read prior known live nodes: " + jsonString); JSONObject jsObj = new JSONObject(jsonString); JSONArray jsonNodes = jsObj.getJSONArray("liveNodes"); for (int ii = 0; ii < jsonNodes.length(); ii++) { nodes.add(jsonNodes.getInt(ii)); } } catch (Exception e) { VoltDB.crashLocalVoltDB( "Unable to read prior known live nodes at ZK path: " + VoltZK.lastKnownLiveNodes, true, e); } return nodes; } /* * Check if the directory specified for the snapshot on partition detection * exists, and has permissions set correctly. */ private boolean testPartitionDetectionDirectory(SnapshotSchedule schedule) { if (m_partitionDetectionEnabled) { File partitionPath = new File(schedule.getPath()); if (!partitionPath.exists()) { tmLog.error("Directory " + partitionPath + " for partition detection snapshots does not exist"); return false; } if (!partitionPath.isDirectory()) { tmLog.error("Directory " + partitionPath + " for partition detection snapshots is not a directory"); return false; } File partitionPathFile = new File(partitionPath, Long.toString(System.currentTimeMillis())); try { partitionPathFile.createNewFile(); partitionPathFile.delete(); } catch (IOException e) { tmLog.error( "Could not create a test file in " + partitionPath + " for partition detection snapshots"); e.printStackTrace(); return false; } return true; } else { return true; } } /** * Given a set of the known host IDs before a fault, and the known host IDs in the * post-fault cluster, determine whether or not we think a network partition may have happened. * NOTE: this assumes that we have already done the k-safety validation for every partition and already failed * if we weren't a viable cluster. * ALSO NOTE: not private so it may be unit-tested. */ static boolean makePPDDecision(Set<Integer> previousHosts, Set<Integer> currentHosts) { // Real partition detection stuff would go here // find the lowest hostId between the still-alive hosts and the // failed hosts. Which set contains the lowest hostId? int blessedHostId = Integer.MAX_VALUE; boolean blessedHostIdInFailedSet = true; // This should be all the pre-partition hosts IDs. Any new host IDs // (say, if this was triggered by rejoin), will be greater than any surviving // host ID, so don't worry about including it in this search. for (Integer hostId : previousHosts) { if (hostId < blessedHostId) { blessedHostId = hostId; } } for (Integer hostId : currentHosts) { if (hostId.equals(blessedHostId)) { blessedHostId = hostId; blessedHostIdInFailedSet = false; } } // Evaluate PPD triggers. boolean partitionDetectionTriggered = false; // Exact 50-50 splits. The set with the lowest survivor host doesn't trigger PPD // If the blessed host is in the failure set, this set is not blessed. if (currentHosts.size() * 2 == previousHosts.size()) { if (blessedHostIdInFailedSet) { tmLog.info("Partition detection triggered for 50/50 cluster failure. " + "This survivor set is shutting down."); partitionDetectionTriggered = true; } else { tmLog.info("Partition detected for 50/50 failure. " + "This survivor set is continuing execution."); } } // A strict, viable minority is always a partition. if (currentHosts.size() * 2 < previousHosts.size()) { tmLog.info("Partition detection triggered. " + "This minority survivor set is shutting down."); partitionDetectionTriggered = true; } return partitionDetectionTriggered; } private void doPartitionDetectionActivities(Set<Integer> currentNodes) { // We should never re-enter here once we've decided we're partitioned and doomed assert (!m_partitionDetected); Set<Integer> currentHosts = new HashSet<Integer>(currentNodes); Set<Integer> previousHosts = readPriorKnownLiveNodes(); boolean partitionDetectionTriggered = makePPDDecision(previousHosts, currentHosts); if (partitionDetectionTriggered) { m_partitionDetected = true; if (m_usingCommandLog) { // Just shut down immediately VoltDB.crashGlobalVoltDB("Use of command logging detected, no additional database snapshot will " + "be generated. Please use the 'recover' action to restore the database if necessary.", false, null); } else { SnapshotUtil.requestSnapshot(0L, m_partSnapshotSchedule.getPath(), m_partSnapshotSchedule.getPrefix() + System.currentTimeMillis(), true, SnapshotFormat.NATIVE, null, m_snapshotHandler, true); } } // If the cluster host set has changed, then write the new set to ZK // NOTE: we don't want to update the known live nodes if we've decided that our subcluster is // dying, otherwise a poorly timed subsequent failure might reverse this decision. Any future promoted // LeaderAppointer should make their partition detection decision based on the pre-partition cluster state. else if (!currentHosts.equals(previousHosts)) { writeKnownLiveNodes(currentNodes); } } private boolean isClusterKSafe(Set<Integer> hostsOnRing) { boolean retval = true; List<String> partitionDirs = null; ImmutableSortedSet.Builder<KSafetyStats.StatsPoint> lackingReplication = ImmutableSortedSet.naturalOrder(); try { partitionDirs = m_zk.getChildren(VoltZK.leaders_initiators, null); } catch (Exception e) { VoltDB.crashLocalVoltDB("Unable to read partitions from ZK", true, e); } //Don't fetch the values serially do it asynchronously Queue<ZKUtil.ByteArrayCallback> dataCallbacks = new ArrayDeque<ZKUtil.ByteArrayCallback>(); Queue<ZKUtil.ChildrenCallback> childrenCallbacks = new ArrayDeque<ZKUtil.ChildrenCallback>(); for (String partitionDir : partitionDirs) { String dir = ZKUtil.joinZKPath(VoltZK.leaders_initiators, partitionDir); try { ZKUtil.ByteArrayCallback callback = new ZKUtil.ByteArrayCallback(); m_zk.getData(dir, false, callback, null); dataCallbacks.offer(callback); ZKUtil.ChildrenCallback childrenCallback = new ZKUtil.ChildrenCallback(); m_zk.getChildren(dir, false, childrenCallback, null); childrenCallbacks.offer(childrenCallback); } catch (Exception e) { VoltDB.crashLocalVoltDB("Unable to read replicas in ZK dir: " + dir, true, e); } } final long statTs = System.currentTimeMillis(); for (String partitionDir : partitionDirs) { int pid = LeaderElector.getPartitionFromElectionDir(partitionDir); String dir = ZKUtil.joinZKPath(VoltZK.leaders_initiators, partitionDir); try { // The data of the partition dir indicates whether the partition has finished // initializing or not. If not, the replicas may still be in the process of // adding themselves to the dir. So don't check for k-safety if that's the case. byte[] partitionState = dataCallbacks.poll().getData(); boolean isInitializing = false; if (partitionState != null && partitionState.length == 1) { isInitializing = partitionState[0] == LeaderElector.INITIALIZING; } List<String> replicas = childrenCallbacks.poll().getChildren(); if (pid == MpInitiator.MP_INIT_PID) continue; final boolean partitionNotOnHashRing = partitionNotOnHashRing(pid); if (!isInitializing && replicas.isEmpty()) { //These partitions can fail, just cleanup and remove the partition from the system if (partitionNotOnHashRing) { removeAndCleanupPartition(pid); continue; } tmLog.fatal("K-Safety violation: No replicas found for partition: " + pid); retval = false; } else if (!partitionNotOnHashRing) { //Record host ids for all partitions that are on the ring //so they are considered for partition detection for (String replica : replicas) { final String split[] = replica.split("/"); final long hsId = Long.valueOf(split[split.length - 1].split("_")[0]); final int hostId = CoreUtils.getHostIdFromHSId(hsId); hostsOnRing.add(hostId); } } if (!isInitializing && !partitionNotOnHashRing) { lackingReplication .add(new KSafetyStats.StatsPoint(statTs, pid, m_kfactor + 1 - replicas.size())); } } catch (Exception e) { VoltDB.crashLocalVoltDB("Unable to read replicas in ZK dir: " + dir, true, e); } } m_stats.setSafetySet(lackingReplication.build()); return retval; } private void removeAndCleanupPartition(int pid) { tmLog.info("Removing and cleanup up partition info for partition " + pid); if (m_removedPartitionsAtPromotionTime != null) { m_removedPartitionsAtPromotionTime.add(pid); tmLog.info( "Partition " + pid + " was cleaned up during LeaderAppointer promotion and should be skipped"); } BabySitter sitter = m_partitionWatchers.remove(pid); if (sitter != null) { sitter.shutdown(); } m_callbacks.remove(pid); try { ZKUtil.asyncDeleteRecursively(m_zk, ZKUtil.joinZKPath(VoltZK.iv2masters, String.valueOf(pid))); ZKUtil.asyncDeleteRecursively(m_zk, ZKUtil.joinZKPath(VoltZK.iv2appointees, String.valueOf(pid))); ZKUtil.asyncDeleteRecursively(m_zk, ZKUtil.joinZKPath(VoltZK.leaders_initiators, "partition_" + String.valueOf(pid))); } catch (Exception e) { tmLog.error("Error removing partition info", e); } } private static boolean partitionNotOnHashRing(int pid) { if (TheHashinator.getConfiguredHashinatorType() == TheHashinator.HashinatorType.LEGACY) return false; return TheHashinator.getRanges(pid).isEmpty(); } /** * Gets the initial cluster partition count on startup. This can only be called during * initialization. Calling this after initialization throws, because the partition count may * not reflect the actual partition count in the cluster. * * @return */ private int getInitialPartitionCount() throws IllegalAccessException { AppointerState currentState = m_state.get(); if (currentState != AppointerState.INIT && currentState != AppointerState.CLUSTER_START) { throw new IllegalAccessException("Getting cached partition count after cluster " + "startup"); } return m_initialPartitionCount; } public void onReplayCompletion() { m_replayComplete.set(true); } public void onSyncSnapshotCompletion() { m_snapshotSyncComplete.set(true); } public void shutdown() { try { m_es.execute(new Runnable() { @Override public void run() { try { m_iv2appointees.shutdown(); m_iv2masters.shutdown(); for (BabySitter watcher : m_partitionWatchers.values()) { watcher.shutdown(); } } catch (Exception e) { // don't care, we're going down } } }); m_es.shutdown(); m_es.awaitTermination(356, TimeUnit.DAYS); } catch (InterruptedException e) { tmLog.warn("Unexpected interrupted exception", e); } } }