Java tutorial
/* This file is part of VoltDB. * Copyright (C) 2008-2012 VoltDB Inc. * * Permission is hereby granted, free of charge, to any person obtaining * a copy of this software and associated documentation files (the * "Software"), to deal in the Software without restriction, including * without limitation the rights to use, copy, modify, merge, publish, * distribute, sublicense, and/or sell copies of the Software, and to * permit persons to whom the Software is furnished to do so, subject to * the following conditions: * * The above copyright notice and this permission notice shall be * included in all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. * IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR * OTHER DEALINGS IN THE SOFTWARE. */ package org.voltdb; import java.io.IOException; import java.io.StringWriter; import java.nio.ByteBuffer; import java.util.ArrayDeque; import java.util.ArrayList; import java.util.Arrays; import java.util.Deque; import java.util.HashMap; import java.util.HashSet; import java.util.List; import java.util.Map; import java.util.Random; import java.util.concurrent.atomic.AtomicInteger; import junit.framework.TestCase; import org.apache.commons.lang3.ArrayUtils; import org.voltcore.logging.Level; import org.voltcore.logging.VoltLogger; import org.voltcore.messaging.HeartbeatMessage; import org.voltcore.messaging.HeartbeatResponseMessage; import org.voltcore.messaging.LocalObjectMessage; import org.voltcore.messaging.Mailbox; import org.voltcore.messaging.Subject; import org.voltcore.messaging.VoltMessage; import org.voltcore.utils.CoreUtils; import org.voltdb.VoltZK.MailboxType; import org.voltdb.catalog.Procedure; import org.voltdb.client.ClientResponse; import org.voltdb.dtxn.DtxnConstants; import org.voltdb.dtxn.MultiPartitionParticipantTxnState; import org.voltdb.dtxn.RestrictedPriorityQueue; import org.voltdb.dtxn.SinglePartitionTxnState; import org.voltdb.exceptions.SerializableException; import org.voltdb.executionsitefuzz.ExecutionSiteFuzzChecker; import org.voltdb.fault.FaultDistributor; import org.voltdb.fault.SiteFailureFault; import org.voltdb.messaging.FastSerializer; import org.voltdb.messaging.FragmentTaskMessage; import org.voltdb.messaging.InitiateTaskMessage; import org.voltdb.messaging.MultiPartitionParticipantMessage; public class TestExecutionSite extends TestCase { private static final VoltLogger testLog = new VoltLogger("TEST"); private static final int FAIL_RANGE = 80000; private static HashMap<Long, RussianRouletteMailbox> postoffice = new HashMap<Long, RussianRouletteMailbox>(); private void registerMailbox(long siteId, RussianRouletteMailbox mbox) { postoffice.put(siteId, mbox); } private static final AtomicInteger m_nextMustDie = new AtomicInteger(-1); // A Mailbox implementation that will randomly simulate node failure by // killing a site instead of sending a message. // TODO: The failure decision is "tuned" to generate a decent number of // site failures but not so many as to kill every site in rapid succession. // Still fragile and could use some more work. class RussianRouletteMailbox implements Mailbox { int m_totalSends; int m_heartBeatSends; private final Long m_siteId; final ArrayList<Deque<VoltMessage>> m_messages = new ArrayList<Deque<VoltMessage>>(); private int m_failureProb; public RussianRouletteMailbox(Long siteId) { for (Subject s : Subject.values()) { m_messages.add(s.getId(), new ArrayDeque<VoltMessage>()); } m_failureProb = 0; m_siteId = siteId; m_totalSends = 0; } @Override public long getHSId() { return m_siteId; } @Override public void setHSId(long hsId) { throw new UnsupportedOperationException(); } void setFailureLikelihood(int failChance) { m_failureProb = failChance; } // Synchronized so messagesSinceFail changes are thread-safe // across execution site accesses synchronized boolean shouldFail(boolean broadcast) { boolean fail = false; // if we don't want failure, don't give m_nextMustDie a chance to kill us if (m_failureProb == 0) { return false; } int failchance = m_failureProb; if (broadcast) { failchance *= 10; } int nextMustDie = m_nextMustDie.decrementAndGet(); if (nextMustDie == 0 || (m_rand.nextInt(FAIL_RANGE) >= (FAIL_RANGE - failchance))) { fail = true; if (nextMustDie < 0) { if (m_rand.nextInt(1) == 0) { m_nextMustDie.set(100); } } } return fail; } @Override public void send(long siteId, VoltMessage message) { message.m_sourceHSId = m_siteId; m_totalSends++; if (message instanceof HeartbeatResponseMessage) { m_heartBeatSends++; } if (shouldFail(false)) { killSite(); return; } RussianRouletteMailbox dest = postoffice.get(siteId); if (dest != null) { dest.deliver(message); } } @Override public void send(long[] siteIds, VoltMessage message) { message.m_sourceHSId = m_siteId; if (message instanceof HeartbeatResponseMessage) { m_heartBeatSends += siteIds.length; } for (int i = 0; siteIds != null && i < siteIds.length; ++i) { m_totalSends++; // There are more single send() rather than these broadcast // send()s, so we increase the chance of these because // there's more interesting behavior here. if (shouldFail(true)) { //System.out.println("FAILING NODE MID-BROADCAST"); killSite(); return; } RussianRouletteMailbox dest = postoffice.get(siteIds[i]); if (dest != null) { dest.deliver(message); } } } public int getWaitingCount() { throw new UnsupportedOperationException(); } private final Subject m_defaultSubjects[] = new Subject[] { Subject.FAILURE, Subject.DEFAULT }; @Override public VoltMessage recv() { return recv(m_defaultSubjects); } @Override public VoltMessage recvBlocking() { return recvBlocking(m_defaultSubjects); } @Override public VoltMessage recvBlocking(long timeout) { return recvBlocking(m_defaultSubjects, timeout); } @Override public synchronized VoltMessage recv(Subject subjects[]) { for (Subject s : subjects) { final Deque<VoltMessage> dq = m_messages.get(s.getId()); assert (dq != null); VoltMessage m = dq.poll(); if (m != null) { return m; } } return null; } @Override public synchronized VoltMessage recvBlocking(Subject subjects[]) { VoltMessage message = null; while (message == null) { for (Subject s : subjects) { final Deque<VoltMessage> dq = m_messages.get(s.getId()); message = dq.poll(); if (message != null) { return message; } } try { this.wait(); } catch (InterruptedException e) { return null; } } return null; } @Override public synchronized VoltMessage recvBlocking(Subject subjects[], long timeout) { VoltMessage message = null; for (Subject s : subjects) { final Deque<VoltMessage> dq = m_messages.get(s.getId()); message = dq.poll(); if (message != null) { return message; } } try { this.wait(timeout); } catch (InterruptedException e) { return null; } for (Subject s : subjects) { final Deque<VoltMessage> dq = m_messages.get(s.getId()); message = dq.poll(); if (message != null) { return message; } } return null; } @Override public void deliver(VoltMessage message) { deliver(message, false); } @Override public void deliverFront(VoltMessage message) { deliver(message, true); } public void deliver(VoltMessage message, final boolean toFront) { final Deque<VoltMessage> dq = m_messages.get(message.getSubject()); synchronized (this) { if (toFront) { dq.push(message); } else { dq.offer(message); } this.notify(); } } // Unfortunately uses Thread.stop() to swiftly slay an execution site // like ninja...easiest way I found to kill the site and not allow it to // finish sending the messages for whatever transaction it was // involved in. void killSite() { // Log breadcrumbs for validator m_siteLogger.get(m_siteId).trace("FUZZTEST selfNodeFailure " + CoreUtils.getHostIdFromHSId(m_siteId)); // mark the site as down in the catalog //System.out.println("KILLING SITE: " + m_siteId); m_sites.get(m_siteId).shutdown(); m_voltdb.killSite(m_siteId); long initiatorId = getInitiatorIds()[CoreUtils.getSiteIdFromHSId(m_siteId)]; m_voltdb.killSite(initiatorId); /* * Fail the site, and its initiator */ m_voltdb.getFaultDistributor() .reportFault(new SiteFailureFault(Arrays.asList(new Long[] { m_siteId, initiatorId }))); // remove this site from the postoffice postoffice.remove(m_siteId); // stop/join this site's thread throw new Error(); } } // ExecutionSite's snapshot processor requires the shared library static { EELibraryLoader.loadExecutionEngineLibrary(true); } /* * If you change the topology parameters there is a chance you might * make the test for ENG1617 invalid */ // Topology parameters private static final int K_FACTOR = 2; private static final int PARTITION_COUNT = 3; private static final int SITE_COUNT = PARTITION_COUNT * (K_FACTOR + 1); MockVoltDB m_voltdb; ExecutionSiteFuzzChecker m_checker; Map<Long, RestrictedPriorityQueue> m_rpqs = new HashMap<Long, RestrictedPriorityQueue>(); Map<Long, ExecutionSite> m_sites = new HashMap<Long, ExecutionSite>(); Map<Long, RussianRouletteMailbox> m_mboxes = new HashMap<Long, RussianRouletteMailbox>(); Map<Long, Thread> m_siteThreads = new HashMap<Long, Thread>(); Map<Long, VoltLogger> m_siteLogger = new HashMap<Long, VoltLogger>(); Map<Long, StringWriter> m_siteResults = new HashMap<Long, StringWriter>(); Random m_rand; private void start(int siteCount, int partitionCount, int kFactor) throws Exception { long seed = System.currentTimeMillis(); m_rand = new Random(seed); m_checker = new ExecutionSiteFuzzChecker(); m_voltdb = new MockVoltDB(); m_voltdb.setFaultDistributor(new FaultDistributor(m_voltdb)); // one host and one initiator per site for (int ss = 0; ss < siteCount; ss++) { final long siteId = CoreUtils.getHSIdFromHostAndSite(getHostIdForSiteId(ss), getInitiatorIdForSiteId(ss)); m_voltdb.addSite(siteId, MailboxType.Initiator); // Configure log4j so that ExecutionSite generates FUZZTEST output String logname = ExecutionSite.class.getName() + "." + ss; final VoltLogger siteLogger = new VoltLogger(logname); m_siteLogger.put(siteId, siteLogger); final StringWriter siteResults = new StringWriter(); m_siteResults.put(siteId, siteResults); siteLogger.addSimpleWriterAppender(siteResults); siteLogger.setLevel(Level.TRACE); } // create k+1 sites per partition int siteIndex = 0; for (int pp = 0; pp < partitionCount; pp++) { for (int kk = 0; kk < (kFactor + 1); kk++) { final long siteId = CoreUtils.getHSIdFromHostAndSite(getHostIdForSiteId(siteIndex), siteIndex); m_voltdb.addSite(siteId, pp); m_checker.addSite(siteId, pp, m_siteResults .get(CoreUtils.getHSIdFromHostAndSite(siteIndex, getInitiatorIdForSiteId(siteIndex)))); ++siteIndex; } } if (siteIndex != siteCount) { throw new RuntimeException("Invalid setup logic."); } Procedure proc = null; proc = m_voltdb.addProcedureForTest(MockSPVoltProcedure.class.getName()); proc.setReadonly(false); proc.setSinglepartition(true); proc = m_voltdb.addProcedureForTest(MockROSPVoltProcedure.class.getName()); proc.setReadonly(true); proc.setSinglepartition(true); proc = m_voltdb.addProcedureForTest(MockMPVoltProcedure.class.getName()); proc.setReadonly(false); proc.setSinglepartition(false); proc = m_voltdb.addProcedureForTest(MockMPVoltProcedureRollbackParticipant.class.getName()); proc.setReadonly(false); proc.setSinglepartition(false); // Done with the logical topology. VoltDB.replaceVoltDBInstanceForTest(m_voltdb); // Create the real objects for (int ss = 0; ss < siteCount; ++ss) { long siteId = CoreUtils.getHSIdFromHostAndSite(ss, ss); m_mboxes.put(siteId, new RussianRouletteMailbox(siteId)); m_siteLogger.put(siteId, m_siteLogger.get(CoreUtils.getHSIdFromHostAndSite(ss, getInitiatorIdForSiteId(ss)))); m_rpqs.put(siteId, new RestrictedPriorityARRR(getInitiatorIds(), ss, m_mboxes.get(siteId))); m_sites.put(siteId, new ExecutionSite(m_voltdb, m_mboxes.get(siteId), null, m_rpqs.get(siteId), new MockProcedureRunnerFactory(), false, false, 0, partitionCount, null)); registerMailbox(siteId, m_mboxes.get(siteId)); } } @Override protected void tearDown() throws Exception { System.out.println("Doing shutdown"); super.tearDown(); m_sites.clear(); m_mboxes.clear(); if (m_voltdb != null) { m_voltdb.shutdown(null); m_voltdb = null; } System.out.println("Shutdown ZK"); m_checker = null; } /* Partitions are assigned to sites in sequence: a,a,b,b,c,c.. */ int getPartitionIdForSiteId(int siteId) { return (int) Math.floor(siteId / (K_FACTOR + 1)); } /* Initiator ids are site ids + 1,000 */ int getInitiatorIdForSiteId(int siteId) { return siteId + 1000; } /* Get a site on the same "host" as the initiator */ long getSiteIdForInitiatorId(long initiatorId) { int siteId = CoreUtils.getSiteIdFromHSId(initiatorId) - 1000; assert (CoreUtils.getHSIdFromHostAndSite(siteId, getInitiatorIdForSiteId(siteId)) == initiatorId); return CoreUtils.getHSIdFromHostAndSite(siteId, siteId); } long getHSIdForES(int siteId) { return CoreUtils.getHSIdFromHostAndSite(getHostIdForSiteId(siteId), siteId); } /* return a new array of initiator ids */ long[] getInitiatorIds() { long[] ids = new long[SITE_COUNT]; for (int ss = 0; ss < SITE_COUNT; ss++) { ids[ss] = CoreUtils.getHSIdFromHostAndSite(getHostIdForSiteId(ss), getInitiatorIdForSiteId(ss)); } return ids; } /* Random initiator */ private long selectRandomInitiator(Random rand) { int site = rand.nextInt(SITE_COUNT); return CoreUtils.getHSIdFromHostAndSite(getHostIdForSiteId(site), getInitiatorIdForSiteId(site)); } /* Given a partition, return a coordinator by value and the participants by out-param. */ private long selectCoordinatorAndParticipants(Random rand, int partition, long initiator, List<Long> participants) { // Failure detection relies on the assumption that coordinator and // initiator are co-located long coordinator = getSiteIdForInitiatorId(initiator); for (int i = 0; i < SITE_COUNT; i++) { long hsId = getHSIdForES(i); if (hsId == coordinator) continue; else participants.add(hsId); } return coordinator; } /* Host ids are site ids */ int getHostIdForSiteId(int siteId) { return siteId; } List<Long> getSiteIdsForPartitionId(int partitionId) { ArrayList<Long> result = new ArrayList<Long>(); for (int ss = 0; ss < SITE_COUNT; ++ss) { if (getPartitionIdForSiteId(ss) == partitionId) { result.add(getHSIdForES(ss)); } } return result; } /* Fake RestrictedPriorityQueue implementation */ public static class RestrictedPriorityARRR extends RestrictedPriorityQueue { private static final long serialVersionUID = 1L; /** * Initialize the RPQ with the set of initiators in the system and * the corresponding execution site's mailbox. Ugh. */ public RestrictedPriorityARRR(long[] initiatorSiteIds, long siteId, Mailbox mbox) { super(initiatorSiteIds, siteId, mbox, true); } } public static class MockProcedureRunnerFactory extends ProcedureRunnerFactory { @Override public ProcedureRunner create(VoltProcedure procedure, Procedure catProc, CatalogSpecificPlanner csp) { if (procedure instanceof MockROSPVoltProcedure) return new MockSPProcedureRunner((MockSPVoltProcedure) procedure, (ExecutionSite) super.m_site); else if (procedure instanceof MockSPVoltProcedure) return new MockSPProcedureRunner((MockSPVoltProcedure) procedure, (ExecutionSite) super.m_site); else if (procedure instanceof MockMPVoltProcedure) return new MockMPProcedureRunner((MockMPVoltProcedure) procedure, (ExecutionSite) super.m_site); else if (procedure instanceof VoltSystemProcedure) return super.create(procedure, catProc, csp); else assert (false); return null; } } public static class MockSPProcedureRunner extends ProcedureRunner { public static int m_called = 0; final MockSPVoltProcedure m_procedure; final ExecutionSite m_site; MockSPProcedureRunner(MockSPVoltProcedure procedure, ExecutionSite site) { super(procedure, site, null, null, null); m_procedure = procedure; m_site = site; } @Override public ClientResponseImpl call(long txnId, Object... paramList) { m_site.simulateExecutePlanFragments(m_txnState.txnId, m_procedure.testReadOnly()); final ClientResponseImpl response = new ClientResponseImpl(ClientResponseImpl.SUCCESS, new VoltTable[] {}, "MockSPVoltProcedure Response"); ++m_called; return response; } @Override protected void reflect() { } } /* Single partition write */ public static class MockSPVoltProcedure extends VoltProcedure { boolean testReadOnly() { return false; } } /* Single partition read */ public static class MockROSPVoltProcedure extends MockSPVoltProcedure { @Override boolean testReadOnly() { return true; } } public static class MockMPVoltProcedureRollbackParticipant extends MockMPVoltProcedure { @Override boolean rollbackParticipant() { return true; } } public static class MockMPVoltProcedure extends VoltProcedure { boolean rollbackParticipant() { return false; } } /* Multi-partition - mock VoltProcedure.slowPath() */ public static class MockMPProcedureRunner extends ProcedureRunner { public final MockMPVoltProcedure m_procedure; public final ExecutionSite m_site; int m_numberOfBatches = 1; // Enable these simulated faults before running the procedure by setting // one of these booleans to true. Allows testcases to simulate various // coordinator node failures. Faults are turned back off once simulated // by the procedure (since they're static...) public static boolean simulate_coordinator_dies_during_commit = false; // Counter for test cases that want to see if the procedure ran. public static int m_called = 0; // Some functions that can be overridden by subclasses to change behavior. int numberOfBatches() { return m_numberOfBatches; } int statementsPerBatch() { return 1; } boolean nonTransactional() { return true; } /* TODO: implement these. boolean rollbackCoordinator() { return false; } boolean userRollbackProcStart() { return false; } boolean userRollbackProcEnd() { return false; } */ public MockMPProcedureRunner(MockMPVoltProcedure procedure, ExecutionSite site) { super(procedure, site, null, null, null); m_procedure = procedure; m_site = site; } @Override protected void reflect() { } /** Helper to look for interesting params in the list and set * internal state based on it */ private void parseParamList(Object... paramList) { ArrayList<Object> params = new ArrayList<Object>(); for (Object param : paramList) { params.add(param); } // parse out number of batches int num_batches_index = params.indexOf("number_of_batches"); if (num_batches_index != -1) { m_numberOfBatches = (Integer) params.get(num_batches_index + 1); } } /** Helper to turn object list into parameter set buffer */ private ByteBuffer createParametersBuffer(Object... paramList) { ParameterSet paramSet = new ParameterSet(); paramSet.setParameters(paramList); FastSerializer fs = new FastSerializer(); try { fs.writeObject(paramSet); } catch (IOException e) { throw new RuntimeException(e); } ByteBuffer paramBuf = fs.getBuffer(); return paramBuf; } @Override public ClientResponseImpl call(long txnId, Object... paramList) { try { parseParamList(paramList); ByteBuffer paramBuf = createParametersBuffer(paramList); for (int i = 0; i < numberOfBatches(); i++) { boolean finalTask = (i == numberOfBatches() - 1); // XXX-IZZY these will turn into arrays for multi-statement batches // Build the aggregator and the distributed tasks. int localTask_startDep = m_txnState.getNextDependencyId() | DtxnConstants.MULTIPARTITION_DEPENDENCY; int localTask_outputDep = m_txnState.getNextDependencyId(); FragmentTaskMessage localTask = FragmentTaskMessage.createWithOneFragment( m_txnState.initiatorHSId, m_txnState.coordinatorSiteId, m_txnState.txnId, m_txnState.isReadOnly(), 1, localTask_outputDep, paramBuf, false); localTask.addInputDepId(0, localTask_startDep); FragmentTaskMessage distributedTask = FragmentTaskMessage.createWithOneFragment( m_txnState.initiatorHSId, m_txnState.coordinatorSiteId, m_txnState.txnId, m_txnState.isReadOnly(), 0, localTask_startDep, paramBuf, finalTask); m_txnState.createLocalFragmentWork(localTask, nonTransactional() && finalTask); m_txnState.createAllParticipatingFragmentWork(distributedTask); m_txnState.setupProcedureResume(finalTask, new int[] { localTask_outputDep }); final Map<Integer, List<VoltTable>> resultDeps = m_site.recursableRun(m_txnState); assertTrue(resultDeps != null); } ++m_called; // simulate node failure: no commit sent to participant if (simulate_coordinator_dies_during_commit) { // turn off the fault for the next time through simulate_coordinator_dies_during_commit = false; throw new ThreadDeath(); } // Return a made up table (no EE interaction anyway.. ) VoltTable[] vta = new VoltTable[1]; vta[0] = new VoltTable(new VoltTable.ColumnInfo("", VoltType.INTEGER)); vta[0].addRow(new Integer(1)); return new ClientResponseImpl(ClientResponse.SUCCESS, vta, null); } // VoltProcedure's call method converts invocation exceptions // to this error path. Do the same here. catch (SerializableException ex) { byte status = 0; return new ClientResponseImpl(ClientResponse.GRACEFUL_FAILURE, status, "", new VoltTable[0], "Intentional fuzz failure.", ex); } } } public void testFuzzedTransactions() throws Exception { for (int ii = 0; ii < 1; ii++) { tearDown(); System.gc(); start(SITE_COUNT, PARTITION_COUNT, K_FACTOR); final int totalTransactions = 20000; final long firstTxnId = 10000; for (int i = 0; i < SITE_COUNT; ++i) { m_mboxes.get(getHSIdForES(i)).setFailureLikelihood(1); } queueTransactions(firstTxnId, totalTransactions, m_rand); createAndRunSiteThreads(); // wait for all the sites to terminate runLoops for (int i = 0; i < SITE_COUNT; ++i) { boolean stopped = false; do { try { m_siteThreads.get(getHSIdForES(i)).join(); } catch (InterruptedException e) { } if (m_siteThreads.get(getHSIdForES(i)).isAlive() == false) { System.out.println("Joined site " + i); stopped = true; } } while (!stopped); } for (int i = 0; i < SITE_COUNT; ++i) { System.out.println("sends for mailbox: " + i + ": " + m_mboxes.get(getHSIdForES(i)).m_totalSends); } m_checker.dumpLogs(); assertTrue(m_checker.validateLogs()); } } /* * SinglePartition basecase. Show that recursableRun completes a * single partition transaction. */ public void testSinglePartitionTxn() throws Exception { start(SITE_COUNT, PARTITION_COUNT, K_FACTOR); final boolean readOnly = false; final boolean singlePartition = true; // That the full procedure name is necessary is a bug in the // mock objects - or perhaps an issue with a nested class? // Or maybe a difference in what ClientInterface does? final StoredProcedureInvocation tx1_spi = new StoredProcedureInvocation(); tx1_spi.setProcName("org.voltdb.TestExecutionSite$MockSPVoltProcedure"); tx1_spi.setParams("commit", new Integer(0)); final InitiateTaskMessage tx1_mn = new InitiateTaskMessage(getInitiatorIdForSiteId(0), 0, 1000, readOnly, singlePartition, tx1_spi, Long.MAX_VALUE); final long siteId0 = getHSIdForES(0); final SinglePartitionTxnState tx1 = new SinglePartitionTxnState(m_mboxes.get(siteId0), m_sites.get(siteId0), tx1_mn); int callcheck = MockSPProcedureRunner.m_called; assertFalse(tx1.isDone()); assertEquals(0, m_sites.get(siteId0).lastCommittedTxnId); assertEquals(0, m_sites.get(siteId0).lastKnownGloballyCommitedMultiPartTxnId); m_sites.get(siteId0).m_transactionsById.put(tx1.txnId, tx1); m_sites.get(siteId0).recursableRun(tx1); assertTrue(tx1.isDone()); assertEquals(null, m_sites.get(siteId0).m_transactionsById.get(tx1.txnId)); assertEquals((++callcheck), MockSPProcedureRunner.m_called); assertEquals(1000, m_sites.get(siteId0).lastCommittedTxnId); assertEquals(0, m_sites.get(siteId0).lastKnownGloballyCommitedMultiPartTxnId); } /* * Single partition read-only */ public void testROSinglePartitionTxn() throws Exception { start(SITE_COUNT, PARTITION_COUNT, K_FACTOR); final boolean readOnly = true; final boolean singlePartition = true; final StoredProcedureInvocation tx1_spi = new StoredProcedureInvocation(); tx1_spi.setProcName("org.voltdb.TestExecutionSite$MockROSPVoltProcedure"); tx1_spi.setParams("commit", new Integer(0)); final InitiateTaskMessage tx1_mn = new InitiateTaskMessage(getInitiatorIdForSiteId(0), 0, 1000, readOnly, singlePartition, tx1_spi, Long.MAX_VALUE); final long siteId0 = getHSIdForES(0); final SinglePartitionTxnState tx1 = new SinglePartitionTxnState(m_mboxes.get(siteId0), m_sites.get(siteId0), tx1_mn); int callcheck = MockSPProcedureRunner.m_called; assertFalse(tx1.isDone()); m_sites.get(siteId0).m_transactionsById.put(tx1.txnId, tx1); m_sites.get(siteId0).recursableRun(tx1); assertTrue(tx1.isDone()); assertEquals(null, m_sites.get(siteId0).m_transactionsById.get(tx1.txnId)); assertEquals((++callcheck), MockSPProcedureRunner.m_called); } /* * Multipartition basecase. Show that recursableRun completes a * multi partition transaction. */ public void testMultiPartitionTxn() throws Exception { tearDown(); start(2, 2, 0); final boolean readOnly = false, singlePartition = false; Thread es1, es2; final StoredProcedureInvocation tx1_spi = new StoredProcedureInvocation(); tx1_spi.setProcName("org.voltdb.TestExecutionSite$MockMPVoltProcedure"); tx1_spi.setParams("commit", new Integer(0)); final long siteId0 = getHSIdForES(0); final long siteId1 = getHSIdForES(1); // site 1 is the coordinator final InitiateTaskMessage tx1_mn_1 = new InitiateTaskMessage(getInitiatorIdForSiteId(0), 0, 1000, readOnly, singlePartition, tx1_spi, Long.MAX_VALUE, new long[] { siteId1 }); final MultiPartitionParticipantTxnState tx1_1 = new MultiPartitionParticipantTxnState(m_mboxes.get(siteId0), m_sites.get(siteId0), tx1_mn_1); // site 2 is a participant final MultiPartitionParticipantMessage tx1_mn_2 = new MultiPartitionParticipantMessage( getInitiatorIdForSiteId(0), 0, 1000, readOnly); final MultiPartitionParticipantTxnState tx1_2 = new MultiPartitionParticipantTxnState(m_mboxes.get(siteId1), m_sites.get(siteId1), tx1_mn_2); // pre-conditions int callcheck = MockMPProcedureRunner.m_called; assertFalse(tx1_1.isDone()); assertFalse(tx1_2.isDone()); assertEquals(0, m_sites.get(siteId0).lastCommittedTxnId); assertEquals(0, m_sites.get(siteId0).lastKnownGloballyCommitedMultiPartTxnId); assertEquals(0, m_sites.get(siteId1).lastCommittedTxnId); assertEquals(0, m_sites.get(siteId1).lastKnownGloballyCommitedMultiPartTxnId); m_sites.get(siteId0).m_transactionsById.put(tx1_1.txnId, tx1_1); m_sites.get(siteId1).m_transactionsById.put(tx1_2.txnId, tx1_2); // execute transaction es1 = new Thread(new Runnable() { @Override public void run() { m_sites.get(siteId0).recursableRun(tx1_1); } }); es1.start(); es2 = new Thread(new Runnable() { @Override public void run() { m_sites.get(siteId1).recursableRun(tx1_2); } }); es2.start(); es1.join(); es2.join(); // post-conditions assertTrue(tx1_1.isDone()); assertTrue(tx1_2.isDone()); assertEquals(null, m_sites.get(siteId0).m_transactionsById.get(tx1_1.txnId)); assertEquals(null, m_sites.get(siteId1).m_transactionsById.get(tx1_2.txnId)); assertEquals(1000, m_sites.get(siteId1).lastCommittedTxnId); assertEquals(1000, m_sites.get(siteId0).lastKnownGloballyCommitedMultiPartTxnId); assertEquals(1000, m_sites.get(siteId1).lastCommittedTxnId); assertEquals(1000, m_sites.get(siteId1).lastKnownGloballyCommitedMultiPartTxnId); assertEquals((++callcheck), MockMPProcedureRunner.m_called); } public void testMultipartitionParticipantCommitsOnFailure() throws Exception { tearDown(); start(2, 2, 0); // cause the coordinator to die before committing. TestExecutionSite.MockMPProcedureRunner.simulate_coordinator_dies_during_commit = true; // The initiator's global commit point will be -1 because // the restricted priority queue is never fed by this testcase. // TxnIds in this testcase are chosen to make -1 a valid // global commit point. (Where -1 is DUMMY_LAST_SEEN...) final long siteId0 = getHSIdForES(0); final long siteId1 = getHSIdForES(1); // Want to commit this participant. Global commit pt must // be GT than the running txnid. m_sites.get(siteId0).lastKnownGloballyCommitedMultiPartTxnId = DtxnConstants.DUMMY_LAST_SEEN_TXN_ID + 1; m_sites.get(siteId1).lastKnownGloballyCommitedMultiPartTxnId = DtxnConstants.DUMMY_LAST_SEEN_TXN_ID + 1; boolean test_rollback = false; multipartitionNodeFailure(test_rollback, DtxnConstants.DUMMY_LAST_SEEN_TXN_ID); } public void testMultiPartitionParticipantRollsbackOnFailure() throws Exception { tearDown(); start(2, 2, 0); // cause the coordinator to die before committing. TestExecutionSite.MockMPProcedureRunner.simulate_coordinator_dies_during_commit = true; // The initiator's global commit point will be -1 because // the restricted priority queue is never fed by this testcase. // TxnIds in this testcase are chosen to make -1 a valid // global commit point. (Where -1 is DUMMY_LAST_SEEN...) final long siteId0 = getHSIdForES(0); final long siteId1 = getHSIdForES(1); // Want to NOT commit this participant. Global commit pt must // be LT than the running txnid. m_sites.get(siteId0).lastKnownGloballyCommitedMultiPartTxnId = DtxnConstants.DUMMY_LAST_SEEN_TXN_ID - 1; m_sites.get(siteId1).lastKnownGloballyCommitedMultiPartTxnId = DtxnConstants.DUMMY_LAST_SEEN_TXN_ID - 1; boolean test_rollback = true; multipartitionNodeFailure(test_rollback, DtxnConstants.DUMMY_LAST_SEEN_TXN_ID); } /* * Simulate a multipartition participant blocked because the coordinating * node failed; at least one other node in the cluster has completed * this transaction -- and therefore it must commit at this participant. */ private void multipartitionNodeFailure(boolean should_rollback, long txnid) throws InterruptedException { final boolean readOnly = false, singlePartition = false; Thread es1, es2; final StoredProcedureInvocation tx1_spi = new StoredProcedureInvocation(); tx1_spi.setProcName("org.voltdb.TestExecutionSite$MockMPVoltProcedure"); tx1_spi.setParams("commit", new Integer(0)); final long siteId0 = getHSIdForES(0); final long siteId1 = getHSIdForES(1); // site 1 is the coordinator final InitiateTaskMessage tx1_mn_1 = new InitiateTaskMessage(getInitiatorIdForSiteId(0), 0, txnid, readOnly, singlePartition, tx1_spi, Long.MAX_VALUE, new long[] { siteId1 }); final MultiPartitionParticipantTxnState tx1_1 = new MultiPartitionParticipantTxnState(m_mboxes.get(siteId0), m_sites.get(siteId0), tx1_mn_1); // site 2 is a participant final MultiPartitionParticipantMessage tx1_mn_2 = new MultiPartitionParticipantMessage( getInitiatorIdForSiteId(0), 0, txnid, readOnly); final MultiPartitionParticipantTxnState tx1_2 = new MultiPartitionParticipantTxnState(m_mboxes.get(siteId1), m_sites.get(siteId1), tx1_mn_2); // pre-conditions int callcheck = MockMPProcedureRunner.m_called; assertFalse(tx1_1.isDone()); assertFalse(tx1_2.isDone()); m_sites.get(siteId0).m_transactionsById.put(tx1_1.txnId, tx1_1); m_sites.get(siteId1).m_transactionsById.put(tx1_2.txnId, tx1_2); // execute transaction es1 = new Thread(new Runnable() { @Override public void run() { m_sites.get(siteId0).recursableRun(tx1_1); } }); es1.start(); es2 = new Thread(new Runnable() { @Override public void run() { m_sites.get(siteId1).recursableRun(tx1_2); } }); es2.start(); es1.join(); // coordinator is now dead. Update the survivor's catalog and // push a fault notice to the participant. Must supply the host id // corresponding to the coordinator site id. m_voltdb.killSite(0); m_voltdb.killSite(CoreUtils.getHSIdFromHostAndSite(0, getInitiatorIdForSiteId(0))); m_voltdb.getFaultDistributor().reportFault(new SiteFailureFault(Arrays.asList( new Long[] { CoreUtils.getHSIdFromHostAndSite(0, getInitiatorIdForSiteId(0)), getHSIdForES(0) }))); es2.join(); // post-conditions assertFalse(tx1_1.isDone()); // did not run to completion because of simulated fault assertTrue(tx1_2.isDone()); // did run to completion because of globalCommitPt. assertEquals(should_rollback, tx1_2.needsRollback()); // did not rollback because of globalCommitPt. assertEquals(null, m_sites.get(siteId1).m_transactionsById.get(tx1_2.txnId)); assertEquals((++callcheck), MockMPProcedureRunner.m_called); } /* * Create a multipartition work unit to test the removal of non-coordinator * site ids on failure. A little out of place in this file but the configured * ExecutionSite and Mailbox are necessary to construct a MP txn state. */ @SuppressWarnings("deprecation") public void testMultiPartitionParticipantTxnState_handleSiteFaults() throws Exception { start(SITE_COUNT, PARTITION_COUNT, K_FACTOR); StoredProcedureInvocation spi = new StoredProcedureInvocation(); spi.setClientHandle(25); spi.setProcName("johnisgreat"); spi.setParams("commit", 57, "gooniestoo"); long[] nonCoordinatorSites = new long[SITE_COUNT - 1]; for (int i = 0; i < SITE_COUNT - 1; i++) { nonCoordinatorSites[i] = getHSIdForES(i + 1); } InitiateTaskMessage mn = new InitiateTaskMessage(-1, 0, -1, false, false, spi, Long.MIN_VALUE, nonCoordinatorSites); final long siteId0 = getHSIdForES(0); Mailbox m0 = m_mboxes.get(siteId0); ExecutionSite es0 = m_sites.get(siteId0); MultiPartitionParticipantTxnState ts = new MultiPartitionParticipantTxnState(m0, es0, mn); // fail middle and last site HashSet<Long> failedSites = new HashSet<Long>(); failedSites.add(getHSIdForES(1)); failedSites.add(getHSIdForES(2)); failedSites.add(getHSIdForES(3)); failedSites.add(getHSIdForES(5)); ts.handleSiteFaults(failedSites); // peek at some internals long[] nonCoordinatingSites = ts.getNonCoordinatingSites(); assertEquals(4, nonCoordinatingSites.length); assertEquals(getHSIdForES(8), nonCoordinatingSites[3]); assertEquals(getHSIdForES(7), nonCoordinatingSites[2]); assertEquals(getHSIdForES(6), nonCoordinatingSites[1]); assertEquals(getHSIdForES(4), nonCoordinatingSites[0]); // fail first site ts = new MultiPartitionParticipantTxnState(m0, es0, mn); failedSites.clear(); failedSites.add(getHSIdForES(1)); ts.handleSiteFaults(failedSites); nonCoordinatingSites = ts.getNonCoordinatingSites(); assertEquals(7, nonCoordinatingSites.length); assertEquals(getHSIdForES(8), nonCoordinatingSites[6]); assertEquals(getHSIdForES(7), nonCoordinatingSites[5]); assertEquals(getHSIdForES(6), nonCoordinatingSites[4]); assertEquals(getHSIdForES(4), nonCoordinatingSites[2]); assertEquals(getHSIdForES(3), nonCoordinatingSites[1]); assertEquals(getHSIdForES(2), nonCoordinatingSites[0]); // fail site that isn't a non-coordinator site ts = new MultiPartitionParticipantTxnState(m0, es0, mn); failedSites.clear(); failedSites.add(getHSIdForES(9)); failedSites.add(getHSIdForES(10)); ts.handleSiteFaults(failedSites); nonCoordinatingSites = ts.getNonCoordinatingSites(); assertEquals(8, nonCoordinatingSites.length); assertEquals(getHSIdForES(8), nonCoordinatingSites[7]); assertEquals(getHSIdForES(7), nonCoordinatingSites[6]); assertEquals(getHSIdForES(6), nonCoordinatingSites[5]); assertEquals(getHSIdForES(5), nonCoordinatingSites[4]); assertEquals(getHSIdForES(3), nonCoordinatingSites[2]); assertEquals(getHSIdForES(2), nonCoordinatingSites[1]); assertEquals(getHSIdForES(1), nonCoordinatingSites[0]); } /* * Show that a multi-partition transaction proceeds if one of the participants * fails */ public void testFailedMultiPartitionParticipant() throws Exception { tearDown(); start(2, 2, 0); final boolean readOnly = false, singlePartition = false; Thread es1; final StoredProcedureInvocation tx1_spi = new StoredProcedureInvocation(); tx1_spi.setProcName("org.voltdb.TestExecutionSite$MockMPVoltProcedure"); tx1_spi.setParams("commit", new Integer(0)); // site 1 is the coordinator. Use the txn id (DUMMY...) that the R.P.Q. // thinks is a valid safe-to-run txnid. final InitiateTaskMessage tx1_mn_1 = new InitiateTaskMessage(getInitiatorIdForSiteId(0), 0, DtxnConstants.DUMMY_LAST_SEEN_TXN_ID, readOnly, singlePartition, tx1_spi, Long.MAX_VALUE, new long[] { getHSIdForES(1) }); final long siteId0 = getHSIdForES(0); final Mailbox m0 = m_mboxes.get(siteId0); final ExecutionSite es0 = m_sites.get(siteId0); es0.lastKnownGloballyCommitedMultiPartTxnId = DtxnConstants.DUMMY_LAST_SEEN_TXN_ID + 1; final MultiPartitionParticipantTxnState tx1_1 = new MultiPartitionParticipantTxnState(m0, es0, tx1_mn_1); // Site 2 won't exist; we'll claim it fails. // pre-conditions int callcheck = MockMPProcedureRunner.m_called; assertFalse(tx1_1.isDone()); es0.m_transactionsById.put(tx1_1.txnId, tx1_1); // execute transaction es1 = new Thread(new Runnable() { @Override public void run() { es0.recursableRun(tx1_1); } }); es1.start(); m_voltdb.killSite(getHSIdForES(1)); m_voltdb.killSite(getInitiatorIds()[1]); m_voltdb.getFaultDistributor().reportFault( new SiteFailureFault(Arrays.asList(new Long[] { getInitiatorIds()[1], getHSIdForES(1) }))); es1.join(); // post-conditions assertTrue(tx1_1.isDone()); assertFalse(tx1_1.needsRollback()); assertEquals(null, es0.m_transactionsById.get(tx1_1.txnId)); assertEquals((++callcheck), MockMPProcedureRunner.m_called); } /* * FUZZ TESTS FOLLOW * * Driven directly through the ExecutionSite mailboxes. * Mailboxes can terminate a sender (at random) instead of delivering a message. * Verification is performed using the execution site trace logger. */ /** * Create a single partition transaction. */ private void createSPInitiation(boolean readOnly, long txn_id, long safe_txn_id, long initiator_id, int partition_id) { final StoredProcedureInvocation spi = new StoredProcedureInvocation(); spi.setProcName("org.voltdb.TestExecutionSite$MockSPVoltProcedure"); spi.setParams("commit", new Integer(partition_id)); List<Long> sitesForPartition = getSiteIdsForPartitionId(partition_id); for (long i : sitesForPartition) { final InitiateTaskMessage itm = new InitiateTaskMessage(initiator_id, i, // each site is its own coordinator txn_id, readOnly, true, // single partition spi, safe_txn_id); // last safe txnid itm.m_sourceHSId = initiator_id; m_mboxes.get(i).deliver(itm); } } /** * Create a multiple partition transaction */ private void createMPInitiation(boolean rollback, boolean rollback_all, boolean readOnly, int numberOfBatches, long txn_id, long safe_txn_id, long initiator_id, int partition_id, long coordinator_id, List<Long> participants) { ArrayList<Object> params = new ArrayList<Object>(); params.add("number_of_batches"); params.add(new Integer(numberOfBatches)); final StoredProcedureInvocation spi = new StoredProcedureInvocation(); if (!rollback) { spi.setProcName("org.voltdb.TestExecutionSite$MockMPVoltProcedure"); params.add("txn_outcome"); params.add("commit"); params.add(new Integer(partition_id)); spi.setParams(params.toArray()); } else { if (rollback_all) { spi.setProcName("org.voltdb.TestExecutionSite$MockMPVoltProcedureRollbackParticipant"); params.add("txn_outcome"); params.add("rollback_all"); params.add(new Integer(partition_id)); spi.setParams(params.toArray()); } else { spi.setProcName("org.voltdb.TestExecutionSite$MockMPVoltProcedureRollbackParticipant"); params.add("txn_outcome"); params.add("rollback_random"); params.add(new Integer(partition_id)); spi.setParams(params.toArray()); } } testLog.info("Creating MP proc, TXN ID: " + txn_id + ", participants: " + participants.toString()); assert (participants.size() == (SITE_COUNT - 1)); final InitiateTaskMessage itm = new InitiateTaskMessage(initiator_id, coordinator_id, txn_id, readOnly, false, // multi-partition spi, safe_txn_id, // last safe txnid ArrayUtils.toPrimitive(participants.toArray(new Long[0]))); itm.m_sourceHSId = initiator_id; m_mboxes.get(coordinator_id).deliver(itm); for (long participant : participants) { final MultiPartitionParticipantMessage mppm = new MultiPartitionParticipantMessage(initiator_id, coordinator_id, txn_id, readOnly); mppm.m_sourceHSId = initiator_id; m_mboxes.get(participant).deliver(mppm); } } /** * Create a heartbeat for initiator_id. * Currently sent to all up sites (also what simple dtxn does). * @param txn_id * @param safe_txn_id * @param initiator_id */ private void createHeartBeat(long txn_id, long safe_txn_id, long initiator_id) { HeartbeatMessage hbm = new HeartbeatMessage(initiator_id, txn_id, safe_txn_id); hbm.m_sourceHSId = initiator_id; for (Mailbox m : m_mboxes.values()) { m.deliver(hbm); } } /* * Pick a random thing to do. If doing the last transaction, * send a heartbeat to flush all the queues. */ private void queueTransactions(long firstTxnId, int totalTransactions, Random rand) { for (int i = 0; i <= totalTransactions; ++i) { boolean rollback = rand.nextBoolean(); // Disabling this as it results in too many all-failures currently //boolean rollback_all = rand.nextBoolean(); boolean rollback_all = false; boolean readOnly = rand.nextBoolean(); long txnid = i + firstTxnId; long safe_txnid = txnid; long initiator = selectRandomInitiator(rand); int partition = i % PARTITION_COUNT; int wheelOfDestiny = rand.nextInt(100); if (i == totalTransactions) { testLog.info("Queueing final heartbeat."); int offset = 0; for (long inid : getInitiatorIds()) { createHeartBeat(txnid + offset, txnid + offset, inid); ++offset; } } else if (wheelOfDestiny < 50) { createSPInitiation(readOnly, txnid, safe_txnid, initiator, partition); } else if (wheelOfDestiny < 70) { int numberOfBatches = rand.nextInt(4) + 1; List<Long> participants = new ArrayList<Long>(); long coordinator = selectCoordinatorAndParticipants(rand, partition, initiator, participants); createMPInitiation(rollback, rollback_all, readOnly, numberOfBatches, txnid, safe_txnid, initiator, partition, coordinator, participants); } else { createHeartBeat(txnid, safe_txnid, initiator); } } } private void createAndRunSiteThreads() { createAndRunSiteThreads(false); } /* * Run the mailboxes / sites to completion. */ private void createAndRunSiteThreads(final boolean loopUntilPoison) { for (int i = 0; i < SITE_COUNT; ++i) { final long site_id = getHSIdForES(i); m_siteThreads.put(site_id, new Thread(new Runnable() { @Override public void run() { m_sites.get(site_id).runLoop(loopUntilPoison); } }, "Site: " + CoreUtils.hsIdToString(site_id))); } for (int i = 0; i < SITE_COUNT; ++i) { m_siteThreads.get(getHSIdForES(i)).start(); } } /* * Create one txn for each initiator. Both initiators will fail concurrently. * The first transaction will not be fully replicated (only exists at 1). * The second transaction from a different initiator * will be fully replicated, and will have a higher transaction id. * When failure agreement runs for both failures at once, * if the bug exists you should see the transaction from the second failed initiator cause the partially * initiated transaction from the first to be executed at a subset of surviving replicas sites. */ public void testENG1617() throws Exception { System.out.println("Starting testENG1617"); start(SITE_COUNT, PARTITION_COUNT, K_FACTOR); for (RussianRouletteMailbox m : m_mboxes.values()) { m.setFailureLikelihood(0); } createAndRunSiteThreads(true); /* * These are the sites that will receive the txns from the two concurrently dieing initiators */ List<Long> involvedSites1 = getSiteIdsForPartitionId(0); /* * Will use these sites to find initiators to kill */ List<Long> involvedSites2 = getSiteIdsForPartitionId(1); //This initiator will initiate the txn with the lower id that is partially replicated to just one site int initiatorToDie1 = getInitiatorIdForSiteId(CoreUtils.getSiteIdFromHSId(involvedSites2.get(0))); StoredProcedureInvocation spi = new StoredProcedureInvocation(); spi.setProcName("org.voltdb.TestExecutionSite$MockSPVoltProcedure"); spi.setParams("commit", new Integer(0)); InitiateTaskMessage itm = new InitiateTaskMessage(getInitiatorIds()[initiatorToDie1 / 1000], involvedSites1.get(2), // each site is its own coordinator getHSIdForES(1), false, true, // single partition spi, 0); // last safe txnid itm.m_sourceHSId = getInitiatorIds()[initiatorToDie1 / 1000]; m_mboxes.get(involvedSites1.get(2)).deliver(itm); //This initiator will initiate the txn with the higher txn id that is fully replicated int initiatorToDie2 = getInitiatorIdForSiteId(CoreUtils.getSiteIdFromHSId(involvedSites2.get(1))); for (int ii = 0; ii < 3; ii++) { spi = new StoredProcedureInvocation(); spi.setProcName("org.voltdb.TestExecutionSite$MockSPVoltProcedure"); spi.setParams("commit", new Integer(0)); itm = new InitiateTaskMessage(getInitiatorIds()[initiatorToDie2 / 1000], involvedSites1.get(ii), // each site is its own coordinator getHSIdForES(3), false, true, // single partition spi, 2); // last safe txnid itm.m_sourceHSId = getInitiatorIds()[initiatorToDie2 / 1000]; m_mboxes.get(involvedSites1.get(ii)).deliver(itm); } LocalObjectMessage lom = new LocalObjectMessage(new Runnable() { @Override public void run() { throw new Error(); } }); lom.m_sourceHSId = involvedSites2.get(0); /* * Kill the two initiators */ m_mboxes.get(involvedSites2.get(0)).deliver(lom); lom = new LocalObjectMessage(new Runnable() { @Override public void run() { throw new Error(); } }); lom.m_sourceHSId = involvedSites2.get(1); m_mboxes.get(involvedSites2.get(1)).deliver(lom); m_siteThreads.get(involvedSites2.get(0)).join(); m_siteThreads.get(involvedSites2.get(1)).join(); m_sites.get(involvedSites2.get(0)).shutdown(); m_voltdb.killSite(involvedSites2.get(0)); m_voltdb.getFaultDistributor() .reportFault( new SiteFailureFault(Arrays.asList(new Long[] { CoreUtils.getHSIdFromHostAndSite(involvedSites2.get(0).intValue(), (int) (involvedSites2.get(0).longValue() >> 32) + 1000), involvedSites2.get(0) }))); // remove this site from the postoffice postoffice.remove(involvedSites2.get(0)); m_sites.get(involvedSites2.get(1)).shutdown(); m_voltdb.killSite(involvedSites2.get(1)); m_voltdb.getFaultDistributor() .reportFault( new SiteFailureFault(Arrays.asList(new Long[] { CoreUtils.getHSIdFromHostAndSite(involvedSites2.get(1).intValue(), (int) (involvedSites2.get(1).longValue() >> 32) + 1000), involvedSites2.get(1) }))); // remove this site from the postoffice postoffice.remove(involvedSites2.get(1)); Thread.sleep(200); /* * Spin for a while giving them a chance to process all the failures. */ long start = System.currentTimeMillis(); while (System.currentTimeMillis() - start > 15000) { boolean containsBadValue = false; for (ExecutionSite site : m_sites.values()) { if (site != null) { if (site.m_transactionsById.containsKey(1L) || site.m_transactionsById.containsKey(3L)) { containsBadValue = true; } } } if (!containsBadValue) { break; } else { Thread.sleep(100); } } for (RussianRouletteMailbox mailbox : m_mboxes.values()) { if (mailbox != null) { lom = new LocalObjectMessage(new Runnable() { @Override public void run() { throw new Error(); } }); lom.m_sourceHSId = mailbox.getHSId(); mailbox.deliver(lom); } } for (Thread t : m_siteThreads.values()) { t.join(); } /* * Txn 1 should have been dropped because it was partially replicated when the initiator failed. * In the old code it would have been marked as safely replicated due to txn 2 from the other initiator * that failed concurrently. */ for (ExecutionSite es : m_sites.values()) { if (es != null) { if (es.m_transactionsById.containsKey(1L)) { System.out.println("Site " + es.getCorrespondingSiteId() + " contains txn 1"); } assertFalse(es.m_transactionsById.containsKey(1L)); if (es.m_transactionsById.containsKey(3L)) { System.out.println("Site " + es.getCorrespondingSiteId() + " contains txn 3"); } assertFalse(es.m_transactionsById.containsKey(3L)); } } } }