Java tutorial
/** * * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, * software distributed under the License is distributed on an * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY * KIND, either express or implied. See the License for the * specific language governing permissions and limitations * under the License. * */ package org.apache.bookkeeper.replication; import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertFalse; import static org.junit.Assert.assertNotNull; import static org.junit.Assert.assertTrue; import static org.junit.Assert.fail; import java.io.IOException; import java.net.URI; import java.nio.ByteBuffer; import java.util.ArrayList; import java.util.Arrays; import java.util.HashMap; import java.util.HashSet; import java.util.LinkedList; import java.util.List; import java.util.Map; import java.util.Optional; import java.util.Random; import java.util.Set; import java.util.concurrent.ConcurrentHashMap; import java.util.concurrent.CountDownLatch; import java.util.concurrent.Future; import java.util.concurrent.TimeUnit; import java.util.concurrent.TimeoutException; import java.util.concurrent.atomic.AtomicInteger; import lombok.Cleanup; import org.apache.bookkeeper.bookie.Bookie; import org.apache.bookkeeper.client.AsyncCallback.AddCallback; import org.apache.bookkeeper.client.BKException; import org.apache.bookkeeper.client.BookKeeper.DigestType; import org.apache.bookkeeper.client.LedgerHandle; import org.apache.bookkeeper.client.LedgerMetadata; import org.apache.bookkeeper.common.util.OrderedScheduler; import org.apache.bookkeeper.conf.ServerConfiguration; import org.apache.bookkeeper.meta.LedgerManager; import org.apache.bookkeeper.meta.MetadataClientDriver; import org.apache.bookkeeper.meta.MetadataDrivers; import org.apache.bookkeeper.meta.ZkLedgerUnderreplicationManager; import org.apache.bookkeeper.meta.zk.ZKMetadataDriverBase; import org.apache.bookkeeper.net.BookieSocketAddress; import org.apache.bookkeeper.proto.BookieServer; import org.apache.bookkeeper.proto.DataFormats.UnderreplicatedLedgerFormat; import org.apache.bookkeeper.replication.ReplicationException.CompatibilityException; import org.apache.bookkeeper.replication.ReplicationException.UnavailableException; import org.apache.bookkeeper.stats.NullStatsLogger; import org.apache.bookkeeper.test.BookKeeperClusterTestCase; import org.apache.commons.lang.mutable.MutableInt; import org.apache.zookeeper.KeeperException; import org.apache.zookeeper.WatchedEvent; import org.apache.zookeeper.Watcher; import org.junit.Assert; import org.junit.Before; import org.junit.Test; import org.slf4j.Logger; import org.slf4j.LoggerFactory; /** * Tests publishing of under replicated ledgers by the Auditor bookie node when * corresponding bookies identifes as not running. */ public class AuditorLedgerCheckerTest extends BookKeeperClusterTestCase { // Depending on the taste, select the amount of logging // by decommenting one of the two lines below // private static final Logger LOG = Logger.getRootLogger(); private static final Logger LOG = LoggerFactory.getLogger(AuditorLedgerCheckerTest.class); private static final byte[] ledgerPassword = "aaa".getBytes(); private Random rng; // Random Number Generator private DigestType digestType; private String underreplicatedPath; private Map<String, AuditorElector> auditorElectors = new ConcurrentHashMap<>(); private ZkLedgerUnderreplicationManager urLedgerMgr; private Set<Long> urLedgerList; private String electionPath; private List<Long> ledgerList; public AuditorLedgerCheckerTest() throws IOException, KeeperException, InterruptedException, CompatibilityException { this("org.apache.bookkeeper.meta.HierarchicalLedgerManagerFactory"); } AuditorLedgerCheckerTest(String ledgerManagerFactoryClass) throws IOException, KeeperException, InterruptedException, CompatibilityException { super(3); LOG.info("Running test case using ledger manager : " + ledgerManagerFactoryClass); this.digestType = DigestType.CRC32; // set ledger manager name baseConf.setLedgerManagerFactoryClassName(ledgerManagerFactoryClass); baseClientConf.setLedgerManagerFactoryClassName(ledgerManagerFactoryClass); } @Before public void setUp() throws Exception { super.setUp(); underreplicatedPath = ZKMetadataDriverBase.resolveZkLedgersRootPath(baseClientConf) + "/underreplication/ledgers"; electionPath = ZKMetadataDriverBase.resolveZkLedgersRootPath(baseConf) + "/underreplication/auditorelection"; urLedgerMgr = new ZkLedgerUnderreplicationManager(baseClientConf, zkc); startAuditorElectors(); rng = new Random(System.currentTimeMillis()); // Initialize the Random urLedgerList = new HashSet<Long>(); ledgerList = new ArrayList<Long>(2); baseClientConf.setMetadataServiceUri(zkUtil.getMetadataServiceUri()); baseConf.setMetadataServiceUri(zkUtil.getMetadataServiceUri()); } @Override public void tearDown() throws Exception { stopAuditorElectors(); super.tearDown(); } private void startAuditorElectors() throws Exception { for (BookieServer bserver : bs) { String addr = bserver.getLocalAddress().toString(); AuditorElector auditorElector = new AuditorElector(addr, baseConf, zkc); auditorElectors.put(addr, auditorElector); auditorElector.start(); LOG.debug("Starting Auditor Elector"); } } private void stopAuditorElectors() throws Exception { for (AuditorElector auditorElector : auditorElectors.values()) { auditorElector.shutdown(); LOG.debug("Stopping Auditor Elector!"); } } /** * Test publishing of under replicated ledgers by the auditor bookie. */ @Test public void testSimpleLedger() throws Exception { LedgerHandle lh1 = createAndAddEntriesToLedger(); Long ledgerId = lh1.getId(); LOG.debug("Created ledger : " + ledgerId); ledgerList.add(ledgerId); lh1.close(); final CountDownLatch underReplicaLatch = registerUrLedgerWatcher(ledgerList.size()); int bkShutdownIndex = bs.size() - 1; String shutdownBookie = shutdownBookie(bkShutdownIndex); // grace period for publishing the bk-ledger LOG.debug("Waiting for ledgers to be marked as under replicated"); waitForAuditToComplete(); underReplicaLatch.await(5, TimeUnit.SECONDS); Map<Long, String> urLedgerData = getUrLedgerData(urLedgerList); assertEquals("Missed identifying under replicated ledgers", 1, urLedgerList.size()); /* * Sample data format present in the under replicated ledger path * * {4=replica: "10.18.89.153:5002"} */ assertTrue("Ledger is not marked as underreplicated:" + ledgerId, urLedgerList.contains(ledgerId)); String data = urLedgerData.get(ledgerId); assertTrue("Bookie " + shutdownBookie + "is not listed in the ledger as missing replica :" + data, data.contains(shutdownBookie)); } /** * Test once published under replicated ledger should exists even after * restarting respective bookie. */ @Test public void testRestartBookie() throws Exception { LedgerHandle lh1 = createAndAddEntriesToLedger(); LedgerHandle lh2 = createAndAddEntriesToLedger(); LOG.debug("Created following ledgers : {}, {}", lh1, lh2); int bkShutdownIndex = bs.size() - 1; ServerConfiguration bookieConf1 = bsConfs.get(bkShutdownIndex); String shutdownBookie = shutdownBookie(bkShutdownIndex); // restart the failed bookie bs.add(startBookie(bookieConf1)); waitForLedgerMissingReplicas(lh1.getId(), 10, shutdownBookie); waitForLedgerMissingReplicas(lh2.getId(), 10, shutdownBookie); } /** * Test publishing of under replicated ledgers when multiple bookie failures * one after another. */ @Test public void testMultipleBookieFailures() throws Exception { LedgerHandle lh1 = createAndAddEntriesToLedger(); // failing first bookie shutdownBookie(bs.size() - 1); // simulate re-replication doLedgerRereplication(lh1.getId()); // failing another bookie String shutdownBookie = shutdownBookie(bs.size() - 1); // grace period for publishing the bk-ledger LOG.debug("Waiting for ledgers to be marked as under replicated"); assertTrue("Ledger should be missing second replica", waitForLedgerMissingReplicas(lh1.getId(), 10, shutdownBookie)); } @Test public void testToggleLedgerReplication() throws Exception { LedgerHandle lh1 = createAndAddEntriesToLedger(); ledgerList.add(lh1.getId()); LOG.debug("Created following ledgers : " + ledgerList); // failing another bookie CountDownLatch urReplicaLatch = registerUrLedgerWatcher(ledgerList.size()); // disabling ledger replication urLedgerMgr.disableLedgerReplication(); ArrayList<String> shutdownBookieList = new ArrayList<String>(); shutdownBookieList.add(shutdownBookie(bs.size() - 1)); shutdownBookieList.add(shutdownBookie(bs.size() - 1)); assertFalse("Ledger replication is not disabled!", urReplicaLatch.await(1, TimeUnit.SECONDS)); // enabling ledger replication urLedgerMgr.enableLedgerReplication(); assertTrue("Ledger replication is not enabled!", urReplicaLatch.await(5, TimeUnit.SECONDS)); } @Test public void testDuplicateEnDisableAutoRecovery() throws Exception { urLedgerMgr.disableLedgerReplication(); try { urLedgerMgr.disableLedgerReplication(); fail("Must throw exception, since AutoRecovery is already disabled"); } catch (UnavailableException e) { assertTrue("AutoRecovery is not disabled previously!", e.getCause() instanceof KeeperException.NodeExistsException); } urLedgerMgr.enableLedgerReplication(); try { urLedgerMgr.enableLedgerReplication(); fail("Must throw exception, since AutoRecovery is already enabled"); } catch (UnavailableException e) { assertTrue("AutoRecovery is not enabled previously!", e.getCause() instanceof KeeperException.NoNodeException); } } /** * Test Auditor should consider Readonly bookie as available bookie. Should not publish ur ledgers for * readonly bookies. */ @Test public void testReadOnlyBookieExclusionFromURLedgersCheck() throws Exception { LedgerHandle lh = createAndAddEntriesToLedger(); ledgerList.add(lh.getId()); LOG.debug("Created following ledgers : " + ledgerList); int count = ledgerList.size(); final CountDownLatch underReplicaLatch = registerUrLedgerWatcher(count); final int bkIndex = 2; ServerConfiguration bookieConf = bsConfs.get(bkIndex); BookieServer bk = bs.get(bkIndex); bookieConf.setReadOnlyModeEnabled(true); bk.getBookie().getStateManager().doTransitionToReadOnlyMode(); bkc.waitForReadOnlyBookie(Bookie.getBookieAddress(bsConfs.get(bkIndex))).get(30, TimeUnit.SECONDS); // grace period for publishing the bk-ledger LOG.debug("Waiting for Auditor to finish ledger check."); waitForAuditToComplete(); assertFalse("latch should not have completed", underReplicaLatch.await(5, TimeUnit.SECONDS)); } /** * Test Auditor should consider Readonly bookie fail and publish ur ledgers for readonly bookies. */ @Test public void testReadOnlyBookieShutdown() throws Exception { LedgerHandle lh = createAndAddEntriesToLedger(); long ledgerId = lh.getId(); ledgerList.add(ledgerId); LOG.debug("Created following ledgers : " + ledgerList); int count = ledgerList.size(); final CountDownLatch underReplicaLatch = registerUrLedgerWatcher(count); int bkIndex = bs.size() - 1; LOG.debug("Moving bookie {} {} to read only...", bkIndex, bs.get(bkIndex)); ServerConfiguration bookieConf = bsConfs.get(bkIndex); BookieServer bk = bs.get(bkIndex); bookieConf.setReadOnlyModeEnabled(true); bk.getBookie().getStateManager().doTransitionToReadOnlyMode(); bkc.waitForReadOnlyBookie(Bookie.getBookieAddress(bsConfs.get(bkIndex))).get(30, TimeUnit.SECONDS); // grace period for publishing the bk-ledger LOG.debug("Waiting for Auditor to finish ledger check."); waitForAuditToComplete(); assertFalse("latch should not have completed", underReplicaLatch.await(1, TimeUnit.SECONDS)); String shutdownBookie = shutdownBookie(bkIndex); // grace period for publishing the bk-ledger LOG.debug("Waiting for ledgers to be marked as under replicated"); waitForAuditToComplete(); underReplicaLatch.await(5, TimeUnit.SECONDS); Map<Long, String> urLedgerData = getUrLedgerData(urLedgerList); assertEquals("Missed identifying under replicated ledgers", 1, urLedgerList.size()); /* * Sample data format present in the under replicated ledger path * * {4=replica: "10.18.89.153:5002"} */ assertTrue("Ledger is not marked as underreplicated:" + ledgerId, urLedgerList.contains(ledgerId)); String data = urLedgerData.get(ledgerId); assertTrue("Bookie " + shutdownBookie + "is not listed in the ledger as missing replica :" + data, data.contains(shutdownBookie)); } public void testInnerDelayedAuditOfLostBookies() throws Exception { LedgerHandle lh1 = createAndAddEntriesToLedger(); Long ledgerId = lh1.getId(); LOG.debug("Created ledger : " + ledgerId); ledgerList.add(ledgerId); lh1.close(); final CountDownLatch underReplicaLatch = registerUrLedgerWatcher(ledgerList.size()); // wait for 5 seconds before starting the recovery work when a bookie fails urLedgerMgr.setLostBookieRecoveryDelay(5); // shutdown a non auditor bookie; choosing non-auditor to avoid another election String shutdownBookie = shutDownNonAuditorBookie(); LOG.debug("Waiting for ledgers to be marked as under replicated"); assertFalse("audit of lost bookie isn't delayed", underReplicaLatch.await(4, TimeUnit.SECONDS)); assertEquals("under replicated ledgers identified when it was not expected", 0, urLedgerList.size()); // wait for another 5 seconds for the ledger to get reported as under replicated assertTrue("audit of lost bookie isn't delayed", underReplicaLatch.await(2, TimeUnit.SECONDS)); assertTrue("Ledger is not marked as underreplicated:" + ledgerId, urLedgerList.contains(ledgerId)); Map<Long, String> urLedgerData = getUrLedgerData(urLedgerList); String data = urLedgerData.get(ledgerId); assertTrue("Bookie " + shutdownBookie + "is not listed in the ledger as missing replica :" + data, data.contains(shutdownBookie)); } /** * Test publishing of under replicated ledgers by the auditor * bookie is delayed if LostBookieRecoveryDelay option is set. */ @Test public void testDelayedAuditOfLostBookies() throws Exception { // wait for a second so that the initial periodic check finishes Thread.sleep(1000); testInnerDelayedAuditOfLostBookies(); } /** * Test publishing of under replicated ledgers by the auditor * bookie is delayed if LostBookieRecoveryDelay option is set * and it continues to be delayed even when periodic bookie check * is set to run every 2 secs. I.e. periodic bookie check doesn't * override the delay */ @Test public void testDelayedAuditWithPeriodicBookieCheck() throws Exception { // enable periodic bookie check on a cadence of every 2 seconds. // this requires us to stop the auditor/auditorElectors, set the // periodic check interval and restart the auditorElectors stopAuditorElectors(); baseConf.setAuditorPeriodicBookieCheckInterval(2); startAuditorElectors(); // wait for a second so that the initial periodic check finishes Thread.sleep(1000); // the delaying of audit should just work despite the fact // we have enabled periodic bookie check testInnerDelayedAuditOfLostBookies(); } @Test public void testRescheduleOfDelayedAuditOfLostBookiesToStartImmediately() throws Exception { // wait for a second so that the initial periodic check finishes Thread.sleep(1000); LedgerHandle lh1 = createAndAddEntriesToLedger(); Long ledgerId = lh1.getId(); LOG.debug("Created ledger : " + ledgerId); ledgerList.add(ledgerId); lh1.close(); final CountDownLatch underReplicaLatch = registerUrLedgerWatcher(ledgerList.size()); // wait for 50 seconds before starting the recovery work when a bookie fails urLedgerMgr.setLostBookieRecoveryDelay(50); // shutdown a non auditor bookie; choosing non-auditor to avoid another election String shutdownBookie = shutDownNonAuditorBookie(); LOG.debug("Waiting for ledgers to be marked as under replicated"); assertFalse("audit of lost bookie isn't delayed", underReplicaLatch.await(4, TimeUnit.SECONDS)); assertEquals("under replicated ledgers identified when it was not expected", 0, urLedgerList.size()); // set lostBookieRecoveryDelay to 0, so that it triggers AuditTask immediately urLedgerMgr.setLostBookieRecoveryDelay(0); // wait for 1 second for the ledger to get reported as under replicated assertTrue("audit of lost bookie isn't delayed", underReplicaLatch.await(1, TimeUnit.SECONDS)); assertTrue("Ledger is not marked as underreplicated:" + ledgerId, urLedgerList.contains(ledgerId)); Map<Long, String> urLedgerData = getUrLedgerData(urLedgerList); String data = urLedgerData.get(ledgerId); assertTrue("Bookie " + shutdownBookie + "is not listed in the ledger as missing replica :" + data, data.contains(shutdownBookie)); } @Test public void testRescheduleOfDelayedAuditOfLostBookiesToStartLater() throws Exception { // wait for a second so that the initial periodic check finishes Thread.sleep(1000); LedgerHandle lh1 = createAndAddEntriesToLedger(); Long ledgerId = lh1.getId(); LOG.debug("Created ledger : " + ledgerId); ledgerList.add(ledgerId); lh1.close(); final CountDownLatch underReplicaLatch = registerUrLedgerWatcher(ledgerList.size()); // wait for 3 seconds before starting the recovery work when a bookie fails urLedgerMgr.setLostBookieRecoveryDelay(3); // shutdown a non auditor bookie; choosing non-auditor to avoid another election String shutdownBookie = shutDownNonAuditorBookie(); LOG.debug("Waiting for ledgers to be marked as under replicated"); assertFalse("audit of lost bookie isn't delayed", underReplicaLatch.await(2, TimeUnit.SECONDS)); assertEquals("under replicated ledgers identified when it was not expected", 0, urLedgerList.size()); // set lostBookieRecoveryDelay to 4, so the pending AuditTask is resheduled urLedgerMgr.setLostBookieRecoveryDelay(4); // since we changed the BookieRecoveryDelay period to 4, the audittask shouldn't have been executed LOG.debug("Waiting for ledgers to be marked as under replicated"); assertFalse("audit of lost bookie isn't delayed", underReplicaLatch.await(2, TimeUnit.SECONDS)); assertEquals("under replicated ledgers identified when it was not expected", 0, urLedgerList.size()); // wait for 3 seconds (since we already waited for 2 secs) for the ledger to get reported as under replicated assertTrue("audit of lost bookie isn't delayed", underReplicaLatch.await(3, TimeUnit.SECONDS)); assertTrue("Ledger is not marked as underreplicated:" + ledgerId, urLedgerList.contains(ledgerId)); Map<Long, String> urLedgerData = getUrLedgerData(urLedgerList); String data = urLedgerData.get(ledgerId); assertTrue("Bookie " + shutdownBookie + "is not listed in the ledger as missing replica :" + data, data.contains(shutdownBookie)); } @Test public void testTriggerAuditorWithNoPendingAuditTask() throws Exception { // wait for a second so that the initial periodic check finishes Thread.sleep(1000); int lostBookieRecoveryDelayConfValue = baseConf.getLostBookieRecoveryDelay(); Auditor auditorBookiesAuditor = getAuditorBookiesAuditor(); Future<?> auditTask = auditorBookiesAuditor.getAuditTask(); int lostBookieRecoveryDelayBeforeChange = auditorBookiesAuditor.getLostBookieRecoveryDelayBeforeChange(); Assert.assertEquals("auditTask is supposed to be null", null, auditTask); Assert.assertEquals( "lostBookieRecoveryDelayBeforeChange of Auditor should be equal to BaseConf's lostBookieRecoveryDelay", lostBookieRecoveryDelayConfValue, lostBookieRecoveryDelayBeforeChange); @Cleanup("shutdown") OrderedScheduler scheduler = OrderedScheduler.newSchedulerBuilder().name("test-scheduler").numThreads(1) .build(); @Cleanup MetadataClientDriver driver = MetadataDrivers .getClientDriver(URI.create(baseClientConf.getMetadataServiceUri())); driver.initialize(baseClientConf, scheduler, NullStatsLogger.INSTANCE, Optional.of(zkc)); // there is no easy way to validate if the Auditor has executed Audit process (Auditor.startAudit), // without shuttingdown Bookie. To test if by resetting LostBookieRecoveryDelay it does Auditing // even when there is no pending AuditTask, following approach is needed. // Here we are creating few ledgers ledgermetadata with non-existing bookies as its ensemble. // When Auditor does audit it recognizes these ledgers as underreplicated and mark them as // under-replicated, since these bookies are not available. int numofledgers = 5; Random rand = new Random(); for (int i = 0; i < numofledgers; i++) { LedgerMetadata metadata = new LedgerMetadata(3, 2, 2, DigestType.CRC32, "passwd".getBytes()); ArrayList<BookieSocketAddress> ensemble = new ArrayList<BookieSocketAddress>(); ensemble.add(new BookieSocketAddress("99.99.99.99:9999")); ensemble.add(new BookieSocketAddress("11.11.11.11:1111")); ensemble.add(new BookieSocketAddress("88.88.88.88:8888")); metadata.addEnsemble(0, ensemble); MutableInt ledgerCreateRC = new MutableInt(-1); CountDownLatch latch = new CountDownLatch(1); long ledgerId = (Math.abs(rand.nextLong())) % 100000000; try (LedgerManager lm = driver.getLedgerManagerFactory().newLedgerManager()) { lm.createLedgerMetadata(ledgerId, metadata, (rc, result) -> { ledgerCreateRC.setValue(rc); latch.countDown(); }); } Assert.assertTrue("Ledger creation should complete within 2 secs", latch.await(2000, TimeUnit.MILLISECONDS)); Assert.assertEquals("LedgerCreate should succeed and return OK rc value", BKException.Code.OK, ledgerCreateRC.getValue()); ledgerList.add(ledgerId); } final CountDownLatch underReplicaLatch = registerUrLedgerWatcher(ledgerList.size()); urLedgerMgr.setLostBookieRecoveryDelay(lostBookieRecoveryDelayBeforeChange); assertTrue("Audit should be triggered and created ledgers should be marked as underreplicated", underReplicaLatch.await(2, TimeUnit.SECONDS)); assertEquals("All the ledgers should be marked as underreplicated", ledgerList.size(), urLedgerList.size()); auditTask = auditorBookiesAuditor.getAuditTask(); Assert.assertEquals("auditTask is supposed to be null", null, auditTask); Assert.assertEquals( "lostBookieRecoveryDelayBeforeChange of Auditor should be equal to BaseConf's lostBookieRecoveryDelay", lostBookieRecoveryDelayBeforeChange, auditorBookiesAuditor.getLostBookieRecoveryDelayBeforeChange()); } @Test public void testTriggerAuditorWithPendingAuditTask() throws Exception { // wait for a second so that the initial periodic check finishes Thread.sleep(1000); Auditor auditorBookiesAuditor = getAuditorBookiesAuditor(); LedgerHandle lh1 = createAndAddEntriesToLedger(); Long ledgerId = lh1.getId(); LOG.debug("Created ledger : " + ledgerId); ledgerList.add(ledgerId); lh1.close(); final CountDownLatch underReplicaLatch = registerUrLedgerWatcher(ledgerList.size()); int lostBookieRecoveryDelay = 5; // wait for 5 seconds before starting the recovery work when a bookie fails urLedgerMgr.setLostBookieRecoveryDelay(lostBookieRecoveryDelay); // shutdown a non auditor bookie; choosing non-auditor to avoid another election String shutdownBookie = shutDownNonAuditorBookie(); LOG.debug("Waiting for ledgers to be marked as under replicated"); assertFalse("audit of lost bookie isn't delayed", underReplicaLatch.await(2, TimeUnit.SECONDS)); assertEquals("under replicated ledgers identified when it was not expected", 0, urLedgerList.size()); Future<?> auditTask = auditorBookiesAuditor.getAuditTask(); Assert.assertNotEquals("auditTask is not supposed to be null", null, auditTask); Assert.assertEquals("lostBookieRecoveryDelayBeforeChange of Auditor should be equal to what we set", lostBookieRecoveryDelay, auditorBookiesAuditor.getLostBookieRecoveryDelayBeforeChange()); // set lostBookieRecoveryDelay to 5 (previous value), so that Auditor is triggered immediately urLedgerMgr.setLostBookieRecoveryDelay(lostBookieRecoveryDelay); assertTrue("audit of lost bookie shouldn't be delayed", underReplicaLatch.await(2, TimeUnit.SECONDS)); assertEquals("all under replicated ledgers should be identified", ledgerList.size(), urLedgerList.size()); Thread.sleep(100); auditTask = auditorBookiesAuditor.getAuditTask(); Assert.assertEquals("auditTask is supposed to be null", null, auditTask); Assert.assertEquals( "lostBookieRecoveryDelayBeforeChange of Auditor should be equal to previously set value", lostBookieRecoveryDelay, auditorBookiesAuditor.getLostBookieRecoveryDelayBeforeChange()); } @Test public void testTriggerAuditorBySettingDelayToZeroWithPendingAuditTask() throws Exception { // wait for a second so that the initial periodic check finishes Thread.sleep(1000); Auditor auditorBookiesAuditor = getAuditorBookiesAuditor(); LedgerHandle lh1 = createAndAddEntriesToLedger(); Long ledgerId = lh1.getId(); LOG.debug("Created ledger : " + ledgerId); ledgerList.add(ledgerId); lh1.close(); final CountDownLatch underReplicaLatch = registerUrLedgerWatcher(ledgerList.size()); int lostBookieRecoveryDelay = 5; // wait for 5 seconds before starting the recovery work when a bookie fails urLedgerMgr.setLostBookieRecoveryDelay(lostBookieRecoveryDelay); // shutdown a non auditor bookie; choosing non-auditor to avoid another election String shutdownBookie = shutDownNonAuditorBookie(); LOG.debug("Waiting for ledgers to be marked as under replicated"); assertFalse("audit of lost bookie isn't delayed", underReplicaLatch.await(2, TimeUnit.SECONDS)); assertEquals("under replicated ledgers identified when it was not expected", 0, urLedgerList.size()); Future<?> auditTask = auditorBookiesAuditor.getAuditTask(); Assert.assertNotEquals("auditTask is not supposed to be null", null, auditTask); Assert.assertEquals("lostBookieRecoveryDelayBeforeChange of Auditor should be equal to what we set", lostBookieRecoveryDelay, auditorBookiesAuditor.getLostBookieRecoveryDelayBeforeChange()); // set lostBookieRecoveryDelay to 0, so that Auditor is triggered immediately urLedgerMgr.setLostBookieRecoveryDelay(0); assertTrue("audit of lost bookie shouldn't be delayed", underReplicaLatch.await(1, TimeUnit.SECONDS)); assertEquals("all under replicated ledgers should be identified", ledgerList.size(), urLedgerList.size()); Thread.sleep(100); auditTask = auditorBookiesAuditor.getAuditTask(); assertEquals("auditTask is supposed to be null", null, auditTask); assertEquals("lostBookieRecoveryDelayBeforeChange of Auditor should be equal to previously set value", 0, auditorBookiesAuditor.getLostBookieRecoveryDelayBeforeChange()); } /** * Test audit of bookies is delayed when one bookie is down. But when * another one goes down, the audit is started immediately. */ @Test public void testDelayedAuditWithMultipleBookieFailures() throws Exception { // wait for the periodic bookie check to finish Thread.sleep(1000); // create a ledger with a bunch of entries LedgerHandle lh1 = createAndAddEntriesToLedger(); Long ledgerId = lh1.getId(); LOG.debug("Created ledger : " + ledgerId); ledgerList.add(ledgerId); lh1.close(); CountDownLatch underReplicaLatch = registerUrLedgerWatcher(ledgerList.size()); // wait for 10 seconds before starting the recovery work when a bookie fails urLedgerMgr.setLostBookieRecoveryDelay(10); // shutdown a non auditor bookie to avoid an election String shutdownBookie1 = shutDownNonAuditorBookie(); // wait for 3 seconds and there shouldn't be any under replicated ledgers // because we have delayed the start of audit by 10 seconds assertFalse("audit of lost bookie isn't delayed", underReplicaLatch.await(3, TimeUnit.SECONDS)); assertEquals("under replicated ledgers identified when it was not expected", 0, urLedgerList.size()); // Now shutdown the second non auditor bookie; We want to make sure that // the history about having delayed recovery remains. Hence we make sure // we bring down a non auditor bookie. This should cause the audit to take // place immediately and not wait for the remaining 7 seconds to elapse String shutdownBookie2 = shutDownNonAuditorBookie(); // 2 second grace period for the ledgers to get reported as under replicated Thread.sleep(2000); // If the following checks pass, it means that audit happened // within 2 seconds of second bookie going down and it didn't // wait for 7 more seconds. Hence the second bookie failure doesn't // delay the audit assertTrue("Ledger is not marked as underreplicated:" + ledgerId, urLedgerList.contains(ledgerId)); Map<Long, String> urLedgerData = getUrLedgerData(urLedgerList); String data = urLedgerData.get(ledgerId); assertTrue( "Bookie " + shutdownBookie1 + shutdownBookie2 + " are not listed in the ledger as missing replicas :" + data, data.contains(shutdownBookie1) && data.contains(shutdownBookie2)); } /** * Test audit of bookies is delayed during rolling upgrade scenario: * a bookies goes down and comes up, the next bookie go down and up and so on. * At any time only one bookie is down. */ @Test public void testDelayedAuditWithRollingUpgrade() throws Exception { // wait for the periodic bookie check to finish Thread.sleep(1000); // create a ledger with a bunch of entries LedgerHandle lh1 = createAndAddEntriesToLedger(); Long ledgerId = lh1.getId(); LOG.debug("Created ledger : " + ledgerId); ledgerList.add(ledgerId); lh1.close(); CountDownLatch underReplicaLatch = registerUrLedgerWatcher(ledgerList.size()); // wait for 5 seconds before starting the recovery work when a bookie fails urLedgerMgr.setLostBookieRecoveryDelay(5); // shutdown a non auditor bookie to avoid an election int idx1 = getShutDownNonAuditorBookieIdx(""); ServerConfiguration conf1 = bsConfs.get(idx1); String shutdownBookie1 = shutdownBookie(idx1); // wait for 2 seconds and there shouldn't be any under replicated ledgers // because we have delayed the start of audit by 5 seconds assertFalse("audit of lost bookie isn't delayed", underReplicaLatch.await(2, TimeUnit.SECONDS)); assertEquals("under replicated ledgers identified when it was not expected", 0, urLedgerList.size()); // restart the bookie we shut down above bs.add(startBookie(conf1)); // Now to simulate the rolling upgrade, bring down a bookie different from // the one we brought down/up above. String shutdownBookie2 = shutDownNonAuditorBookie(shutdownBookie1); // since the first bookie that was brought down/up has come up, there is only // one bookie down at this time. Hence the lost bookie check shouldn't start // immediately; it will start 5 seconds after the second bookie went down assertFalse("audit of lost bookie isn't delayed", underReplicaLatch.await(2, TimeUnit.SECONDS)); assertEquals("under replicated ledgers identified when it was not expected", 0, urLedgerList.size()); // wait for a total of 6 seconds(2+4) for the ledgers to get reported as under replicated Thread.sleep(4000); // If the following checks pass, it means that auditing happened // after lostBookieRecoveryDelay during rolling upgrade as expected assertTrue("Ledger is not marked as underreplicated:" + ledgerId, urLedgerList.contains(ledgerId)); Map<Long, String> urLedgerData = getUrLedgerData(urLedgerList); String data = urLedgerData.get(ledgerId); assertTrue("Bookie " + shutdownBookie1 + "wrongly listed as missing the ledger: " + data, !data.contains(shutdownBookie1)); assertTrue("Bookie " + shutdownBookie2 + " is not listed in the ledger as missing replicas :" + data, data.contains(shutdownBookie2)); LOG.info("*****************Test Complete"); } private void waitForAuditToComplete() throws Exception { long endTime = System.currentTimeMillis() + 5_000; while (System.currentTimeMillis() < endTime) { Auditor auditor = getAuditorBookiesAuditor(); if (auditor != null) { Future<?> task = auditor.submitAuditTask(); task.get(5, TimeUnit.SECONDS); return; } Thread.sleep(100); } throw new TimeoutException("Could not find an audit within 5 seconds"); } /** * Wait for ledger to be underreplicated, and to be missing all replicas specified. */ private boolean waitForLedgerMissingReplicas(Long ledgerId, long secondsToWait, String... replicas) throws Exception { for (int i = 0; i < secondsToWait; i++) { try { UnderreplicatedLedgerFormat data = urLedgerMgr.getLedgerUnreplicationInfo(ledgerId); boolean all = true; for (String r : replicas) { all = all && data.getReplicaList().contains(r); } if (all) { return true; } } catch (Exception e) { // may not find node } Thread.sleep(1000); } return false; } private CountDownLatch registerUrLedgerWatcher(int count) throws KeeperException, InterruptedException { final CountDownLatch underReplicaLatch = new CountDownLatch(count); for (Long ledgerId : ledgerList) { Watcher urLedgerWatcher = new ChildWatcher(underReplicaLatch); String znode = ZkLedgerUnderreplicationManager.getUrLedgerZnode(underreplicatedPath, ledgerId); zkc.exists(znode, urLedgerWatcher); } return underReplicaLatch; } private void doLedgerRereplication(Long... ledgerIds) throws UnavailableException { for (int i = 0; i < ledgerIds.length; i++) { long lid = urLedgerMgr.getLedgerToRereplicate(); assertTrue("Received unexpected ledgerid", Arrays.asList(ledgerIds).contains(lid)); urLedgerMgr.markLedgerReplicated(lid); urLedgerMgr.releaseUnderreplicatedLedger(lid); } } private String shutdownBookie(int bkShutdownIndex) throws Exception { BookieServer bkServer = bs.get(bkShutdownIndex); String bookieAddr = bkServer.getLocalAddress().toString(); LOG.debug("Shutting down bookie:" + bookieAddr); killBookie(bkShutdownIndex); auditorElectors.get(bookieAddr).shutdown(); auditorElectors.remove(bookieAddr); return bookieAddr; } private LedgerHandle createAndAddEntriesToLedger() throws BKException, InterruptedException { int numEntriesToWrite = 100; // Create a ledger LedgerHandle lh = bkc.createLedger(digestType, ledgerPassword); LOG.info("Ledger ID: " + lh.getId()); addEntry(numEntriesToWrite, lh); return lh; } private void addEntry(int numEntriesToWrite, LedgerHandle lh) throws InterruptedException, BKException { final CountDownLatch completeLatch = new CountDownLatch(numEntriesToWrite); final AtomicInteger rc = new AtomicInteger(BKException.Code.OK); for (int i = 0; i < numEntriesToWrite; i++) { ByteBuffer entry = ByteBuffer.allocate(4); entry.putInt(rng.nextInt(Integer.MAX_VALUE)); entry.position(0); lh.asyncAddEntry(entry.array(), new AddCallback() { public void addComplete(int rc2, LedgerHandle lh, long entryId, Object ctx) { rc.compareAndSet(BKException.Code.OK, rc2); completeLatch.countDown(); } }, null); } completeLatch.await(); if (rc.get() != BKException.Code.OK) { throw BKException.create(rc.get()); } } private Map<Long, String> getUrLedgerData(Set<Long> urLedgerList) throws KeeperException, InterruptedException { Map<Long, String> urLedgerData = new HashMap<Long, String>(); for (Long ledgerId : urLedgerList) { String znode = ZkLedgerUnderreplicationManager.getUrLedgerZnode(underreplicatedPath, ledgerId); byte[] data = zkc.getData(znode, false, null); urLedgerData.put(ledgerId, new String(data)); } return urLedgerData; } private class ChildWatcher implements Watcher { private final CountDownLatch underReplicaLatch; public ChildWatcher(CountDownLatch underReplicaLatch) { this.underReplicaLatch = underReplicaLatch; } @Override public void process(WatchedEvent event) { LOG.info("Received notification for the ledger path : " + event.getPath()); for (Long ledgerId : ledgerList) { if (event.getPath().contains(ledgerId + "")) { urLedgerList.add(ledgerId); } } LOG.debug("Count down and waiting for next notification"); // count down and waiting for next notification underReplicaLatch.countDown(); } } private BookieServer getAuditorBookie() throws Exception { List<BookieServer> auditors = new LinkedList<BookieServer>(); byte[] data = zkc.getData(electionPath, false, null); assertNotNull("Auditor election failed", data); for (BookieServer bks : bs) { if (new String(data).contains(bks.getLocalAddress().getPort() + "")) { auditors.add(bks); } } assertEquals("Multiple Bookies acting as Auditor!", 1, auditors.size()); return auditors.get(0); } private Auditor getAuditorBookiesAuditor() throws Exception { BookieServer auditorBookieServer = getAuditorBookie(); String bookieAddr = auditorBookieServer.getLocalAddress().toString(); return auditorElectors.get(bookieAddr).auditor; } private String shutDownNonAuditorBookie() throws Exception { // shutdown bookie which is not an auditor int indexOf = bs.indexOf(getAuditorBookie()); int bkIndexDownBookie; if (indexOf < bs.size() - 1) { bkIndexDownBookie = indexOf + 1; } else { bkIndexDownBookie = indexOf - 1; } return shutdownBookie(bkIndexDownBookie); } private int getShutDownNonAuditorBookieIdx(String exclude) throws Exception { // shutdown bookie which is not an auditor int indexOf = bs.indexOf(getAuditorBookie()); int bkIndexDownBookie = 0; for (int i = 0; i < bs.size(); i++) { if (i == indexOf || bs.get(i).getLocalAddress().toString().equals(exclude)) { continue; } bkIndexDownBookie = i; break; } return bkIndexDownBookie; } private String shutDownNonAuditorBookie(String exclude) throws Exception { return shutdownBookie(getShutDownNonAuditorBookieIdx(exclude)); } }