org.apache.hadoop.hbase.master.TestMasterFailover.java Source code

Java tutorial

Introduction

Here is the source code for org.apache.hadoop.hbase.master.TestMasterFailover.java

Source

/**
 *
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.hadoop.hbase.master;

import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertFalse;
import static org.junit.Assert.assertNotNull;
import static org.junit.Assert.assertTrue;

import java.io.IOException;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import java.util.Set;
import java.util.TreeSet;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hbase.Abortable;
import org.apache.hadoop.hbase.ClusterStatus;
import org.apache.hadoop.hbase.HBaseConfiguration;
import org.apache.hadoop.hbase.HBaseTestingUtility;
import org.apache.hadoop.hbase.HColumnDescriptor;
import org.apache.hadoop.hbase.HRegionInfo;
import org.apache.hadoop.hbase.HTableDescriptor;
import org.apache.hadoop.hbase.LargeTests;
import org.apache.hadoop.hbase.MiniHBaseCluster;
import org.apache.hadoop.hbase.RegionTransition;
import org.apache.hadoop.hbase.ServerName;
import org.apache.hadoop.hbase.TableName;
import org.apache.hadoop.hbase.TableStateManager;
import org.apache.hadoop.hbase.executor.EventType;
import org.apache.hadoop.hbase.master.RegionState.State;
import org.apache.hadoop.hbase.protobuf.ProtobufUtil;
import org.apache.hadoop.hbase.protobuf.generated.ZooKeeperProtos;
import org.apache.hadoop.hbase.regionserver.HRegion;
import org.apache.hadoop.hbase.regionserver.HRegionServer;
import org.apache.hadoop.hbase.regionserver.RegionMergeTransaction;
import org.apache.hadoop.hbase.regionserver.RegionServerStoppedException;
import org.apache.hadoop.hbase.util.Bytes;
import org.apache.hadoop.hbase.util.FSTableDescriptors;
import org.apache.hadoop.hbase.util.FSUtils;
import org.apache.hadoop.hbase.util.JVMClusterUtil;
import org.apache.hadoop.hbase.util.JVMClusterUtil.MasterThread;
import org.apache.hadoop.hbase.util.JVMClusterUtil.RegionServerThread;
import org.apache.hadoop.hbase.util.Threads;
import org.apache.hadoop.hbase.zookeeper.MetaRegionTracker;
import org.apache.hadoop.hbase.zookeeper.ZKAssign;
import org.apache.hadoop.hbase.zookeeper.ZKTableStateManager;
import org.apache.hadoop.hbase.zookeeper.ZooKeeperWatcher;
import org.apache.zookeeper.data.Stat;
import org.junit.Test;
import org.junit.experimental.categories.Category;

@Category(LargeTests.class)
public class TestMasterFailover {
    private static final Log LOG = LogFactory.getLog(TestMasterFailover.class);

    /**
     * Complex test of master failover that tests as many permutations of the
     * different possible states that regions in transition could be in within ZK.
     * <p>
     * This tests the proper handling of these states by the failed-over master
     * and includes a thorough testing of the timeout code as well.
     * <p>
     * Starts with a single master and three regionservers.
     * <p>
     * Creates two tables, enabledTable and disabledTable, each containing 5
     * regions.  The disabledTable is then disabled.
     * <p>
     * After reaching steady-state, the master is killed.  We then mock several
     * states in ZK.
     * <p>
     * After mocking them, we will startup a new master which should become the
     * active master and also detect that it is a failover.  The primary test
     * passing condition will be that all regions of the enabled table are
     * assigned and all the regions of the disabled table are not assigned.
     * <p>
     * The different scenarios to be tested are below:
     * <p>
     * <b>ZK State:  OFFLINE</b>
     * <p>A node can get into OFFLINE state if</p>
     * <ul>
     * <li>An RS fails to open a region, so it reverts the state back to OFFLINE
     * <li>The Master is assigning the region to a RS before it sends RPC
     * </ul>
     * <p>We will mock the scenarios</p>
     * <ul>
     * <li>Master has assigned an enabled region but RS failed so a region is
     *     not assigned anywhere and is sitting in ZK as OFFLINE</li>
     * <li>This seems to cover both cases?</li>
     * </ul>
     * <p>
     * <b>ZK State:  CLOSING</b>
     * <p>A node can get into CLOSING state if</p>
     * <ul>
     * <li>An RS has begun to close a region
     * </ul>
     * <p>We will mock the scenarios</p>
     * <ul>
     * <li>Region of enabled table was being closed but did not complete
     * <li>Region of disabled table was being closed but did not complete
     * </ul>
     * <p>
     * <b>ZK State:  CLOSED</b>
     * <p>A node can get into CLOSED state if</p>
     * <ul>
     * <li>An RS has completed closing a region but not acknowledged by master yet
     * </ul>
     * <p>We will mock the scenarios</p>
     * <ul>
     * <li>Region of a table that should be enabled was closed on an RS
     * <li>Region of a table that should be disabled was closed on an RS
     * </ul>
     * <p>
     * <b>ZK State:  OPENING</b>
     * <p>A node can get into OPENING state if</p>
     * <ul>
     * <li>An RS has begun to open a region
     * </ul>
     * <p>We will mock the scenarios</p>
     * <ul>
     * <li>RS was opening a region of enabled table but never finishes
     * </ul>
     * <p>
     * <b>ZK State:  OPENED</b>
     * <p>A node can get into OPENED state if</p>
     * <ul>
     * <li>An RS has finished opening a region but not acknowledged by master yet
     * </ul>
     * <p>We will mock the scenarios</p>
     * <ul>
     * <li>Region of a table that should be enabled was opened on an RS
     * <li>Region of a table that should be disabled was opened on an RS
     * </ul>
     * @throws Exception
     */
    @Test(timeout = 240000)
    public void testMasterFailoverWithMockedRIT() throws Exception {

        final int NUM_MASTERS = 1;
        final int NUM_RS = 3;

        // Create config to use for this cluster
        Configuration conf = HBaseConfiguration.create();

        // Start the cluster
        HBaseTestingUtility TEST_UTIL = new HBaseTestingUtility(conf);
        TEST_UTIL.startMiniCluster(NUM_MASTERS, NUM_RS);
        MiniHBaseCluster cluster = TEST_UTIL.getHBaseCluster();
        log("Cluster started");

        // Create a ZKW to use in the test
        ZooKeeperWatcher zkw = HBaseTestingUtility.getZooKeeperWatcher(TEST_UTIL);

        // get all the master threads
        List<MasterThread> masterThreads = cluster.getMasterThreads();
        assertEquals(1, masterThreads.size());

        // only one master thread, let's wait for it to be initialized
        assertTrue(cluster.waitForActiveAndReadyMaster());
        HMaster master = masterThreads.get(0).getMaster();
        assertTrue(master.isActiveMaster());
        assertTrue(master.isInitialized());

        // disable load balancing on this master
        master.balanceSwitch(false);

        // create two tables in META, each with 10 regions
        byte[] FAMILY = Bytes.toBytes("family");
        byte[][] SPLIT_KEYS = new byte[][] { new byte[0], Bytes.toBytes("aaa"), Bytes.toBytes("bbb"),
                Bytes.toBytes("ccc"), Bytes.toBytes("ddd"), Bytes.toBytes("eee"), Bytes.toBytes("fff"),
                Bytes.toBytes("ggg"), Bytes.toBytes("hhh"), Bytes.toBytes("iii"), Bytes.toBytes("jjj") };

        byte[] enabledTable = Bytes.toBytes("enabledTable");
        HTableDescriptor htdEnabled = new HTableDescriptor(TableName.valueOf(enabledTable));
        htdEnabled.addFamily(new HColumnDescriptor(FAMILY));

        FileSystem filesystem = FileSystem.get(conf);
        Path rootdir = FSUtils.getRootDir(conf);
        FSTableDescriptors fstd = new FSTableDescriptors(filesystem, rootdir);
        // Write the .tableinfo
        fstd.createTableDescriptor(htdEnabled);

        HRegionInfo hriEnabled = new HRegionInfo(htdEnabled.getTableName(), null, null);
        createRegion(hriEnabled, rootdir, conf, htdEnabled);

        List<HRegionInfo> enabledRegions = TEST_UTIL.createMultiRegionsInMeta(TEST_UTIL.getConfiguration(),
                htdEnabled, SPLIT_KEYS);

        TableName disabledTable = TableName.valueOf("disabledTable");
        HTableDescriptor htdDisabled = new HTableDescriptor(disabledTable);
        htdDisabled.addFamily(new HColumnDescriptor(FAMILY));
        // Write the .tableinfo
        fstd.createTableDescriptor(htdDisabled);
        HRegionInfo hriDisabled = new HRegionInfo(htdDisabled.getTableName(), null, null);
        createRegion(hriDisabled, rootdir, conf, htdDisabled);
        List<HRegionInfo> disabledRegions = TEST_UTIL.createMultiRegionsInMeta(TEST_UTIL.getConfiguration(),
                htdDisabled, SPLIT_KEYS);

        TableName tableWithMergingRegions = TableName.valueOf("tableWithMergingRegions");
        TEST_UTIL.createTable(tableWithMergingRegions, FAMILY, new byte[][] { Bytes.toBytes("m") });

        log("Regions in hbase:meta and namespace have been created");

        // at this point we only expect 4 regions to be assigned out
        // (catalogs and namespace, + 2 merging regions)
        assertEquals(4, cluster.countServedRegions());

        // Move merging regions to the same region server
        AssignmentManager am = master.getAssignmentManager();
        RegionStates regionStates = am.getRegionStates();
        List<HRegionInfo> mergingRegions = regionStates.getRegionsOfTable(tableWithMergingRegions);
        assertEquals(2, mergingRegions.size());
        HRegionInfo a = mergingRegions.get(0);
        HRegionInfo b = mergingRegions.get(1);
        HRegionInfo newRegion = RegionMergeTransaction.getMergedRegionInfo(a, b);
        ServerName mergingServer = regionStates.getRegionServerOfRegion(a);
        ServerName serverB = regionStates.getRegionServerOfRegion(b);
        if (!serverB.equals(mergingServer)) {
            RegionPlan plan = new RegionPlan(b, serverB, mergingServer);
            am.balance(plan);
            assertTrue(am.waitForAssignment(b));
        }

        // Let's just assign everything to first RS
        HRegionServer hrs = cluster.getRegionServer(0);
        ServerName serverName = hrs.getServerName();
        HRegionInfo closingRegion = enabledRegions.remove(0);
        // we'll need some regions to already be assigned out properly on live RS
        List<HRegionInfo> enabledAndAssignedRegions = new ArrayList<HRegionInfo>();
        enabledAndAssignedRegions.add(enabledRegions.remove(0));
        enabledAndAssignedRegions.add(enabledRegions.remove(0));
        enabledAndAssignedRegions.add(closingRegion);

        List<HRegionInfo> disabledAndAssignedRegions = new ArrayList<HRegionInfo>();
        disabledAndAssignedRegions.add(disabledRegions.remove(0));
        disabledAndAssignedRegions.add(disabledRegions.remove(0));

        // now actually assign them
        for (HRegionInfo hri : enabledAndAssignedRegions) {
            master.assignmentManager.regionPlans.put(hri.getEncodedName(), new RegionPlan(hri, null, serverName));
            master.assignRegion(hri);
        }
        for (HRegionInfo hri : disabledAndAssignedRegions) {
            master.assignmentManager.regionPlans.put(hri.getEncodedName(), new RegionPlan(hri, null, serverName));
            master.assignRegion(hri);
        }

        // wait for no more RIT
        log("Waiting for assignment to finish");
        ZKAssign.blockUntilNoRIT(zkw);
        log("Assignment completed");

        // Stop the master
        log("Aborting master");
        cluster.abortMaster(0);
        cluster.waitOnMaster(0);
        log("Master has aborted");

        /*
         * Now, let's start mocking up some weird states as described in the method
         * javadoc.
         */

        // Master is down, so is the meta. We need to assign it somewhere
        // so that regions can be assigned during the mocking phase.
        ZKAssign.createNodeOffline(zkw, HRegionInfo.FIRST_META_REGIONINFO, hrs.getServerName());
        ProtobufUtil.openRegion(hrs.getRSRpcServices(), hrs.getServerName(), HRegionInfo.FIRST_META_REGIONINFO);
        while (true) {
            ServerName sn = MetaRegionTracker.getMetaRegionLocation(zkw);
            if (sn != null && sn.equals(hrs.getServerName())) {
                break;
            }
            Thread.sleep(100);
        }

        List<HRegionInfo> regionsThatShouldBeOnline = new ArrayList<HRegionInfo>();
        List<HRegionInfo> regionsThatShouldBeOffline = new ArrayList<HRegionInfo>();

        log("Beginning to mock scenarios");

        // Disable the disabledTable in ZK
        TableStateManager zktable = new ZKTableStateManager(zkw);
        zktable.setTableState(disabledTable, ZooKeeperProtos.Table.State.DISABLED);

        /*
         *  ZK = OFFLINE
         */

        // Region that should be assigned but is not and is in ZK as OFFLINE
        // Cause: This can happen if the master crashed after creating the znode but before sending the
        //  request to the region server
        HRegionInfo region = enabledRegions.remove(0);
        regionsThatShouldBeOnline.add(region);
        ZKAssign.createNodeOffline(zkw, region, serverName);

        /*
         * ZK = CLOSING
         */
        // Cause: Same as offline.
        regionsThatShouldBeOnline.add(closingRegion);
        ZKAssign.createNodeClosing(zkw, closingRegion, serverName);

        /*
         * ZK = CLOSED
         */

        // Region of enabled table closed but not ack
        //Cause: Master was down while the region server updated the ZK status.
        region = enabledRegions.remove(0);
        regionsThatShouldBeOnline.add(region);
        int version = ZKAssign.createNodeClosing(zkw, region, serverName);
        ZKAssign.transitionNodeClosed(zkw, region, serverName, version);

        // Region of disabled table closed but not ack
        region = disabledRegions.remove(0);
        regionsThatShouldBeOffline.add(region);
        version = ZKAssign.createNodeClosing(zkw, region, serverName);
        ZKAssign.transitionNodeClosed(zkw, region, serverName, version);

        /*
         * ZK = OPENED
         */

        // Region of enabled table was opened on RS
        // Cause: as offline
        region = enabledRegions.remove(0);
        regionsThatShouldBeOnline.add(region);
        ZKAssign.createNodeOffline(zkw, region, serverName);
        ProtobufUtil.openRegion(hrs.getRSRpcServices(), hrs.getServerName(), region);
        while (true) {
            byte[] bytes = ZKAssign.getData(zkw, region.getEncodedName());
            RegionTransition rt = RegionTransition.parseFrom(bytes);
            if (rt != null && rt.getEventType().equals(EventType.RS_ZK_REGION_OPENED)) {
                break;
            }
            Thread.sleep(100);
        }

        // Region of disable table was opened on RS
        // Cause: Master failed while updating the status for this region server.
        region = disabledRegions.remove(0);
        regionsThatShouldBeOffline.add(region);
        ZKAssign.createNodeOffline(zkw, region, serverName);
        ProtobufUtil.openRegion(hrs.getRSRpcServices(), hrs.getServerName(), region);
        while (true) {
            byte[] bytes = ZKAssign.getData(zkw, region.getEncodedName());
            RegionTransition rt = RegionTransition.parseFrom(bytes);
            if (rt != null && rt.getEventType().equals(EventType.RS_ZK_REGION_OPENED)) {
                break;
            }
            Thread.sleep(100);
        }

        /*
         * ZK = MERGING
         */

        // Regions of table of merging regions
        // Cause: Master was down while merging was going on
        RegionMergeTransaction.createNodeMerging(zkw, newRegion, mergingServer, a, b);

        /*
         * ZK = NONE
         */

        /*
         * DONE MOCKING
         */

        log("Done mocking data up in ZK");

        // Start up a new master
        log("Starting up a new master");
        master = cluster.startMaster().getMaster();
        log("Waiting for master to be ready");
        cluster.waitForActiveAndReadyMaster();
        log("Master is ready");

        // Get new region states since master restarted
        regionStates = master.getAssignmentManager().getRegionStates();
        // Merging region should remain merging
        assertTrue(regionStates.isRegionInState(a, State.MERGING));
        assertTrue(regionStates.isRegionInState(b, State.MERGING));
        assertTrue(regionStates.isRegionInState(newRegion, State.MERGING_NEW));
        // Now remove the faked merging znode, merging regions should be
        // offlined automatically, otherwise it is a bug in AM.
        ZKAssign.deleteNodeFailSilent(zkw, newRegion);

        // Failover should be completed, now wait for no RIT
        log("Waiting for no more RIT");
        ZKAssign.blockUntilNoRIT(zkw);
        log("No more RIT in ZK, now doing final test verification");

        // Grab all the regions that are online across RSs
        Set<HRegionInfo> onlineRegions = new TreeSet<HRegionInfo>();
        for (JVMClusterUtil.RegionServerThread rst : cluster.getRegionServerThreads()) {
            onlineRegions.addAll(ProtobufUtil.getOnlineRegions(rst.getRegionServer().getRSRpcServices()));
        }

        // Now, everything that should be online should be online
        for (HRegionInfo hri : regionsThatShouldBeOnline) {
            assertTrue(onlineRegions.contains(hri));
        }

        // Everything that should be offline should not be online
        for (HRegionInfo hri : regionsThatShouldBeOffline) {
            if (onlineRegions.contains(hri)) {
                LOG.debug(hri);
            }
            assertFalse(onlineRegions.contains(hri));
        }

        log("Done with verification, all passed, shutting down cluster");

        // Done, shutdown the cluster
        TEST_UTIL.shutdownMiniCluster();
    }

    /**
     * Complex test of master failover that tests as many permutations of the
     * different possible states that regions in transition could be in within ZK
     * pointing to an RS that has died while no master is around to process it.
     * <p>
     * This tests the proper handling of these states by the failed-over master
     * and includes a thorough testing of the timeout code as well.
     * <p>
     * Starts with a single master and two regionservers.
     * <p>
     * Creates two tables, enabledTable and disabledTable, each containing 5
     * regions.  The disabledTable is then disabled.
     * <p>
     * After reaching steady-state, the master is killed.  We then mock several
     * states in ZK.  And one of the RS will be killed.
     * <p>
     * After mocking them and killing an RS, we will startup a new master which
     * should become the active master and also detect that it is a failover.  The
     * primary test passing condition will be that all regions of the enabled
     * table are assigned and all the regions of the disabled table are not
     * assigned.
     * <p>
     * The different scenarios to be tested are below:
     * <p>
     * <b>ZK State:  CLOSING</b>
     * <p>A node can get into CLOSING state if</p>
     * <ul>
     * <li>An RS has begun to close a region
     * </ul>
     * <p>We will mock the scenarios</p>
     * <ul>
     * <li>Region was being closed but the RS died before finishing the close
     * </ul>
     * <b>ZK State:  OPENED</b>
     * <p>A node can get into OPENED state if</p>
     * <ul>
     * <li>An RS has finished opening a region but not acknowledged by master yet
     * </ul>
     * <p>We will mock the scenarios</p>
     * <ul>
     * <li>Region of a table that should be enabled was opened by a now-dead RS
     * <li>Region of a table that should be disabled was opened by a now-dead RS
     * </ul>
     * <p>
     * <b>ZK State:  NONE</b>
     * <p>A region could not have a transition node if</p>
     * <ul>
     * <li>The server hosting the region died and no master processed it
     * </ul>
     * <p>We will mock the scenarios</p>
     * <ul>
     * <li>Region of enabled table was on a dead RS that was not yet processed
     * <li>Region of disabled table was on a dead RS that was not yet processed
     * </ul>
     * @throws Exception
     */
    @Test(timeout = 180000)
    public void testMasterFailoverWithMockedRITOnDeadRS() throws Exception {

        final int NUM_MASTERS = 1;
        final int NUM_RS = 2;

        // Create and start the cluster
        HBaseTestingUtility TEST_UTIL = new HBaseTestingUtility();
        Configuration conf = TEST_UTIL.getConfiguration();

        conf.setInt(ServerManager.WAIT_ON_REGIONSERVERS_MINTOSTART, 1);
        conf.setInt(ServerManager.WAIT_ON_REGIONSERVERS_MAXTOSTART, 2);
        TEST_UTIL.startMiniCluster(NUM_MASTERS, NUM_RS);
        MiniHBaseCluster cluster = TEST_UTIL.getHBaseCluster();
        log("Cluster started");

        // Create a ZKW to use in the test
        ZooKeeperWatcher zkw = new ZooKeeperWatcher(TEST_UTIL.getConfiguration(), "unittest", new Abortable() {

            @Override
            public void abort(String why, Throwable e) {
                LOG.error("Fatal ZK Error: " + why, e);
                org.junit.Assert.assertFalse("Fatal ZK error", true);
            }

            @Override
            public boolean isAborted() {
                return false;
            }

        });

        // get all the master threads
        List<MasterThread> masterThreads = cluster.getMasterThreads();
        assertEquals(1, masterThreads.size());

        // only one master thread, let's wait for it to be initialized
        assertTrue(cluster.waitForActiveAndReadyMaster());
        HMaster master = masterThreads.get(0).getMaster();
        assertTrue(master.isActiveMaster());
        assertTrue(master.isInitialized());

        // disable load balancing on this master
        master.balanceSwitch(false);

        // create two tables in META, each with 30 regions
        byte[] FAMILY = Bytes.toBytes("family");
        byte[][] SPLIT_KEYS = TEST_UTIL.getRegionSplitStartKeys(Bytes.toBytes("aaa"), Bytes.toBytes("zzz"), 30);

        byte[] enabledTable = Bytes.toBytes("enabledTable");
        HTableDescriptor htdEnabled = new HTableDescriptor(TableName.valueOf(enabledTable));
        htdEnabled.addFamily(new HColumnDescriptor(FAMILY));
        FileSystem filesystem = FileSystem.get(conf);
        Path rootdir = FSUtils.getRootDir(conf);
        FSTableDescriptors fstd = new FSTableDescriptors(filesystem, rootdir);
        // Write the .tableinfo
        fstd.createTableDescriptor(htdEnabled);
        HRegionInfo hriEnabled = new HRegionInfo(htdEnabled.getTableName(), null, null);
        createRegion(hriEnabled, rootdir, conf, htdEnabled);

        List<HRegionInfo> enabledRegions = TEST_UTIL.createMultiRegionsInMeta(TEST_UTIL.getConfiguration(),
                htdEnabled, SPLIT_KEYS);

        TableName disabledTable = TableName.valueOf("disabledTable");
        HTableDescriptor htdDisabled = new HTableDescriptor(disabledTable);
        htdDisabled.addFamily(new HColumnDescriptor(FAMILY));
        // Write the .tableinfo
        fstd.createTableDescriptor(htdDisabled);
        HRegionInfo hriDisabled = new HRegionInfo(htdDisabled.getTableName(), null, null);
        createRegion(hriDisabled, rootdir, conf, htdDisabled);

        List<HRegionInfo> disabledRegions = TEST_UTIL.createMultiRegionsInMeta(TEST_UTIL.getConfiguration(),
                htdDisabled, SPLIT_KEYS);

        log("Regions in hbase:meta and Namespace have been created");

        // at this point we only expect 2 regions to be assigned out (catalogs and namespace  )
        assertEquals(2, cluster.countServedRegions());

        // The first RS will stay online
        List<RegionServerThread> regionservers = cluster.getRegionServerThreads();
        HRegionServer hrs = regionservers.get(0).getRegionServer();

        // The second RS is going to be hard-killed
        RegionServerThread hrsDeadThread = regionservers.get(1);
        HRegionServer hrsDead = hrsDeadThread.getRegionServer();
        ServerName deadServerName = hrsDead.getServerName();

        // we'll need some regions to already be assigned out properly on live RS
        List<HRegionInfo> enabledAndAssignedRegions = new ArrayList<HRegionInfo>();
        enabledAndAssignedRegions.addAll(enabledRegions.subList(0, 6));
        enabledRegions.removeAll(enabledAndAssignedRegions);
        List<HRegionInfo> disabledAndAssignedRegions = new ArrayList<HRegionInfo>();
        disabledAndAssignedRegions.addAll(disabledRegions.subList(0, 6));
        disabledRegions.removeAll(disabledAndAssignedRegions);

        // now actually assign them
        for (HRegionInfo hri : enabledAndAssignedRegions) {
            master.assignmentManager.regionPlans.put(hri.getEncodedName(),
                    new RegionPlan(hri, null, hrs.getServerName()));
            master.assignRegion(hri);
        }
        for (HRegionInfo hri : disabledAndAssignedRegions) {
            master.assignmentManager.regionPlans.put(hri.getEncodedName(),
                    new RegionPlan(hri, null, hrs.getServerName()));
            master.assignRegion(hri);
        }

        log("Waiting for assignment to finish");
        ZKAssign.blockUntilNoRIT(zkw);
        master.assignmentManager.waitUntilNoRegionsInTransition(60000);
        log("Assignment completed");

        assertTrue(" Table must be enabled.", master.getAssignmentManager().getTableStateManager()
                .isTableState(TableName.valueOf("enabledTable"), ZooKeeperProtos.Table.State.ENABLED));
        // we also need regions assigned out on the dead server
        List<HRegionInfo> enabledAndOnDeadRegions = new ArrayList<HRegionInfo>();
        enabledAndOnDeadRegions.addAll(enabledRegions.subList(0, 6));
        enabledRegions.removeAll(enabledAndOnDeadRegions);
        List<HRegionInfo> disabledAndOnDeadRegions = new ArrayList<HRegionInfo>();
        disabledAndOnDeadRegions.addAll(disabledRegions.subList(0, 6));
        disabledRegions.removeAll(disabledAndOnDeadRegions);

        // set region plan to server to be killed and trigger assign
        for (HRegionInfo hri : enabledAndOnDeadRegions) {
            master.assignmentManager.regionPlans.put(hri.getEncodedName(),
                    new RegionPlan(hri, null, deadServerName));
            master.assignRegion(hri);
        }
        for (HRegionInfo hri : disabledAndOnDeadRegions) {
            master.assignmentManager.regionPlans.put(hri.getEncodedName(),
                    new RegionPlan(hri, null, deadServerName));
            master.assignRegion(hri);
        }

        // wait for no more RIT
        log("Waiting for assignment to finish");
        ZKAssign.blockUntilNoRIT(zkw);
        master.assignmentManager.waitUntilNoRegionsInTransition(60000);
        log("Assignment completed");

        // Due to master.assignRegion(hri) could fail to assign a region to a specified RS
        // therefore, we need make sure that regions are in the expected RS
        verifyRegionLocation(hrs, enabledAndAssignedRegions);
        verifyRegionLocation(hrs, disabledAndAssignedRegions);
        verifyRegionLocation(hrsDead, enabledAndOnDeadRegions);
        verifyRegionLocation(hrsDead, disabledAndOnDeadRegions);

        assertTrue(" Didn't get enough regions of enabledTalbe on live rs.", enabledAndAssignedRegions.size() >= 2);
        assertTrue(" Didn't get enough regions of disalbedTable on live rs.",
                disabledAndAssignedRegions.size() >= 2);
        assertTrue(" Didn't get enough regions of enabledTalbe on dead rs.", enabledAndOnDeadRegions.size() >= 2);
        assertTrue(" Didn't get enough regions of disalbedTable on dead rs.", disabledAndOnDeadRegions.size() >= 2);

        // Stop the master
        log("Aborting master");
        cluster.abortMaster(0);
        cluster.waitOnMaster(0);
        log("Master has aborted");

        /*
         * Now, let's start mocking up some weird states as described in the method
         * javadoc.
         */

        // Master is down, so is the meta. We need to assign it somewhere
        // so that regions can be assigned during the mocking phase.
        ZKAssign.createNodeOffline(zkw, HRegionInfo.FIRST_META_REGIONINFO, hrs.getServerName());
        ProtobufUtil.openRegion(hrs.getRSRpcServices(), hrs.getServerName(), HRegionInfo.FIRST_META_REGIONINFO);
        while (true) {
            ServerName sn = MetaRegionTracker.getMetaRegionLocation(zkw);
            if (sn != null && sn.equals(hrs.getServerName())) {
                break;
            }
            Thread.sleep(100);
        }

        List<HRegionInfo> regionsThatShouldBeOnline = new ArrayList<HRegionInfo>();
        List<HRegionInfo> regionsThatShouldBeOffline = new ArrayList<HRegionInfo>();

        log("Beginning to mock scenarios");

        // Disable the disabledTable in ZK
        TableStateManager zktable = new ZKTableStateManager(zkw);
        zktable.setTableState(disabledTable, ZooKeeperProtos.Table.State.DISABLED);

        assertTrue(" The enabled table should be identified on master fail over.",
                zktable.isTableState(TableName.valueOf("enabledTable"), ZooKeeperProtos.Table.State.ENABLED));

        /*
         * ZK = CLOSING
         */

        // Region of enabled table being closed on dead RS but not finished
        HRegionInfo region = enabledAndOnDeadRegions.remove(0);
        regionsThatShouldBeOnline.add(region);
        ZKAssign.createNodeClosing(zkw, region, deadServerName);
        LOG.debug("\n\nRegion of enabled table was CLOSING on dead RS\n" + region + "\n\n");

        // Region of disabled table being closed on dead RS but not finished
        region = disabledAndOnDeadRegions.remove(0);
        regionsThatShouldBeOffline.add(region);
        ZKAssign.createNodeClosing(zkw, region, deadServerName);
        LOG.debug("\n\nRegion of disabled table was CLOSING on dead RS\n" + region + "\n\n");

        /*
         * ZK = CLOSED
         */

        // Region of enabled on dead server gets closed but not ack'd by master
        region = enabledAndOnDeadRegions.remove(0);
        regionsThatShouldBeOnline.add(region);
        int version = ZKAssign.createNodeClosing(zkw, region, deadServerName);
        ZKAssign.transitionNodeClosed(zkw, region, deadServerName, version);
        LOG.debug("\n\nRegion of enabled table was CLOSED on dead RS\n" + region + "\n\n");

        // Region of disabled on dead server gets closed but not ack'd by master
        region = disabledAndOnDeadRegions.remove(0);
        regionsThatShouldBeOffline.add(region);
        version = ZKAssign.createNodeClosing(zkw, region, deadServerName);
        ZKAssign.transitionNodeClosed(zkw, region, deadServerName, version);
        LOG.debug("\n\nRegion of disabled table was CLOSED on dead RS\n" + region + "\n\n");

        /*
         * ZK = OPENING
         */

        // RS was opening a region of enabled table then died
        region = enabledRegions.remove(0);
        regionsThatShouldBeOnline.add(region);
        ZKAssign.createNodeOffline(zkw, region, deadServerName);
        ZKAssign.transitionNodeOpening(zkw, region, deadServerName);
        LOG.debug("\n\nRegion of enabled table was OPENING on dead RS\n" + region + "\n\n");

        // RS was opening a region of disabled table then died
        region = disabledRegions.remove(0);
        regionsThatShouldBeOffline.add(region);
        ZKAssign.createNodeOffline(zkw, region, deadServerName);
        ZKAssign.transitionNodeOpening(zkw, region, deadServerName);
        LOG.debug("\n\nRegion of disabled table was OPENING on dead RS\n" + region + "\n\n");

        /*
         * ZK = OPENED
         */

        // Region of enabled table was opened on dead RS
        region = enabledRegions.remove(0);
        regionsThatShouldBeOnline.add(region);
        ZKAssign.createNodeOffline(zkw, region, deadServerName);
        ProtobufUtil.openRegion(hrsDead.getRSRpcServices(), hrsDead.getServerName(), region);
        while (true) {
            byte[] bytes = ZKAssign.getData(zkw, region.getEncodedName());
            RegionTransition rt = RegionTransition.parseFrom(bytes);
            if (rt != null && rt.getEventType().equals(EventType.RS_ZK_REGION_OPENED)) {
                break;
            }
            Thread.sleep(100);
        }
        LOG.debug("\n\nRegion of enabled table was OPENED on dead RS\n" + region + "\n\n");

        // Region of disabled table was opened on dead RS
        region = disabledRegions.remove(0);
        regionsThatShouldBeOffline.add(region);
        ZKAssign.createNodeOffline(zkw, region, deadServerName);
        ProtobufUtil.openRegion(hrsDead.getRSRpcServices(), hrsDead.getServerName(), region);
        while (true) {
            byte[] bytes = ZKAssign.getData(zkw, region.getEncodedName());
            RegionTransition rt = RegionTransition.parseFrom(bytes);
            if (rt != null && rt.getEventType().equals(EventType.RS_ZK_REGION_OPENED)) {
                break;
            }
            Thread.sleep(100);
        }
        LOG.debug("\n\nRegion of disabled table was OPENED on dead RS\n" + region + "\n\n");

        /*
         * ZK = NONE
         */

        // Region of enabled table was open at steady-state on dead RS
        region = enabledRegions.remove(0);
        regionsThatShouldBeOnline.add(region);
        ZKAssign.createNodeOffline(zkw, region, deadServerName);
        ProtobufUtil.openRegion(hrsDead.getRSRpcServices(), hrsDead.getServerName(), region);
        while (true) {
            byte[] bytes = ZKAssign.getData(zkw, region.getEncodedName());
            RegionTransition rt = RegionTransition.parseFrom(bytes);
            if (rt != null && rt.getEventType().equals(EventType.RS_ZK_REGION_OPENED)) {
                ZKAssign.deleteOpenedNode(zkw, region.getEncodedName(), rt.getServerName());
                LOG.debug("DELETED " + rt);
                break;
            }
            Thread.sleep(100);
        }
        LOG.debug("\n\nRegion of enabled table was open at steady-state on dead RS" + "\n" + region + "\n\n");

        // Region of disabled table was open at steady-state on dead RS
        region = disabledRegions.remove(0);
        regionsThatShouldBeOffline.add(region);
        ZKAssign.createNodeOffline(zkw, region, deadServerName);
        ProtobufUtil.openRegion(hrsDead.getRSRpcServices(), hrsDead.getServerName(), region);
        while (true) {
            byte[] bytes = ZKAssign.getData(zkw, region.getEncodedName());
            RegionTransition rt = RegionTransition.parseFrom(bytes);
            if (rt != null && rt.getEventType().equals(EventType.RS_ZK_REGION_OPENED)) {
                ZKAssign.deleteOpenedNode(zkw, region.getEncodedName(), rt.getServerName());
                break;
            }
            Thread.sleep(100);
        }
        LOG.debug("\n\nRegion of disabled table was open at steady-state on dead RS" + "\n" + region + "\n\n");

        /*
         * DONE MOCKING
         */

        log("Done mocking data up in ZK");

        // Kill the RS that had a hard death
        log("Killing RS " + deadServerName);
        hrsDead.abort("Killing for unit test");
        log("RS " + deadServerName + " killed");

        // Start up a new master.  Wait until regionserver is completely down
        // before starting new master because of hbase-4511.
        while (hrsDeadThread.isAlive()) {
            Threads.sleep(10);
        }
        log("Starting up a new master");
        master = cluster.startMaster().getMaster();
        log("Waiting for master to be ready");
        assertTrue(cluster.waitForActiveAndReadyMaster());
        log("Master is ready");

        // Wait until SSH processing completed for dead server.
        while (master.getServerManager().areDeadServersInProgress()) {
            Thread.sleep(10);
        }

        // Failover should be completed, now wait for no RIT
        log("Waiting for no more RIT");
        ZKAssign.blockUntilNoRIT(zkw);
        log("No more RIT in ZK");
        long now = System.currentTimeMillis();
        long maxTime = 120000;
        boolean done = master.assignmentManager.waitUntilNoRegionsInTransition(maxTime);
        if (!done) {
            RegionStates regionStates = master.getAssignmentManager().getRegionStates();
            LOG.info("rit=" + regionStates.getRegionsInTransition());
        }
        long elapsed = System.currentTimeMillis() - now;
        assertTrue("Elapsed=" + elapsed + ", maxTime=" + maxTime + ", done=" + done, elapsed < maxTime);
        log("No more RIT in RIT map, doing final test verification");

        // Grab all the regions that are online across RSs
        Set<HRegionInfo> onlineRegions = new TreeSet<HRegionInfo>();
        now = System.currentTimeMillis();
        maxTime = 30000;
        for (JVMClusterUtil.RegionServerThread rst : cluster.getRegionServerThreads()) {
            try {
                HRegionServer rs = rst.getRegionServer();
                while (!rs.getRegionsInTransitionInRS().isEmpty()) {
                    elapsed = System.currentTimeMillis() - now;
                    assertTrue("Test timed out in getting online regions", elapsed < maxTime);
                    if (rs.isAborted() || rs.isStopped()) {
                        // This region server is stopped, skip it.
                        break;
                    }
                    Thread.sleep(100);
                }
                onlineRegions.addAll(ProtobufUtil.getOnlineRegions(rs.getRSRpcServices()));
            } catch (RegionServerStoppedException e) {
                LOG.info("Got RegionServerStoppedException", e);
            }
        }

        // Now, everything that should be online should be online
        for (HRegionInfo hri : regionsThatShouldBeOnline) {
            assertTrue("region=" + hri.getRegionNameAsString() + ", " + onlineRegions.toString(),
                    onlineRegions.contains(hri));
        }

        // Everything that should be offline should not be online
        for (HRegionInfo hri : regionsThatShouldBeOffline) {
            assertFalse(onlineRegions.contains(hri));
        }

        log("Done with verification, all passed, shutting down cluster");

        // Done, shutdown the cluster
        TEST_UTIL.shutdownMiniCluster();
    }

    /**
     * Verify regions are on the expected region server
     */
    private void verifyRegionLocation(HRegionServer hrs, List<HRegionInfo> regions) throws IOException {
        List<HRegionInfo> tmpOnlineRegions = ProtobufUtil.getOnlineRegions(hrs.getRSRpcServices());
        Iterator<HRegionInfo> itr = regions.iterator();
        while (itr.hasNext()) {
            HRegionInfo tmp = itr.next();
            if (!tmpOnlineRegions.contains(tmp)) {
                itr.remove();
            }
        }
    }

    HRegion createRegion(final HRegionInfo hri, final Path rootdir, final Configuration c,
            final HTableDescriptor htd) throws IOException {
        HRegion r = HRegion.createHRegion(hri, rootdir, c, htd);
        // The above call to create a region will create an hlog file.  Each
        // log file create will also create a running thread to do syncing.  We need
        // to close out this log else we will have a running thread trying to sync
        // the file system continuously which is ugly when dfs is taken away at the
        // end of the test.
        HRegion.closeHRegion(r);
        return r;
    }

    // TODO: Next test to add is with testing permutations of the RIT or the RS
    //       killed are hosting ROOT and hbase:meta regions.

    private void log(String string) {
        LOG.info("\n\n" + string + " \n\n");
    }

    @Test(timeout = 180000)
    public void testShouldCheckMasterFailOverWhenMETAIsInOpenedState() throws Exception {
        LOG.info("Starting testShouldCheckMasterFailOverWhenMETAIsInOpenedState");
        final int NUM_MASTERS = 1;
        final int NUM_RS = 2;

        // Start the cluster
        HBaseTestingUtility TEST_UTIL = new HBaseTestingUtility();
        Configuration conf = TEST_UTIL.getConfiguration();
        conf.setInt("hbase.master.info.port", -1);

        TEST_UTIL.startMiniCluster(NUM_MASTERS, NUM_RS);
        MiniHBaseCluster cluster = TEST_UTIL.getHBaseCluster();

        // Find regionserver carrying meta.
        HRegionServer regionServer = cluster.getMaster();
        HRegion metaRegion = regionServer.getOnlineRegion(HRegionInfo.FIRST_META_REGIONINFO.getRegionName());

        TEST_UTIL.shutdownMiniHBaseCluster();

        // Create a ZKW to use in the test
        ZooKeeperWatcher zkw = HBaseTestingUtility.createAndForceNodeToOpenedState(TEST_UTIL, metaRegion,
                regionServer.getServerName());

        LOG.info("Staring cluster for second time");
        TEST_UTIL.startMiniHBaseCluster(NUM_MASTERS, NUM_RS);

        HMaster master = TEST_UTIL.getHBaseCluster().getMaster();
        while (!master.isInitialized()) {
            Thread.sleep(100);
        }
        // Failover should be completed, now wait for no RIT
        log("Waiting for no more RIT");
        ZKAssign.blockUntilNoRIT(zkw);

        zkw.close();
        // Stop the cluster
        TEST_UTIL.shutdownMiniCluster();
    }

    /**
     * This tests a RIT in offline state will get re-assigned after a master restart
     */
    @Test(timeout = 240000)
    public void testOfflineRegionReAssginedAfterMasterRestart() throws Exception {
        final TableName table = TableName.valueOf("testOfflineRegionReAssginedAfterMasterRestart");
        final int NUM_MASTERS = 1;
        final int NUM_RS = 2;

        // Create config to use for this cluster
        Configuration conf = HBaseConfiguration.create();

        // Start the cluster
        final HBaseTestingUtility TEST_UTIL = new HBaseTestingUtility(conf);
        TEST_UTIL.startMiniCluster(NUM_MASTERS, NUM_RS);
        log("Cluster started");

        TEST_UTIL.createTable(table, Bytes.toBytes("family"));
        HMaster master = TEST_UTIL.getHBaseCluster().getMaster();
        RegionStates regionStates = master.getAssignmentManager().getRegionStates();
        HRegionInfo hri = regionStates.getRegionsOfTable(table).get(0);
        ServerName serverName = regionStates.getRegionServerOfRegion(hri);
        TEST_UTIL.assertRegionOnServer(hri, serverName, 200);

        ServerName dstName = null;
        for (ServerName tmpServer : master.serverManager.getOnlineServers().keySet()) {
            if (!tmpServer.equals(serverName)) {
                dstName = tmpServer;
                break;
            }
        }
        // find a different server
        assertTrue(dstName != null);
        // shutdown HBase cluster
        TEST_UTIL.shutdownMiniHBaseCluster();
        // create a RIT node in offline state
        ZooKeeperWatcher zkw = TEST_UTIL.getZooKeeperWatcher();
        ZKAssign.createNodeOffline(zkw, hri, dstName);
        Stat stat = new Stat();
        byte[] data = ZKAssign.getDataNoWatch(zkw, hri.getEncodedName(), stat);
        assertTrue(data != null);
        RegionTransition rt = RegionTransition.parseFrom(data);
        assertTrue(rt.getEventType() == EventType.M_ZK_REGION_OFFLINE);

        LOG.info(hri.getEncodedName() + " region is in offline state with source server=" + serverName
                + " and dst server=" + dstName);

        // start HBase cluster
        TEST_UTIL.startMiniHBaseCluster(NUM_MASTERS, NUM_RS);

        while (true) {
            master = TEST_UTIL.getHBaseCluster().getMaster();
            if (master != null && master.isInitialized()) {
                ServerManager serverManager = master.getServerManager();
                if (!serverManager.areDeadServersInProgress()) {
                    break;
                }
            }
            Thread.sleep(200);
        }

        // verify the region is assigned
        master = TEST_UTIL.getHBaseCluster().getMaster();
        master.getAssignmentManager().waitForAssignment(hri);
        regionStates = master.getAssignmentManager().getRegionStates();
        RegionState newState = regionStates.getRegionState(hri);
        assertTrue(newState.isOpened());
    }

    /**
     * Simple test of master failover.
     * <p>
     * Starts with three masters.  Kills a backup master.  Then kills the active
     * master.  Ensures the final master becomes active and we can still contact
     * the cluster.
     * @throws Exception
     */
    @Test(timeout = 240000)
    public void testSimpleMasterFailover() throws Exception {

        final int NUM_MASTERS = 3;
        final int NUM_RS = 3;

        // Start the cluster
        HBaseTestingUtility TEST_UTIL = new HBaseTestingUtility();

        TEST_UTIL.startMiniCluster(NUM_MASTERS, NUM_RS);
        MiniHBaseCluster cluster = TEST_UTIL.getHBaseCluster();

        // get all the master threads
        List<MasterThread> masterThreads = cluster.getMasterThreads();

        // wait for each to come online
        for (MasterThread mt : masterThreads) {
            assertTrue(mt.isAlive());
        }

        // verify only one is the active master and we have right number
        int numActive = 0;
        int activeIndex = -1;
        ServerName activeName = null;
        HMaster active = null;
        for (int i = 0; i < masterThreads.size(); i++) {
            if (masterThreads.get(i).getMaster().isActiveMaster()) {
                numActive++;
                activeIndex = i;
                active = masterThreads.get(activeIndex).getMaster();
                activeName = active.getServerName();
            }
        }
        assertEquals(1, numActive);
        assertEquals(NUM_MASTERS, masterThreads.size());
        LOG.info("Active master " + activeName);

        // Check that ClusterStatus reports the correct active and backup masters
        assertNotNull(active);
        ClusterStatus status = active.getClusterStatus();
        assertTrue(status.getMaster().equals(activeName));
        assertEquals(2, status.getBackupMastersSize());
        assertEquals(2, status.getBackupMasters().size());

        // attempt to stop one of the inactive masters
        int backupIndex = (activeIndex == 0 ? 1 : activeIndex - 1);
        HMaster master = cluster.getMaster(backupIndex);
        LOG.debug("\n\nStopping a backup master: " + master.getServerName() + "\n");
        cluster.stopMaster(backupIndex, false);
        cluster.waitOnMaster(backupIndex);

        // Verify still one active master and it's the same
        for (int i = 0; i < masterThreads.size(); i++) {
            if (masterThreads.get(i).getMaster().isActiveMaster()) {
                assertTrue(activeName.equals(masterThreads.get(i).getMaster().getServerName()));
                activeIndex = i;
                active = masterThreads.get(activeIndex).getMaster();
            }
        }
        assertEquals(1, numActive);
        assertEquals(2, masterThreads.size());
        int rsCount = masterThreads.get(activeIndex).getMaster().getClusterStatus().getServersSize();
        LOG.info("Active master " + active.getServerName() + " managing " + rsCount + " regions servers");
        assertEquals(5, rsCount);

        // Check that ClusterStatus reports the correct active and backup masters
        assertNotNull(active);
        status = active.getClusterStatus();
        assertTrue(status.getMaster().equals(activeName));
        assertEquals(1, status.getBackupMastersSize());
        assertEquals(1, status.getBackupMasters().size());

        // kill the active master
        LOG.debug("\n\nStopping the active master " + active.getServerName() + "\n");
        cluster.stopMaster(activeIndex, false);
        cluster.waitOnMaster(activeIndex);

        // wait for an active master to show up and be ready
        assertTrue(cluster.waitForActiveAndReadyMaster());

        LOG.debug("\n\nVerifying backup master is now active\n");
        // should only have one master now
        assertEquals(1, masterThreads.size());

        // and he should be active
        active = masterThreads.get(0).getMaster();
        assertNotNull(active);
        status = active.getClusterStatus();
        ServerName mastername = status.getMaster();
        assertTrue(mastername.equals(active.getServerName()));
        assertTrue(active.isActiveMaster());
        assertEquals(0, status.getBackupMastersSize());
        assertEquals(0, status.getBackupMasters().size());
        int rss = status.getServersSize();
        LOG.info("Active master " + mastername.getServerName() + " managing " + rss + " region servers");
        assertEquals(4, rss);

        // Stop the cluster
        TEST_UTIL.shutdownMiniCluster();
    }
}