org.apache.hadoop.hbase.master.TestRollingRestart.java Source code

Java tutorial

Introduction

Here is the source code for org.apache.hadoop.hbase.master.TestRollingRestart.java

Source

/**
 *
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.hadoop.hbase.master;

import static org.junit.Assert.assertEquals;

import java.io.IOException;
import java.util.List;
import java.util.NavigableSet;
import java.util.Set;
import java.util.TreeSet;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.hbase.HBaseConfiguration;
import org.apache.hadoop.hbase.HBaseTestingUtility;
import org.apache.hadoop.hbase.HRegionInfo;
import org.apache.hadoop.hbase.LargeTests;
import org.apache.hadoop.hbase.MiniHBaseCluster;
import org.apache.hadoop.hbase.ServerName;
import org.apache.hadoop.hbase.client.HTable;
import org.apache.hadoop.hbase.protobuf.ProtobufUtil;
import org.apache.hadoop.hbase.util.Bytes;
import org.apache.hadoop.hbase.util.JVMClusterUtil.MasterThread;
import org.apache.hadoop.hbase.util.JVMClusterUtil.RegionServerThread;
import org.apache.hadoop.hbase.zookeeper.ZKAssign;
import org.apache.hadoop.hbase.zookeeper.ZooKeeperWatcher;
import org.apache.zookeeper.KeeperException;
import org.junit.Test;
import org.junit.experimental.categories.Category;

/**
 * Tests the restarting of everything as done during rolling restarts.
 */
@Category(LargeTests.class)
public class TestRollingRestart {
    private static final Log LOG = LogFactory.getLog(TestRollingRestart.class);

    @Test(timeout = 500000)
    public void testBasicRollingRestart() throws Exception {

        // Start a cluster with 2 masters and 4 regionservers
        final int NUM_MASTERS = 2;
        final int NUM_RS = 3;
        final int NUM_REGIONS_TO_CREATE = 20;

        int expectedNumRS = 3;

        // Start the cluster
        log("Starting cluster");
        Configuration conf = HBaseConfiguration.create();
        HBaseTestingUtility TEST_UTIL = new HBaseTestingUtility(conf);
        TEST_UTIL.startMiniCluster(NUM_MASTERS, NUM_RS);
        MiniHBaseCluster cluster = TEST_UTIL.getHBaseCluster();
        log("Waiting for active/ready master");
        cluster.waitForActiveAndReadyMaster();
        ZooKeeperWatcher zkw = new ZooKeeperWatcher(conf, "testRollingRestart", null);
        HMaster master = cluster.getMaster();

        // Create a table with regions
        byte[] table = Bytes.toBytes("tableRestart");
        byte[] family = Bytes.toBytes("family");
        log("Creating table with " + NUM_REGIONS_TO_CREATE + " regions");
        HTable ht = TEST_UTIL.createTable(table, family);
        int numRegions = TEST_UTIL.createMultiRegions(conf, ht, family, NUM_REGIONS_TO_CREATE);
        numRegions += 1; // catalogs
        log("Waiting for no more RIT\n");
        blockUntilNoRIT(zkw, master);
        log("Disabling table\n");
        TEST_UTIL.getHBaseAdmin().disableTable(table);
        log("Waiting for no more RIT\n");
        blockUntilNoRIT(zkw, master);
        NavigableSet<String> regions = HBaseTestingUtility.getAllOnlineRegions(cluster);
        log("Verifying only catalog and namespace regions are assigned\n");
        if (regions.size() != 2) {
            for (String oregion : regions)
                log("Region still online: " + oregion);
        }
        assertEquals(2, regions.size());
        log("Enabling table\n");
        TEST_UTIL.getHBaseAdmin().enableTable(table);
        log("Waiting for no more RIT\n");
        blockUntilNoRIT(zkw, master);
        log("Verifying there are " + numRegions + " assigned on cluster\n");
        regions = HBaseTestingUtility.getAllOnlineRegions(cluster);
        assertRegionsAssigned(cluster, regions);
        assertEquals(expectedNumRS, cluster.getRegionServerThreads().size());

        // Add a new regionserver
        log("Adding a fourth RS");
        RegionServerThread restarted = cluster.startRegionServer();
        expectedNumRS++;
        restarted.waitForServerOnline();
        log("Additional RS is online");
        log("Waiting for no more RIT");
        blockUntilNoRIT(zkw, master);
        log("Verifying there are " + numRegions + " assigned on cluster");
        assertRegionsAssigned(cluster, regions);
        assertEquals(expectedNumRS, cluster.getRegionServerThreads().size());

        // Master Restarts
        List<MasterThread> masterThreads = cluster.getMasterThreads();
        MasterThread activeMaster = null;
        MasterThread backupMaster = null;
        assertEquals(2, masterThreads.size());
        if (masterThreads.get(0).getMaster().isActiveMaster()) {
            activeMaster = masterThreads.get(0);
            backupMaster = masterThreads.get(1);
        } else {
            activeMaster = masterThreads.get(1);
            backupMaster = masterThreads.get(0);
        }

        // Bring down the backup master
        log("Stopping backup master\n\n");
        backupMaster.getMaster().stop("Stop of backup during rolling restart");
        cluster.hbaseCluster.waitOnMaster(backupMaster);

        // Bring down the primary master
        log("Stopping primary master\n\n");
        activeMaster.getMaster().stop("Stop of active during rolling restart");
        cluster.hbaseCluster.waitOnMaster(activeMaster);

        // Start primary master
        log("Restarting primary master\n\n");
        activeMaster = cluster.startMaster();
        cluster.waitForActiveAndReadyMaster();
        master = activeMaster.getMaster();

        // Start backup master
        log("Restarting backup master\n\n");
        backupMaster = cluster.startMaster();

        assertEquals(expectedNumRS, cluster.getRegionServerThreads().size());

        // RegionServer Restarts

        // Bring them down, one at a time, waiting between each to complete
        List<RegionServerThread> regionServers = cluster.getLiveRegionServerThreads();
        int num = 1;
        int total = regionServers.size();
        for (RegionServerThread rst : regionServers) {
            ServerName serverName = rst.getRegionServer().getServerName();
            log("Stopping region server " + num + " of " + total + " [ " + serverName + "]");
            rst.getRegionServer().stop("Stopping RS during rolling restart");
            cluster.hbaseCluster.waitOnRegionServer(rst);
            log("Waiting for RS shutdown to be handled by master");
            waitForRSShutdownToStartAndFinish(activeMaster, serverName);
            log("RS shutdown done, waiting for no more RIT");
            blockUntilNoRIT(zkw, master);
            log("Verifying there are " + numRegions + " assigned on cluster");
            assertRegionsAssigned(cluster, regions);
            expectedNumRS--;
            assertEquals(expectedNumRS, cluster.getRegionServerThreads().size());
            log("Restarting region server " + num + " of " + total);
            restarted = cluster.startRegionServer();
            restarted.waitForServerOnline();
            expectedNumRS++;
            log("Region server " + num + " is back online");
            log("Waiting for no more RIT");
            blockUntilNoRIT(zkw, master);
            log("Verifying there are " + numRegions + " assigned on cluster");
            assertRegionsAssigned(cluster, regions);
            assertEquals(expectedNumRS, cluster.getRegionServerThreads().size());
            num++;
        }
        Thread.sleep(1000);
        assertRegionsAssigned(cluster, regions);

        // TODO: Bring random 3 of 4 RS down at the same time

        ht.close();
        // Stop the cluster
        TEST_UTIL.shutdownMiniCluster();
    }

    private void blockUntilNoRIT(ZooKeeperWatcher zkw, HMaster master)
            throws KeeperException, InterruptedException {
        ZKAssign.blockUntilNoRIT(zkw);
        master.assignmentManager.waitUntilNoRegionsInTransition(60000);
    }

    private void waitForRSShutdownToStartAndFinish(MasterThread activeMaster, ServerName serverName)
            throws InterruptedException {
        ServerManager sm = activeMaster.getMaster().getServerManager();
        // First wait for it to be in dead list
        while (!sm.getDeadServers().isDeadServer(serverName)) {
            log("Waiting for [" + serverName + "] to be listed as dead in master");
            Thread.sleep(1);
        }
        log("Server [" + serverName + "] marked as dead, waiting for it to " + "finish dead processing");
        while (sm.areDeadServersInProgress()) {
            log("Server [" + serverName + "] still being processed, waiting");
            Thread.sleep(100);
        }
        log("Server [" + serverName + "] done with server shutdown processing");
    }

    private void log(String msg) {
        LOG.debug("\n\nTRR: " + msg + "\n");
    }

    private int getNumberOfOnlineRegions(MiniHBaseCluster cluster) {
        int numFound = 0;
        for (RegionServerThread rst : cluster.getLiveRegionServerThreads()) {
            numFound += rst.getRegionServer().getNumberOfOnlineRegions();
        }
        for (MasterThread mt : cluster.getMasterThreads()) {
            numFound += mt.getMaster().getNumberOfOnlineRegions();
        }
        return numFound;
    }

    private void assertRegionsAssigned(MiniHBaseCluster cluster, Set<String> expectedRegions) throws IOException {
        int numFound = getNumberOfOnlineRegions(cluster);
        if (expectedRegions.size() > numFound) {
            log("Expected to find " + expectedRegions.size() + " but only found" + " " + numFound);
            NavigableSet<String> foundRegions = HBaseTestingUtility.getAllOnlineRegions(cluster);
            for (String region : expectedRegions) {
                if (!foundRegions.contains(region)) {
                    log("Missing region: " + region);
                }
            }
            assertEquals(expectedRegions.size(), numFound);
        } else if (expectedRegions.size() < numFound) {
            int doubled = numFound - expectedRegions.size();
            log("Expected to find " + expectedRegions.size() + " but found" + " " + numFound + " (" + doubled
                    + " double assignments?)");
            NavigableSet<String> doubleRegions = getDoubleAssignedRegions(cluster);
            for (String region : doubleRegions) {
                log("Region is double assigned: " + region);
            }
            assertEquals(expectedRegions.size(), numFound);
        } else {
            log("Success!  Found expected number of " + numFound + " regions");
        }
    }

    private NavigableSet<String> getDoubleAssignedRegions(MiniHBaseCluster cluster) throws IOException {
        NavigableSet<String> online = new TreeSet<String>();
        NavigableSet<String> doubled = new TreeSet<String>();
        for (RegionServerThread rst : cluster.getLiveRegionServerThreads()) {
            for (HRegionInfo region : ProtobufUtil.getOnlineRegions(rst.getRegionServer().getRSRpcServices())) {
                if (!online.add(region.getRegionNameAsString())) {
                    doubled.add(region.getRegionNameAsString());
                }
            }
        }
        return doubled;
    }

}