org.apache.hadoop.hbase.regionserver.TestWALLockup.java Source code

Java tutorial

Introduction

Here is the source code for org.apache.hadoop.hbase.regionserver.TestWALLockup.java

Source

/**
 *
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.hadoop.hbase.regionserver;

import static org.junit.Assert.assertTrue;

import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import java.util.concurrent.CountDownLatch;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hbase.Cell;
import org.apache.hadoop.hbase.CellScanner;
import org.apache.hadoop.hbase.HBaseTestingUtility;
import org.apache.hadoop.hbase.HConstants;
import org.apache.hadoop.hbase.HTableDescriptor;
import org.apache.hadoop.hbase.Server;
import org.apache.hadoop.hbase.TableName;
import org.apache.hadoop.hbase.client.Durability;
import org.apache.hadoop.hbase.client.Put;
import org.apache.hadoop.hbase.regionserver.wal.FSHLog;
import org.apache.hadoop.hbase.regionserver.wal.WALEdit;
import org.apache.hadoop.hbase.testclassification.MediumTests;
import org.apache.hadoop.hbase.util.Bytes;
import org.apache.hadoop.hbase.util.EnvironmentEdgeManagerTestHelper;
import org.apache.hadoop.hbase.util.Threads;
import org.apache.hadoop.hbase.wal.WAL;
import org.apache.hadoop.hbase.wal.WALKey;
import org.apache.hadoop.hbase.wal.WALProvider.Writer;
import org.junit.After;
import org.junit.Before;
import org.junit.Rule;
import org.junit.Test;
import org.junit.experimental.categories.Category;
import org.junit.rules.TestName;
import org.mockito.Mockito;

/**
 * Testing for lock up of WAL subsystem.
 * Copied from TestHRegion.
 */
@Category({ MediumTests.class })
public class TestWALLockup {
    private static final Log LOG = LogFactory.getLog(TestWALLockup.class);
    @Rule
    public TestName name = new TestName();

    private static final String COLUMN_FAMILY = "MyCF";
    private static final byte[] COLUMN_FAMILY_BYTES = Bytes.toBytes(COLUMN_FAMILY);

    HRegion region = null;
    // Do not run unit tests in parallel (? Why not?  It don't work?  Why not?  St.Ack)
    private static HBaseTestingUtility TEST_UTIL;
    private static Configuration CONF;
    private String dir;

    // Test names
    protected TableName tableName;

    @Before
    public void setup() throws IOException {
        TEST_UTIL = HBaseTestingUtility.createLocalHTU();
        CONF = TEST_UTIL.getConfiguration();
        // Disable block cache.
        CONF.setFloat(HConstants.HFILE_BLOCK_CACHE_SIZE_KEY, 0f);
        dir = TEST_UTIL.getDataTestDir("TestHRegion").toString();
        tableName = TableName.valueOf(name.getMethodName());
    }

    @After
    public void tearDown() throws Exception {
        EnvironmentEdgeManagerTestHelper.reset();
        LOG.info("Cleaning test directory: " + TEST_UTIL.getDataTestDir());
        TEST_UTIL.cleanupTestDir();
    }

    String getName() {
        return name.getMethodName();
    }

    /**
     * Reproduce locking up that happens when we get an inopportune sync during setup for
     * zigzaglatch wait. See HBASE-14317. If below is broken, we will see this test timeout because
     * it is locked up.
     * <p>First I need to set up some mocks for Server and RegionServerServices. I also need to
     * set up a dodgy WAL that will throw an exception when we go to append to it.
     */
    @Test(timeout = 30000)
    public void testLockupWhenSyncInMiddleOfZigZagSetup() throws IOException {
        // A WAL that we can have throw exceptions when a flag is set.
        class DodgyFSLog extends FSHLog {
            // Set this when want the WAL to start throwing exceptions.
            volatile boolean throwException = false;

            // Latch to hold up processing until after another operation has had time to run.
            CountDownLatch latch = new CountDownLatch(1);

            public DodgyFSLog(FileSystem fs, Path root, String logDir, Configuration conf) throws IOException {
                super(fs, root, logDir, conf);
            }

            @Override
            protected void afterCreatingZigZagLatch() {
                // If throwException set, then append will throw an exception causing the WAL to be
                // rolled. We'll come in here. Hold up processing until a sync can get in before
                // the zigzag has time to complete its setup and get its own sync in. This is what causes
                // the lock up we've seen in production.
                if (throwException) {
                    try {
                        LOG.info("LATCHED");
                        this.latch.await();
                    } catch (InterruptedException e) {
                        // TODO Auto-generated catch block
                        e.printStackTrace();
                    }
                }
            }

            @Override
            protected void beforeWaitOnSafePoint() {
                if (throwException) {
                    LOG.info("COUNTDOWN");
                    // Don't countdown latch until someone waiting on it otherwise, the above
                    // afterCreatingZigZagLatch will get to the latch and no one will ever free it and we'll
                    // be stuck; test won't go down
                    while (this.latch.getCount() <= 0)
                        Threads.sleep(1);
                    this.latch.countDown();
                }
            }

            @Override
            protected Writer createWriterInstance(Path path) throws IOException {
                final Writer w = super.createWriterInstance(path);
                return new Writer() {
                    @Override
                    public void close() throws IOException {
                        w.close();
                    }

                    @Override
                    public void sync() throws IOException {
                        if (throwException) {
                            throw new IOException("FAKE! Failed to replace a bad datanode...SYNC");
                        }
                        w.sync();
                    }

                    @Override
                    public void append(Entry entry) throws IOException {
                        if (throwException) {
                            throw new IOException("FAKE! Failed to replace a bad datanode...APPEND");
                        }
                        w.append(entry);
                    }

                    @Override
                    public long getLength() throws IOException {
                        return w.getLength();
                    }
                };
            }
        }

        // Mocked up server and regionserver services. Needed below.
        Server server = Mockito.mock(Server.class);
        Mockito.when(server.getConfiguration()).thenReturn(CONF);
        Mockito.when(server.isStopped()).thenReturn(false);
        Mockito.when(server.isAborted()).thenReturn(false);
        RegionServerServices services = Mockito.mock(RegionServerServices.class);

        // OK. Now I have my mocked up Server & RegionServerServices and dodgy WAL, go ahead with test.
        FileSystem fs = FileSystem.get(CONF);
        Path rootDir = new Path(dir + getName());
        DodgyFSLog dodgyWAL = new DodgyFSLog(fs, rootDir, getName(), CONF);
        Path originalWAL = dodgyWAL.getCurrentFileName();
        // I need a log roller running.
        LogRoller logRoller = new LogRoller(server, services);
        logRoller.addWAL(dodgyWAL);
        // There is no 'stop' once a logRoller is running.. it just dies.
        logRoller.start();
        // Now get a region and start adding in edits.
        HTableDescriptor htd = new HTableDescriptor(TableName.META_TABLE_NAME);
        final HRegion region = initHRegion(tableName, null, null, dodgyWAL);
        byte[] bytes = Bytes.toBytes(getName());
        try {
            // First get something into memstore. Make a Put and then pull the Cell out of it. Will
            // manage append and sync carefully in below to manufacture hang. We keep adding same
            // edit. WAL subsystem doesn't care.
            Put put = new Put(bytes);
            put.addColumn(COLUMN_FAMILY_BYTES, Bytes.toBytes("1"), bytes);
            WALKey key = new WALKey(region.getRegionInfo().getEncodedNameAsBytes(), htd.getTableName());
            WALEdit edit = new WALEdit();
            List<Cell> cells = new ArrayList<Cell>();
            for (CellScanner cs = put.cellScanner(); cs.advance();) {
                edit.add(cs.current());
                cells.add(cs.current());
            }
            // Put something in memstore and out in the WAL. Do a big number of appends so we push
            // out other side of the ringbuffer. If small numbers, stuff doesn't make it to WAL
            for (int i = 0; i < 1000; i++) {
                dodgyWAL.append(htd, region.getRegionInfo(), key, edit, region.getSequenceId(), true, cells);
            }
            // Set it so we start throwing exceptions.
            dodgyWAL.throwException = true;
            // This append provokes a WAL roll.
            dodgyWAL.append(htd, region.getRegionInfo(), key, edit, region.getSequenceId(), true, cells);
            boolean exception = false;
            try {
                dodgyWAL.sync();
            } catch (Exception e) {
                exception = true;
            }
            assertTrue("Did not get sync exception", exception);

            // Get a memstore flush going too so we have same hung profile as up in the issue over
            // in HBASE-14317. Flush hangs trying to get sequenceid because the ringbuffer is held up
            // by the zigzaglatch waiting on syncs to come home.
            Thread t = new Thread("flusher") {
                public void run() {
                    try {
                        region.flush(false);
                    } catch (IOException e) {
                        // TODO Auto-generated catch block
                        e.printStackTrace();
                    }
                };
            };
            t.setDaemon(true);
            t.start();
            // Wait till it gets into flushing. It will get stuck on getSequenceId. Then proceed.
            while (!region.writestate.flushing)
                Threads.sleep(1);
            // Now assert I got a new WAL file put in place even though loads of errors above.
            assertTrue(originalWAL != dodgyWAL.getCurrentFileName());
            // Can I append to it?
            dodgyWAL.throwException = false;
            region.put(put);
        } finally {
            // To stop logRoller, its server has to say it is stopped.
            Mockito.when(server.isStopped()).thenReturn(true);
            if (logRoller != null)
                logRoller.interrupt();
            if (region != null)
                region.close();
            if (dodgyWAL != null)
                dodgyWAL.close();
        }
    }

    /**
     * @return A region on which you must call
     *         {@link HBaseTestingUtility#closeRegionAndWAL(HRegion)} when done.
     */
    public static HRegion initHRegion(TableName tableName, byte[] startKey, byte[] stopKey, WAL wal)
            throws IOException {
        return TEST_UTIL.createLocalHRegion(tableName, startKey, stopKey, false, Durability.SYNC_WAL, wal,
                COLUMN_FAMILY_BYTES);
    }
}