org.apache.hadoop.hdfs.server.namenode.ha.TestPipelinesFailover.java Source code

Introduction

Here is the source code for org.apache.hadoop.hdfs.server.namenode.ha.TestPipelinesFailover.java
Source

/**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.hadoop.hdfs.server.namenode.ha;

import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertTrue;

import java.io.IOException;
import java.security.PrivilegedExceptionAction;
import java.util.concurrent.TimeoutException;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hdfs.AppendTestUtil;
import org.apache.hadoop.hdfs.DFSConfigKeys;
import org.apache.hadoop.hdfs.DistributedFileSystem;
import org.apache.hadoop.hdfs.MiniDFSCluster;
import org.apache.hadoop.hdfs.MiniDFSNNTopology;
import org.apache.hadoop.hdfs.server.blockmanagement.BlockManagerTestUtil;
import org.apache.hadoop.hdfs.server.namenode.FSNamesystem;
import org.apache.hadoop.io.IOUtils;
import org.apache.hadoop.io.retry.RetryInvocationHandler;
import org.apache.hadoop.security.UserGroupInformation;
import org.apache.hadoop.test.GenericTestUtils;
import org.apache.hadoop.test.MultithreadedTestUtil.RepeatingTestThread;
import org.apache.hadoop.test.MultithreadedTestUtil.TestContext;
import org.apache.log4j.Level;
import org.junit.Test;

import com.google.common.base.Supplier;
import io.hops.common.INodeUtil;
import io.hops.exception.StorageException;
import io.hops.metadata.hdfs.entity.INodeIdentifier;
import io.hops.transaction.handler.HDFSOperationType;
import io.hops.transaction.handler.HopsTransactionalRequestHandler;
import io.hops.transaction.lock.LockFactory;
import io.hops.transaction.lock.TransactionLockTypes;
import io.hops.transaction.lock.TransactionLocks;
import org.apache.hadoop.hdfs.DFSTestUtil;
import org.apache.hadoop.hdfs.server.blockmanagement.BlockInfoContiguousUnderConstruction;
import org.apache.hadoop.hdfs.server.blockmanagement.DatanodeManager;
import org.apache.hadoop.hdfs.server.blockmanagement.DatanodeStorageInfo;
import org.junit.Ignore;

/**
 * Test cases regarding pipeline recovery during NN failover.
 */
public class TestPipelinesFailover {
    static {
        GenericTestUtils.setLogLevel(LogFactory.getLog(RetryInvocationHandler.class), Level.ALL);
        DFSTestUtil.setNameNodeLogLevel(Level.ALL);
    }

    protected static final Log LOG = LogFactory.getLog(TestPipelinesFailover.class);
    private static final Path TEST_PATH = new Path("/test-file");
    private static final int BLOCK_SIZE = 4096;
    private static final int BLOCK_AND_A_HALF = BLOCK_SIZE * 3 / 2;

    private static final int STRESS_NUM_THREADS = 25;
    private static final int STRESS_RUNTIME = 40000;

    enum TestScenario {
        ORIGINAL_ACTIVE_CRASHED {
            @Override
            void run(MiniDFSCluster cluster) throws IOException {
                cluster.restartNameNode(0);
            }
        };

        abstract void run(MiniDFSCluster cluster) throws IOException;
    }

    enum MethodToTestIdempotence {
        ALLOCATE_BLOCK, COMPLETE_FILE;
    }

    /**
     * Tests continuing a write pipeline over a failover.
     */
    @Test(timeout = 30000)
    public void testAllocateBlockAfterCrashFailover() throws Exception {
        doWriteOverFailoverTest(TestScenario.ORIGINAL_ACTIVE_CRASHED, MethodToTestIdempotence.ALLOCATE_BLOCK);
    }

    @Test(timeout = 30000)
    public void testCompleteFileAfterCrashFailover() throws Exception {
        doWriteOverFailoverTest(TestScenario.ORIGINAL_ACTIVE_CRASHED, MethodToTestIdempotence.COMPLETE_FILE);
    }

    private void doWriteOverFailoverTest(TestScenario scenario, MethodToTestIdempotence methodToTest)
            throws Exception {
        Configuration conf = new Configuration();
        conf.setInt(DFSConfigKeys.DFS_BLOCK_SIZE_KEY, BLOCK_SIZE);
        // Don't check replication periodically.
        conf.setInt(DFSConfigKeys.DFS_NAMENODE_REPLICATION_INTERVAL_KEY, 1000);

        FSDataOutputStream stm = null;
        MiniDFSCluster cluster = new MiniDFSCluster.Builder(conf)
                .nnTopology(MiniDFSNNTopology.simpleHOPSTopology(2)).numDataNodes(3).build();
        try {
            int sizeWritten = 0;

            cluster.waitActive();
            Thread.sleep(500);

            LOG.info("Starting with NN 0 active");
            FileSystem fs = cluster.getFileSystem(0);
            stm = fs.create(TEST_PATH);

            // write a block and a half
            AppendTestUtil.write(stm, 0, BLOCK_AND_A_HALF);
            sizeWritten += BLOCK_AND_A_HALF;

            // Make sure all of the blocks are written out before failover.
            stm.hflush();

            LOG.info("Failing over to NN 1");
            scenario.run(cluster);

            // NOTE: explicitly do *not* make any further metadata calls
            // to the NN here. The next IPC call should be to allocate the next
            // block. Any other call would notice the failover and not test
            // idempotence of the operation (HDFS-3031)

            FSNamesystem ns1 = cluster.getNameNode(1).getNamesystem();
            BlockManagerTestUtil.updateState(ns1.getBlockManager());
            assertEquals(0, ns1.getPendingReplicationBlocks());
            assertEquals(0, ns1.getCorruptReplicaBlocks());
            assertEquals(0, ns1.getMissingBlocksCount());

            // If we're testing allocateBlock()'s idempotence, write another
            // block and a half, so we have to allocate a new block.
            // Otherise, don't write anything, so our next RPC will be
            // completeFile() if we're testing idempotence of that operation.
            if (methodToTest == MethodToTestIdempotence.ALLOCATE_BLOCK) {
                // write another block and a half
                AppendTestUtil.write(stm, sizeWritten, BLOCK_AND_A_HALF);
                sizeWritten += BLOCK_AND_A_HALF;
            }

            stm.close();
            stm = null;

            AppendTestUtil.check(fs, TEST_PATH, sizeWritten);
        } finally {
            IOUtils.closeStream(stm);
            cluster.shutdown();
        }
    }

    /**
     * Tests continuing a write pipeline over a failover when a DN fails
     * after the failover - ensures that updating the pipeline succeeds
     * even when the pipeline was constructed on a different NN.
     */
    @Test(timeout = 30000)
    public void testWriteOverCrashFailoverWithDnFail() throws Exception {
        doTestWriteOverFailoverWithDnFail(TestScenario.ORIGINAL_ACTIVE_CRASHED);
    }

    private void doTestWriteOverFailoverWithDnFail(TestScenario scenario) throws Exception {
        Configuration conf = new Configuration();
        conf.setInt(DFSConfigKeys.DFS_BLOCK_SIZE_KEY, BLOCK_SIZE);

        FSDataOutputStream stm = null;
        MiniDFSCluster cluster = new MiniDFSCluster.Builder(conf)
                .nnTopology(MiniDFSNNTopology.simpleHOPSTopology(2)).numDataNodes(5).build();
        try {
            cluster.waitActive();
            Thread.sleep(500);

            LOG.info("Starting with NN 0 active");
            FileSystem fs = cluster.getFileSystem(0);
            stm = fs.create(TEST_PATH);

            // write a block and a half
            AppendTestUtil.write(stm, 0, BLOCK_AND_A_HALF);

            // Make sure all the blocks are written before failover
            stm.hflush();

            LOG.info("Failing over to NN 1");
            scenario.run(cluster);

            assertTrue(fs.exists(TEST_PATH));

            cluster.stopDataNode(0);

            // write another block and a half
            AppendTestUtil.write(stm, BLOCK_AND_A_HALF, BLOCK_AND_A_HALF);
            stm.hflush();

            cluster.stopDataNode(1);

            AppendTestUtil.write(stm, BLOCK_AND_A_HALF * 2, BLOCK_AND_A_HALF);
            stm.hflush();

            stm.close();
            stm = null;

            AppendTestUtil.check(fs, TEST_PATH, BLOCK_AND_A_HALF * 3);
        } finally {
            IOUtils.closeStream(stm);
            cluster.shutdown();
        }
    }

    /**
     * Tests lease recovery if a client crashes. This approximates the
     * use case of HBase WALs being recovered after a NN failover.
     */
    @Test(timeout = 150000)
    public void testLeaseRecoveryAfterFailover() throws Exception {
        final Configuration conf = new Configuration();
        // Disable permissions so that another user can recover the lease.
        conf.setBoolean(DFSConfigKeys.DFS_PERMISSIONS_ENABLED_KEY, false);
        conf.setInt(DFSConfigKeys.DFS_BLOCK_SIZE_KEY, BLOCK_SIZE);

        FSDataOutputStream stm = null;
        final MiniDFSCluster cluster = new MiniDFSCluster.Builder(conf)
                .nnTopology(MiniDFSNNTopology.simpleHOPSTopology(2)).numDataNodes(3).build();
        try {
            cluster.waitActive();
            Thread.sleep(500);

            LOG.info("Starting with NN 0 active");
            FileSystem fs = cluster.getFileSystem(0);
            stm = fs.create(TEST_PATH);

            // write a block and a half
            AppendTestUtil.write(stm, 0, BLOCK_AND_A_HALF);
            stm.hflush();

            LOG.info("Failing over to NN 1");

            cluster.restartNameNode(0);
            cluster.waitActive();

            assertTrue(fs.exists(TEST_PATH));

            FileSystem fsOtherUser = createFsAsOtherUser(cluster, conf);
            loopRecoverLease(fsOtherUser, TEST_PATH);

            AppendTestUtil.check(fs, TEST_PATH, BLOCK_AND_A_HALF);

        } finally {
            IOUtils.closeStream(stm);
            cluster.shutdown();
        }
    }

    /**
     * Stress test for pipeline/lease recovery. Starts a number of
     * threads, each of which creates a file and has another client
     * break the lease. While these threads run, failover proceeds
     * back and forth between two namenodes.
     */
    @Test(timeout = STRESS_RUNTIME * 3)
    @Ignore
    public void testPipelineRecoveryStress() throws Exception {
        HAStressTestHarness harness = new HAStressTestHarness();
        // Disable permissions so that another user can recover the lease.
        harness.conf.setBoolean(DFSConfigKeys.DFS_PERMISSIONS_ENABLED_KEY, false);
        // This test triggers rapid NN failovers.  The client retry policy uses an
        // exponential backoff.  This can quickly lead to long sleep times and even
        // timeout the whole test.  Cap the sleep time at 1s to prevent this.
        harness.conf.setInt(DFSConfigKeys.DFS_CLIENT_FAILOVER_SLEEPTIME_MAX_KEY, 1000);

        final MiniDFSCluster cluster = harness.startCluster();
        try {
            cluster.waitActive();

            FileSystem fs = harness.getFailoverFs();
            DistributedFileSystem fsAsOtherUser = createFsAsOtherUser(cluster, harness.conf);

            TestContext testers = new TestContext();
            for (int i = 0; i < STRESS_NUM_THREADS; i++) {
                Path p = new Path("/test-" + i);
                testers.addThread(new PipelineTestThread(testers, fs, fsAsOtherUser, p));
            }

            // Start a separate thread which will make sure that replication
            // happens quickly by triggering deletion reports and replication
            // work calculation frequently.
            harness.addReplicationTriggerThread(500);
            harness.addFailoverThread(5000);
            harness.startThreads();
            testers.startThreads();

            testers.waitFor(STRESS_RUNTIME);
            testers.stop();
            harness.stopThreads();
        } finally {
            System.err.println("===========================\n\n\n\n");
            harness.shutdown();
        }
    }

    /**
     * Test thread which creates a file, has another fake user recover
     * the lease on the file, and then ensures that the file's contents
     * are properly readable. If any of these steps fails, propagates
     * an exception back to the test context, causing the test case
     * to fail.
     */
    private static class PipelineTestThread extends RepeatingTestThread {
        private final FileSystem fs;
        private final FileSystem fsOtherUser;
        private final Path path;

        public PipelineTestThread(TestContext ctx, FileSystem fs, FileSystem fsOtherUser, Path p) {
            super(ctx);
            this.fs = fs;
            this.fsOtherUser = fsOtherUser;
            this.path = p;
        }

        @Override
        public void doAnAction() throws Exception {
            FSDataOutputStream stm = fs.create(path, true);
            try {
                AppendTestUtil.write(stm, 0, 100);
                stm.hflush();
                loopRecoverLease(fsOtherUser, path);
                AppendTestUtil.check(fs, path, 100);
            } finally {
                try {
                    stm.close();
                } catch (IOException e) {
                    // should expect this since we lost the lease
                }
            }
        }

        @Override
        public String toString() {
            return "Pipeline test thread for " + path;
        }
    }

    private DistributedFileSystem createFsAsOtherUser(final MiniDFSCluster cluster, final Configuration conf)
            throws IOException, InterruptedException {
        return (DistributedFileSystem) UserGroupInformation
                .createUserForTesting("otheruser", new String[] { "othergroup" })
                .doAs(new PrivilegedExceptionAction<FileSystem>() {
                    @Override
                    public FileSystem run() throws Exception {
                        return cluster.getFileSystem(0);
                    }
                });
    }

    private DatanodeStorageInfo[] getStorageInfosTx(final BlockInfoContiguousUnderConstruction b,
            final DatanodeManager datanodeManager) throws IOException {

        return (DatanodeStorageInfo[]) new HopsTransactionalRequestHandler(
                HDFSOperationType.GET_EXPECTED_BLK_LOCATIONS) {
            INodeIdentifier inodeIdentifier;

            @Override
            public void setUp() throws StorageException {
                inodeIdentifier = INodeUtil.resolveINodeFromBlock(b);
            }

            @Override
            public void acquireLock(TransactionLocks locks) throws IOException {
                LockFactory lf = LockFactory.getInstance();
                locks.add(lf.getIndividualINodeLock(TransactionLockTypes.INodeLockType.READ, inodeIdentifier))
                        .add(lf.getIndividualBlockLock(b.getBlockId(), inodeIdentifier))
                        .add(lf.getBlockRelated(LockFactory.BLK.RE, LockFactory.BLK.UC));
            }

            @Override
            public Object performTask() throws StorageException, IOException {
                return b.getExpectedStorageLocations(datanodeManager);
            }
        }.handle();
    }

    /**
     * Try to recover the lease on the given file for up to 60 seconds.
     * @param fsOtherUser the filesystem to use for the recoverLease call
     * @param testPath the path on which to run lease recovery
     * @throws TimeoutException if lease recover does not succeed within 60
     * seconds
     * @throws InterruptedException if the thread is interrupted
     */
    private static void loopRecoverLease(final FileSystem fsOtherUser, final Path testPath)
            throws TimeoutException, InterruptedException, IOException {
        try {
            GenericTestUtils.waitFor(new Supplier<Boolean>() {
                @Override
                public Boolean get() {
                    boolean success;
                    try {
                        success = ((DistributedFileSystem) fsOtherUser).recoverLease(testPath);
                    } catch (IOException e) {
                        throw new RuntimeException(e);
                    }
                    if (!success) {
                        LOG.info("Waiting to recover lease successfully");
                    }
                    return success;
                }
            }, 5000, 600000);
        } catch (TimeoutException e) {
            throw new TimeoutException("Timed out recovering lease for " + testPath);
        }
    }
}