org.apache.hadoop.raid.TestDirectoryBlockFixer.java Source code

Java tutorial

Introduction

Here is the source code for org.apache.hadoop.raid.TestDirectoryBlockFixer.java

Source

/**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.hadoop.raid;

import static org.junit.Assert.assertFalse;
import static org.junit.Assert.assertNotSame;

import java.io.File;
import java.io.IOException;
import java.net.URI;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Random;
import java.util.Set;

import junit.framework.TestCase;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.FileUtil;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hdfs.DFSUtil;
import org.apache.hadoop.hdfs.DistributedFileSystem;
import org.apache.hadoop.hdfs.MiniDFSCluster;
import org.apache.hadoop.hdfs.RaidDFSUtil;
import org.apache.hadoop.hdfs.TestDatanodeBlockScanner;
import org.apache.hadoop.hdfs.TestRaidDfs;
import org.apache.hadoop.hdfs.protocol.Block;
import org.apache.hadoop.hdfs.protocol.LocatedBlocks;
import org.apache.hadoop.mapred.MiniMRCluster;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.JobID;
import org.apache.hadoop.raid.DistBlockIntegrityMonitor.Worker.LostFileInfo;
import org.apache.hadoop.raid.LogUtils.LOGRESULTS;
import org.apache.hadoop.raid.LogUtils.LOGTYPES;
import org.apache.hadoop.raid.StripeStore.StripeInfo;
import org.apache.hadoop.util.StringUtils;
import org.junit.Test;

public class TestDirectoryBlockFixer extends TestCase {
    final static Log LOG = LogFactory.getLog("org.apache.hadoop.raid.TestDirectoryBlockFixer");
    final static String TEST_DIR = new File(System.getProperty("test.build.data", "build/contrib/raid/test/data"))
            .getAbsolutePath();
    final static String CHECKSUM_STORE_DIR = new File(TEST_DIR, "ckm_store." + System.currentTimeMillis())
            .getAbsolutePath();
    final static String STRIPE_STORE_DIR = new File(TEST_DIR, "stripe_store." + System.currentTimeMillis())
            .getAbsolutePath();
    final static String CONFIG_FILE = new File(TEST_DIR, "test-raid.xml").getAbsolutePath();
    final static long RELOAD_INTERVAL = 1000;
    final static int NUM_DATANODES = 3;
    final long blockSize = 8192L;
    final long[] fileSizes = new long[] { blockSize + blockSize / 2, // block 0, 1
            3 * blockSize, // block 2, 3
            blockSize + blockSize / 2 + 1 }; // block 4, 5, 6, 7
    final long[] blockSizes = new long[] { blockSize, 2 * blockSize, blockSize / 2 };
    final Integer[] rsCorruptFileIdx1 = new Integer[] { 0, 1, 2, 3, 5, 6, 7 };
    final int[] rsNumCorruptBlocksInFiles1 = new int[] { 2, 2, 3 };
    final Integer[] rsCorruptFileIdx2 = new Integer[] { 1, 3, 4, 5, 6 };
    final int[] rsNumCorruptBlocksInFiles2 = new int[] { 1, 1, 3 };
    Configuration conf;
    String namenode = null;
    MiniDFSCluster dfsCluster = null;
    String hftp = null;
    MiniMRCluster mr = null;
    FileSystem fileSys = null;
    RaidNode cnode = null;
    String jobTrackerName = null;
    Random rand = new Random();
    static {
        ParityFilePair.disableCacheUsedInTestOnly();
    }

    public Configuration getRaidNodeConfig(Configuration conf, boolean local) {
        // create an instance of the RaidNode
        Configuration localConf = new Configuration(conf);
        localConf.setInt("raid.blockfix.interval", 1000);
        if (local) {
            localConf.set("raid.blockfix.classname", "org.apache.hadoop.raid.LocalBlockIntegrityMonitor");
        } else {
            localConf.set("raid.blockfix.classname", "org.apache.hadoop.raid.DistBlockIntegrityMonitor");
        }
        localConf.setLong("raid.blockfix.filespertask", 2L);
        return localConf;
    }

    @Test
    public void testDirectoryFilterUnfixableFiles() throws Exception {
        conf = new Configuration();
        dfsCluster = new MiniDFSCluster(conf, NUM_DATANODES, true, null);
        dfsCluster.waitActive();
        FileSystem fs = dfsCluster.getFileSystem();

        Utils.loadTestCodecs(conf, 3, 5, 1, 3, "/destraid", "/destraidrs", false, true);
        try {
            Configuration testConf = fs.getConf();
            BlockIntegrityMonitor blockFixer = new LocalBlockIntegrityMonitor(testConf, false);

            String p1 = "/user/foo/f1";
            String p2 = "/user/foo/f2";
            String p3 = "/user1/foo/bar/f1";
            String p4 = "/a/b";
            String p5 = "/c";
            String p6 = "/destraidrs/user";
            String p7 = "/destraid/user1/foo";

            fs.mkdirs(new Path(p6));

            List<String> fileList = new ArrayList<String>();
            fileList.add(p1);
            fileList.add(p2);
            fileList.add(p3);
            fileList.add(p4);
            fileList.add(p5);

            blockFixer.filterUnreconstructableSourceFiles(fs, fileList.iterator());
            // p3 and p5 should be filtered out.
            assertEquals(3, fileList.size());

            Set<String> filtered = new HashSet<String>();
            for (String p : fileList)
                filtered.add(p);
            assertFalse("File not filtered", filtered.contains(p3));
            assertFalse("File not filtered", filtered.contains(p5));

            fileList.add(p3);
            fs.mkdirs(new Path(p7));
            blockFixer.filterUnreconstructableSourceFiles(fs, fileList.iterator());
            // Nothing is filtered.
            assertEquals(4, fileList.size());
        } finally {
            dfsCluster.shutdown();
        }
    }

    @Test
    public void testDirBlockFixLocal() throws Exception {
        implDirBlockFix(true, true, false);
    }

    @Test
    public void testDirBlockFixDist() throws Exception {
        implDirBlockFix(false, true, false);
    }

    @Test
    public void testDirBlockFixDistWithoutStripeInfo() throws Exception {
        implDirBlockFix(false, false, false);
    }

    @Test
    public void testDirBlockFixDistWithStripeVerificationFailure() throws Exception {
        implDirBlockFix(false, true, true);
    }

    private long getTotal(int[] elements) {
        long totalCorruptBlocks = 0;
        for (int numBlk : elements) {
            totalCorruptBlocks += numBlk;
        }
        return totalCorruptBlocks;
    }

    /**
     * Create a file with three stripes, corrupt a block each in two stripes,
     * and wait for the the file to be fixed.
     */
    private void implDirBlockFix(boolean local, boolean hasStripeInfo, boolean corruptStripe) throws Exception {
        LOG.info("Test testDirBlockFix started. local:" + local + " hasStripeInfo:" + hasStripeInfo
                + " corruptStripe:" + corruptStripe);
        int stripeLength = 3;
        mySetup(stripeLength);
        long[] crcs = new long[3];
        int[] seeds = new int[3];
        Path dirPath = new Path("/user/dhruba/raidtestrs");
        Path[] files = TestRaidDfs.createTestFiles(dirPath, fileSizes, blockSizes, crcs, seeds, fileSys, (short) 1);
        Path destPath = new Path("/destraidrs/user/dhruba");
        LOG.info("Test testDirBlockFix created test files");
        Configuration localConf = this.getRaidNodeConfig(conf, local);
        // Not allow multiple running jobs
        localConf.setLong("raid.blockfix.maxpendingjobs", 1L);

        try {
            cnode = RaidNode.createRaidNode(null, localConf);
            TestRaidDfs.waitForDirRaided(LOG, fileSys, dirPath, destPath);
            cnode.stop();
            cnode.join();

            DistributedFileSystem dfs = (DistributedFileSystem) fileSys;
            String[] corruptFiles = DFSUtil.getCorruptFiles(dfs);
            assertEquals("no corrupt files expected", 0, corruptFiles.length);
            assertEquals("filesFixed() should return 0 before fixing files", 0,
                    cnode.blockIntegrityMonitor.getNumFilesFixed());
            if (!hasStripeInfo) {
                // clear out all stripes
                LocalStripeStore lss = new LocalStripeStore();
                lss.initialize(localConf, false, dfs);
                lss.clear();
            }
            if (corruptStripe) {
                LocalStripeStore lss = new LocalStripeStore();
                lss.initialize(localConf, false, dfs);
                Set<List<Block>> corruptCandidates = new HashSet<List<Block>>(lss.stripeSet.keySet());
                for (List<Block> lb : corruptCandidates) {
                    for (Codec codec : Codec.getCodecs()) {
                        StripeInfo si = lss.getStripe(codec, lb.get(0));
                        if (si == null) {
                            continue;
                        }
                        String oldSi = si.toString();
                        Collections.rotate(si.parityBlocks, 1);
                        Collections.rotate(si.srcBlocks, 1);
                        lss.putStripe(codec, si.parityBlocks, si.srcBlocks);
                        String newSi = lss.getStripe(codec, lb.get(0)).toString();
                        LOG.info("Corrupt the stripe info old : " + oldSi + " new : " + newSi);
                    }
                }
            }

            this.corruptFiles(dirPath, crcs, rsCorruptFileIdx1, dfs, files, rsNumCorruptBlocksInFiles1);
            cnode = RaidNode.createRaidNode(null, localConf);
            long start = System.currentTimeMillis();
            while (cnode.blockIntegrityMonitor.getNumFilesFixed() < 3
                    && cnode.blockIntegrityMonitor.getNumFileFixFailures() < 3
                    && System.currentTimeMillis() - start < 120000) {
                LOG.info("Test testDirBlockFix waiting for files to be fixed.");
                Thread.sleep(1000);
            }
            long totalCorruptBlocks = getTotal(rsNumCorruptBlocksInFiles1);
            if (hasStripeInfo) {
                if (!corruptStripe) {
                    TestBlockFixer.verifyMetrics(fileSys, cnode, local, 3L, totalCorruptBlocks);

                    dfs = getDFS(conf, dfs);
                    for (int i = 0; i < fileSizes.length; i++) {
                        assertTrue("file " + files[i] + " not fixed",
                                TestRaidDfs.validateFile(dfs, files[i], fileSizes[i], crcs[i]));
                    }
                } else {
                    TestBlockFixer.verifyMetrics(fileSys, cnode, local, 0L, 0L);
                    assertTrue("should fail to fix more than 3 files",
                            cnode.blockIntegrityMonitor.getNumFileFixFailures() >= 3L);
                    TestBlockFixer.verifyMetrics(fileSys, cnode, LOGTYPES.OFFLINE_RECONSTRUCTION_FILE,
                            LOGRESULTS.FAILURE, 3L, true);
                    // Will throw stripe mismatch exception for the first blocks of 3 files
                    TestBlockFixer.verifyMetrics(fileSys, cnode,
                            LOGTYPES.OFFLINE_RECONSTRUCTION_STRIPE_VERIFICATION, LOGRESULTS.FAILURE, 3L, true);
                }
            } else {
                TestBlockFixer.verifyMetrics(fileSys, cnode, local, 0L, 0L);
                assertTrue("should fail to fix more than 3 files",
                        cnode.blockIntegrityMonitor.getNumFileFixFailures() >= 3L);
                TestBlockFixer.verifyMetrics(fileSys, cnode, LOGTYPES.OFFLINE_RECONSTRUCTION_GET_STRIPE,
                        LOGRESULTS.FAILURE, totalCorruptBlocks, true);
                TestBlockFixer.verifyMetrics(fileSys, cnode, LOGTYPES.OFFLINE_RECONSTRUCTION_FILE,
                        LOGRESULTS.FAILURE, 3L, true);
            }
        } catch (Exception e) {
            LOG.info("Test testDirBlockFix Exception " + e, e);
            throw e;
        } finally {
            myTearDown();
        }
        LOG.info("Test testDirBlockFix completed.");
    }

    /**
     * Tests integrity of generated block.
     * Create a file and delete a block entirely. Wait for the block to be
     * regenerated. Now stop RaidNode and corrupt the generated block.
     * Test that corruption in the generated block can be detected by clients.
     */
    private void generatedBlockTestCommon(String testName, int blockToCorrupt, boolean local) throws Exception {
        LOG.info("Test " + testName + " started.");
        int stripeLength = 3;
        mySetup(stripeLength);
        long[] crcs = new long[3];
        int[] seeds = new int[3];
        Path dirPath = new Path("/user/dhruba/raidtest");
        Path[] files = TestRaidDfs.createTestFiles(dirPath, fileSizes, blockSizes, crcs, seeds, fileSys, (short) 1);
        Path destPath = new Path("/destraid/user/dhruba");
        LOG.info("Test " + testName + " created test files");
        Configuration localConf = this.getRaidNodeConfig(conf, local);

        try {
            cnode = RaidNode.createRaidNode(null, localConf);
            TestRaidDfs.waitForDirRaided(LOG, fileSys, dirPath, destPath);
            cnode.stop();
            cnode.join();

            DistributedFileSystem dfs = (DistributedFileSystem) fileSys;
            String[] corruptFiles = DFSUtil.getCorruptFiles(dfs);
            assertEquals("no corrupt files expected", 0, corruptFiles.length);
            assertEquals("filesFixed() should return 0 before fixing files", 0,
                    cnode.blockIntegrityMonitor.getNumFilesFixed());

            Integer[] corruptBlockIdxs = new Integer[] { blockToCorrupt };
            TestDirectoryRaidDfs.corruptBlocksInDirectory(conf, dirPath, crcs, corruptBlockIdxs, fileSys,
                    dfsCluster, false, true);

            corruptFiles = DFSUtil.getCorruptFiles(dfs);
            assertEquals("files not corrupted", corruptBlockIdxs.length, corruptFiles.length);
            int corruptFileIdx = -1;
            for (int i = 0; i < files.length; i++) {
                if (files[i].toUri().getPath().equals(corruptFiles[0])) {
                    corruptFileIdx = i;
                    break;
                }
            }
            assertNotSame("Wrong corrupt file", -1, corruptFileIdx);
            cnode = RaidNode.createRaidNode(null, localConf);
            long start = System.currentTimeMillis();
            while (cnode.blockIntegrityMonitor.getNumFilesFixed() < 1
                    && System.currentTimeMillis() - start < 120000) {
                LOG.info("Test testDirBlockFix waiting for files to be fixed.");
                Thread.sleep(1000);
            }
            TestBlockFixer.verifyMetrics(fileSys, cnode, local, 1L, corruptBlockIdxs.length);
            // Stop RaidNode
            cnode.stop();
            cnode.join();
            cnode = null;

            // The block has successfully been reconstructed.
            dfs = getDFS(conf, dfs);
            assertTrue("file not fixed", TestRaidDfs.validateFile(dfs, files[corruptFileIdx],
                    fileSizes[corruptFileIdx], crcs[corruptFileIdx]));

            // Now corrupt the generated block.
            TestDirectoryRaidDfs.corruptBlocksInDirectory(conf, dirPath, crcs, corruptBlockIdxs, dfs, dfsCluster,
                    false, false);
            try {
                TestRaidDfs.validateFile(dfs, files[corruptFileIdx], fileSizes[corruptFileIdx],
                        crcs[corruptFileIdx]);
                fail("Expected exception not thrown");
            } catch (org.apache.hadoop.fs.ChecksumException ce) {
            } catch (org.apache.hadoop.fs.BlockMissingException bme) {
            }
        } catch (Exception e) {
            LOG.info("Test " + testName + " Exception " + e, e);
            throw e;
        } finally {
            myTearDown();
        }
        LOG.info("Test " + testName + " completed.");
    }

    /**
     * Tests integrity of generated block.
     * Create a file and delete a block entirely. Wait for the block to be
     * regenerated. Now stop RaidNode and corrupt the generated block.
     * Test that corruption in the generated block can be detected by clients.
     */
    @Test
    public void testGeneratedBlockLocal() throws Exception {
        generatedBlockTestCommon("testGeneratedBlock", 2, true);
    }

    /**
     * Tests integrity of generated block.
     * Create a file and delete a block entirely. Wait for the block to be
     * regenerated. Now stop RaidNode and corrupt the generated block.
     * Test that corruption in the generated block can be detected by clients.
     */
    @Test
    public void testGeneratedBlockDist() throws Exception {
        generatedBlockTestCommon("testGeneratedBlock", 2, false);
    }

    /**
     * Tests integrity of generated last block.
     * Create a file and delete a block entirely. Wait for the block to be
     * regenerated. Now stop RaidNode and corrupt the generated block.
     * Test that corruption in the generated block can be detected by clients.
     */
    @Test
    public void testGeneratedLastBlockLocal() throws Exception {
        generatedBlockTestCommon("testGeneratedLastBlock", -1, true);
    }

    /**
     * Tests integrity of generated last block.
     * Create a file and delete a block entirely. Wait for the block to be
     * regenerated. Now stop RaidNode and corrupt the generated block.
     * Test that corruption in the generated block can be detected by clients.
     */
    @Test
    public void testGeneratedLastBlockDist() throws Exception {
        generatedBlockTestCommon("testGeneratedLastBlock", -1, false);
    }

    @Test
    public void testParityBlockFixLocal() throws Exception {
        implParityBlockFix("testParityBlockFixLocal", true);
    }

    @Test
    public void testParityBlockFixDist() throws Exception {
        implParityBlockFix("testParityBlockFixDist", false);
    }

    /**
     * Corrupt a parity file and wait for it to get fixed.
     */
    private void implParityBlockFix(String testName, boolean local) throws Exception {
        LOG.info("Test " + testName + " started.");
        int stripeLength = 3;
        mySetup(stripeLength);
        long[] crcs = new long[3];
        int[] seeds = new int[3];
        Path dirPath = new Path("/user/dhruba/raidtest");
        Path[] files = TestRaidDfs.createTestFiles(dirPath, fileSizes, blockSizes, crcs, seeds, fileSys, (short) 1);
        Path destPath = new Path("/destraid/user/dhruba");
        Path parityFile = new Path("/destraid/user/dhruba/raidtest");
        LOG.info("Test " + testName + " created test files");
        Configuration localConf = this.getRaidNodeConfig(conf, local);

        try {
            cnode = RaidNode.createRaidNode(null, localConf);
            TestRaidDfs.waitForDirRaided(LOG, fileSys, dirPath, destPath);
            cnode.stop();
            cnode.join();

            long parityCRC = RaidDFSUtil.getCRC(fileSys, parityFile);

            FileStatus parityStat = fileSys.getFileStatus(parityFile);
            DistributedFileSystem dfs = (DistributedFileSystem) fileSys;
            LocatedBlocks locs = RaidDFSUtil.getBlockLocations(dfs, parityFile.toUri().getPath(), 0,
                    parityStat.getLen());
            String[] corruptFiles = DFSUtil.getCorruptFiles(dfs);

            assertEquals("no corrupt files expected", 0, corruptFiles.length);
            assertEquals("filesFixed() should return 0 before fixing files", 0,
                    cnode.blockIntegrityMonitor.getNumFilesFixed());

            // Corrupt parity blocks for different stripes.
            int[] corruptBlockIdxs = new int[] { 0, 1, 2 };
            for (int idx : corruptBlockIdxs)
                corruptBlock(locs.get(idx).getBlock(), dfsCluster);
            RaidDFSUtil.reportCorruptBlocks(dfs, parityFile, corruptBlockIdxs, 2 * blockSize);

            corruptFiles = DFSUtil.getCorruptFiles(dfs);
            assertEquals("file not corrupted", 1, corruptFiles.length);
            assertEquals("wrong file corrupted", corruptFiles[0], parityFile.toUri().getPath());

            cnode = RaidNode.createRaidNode(null, localConf);
            long start = System.currentTimeMillis();
            while (cnode.blockIntegrityMonitor.getNumFilesFixed() < 1
                    && System.currentTimeMillis() - start < 120000) {
                LOG.info("Test " + testName + " waiting for files to be fixed.");
                Thread.sleep(3000);
            }
            TestBlockFixer.verifyMetrics(fileSys, cnode, local, 1L, corruptBlockIdxs.length);

            long checkCRC = RaidDFSUtil.getCRC(fileSys, parityFile);

            assertEquals("file not fixed", parityCRC, checkCRC);

        } catch (Exception e) {
            LOG.info("Test " + testName + " Exception " + e + StringUtils.stringifyException(e));
            throw e;
        } finally {
            myTearDown();
        }
        LOG.info("Test " + testName + " completed.");
    }

    private void corruptFiles(Path dirPath, long[] crcs, Integer[] corruptBlockIdxs, DistributedFileSystem dfs,
            Path[] files, int[] numCorruptBlocksInFiles) throws IOException {
        int totalCorruptFiles = DFSUtil.getCorruptFiles(dfs).length;
        TestDirectoryRaidDfs.corruptBlocksInDirectory(conf, dirPath, crcs, corruptBlockIdxs, fileSys, dfsCluster,
                false, true);

        String[] corruptFiles = DFSUtil.getCorruptFiles(dfs);
        for (int i = 0; i < numCorruptBlocksInFiles.length; i++) {
            if (numCorruptBlocksInFiles[i] > 0)
                totalCorruptFiles++;
        }
        assertEquals("files not corrupted", totalCorruptFiles, corruptFiles.length);
        for (int i = 0; i < fileSizes.length; i++) {
            assertEquals("wrong number of corrupt blocks for file " + files[i], numCorruptBlocksInFiles[i],
                    RaidDFSUtil.corruptBlocksInFile(dfs, files[i].toUri().getPath(), 0, fileSizes[i]).size());
        }
    }

    /**
     * tests that we can have 2 concurrent jobs fixing files 
     * (dist block fixer)
     */
    @Test
    public void testConcurrentJobs() throws Exception {
        LOG.info("Test testConcurrentJobs started.");
        int stripeLength = 3;
        mySetup(stripeLength);
        long[] crcs1 = new long[3];
        int[] seeds1 = new int[3];
        long[] crcs2 = new long[3];
        int[] seeds2 = new int[3];
        Path dirPath1 = new Path("/user/dhruba/raidtestrs/1");
        Path[] files1 = TestRaidDfs.createTestFiles(dirPath1, fileSizes, blockSizes, crcs1, seeds1, fileSys,
                (short) 1);
        Path dirPath2 = new Path("/user/dhruba/raidtestrs/2");
        Path[] files2 = TestRaidDfs.createTestFiles(dirPath2, fileSizes, blockSizes, crcs2, seeds2, fileSys,
                (short) 1);
        Path destPath = new Path("/destraidrs/user/dhruba/raidtestrs");

        LOG.info("Test testConcurrentJobs created test files");
        Configuration localConf = this.getRaidNodeConfig(conf, false);
        localConf.setLong(BlockIntegrityMonitor.BLOCKCHECK_INTERVAL, 15000L);
        localConf.setLong(DistBlockIntegrityMonitor.RAIDNODE_BLOCK_FIX_SUBMISSION_INTERVAL_KEY, 15000L);
        localConf.setLong(DistBlockIntegrityMonitor.RAIDNODE_BLOCK_FIX_SCAN_SUBMISSION_INTERVAL_KEY, 3600000);

        try {
            cnode = RaidNode.createRaidNode(null, localConf);
            TestRaidDfs.waitForDirRaided(LOG, fileSys, dirPath1, destPath);
            TestRaidDfs.waitForDirRaided(LOG, fileSys, dirPath2, destPath);
            cnode.stop();
            cnode.join();

            DistributedFileSystem dfs = (DistributedFileSystem) fileSys;
            String[] corruptFiles = DFSUtil.getCorruptFiles(dfs);
            assertEquals("no corrupt files expected", 0, corruptFiles.length);
            assertEquals("filesFixed() should return 0 before fixing files", 0,
                    cnode.blockIntegrityMonitor.getNumFilesFixed());
            //corrupt directory 1
            this.corruptFiles(dirPath1, crcs1, rsCorruptFileIdx1, dfs, files1, rsNumCorruptBlocksInFiles1);

            cnode = RaidNode.createRaidNode(null, localConf);
            DistBlockIntegrityMonitor blockFixer = (DistBlockIntegrityMonitor) cnode.blockIntegrityMonitor;
            long start = System.currentTimeMillis();

            // All files are HIGH-PRI corrupt files
            while (blockFixer.jobsRunning() < 1 && System.currentTimeMillis() - start < 60000) {
                LOG.info("Test testDirBlockFix waiting for fixing job 1 to start");
                Thread.sleep(1000);
            }
            assertEquals("job 1 not running", 1, blockFixer.jobsRunning());

            // Corrupt directory 2
            this.corruptFiles(dirPath2, crcs2, rsCorruptFileIdx2, dfs, files2, rsNumCorruptBlocksInFiles2);

            // 1 LOW-PRI file and 2 HIGH-PRI files 
            while (blockFixer.jobsRunning() < 3 && System.currentTimeMillis() - start < 60000) {
                LOG.info("Test testDirBlockFix waiting for fixing job 2 and 3 to start");
                Thread.sleep(1000);
            }
            assertTrue("more than 3 jobs are running", blockFixer.jobsRunning() >= 3);

            while (blockFixer.getNumFilesFixed() < 6 && System.currentTimeMillis() - start < 240000) {
                LOG.info("Test testDirBlockFix waiting for files to be fixed.");
                Thread.sleep(1000);
            }
            TestBlockFixer.verifyMetrics(fileSys, cnode, false, 6L,
                    getTotal(rsNumCorruptBlocksInFiles1) + getTotal(rsNumCorruptBlocksInFiles2));
            dfs = getDFS(conf, dfs);
            for (int i = 0; i < fileSizes.length; i++) {
                assertTrue("file " + files1[i] + " not fixed",
                        TestRaidDfs.validateFile(dfs, files1[i], fileSizes[i], crcs1[i]));
            }
            for (int i = 0; i < fileSizes.length; i++) {
                assertTrue("file " + files2[i] + " not fixed",
                        TestRaidDfs.validateFile(dfs, files2[i], fileSizes[i], crcs2[i]));
            }
        } catch (Exception e) {
            LOG.info("Test testConcurrentJobs exception " + e, e);
            throw e;
        } finally {
            myTearDown();
        }
    }

    /**
     * tests that the distributed block fixer obeys
     * the limit on how many jobs to submit simultaneously.
     */
    @Test
    public void testMaxPendingJobs() throws Exception {
        LOG.info("Test testMaxPendingJobs started.");
        int stripeLength = 3;
        mySetup(stripeLength);
        long[] crcs1 = new long[3];
        int[] seeds1 = new int[3];
        long[] crcs2 = new long[3];
        int[] seeds2 = new int[3];
        Path dirPath1 = new Path("/user/dhruba/raidtestrs/1");
        Path[] files1 = TestRaidDfs.createTestFiles(dirPath1, fileSizes, blockSizes, crcs1, seeds1, fileSys,
                (short) 1);
        Path dirPath2 = new Path("/user/dhruba/raidtestrs/2");
        Path[] files2 = TestRaidDfs.createTestFiles(dirPath2, fileSizes, blockSizes, crcs2, seeds2, fileSys,
                (short) 1);
        Path destPath = new Path("/destraidrs/user/dhruba/raidtestrs");
        LOG.info("Test testMaxPendingJobs created test files");
        Configuration localConf = this.getRaidNodeConfig(conf, false);
        localConf.setLong("raid.blockfix.maxpendingjobs", 1L);

        try {
            cnode = RaidNode.createRaidNode(null, localConf);
            TestRaidDfs.waitForDirRaided(LOG, fileSys, dirPath1, destPath);
            TestRaidDfs.waitForDirRaided(LOG, fileSys, dirPath2, destPath);
            cnode.stop();
            cnode.join();

            DistributedFileSystem dfs = (DistributedFileSystem) fileSys;
            String[] corruptFiles = DFSUtil.getCorruptFiles(dfs);
            assertEquals("no corrupt files expected", 0, corruptFiles.length);
            assertEquals("filesFixed() should return 0 before fixing files", 0,
                    cnode.blockIntegrityMonitor.getNumFilesFixed());
            //corrupt directory 1
            this.corruptFiles(dirPath1, crcs1, rsCorruptFileIdx1, dfs, files1, rsNumCorruptBlocksInFiles1);

            cnode = RaidNode.createRaidNode(null, localConf);
            DistBlockIntegrityMonitor blockFixer = (DistBlockIntegrityMonitor) cnode.blockIntegrityMonitor;
            long start = System.currentTimeMillis();

            while (blockFixer.jobsRunning() < 1 && System.currentTimeMillis() - start < 60000) {
                LOG.info("Test testDirBlockFix waiting for fixing job 1 to start");
                Thread.sleep(1000);
            }
            assertEquals("job not running", 1, blockFixer.jobsRunning());

            //corrupt directory 2
            this.corruptFiles(dirPath2, crcs2, rsCorruptFileIdx2, dfs, files2, rsNumCorruptBlocksInFiles2);

            // wait until both files are fixed
            while (blockFixer.getNumFilesFixed() < 6 && System.currentTimeMillis() - start < 240000) {
                // make sure the block fixer does not start a second job while
                // the first one is still running
                assertTrue("too many jobs running", blockFixer.jobsRunning() <= 1);
                Thread.sleep(1000);
            }
            TestBlockFixer.verifyMetrics(fileSys, cnode, false, 6L,
                    getTotal(rsNumCorruptBlocksInFiles1) + getTotal(rsNumCorruptBlocksInFiles2));
            dfs = getDFS(conf, dfs);
            for (int i = 0; i < fileSizes.length; i++) {
                assertTrue("file " + files1[i] + " not fixed",
                        TestRaidDfs.validateFile(dfs, files1[i], fileSizes[i], crcs1[i]));
            }
            for (int i = 0; i < fileSizes.length; i++) {
                assertTrue("file " + files2[i] + " not fixed",
                        TestRaidDfs.validateFile(dfs, files2[i], fileSizes[i], crcs2[i]));
            }
        } catch (Exception e) {
            LOG.info("Test testMaxPendingJobs exception " + e + StringUtils.stringifyException(e));
            throw e;
        } finally {
            myTearDown();
        }
    }

    static class FakeDistBlockIntegrityMonitor extends DistBlockIntegrityMonitor {
        Map<String, List<String>> submittedJobs = new HashMap<String, List<String>>();

        FakeDistBlockIntegrityMonitor(Configuration conf) throws Exception {
            super(conf);
        }

        @Override
        void submitJob(Job job, List<String> filesInJob, Priority priority, Map<Job, List<LostFileInfo>> jobIndex,
                Map<JobID, TrackingUrlInfo> idToTrackingUrlMap) {
            LOG.info("Job " + job.getJobName() + " was submitted ");
            submittedJobs.put(job.getJobName(), filesInJob);
        }
    }

    public void testMultiplePriorities() throws Exception {
        long[] crcs = new long[3];
        int[] seeds = new int[3];
        Path dirPath = new Path("/home/test");
        int stripeLength = 3;
        short repl = 1;
        mySetup(stripeLength);
        Codec codec = Codec.getCodec("rs");
        LOG.info("Starting testMultiplePriorities");
        try {
            // Create test file and raid it.
            Path[] files = TestRaidDfs.createTestFiles(dirPath, fileSizes, blockSizes, crcs, seeds, fileSys,
                    (short) 1);
            FileStatus stat = fileSys.getFileStatus(dirPath);
            RaidNode.doRaid(conf, stat, new Path(codec.parityDirectory), codec, new RaidNode.Statistics(),
                    RaidUtils.NULL_PROGRESSABLE, false, repl, repl);

            Integer[] corruptBlockIdxs = new Integer[] { 0, 2 };
            LOG.info("Corrupt block " + corruptBlockIdxs + " of directory " + dirPath);
            TestDirectoryRaidDfs.corruptBlocksInDirectory(conf, dirPath, crcs, corruptBlockIdxs, fileSys,
                    dfsCluster, false, true);

            // Create Block Fixer and fix.
            FakeDistBlockIntegrityMonitor distBlockFixer = new FakeDistBlockIntegrityMonitor(conf);
            assertEquals(0, distBlockFixer.submittedJobs.size());
            // waiting for one job to submit
            long startTime = System.currentTimeMillis();
            while (System.currentTimeMillis() - startTime < 120000 && distBlockFixer.submittedJobs.size() == 0) {
                distBlockFixer.getCorruptionMonitor().checkAndReconstructBlocks();
                LOG.info("Waiting for jobs to submit");
                Thread.sleep(10000);
            }
            int submittedJob = distBlockFixer.submittedJobs.size();
            LOG.info("Already Submitted " + submittedJob + " jobs");
            assertTrue("Should submit more than 1 jobs", submittedJob >= 1);

            // Corrupt two more blocks
            corruptBlockIdxs = new Integer[] { 4, 5 };
            LOG.info("Corrupt block " + corruptBlockIdxs + " of directory " + dirPath);
            TestDirectoryRaidDfs.corruptBlocksInDirectory(conf, dirPath, crcs, corruptBlockIdxs, fileSys,
                    dfsCluster, false, true);

            // A new job should be submitted since two blocks are corrupt.
            startTime = System.currentTimeMillis();
            while (System.currentTimeMillis() - startTime < 120000
                    && distBlockFixer.submittedJobs.size() == submittedJob) {
                distBlockFixer.getCorruptionMonitor().checkAndReconstructBlocks();
                LOG.info("Waiting for more jobs to submit");
                Thread.sleep(10000);
            }
            LOG.info("Already Submitted " + distBlockFixer.submittedJobs.size() + " jobs");
            assertTrue("should submit more than 1 jobs", distBlockFixer.submittedJobs.size() - submittedJob >= 1);
        } finally {
            myTearDown();
        }
    }

    public static DistributedFileSystem getDFS(Configuration conf, FileSystem dfs) throws IOException {
        Configuration clientConf = new Configuration(conf);
        clientConf.set("fs.hdfs.impl", "org.apache.hadoop.hdfs.DistributedFileSystem");
        clientConf.setBoolean("fs.hdfs.impl.disable.cache", true);
        URI dfsUri = dfs.getUri();
        FileSystem.closeAll();
        return (DistributedFileSystem) FileSystem.get(dfsUri, clientConf);
    }

    private void mySetup(int stripeLength) throws Exception {
        if (System.getProperty("hadoop.log.dir") == null) {
            String base = new File(".").getAbsolutePath();
            System.setProperty("hadoop.log.dir", new Path(base).toString() + "/logs");
        }

        new File(TEST_DIR).mkdirs(); // Make sure data directory exists
        conf = new Configuration();

        conf.set("raid.config.file", CONFIG_FILE);
        conf.setBoolean("raid.config.reload", true);
        conf.setLong("raid.config.reload.interval", RELOAD_INTERVAL);

        // scan all policies once every 5 second
        conf.setLong("raid.policy.rescan.interval", 5000);

        // do not use map-reduce cluster for Raiding
        conf.set("raid.classname", "org.apache.hadoop.raid.LocalRaidNode");
        conf.set("raid.server.address", "localhost:" + MiniDFSCluster.getFreePort());
        conf.set("mapred.raid.http.address", "localhost:0");

        Utils.loadTestCodecs(conf, stripeLength, stripeLength, 1, 3, "/destraid", "/destraidrs", false, true);

        conf.setBoolean("dfs.permissions", false);
        // Make sure initial repl is smaller than NUM_DATANODES
        conf.setInt(RaidNode.RAID_PARITY_INITIAL_REPL_KEY, 1);

        dfsCluster = new MiniDFSCluster(conf, NUM_DATANODES, true, null);
        dfsCluster.waitActive();
        fileSys = dfsCluster.getFileSystem();
        namenode = fileSys.getUri().toString();

        FileSystem.setDefaultUri(conf, namenode);
        mr = new MiniMRCluster(4, namenode, 3);
        jobTrackerName = "localhost:" + mr.getJobTrackerPort();
        hftp = "hftp://localhost.localdomain:" + dfsCluster.getNameNodePort();

        FileSystem.setDefaultUri(conf, namenode);
        conf.set("mapred.job.tracker", jobTrackerName);
        conf.set(RaidNode.RAID_CHECKSUM_STORE_CLASS_KEY, "org.apache.hadoop.raid.LocalChecksumStore");
        conf.setBoolean(RaidNode.RAID_CHECKSUM_STORE_REQUIRED_KEY, true);
        conf.set(LocalChecksumStore.LOCAL_CHECK_STORE_DIR_KEY, CHECKSUM_STORE_DIR);
        conf.set(RaidNode.RAID_STRIPE_STORE_CLASS_KEY, "org.apache.hadoop.raid.LocalStripeStore");
        conf.set(LocalStripeStore.LOCAL_STRIPE_STORE_DIR_KEY, STRIPE_STORE_DIR);
        ConfigBuilder cb = new ConfigBuilder(CONFIG_FILE);
        cb.addPolicy("RaidTest1", "/user/dhruba/raidtest", 1, 1);
        cb.addPolicy("RaidTest2", "/user/dhruba/raidtestrs", 1, 1, "rs");
        cb.persist();
    }

    private void myTearDown() throws Exception {
        if (cnode != null) {
            cnode.stop();
            cnode.join();
        }
        if (mr != null) {
            mr.shutdown();
        }
        if (dfsCluster != null) {
            dfsCluster.shutdown();
        }
    }

    static void corruptBlock(Block block, MiniDFSCluster dfs) throws IOException {
        boolean corrupted = false;
        for (int i = 0; i < NUM_DATANODES; i++) {
            corrupted |= TestDatanodeBlockScanner.corruptReplica(block, i, dfs);
        }
        assertTrue("could not corrupt block", corrupted);
    }
}