org.apache.hadoop.tools.mapred.filechunk.TestCopyChunkMapper.java Source code

Java tutorial

Introduction

Here is the source code for org.apache.hadoop.tools.mapred.filechunk.TestCopyChunkMapper.java

Source

/**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.hadoop.tools.mapred.filechunk;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.*;
import org.apache.hadoop.fs.Options.ChecksumOpt;
import org.apache.hadoop.fs.permission.FsAction;
import org.apache.hadoop.fs.permission.FsPermission;
import org.apache.hadoop.hdfs.MiniDFSCluster;
import org.apache.hadoop.io.IOUtils;
import org.apache.hadoop.io.SequenceFile;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.security.AccessControlException;
import org.apache.hadoop.security.UserGroupInformation;
import org.apache.hadoop.tools.*;
import org.apache.hadoop.tools.DistCp.Counter;
import org.apache.hadoop.tools.mapred.CopyCommitter;
import org.apache.hadoop.tools.mapred.CopyMapper;
import org.apache.hadoop.tools.util.DistCpUtils;
import org.apache.hadoop.util.DataChecksum;
import org.apache.log4j.Level;
import org.junit.Assert;
import org.junit.BeforeClass;
import org.junit.Test;

import java.io.DataOutputStream;
import java.io.IOException;
import java.io.OutputStream;
import java.security.PrivilegedAction;
import java.util.ArrayList;
import java.util.EnumSet;
import java.util.List;
import java.util.Random;

public class TestCopyChunkMapper {
    private static final Log LOG = LogFactory.getLog(TestCopyChunkMapper.class);
    private static List<Path> pathList = new ArrayList<Path>();
    private static int nFiles = 0;
    private static final int DEFAULT_FILE_SIZE = 1024;
    private static final long MID_FILE_SIZE = 32 * 1024; //8 chunks
    private static final long LARGE_FILE_SIZE = 321 * 1024; //81 chunks
    private static final long NON_DEFAULT_BLOCK_SIZE = 4 * 1024;

    private static MiniDFSCluster cluster;

    private static final String SOURCE_PATH = "/tmp/source";
    private static final String TARGET_PATH = "/tmp/target";

    private static Configuration configuration;

    @BeforeClass
    public static void setup() throws Exception {
        configuration = getConfigurationForCluster();
        cluster = new MiniDFSCluster.Builder(configuration).numDataNodes(1).format(true).build();
        // TODO: only for debuging unit, remove this when ok
        org.apache.log4j.LogManager.getRootLogger().setLevel(Level.DEBUG);
    }

    private static Configuration getConfigurationForCluster() throws IOException {
        Configuration configuration = new Configuration();
        System.setProperty("test.build.data", "target/tmp/build/TEST_COPY_CHUNK_MAPPER/data");
        configuration.set("hadoop.log.dir", "target/tmp");
        configuration.set("dfs.namenode.fs-limits.min-block-size", "0");
        configuration.set("dfs.blocksize", NON_DEFAULT_BLOCK_SIZE + "");
        LOG.debug("fs.default.name  == " + configuration.get("fs.default.name"));
        LOG.debug("dfs.http.address == " + configuration.get("dfs.http.address"));
        return configuration;
    }

    private static Configuration getConfiguration() throws IOException {
        Configuration configuration = getConfigurationForCluster();
        final FileSystem fs = cluster.getFileSystem();
        Path workPath = new Path(TARGET_PATH).makeQualified(fs.getUri(), fs.getWorkingDirectory());
        configuration.set(DistCpConstants.CONF_LABEL_TARGET_WORK_PATH, workPath.toString());
        configuration.set(DistCpConstants.CONF_LABEL_TARGET_FINAL_PATH, workPath.toString());
        configuration.setBoolean(DistCpOptionSwitch.OVERWRITE.getConfigLabel(), false);
        configuration.setBoolean(DistCpOptionSwitch.SKIP_CRC.getConfigLabel(), false);
        configuration.setBoolean(DistCpOptionSwitch.SYNC_FOLDERS.getConfigLabel(), true);
        configuration.set(DistCpOptionSwitch.PRESERVE_STATUS.getConfigLabel(), "br");
        configuration.setBoolean(DistCpConstants.CONF_LABEL_COPY_BY_CHUNK, true);
        return configuration;
    }

    private static void createSourceData() throws Exception {
        mkdirs(SOURCE_PATH + "/1"); //1
        mkdirs(SOURCE_PATH + "/2"); //1
        mkdirs(SOURCE_PATH + "/2/3/4"); //2
        mkdirs(SOURCE_PATH + "/2/3"); //0
        mkdirs(SOURCE_PATH + "/5"); //1
        touchFile(SOURCE_PATH + "/5/6", MID_FILE_SIZE, true, new ChecksumOpt(DataChecksum.Type.CRC32, 512)); //8
        mkdirs(SOURCE_PATH + "/7"); //1
        mkdirs(SOURCE_PATH + "/7/8"); //1
        touchFile(SOURCE_PATH + "/7/8/9", LARGE_FILE_SIZE, true, new ChecksumOpt(DataChecksum.Type.CRC32, 512)); //81
        //96 chunks
    }

    private static void appendSourceData() throws Exception {
        FileSystem fs = cluster.getFileSystem();
        for (Path source : pathList) {
            if (fs.getFileStatus(source).isFile()) {
                // append 2048 bytes per file
                appendFile(source, DEFAULT_FILE_SIZE * 2);
            }
        }
    }

    private static void createSourceDataWithDifferentBlockSize() throws Exception {
        mkdirs(SOURCE_PATH + "/1");
        mkdirs(SOURCE_PATH + "/2");
        mkdirs(SOURCE_PATH + "/2/3/4");
        mkdirs(SOURCE_PATH + "/2/3");
        mkdirs(SOURCE_PATH + "/5");
        touchFile(SOURCE_PATH + "/5/6", true, null);
        mkdirs(SOURCE_PATH + "/7");
        mkdirs(SOURCE_PATH + "/7/8");
        touchFile(SOURCE_PATH + "/7/8/9");
    }

    private static void createSourceDataWithDifferentChecksumType() throws Exception {
        mkdirs(SOURCE_PATH + "/1");
        mkdirs(SOURCE_PATH + "/2");
        mkdirs(SOURCE_PATH + "/2/3/4");
        mkdirs(SOURCE_PATH + "/2/3");
        mkdirs(SOURCE_PATH + "/5");
        touchFile(SOURCE_PATH + "/5/6", new ChecksumOpt(DataChecksum.Type.CRC32, 512));
        mkdirs(SOURCE_PATH + "/7");
        mkdirs(SOURCE_PATH + "/7/8");
        touchFile(SOURCE_PATH + "/7/8/9", new ChecksumOpt(DataChecksum.Type.CRC32C, 512));
    }

    private static void mkdirs(String path) throws Exception {
        FileSystem fileSystem = cluster.getFileSystem();
        final Path qualifiedPath = new Path(path).makeQualified(fileSystem.getUri(),
                fileSystem.getWorkingDirectory());
        pathList.add(qualifiedPath);
        fileSystem.mkdirs(qualifiedPath);
    }

    private static void touchFile(String path) throws Exception {
        touchFile(path, false, null);
    }

    private static void touchFile(String path, ChecksumOpt checksumOpt) throws Exception {
        // create files with specific checksum opt and non-default block size
        touchFile(path, true, checksumOpt);
    }

    private static void touchFile(String path, boolean createMultipleBlocks, ChecksumOpt checksumOpt)
            throws Exception {
        FileSystem fs;
        DataOutputStream outputStream = null;
        try {
            fs = cluster.getFileSystem();
            final Path qualifiedPath = new Path(path).makeQualified(fs.getUri(), fs.getWorkingDirectory());
            final long blockSize = createMultipleBlocks ? NON_DEFAULT_BLOCK_SIZE
                    : fs.getDefaultBlockSize(qualifiedPath) * 2;
            FsPermission permission = FsPermission.getFileDefault().applyUMask(FsPermission.getUMask(fs.getConf()));
            outputStream = fs.create(qualifiedPath, permission, EnumSet.of(CreateFlag.CREATE, CreateFlag.OVERWRITE),
                    0, (short) (fs.getDefaultReplication(qualifiedPath) * 2), blockSize, null, checksumOpt);
            byte[] bytes = new byte[DEFAULT_FILE_SIZE];
            outputStream.write(bytes);
            long fileSize = DEFAULT_FILE_SIZE;
            if (createMultipleBlocks) {
                while (fileSize < 2 * blockSize) {
                    outputStream.write(bytes);
                    outputStream.flush();
                    fileSize += DEFAULT_FILE_SIZE;
                }
            }
            pathList.add(qualifiedPath);
            ++nFiles;

            FileStatus fileStatus = fs.getFileStatus(qualifiedPath);
            System.out.println(fileStatus.getBlockSize());
            System.out.println(fileStatus.getReplication());
        } finally {
            IOUtils.cleanup(null, outputStream);
        }
    }

    private static void touchFile(String path, long totalFileSize, boolean preserveBlockSize,
            ChecksumOpt checksumOpt) throws Exception {
        FileSystem fs;
        DataOutputStream outputStream = null;
        try {
            fs = cluster.getFileSystem();
            final Path qualifiedPath = new Path(path).makeQualified(fs.getUri(), fs.getWorkingDirectory());
            final long blockSize = preserveBlockSize ? NON_DEFAULT_BLOCK_SIZE
                    : fs.getDefaultBlockSize(qualifiedPath) * 2;
            FsPermission permission = FsPermission.getFileDefault().applyUMask(FsPermission.getUMask(fs.getConf()));
            outputStream = fs.create(qualifiedPath, permission, EnumSet.of(CreateFlag.CREATE, CreateFlag.OVERWRITE),
                    0, (short) (fs.getDefaultReplication(qualifiedPath) * 2), blockSize, null, checksumOpt);
            byte[] bytes = new byte[DEFAULT_FILE_SIZE];
            long curFileSize = 0;
            int bufferLen = DEFAULT_FILE_SIZE;
            while (curFileSize < totalFileSize) {
                if (totalFileSize - curFileSize < DEFAULT_FILE_SIZE)
                    bufferLen = (int) (totalFileSize - curFileSize);
                outputStream.write(bytes, 0, bufferLen);
                outputStream.flush();
                curFileSize += bufferLen;
            }
            pathList.add(qualifiedPath);
            ++nFiles;

            FileStatus fileStatus = fs.getFileStatus(qualifiedPath);
            System.out.println(fileStatus.getBlockSize());
            System.out.println(fileStatus.getReplication());
        } finally {
            IOUtils.cleanup(null, outputStream);
        }
    }

    /**
     * Append specified length of bytes to a given file
     */
    private static void appendFile(Path p, int length) throws IOException {
        byte[] toAppend = new byte[length];
        Random random = new Random();
        random.nextBytes(toAppend);
        FSDataOutputStream out = cluster.getFileSystem().append(p);
        try {
            out.write(toAppend);
        } finally {
            IOUtils.closeStream(out);
        }
    }

    // TODO: checksum not supported yet for chunk
    public void testCopyWithDifferentChecksumType() throws Exception {
        testCopy(true);
    }

    @Test(timeout = 40000)
    public void testRun() throws Exception {
        testCopy(false);
    }

    public void testCopy(boolean preserveChecksum) throws Exception {
        deleteState();
        if (preserveChecksum) {
            createSourceDataWithDifferentChecksumType();
        } else {
            createSourceData();
        }

        FileSystem fs = cluster.getFileSystem();
        CopyChunkMapper copyChunkMapper = new CopyChunkMapper();
        StubContext stubContext = new StubContext(getConfiguration(), null, 0);
        Mapper<Text, CopyListingChunkFileStatus, Text, Text>.Context context = stubContext.getContext();

        Configuration configuration = context.getConfiguration();
        EnumSet<DistCpOptions.FileAttribute> fileAttributes = EnumSet.of(DistCpOptions.FileAttribute.REPLICATION);
        if (preserveChecksum) {
            fileAttributes.add(DistCpOptions.FileAttribute.CHECKSUMTYPE);
        }
        configuration.set(DistCpOptionSwitch.PRESERVE_STATUS.getConfigLabel(),
                DistCpUtils.packAttributes(fileAttributes));
        mkdirs(TARGET_PATH);
        copyChunkMapper.setup(context);

        //TODO: to keep the same with configuration
        SimpleCopyListing listing = new SimpleCopyListing(configuration);
        DistCpOptions options = OptionsParser.parse(
                new String[] { "-pb", "-bychunk", fs.getFileStatus(new Path(SOURCE_PATH)).getPath().toString(),
                        fs.getFileStatus(new Path(TARGET_PATH)).getPath().toString() });
        options.setTargetPathExists(true);
        Assert.assertEquals(options.getByChunk(), true);
        Path listingFile = new Path("/tmp/chunkSeqList");
        listing.buildListing(listingFile, options);
        long numChunks = listing.getNumberOfPaths();
        Assert.assertEquals(81 + 8 + 7 + 1, numChunks);
        SequenceFile.Reader reader = new SequenceFile.Reader(configuration, SequenceFile.Reader.file(listingFile));
        if (options.getByChunk()) {
            CopyListingChunkFileStatus fileStatus = new CopyListingChunkFileStatus();
            Text relativePath = new Text();
            while (reader.next(relativePath, fileStatus)) {
                copyChunkMapper.map(relativePath, fileStatus, context);
            }
        }
        //TODO: verify every chunk

        //TODO Stitch chunks
        // Check that the maps worked.
        //verifyCopy(fs, preserveChecksum);
        Assert.assertEquals(numChunks, stubContext.getReporter().getCounter(Counter.COPY).getValue());
        Assert.assertEquals(MID_FILE_SIZE + LARGE_FILE_SIZE,
                stubContext.getReporter().getCounter(Counter.BYTESCOPIED).getValue());

        //    if (!preserveChecksum) {
        //      Assert.assertEquals(nFiles * DEFAULT_FILE_SIZE, stubContext
        //          .getReporter().getCounter(Counter.BYTESCOPIED)
        //          .getValue());
        //    } else {
        //      Assert.assertEquals(nFiles * NON_DEFAULT_BLOCK_SIZE * 2, stubContext
        //          .getReporter().getCounter(Counter.BYTESCOPIED)
        //          .getValue());
        //    }

        //TODO: test Skip
        //    testCopyingExistingFiles(fs, copyChunkMapper, context);
        //    for (Text value : stubContext.getWriter().values()) {
        //      Assert.assertTrue(value.toString() + " is not skipped", value
        //          .toString().startsWith("SKIP:"));
        //    }
    }

    private void verifyCopy(FileSystem fs, boolean preserveChecksum) throws Exception {
        for (Path path : pathList) {
            final Path targetPath = new Path(path.toString().replaceAll(SOURCE_PATH, TARGET_PATH));
            Assert.assertTrue(fs.exists(targetPath));
            Assert.assertTrue(fs.isFile(targetPath) == fs.isFile(path));
            FileStatus sourceStatus = fs.getFileStatus(path);
            FileStatus targetStatus = fs.getFileStatus(targetPath);
            Assert.assertEquals(sourceStatus.getReplication(), targetStatus.getReplication());
            if (preserveChecksum) {
                Assert.assertEquals(sourceStatus.getBlockSize(), targetStatus.getBlockSize());
            }
            Assert.assertTrue(
                    !fs.isFile(targetPath) || fs.getFileChecksum(targetPath).equals(fs.getFileChecksum(path)));
        }
    }

    private void doTestIgnoreFailures(boolean ignoreFailures) {
        try {
            deleteState();
            createSourceData();

            FileSystem fs = cluster.getFileSystem();
            CopyMapper copyMapper = new CopyMapper();
            StubContext stubContext = new StubContext(getConfiguration(), null, 0);
            Mapper<Text, CopyListingFileStatus, Text, Text>.Context context = stubContext.getContext();

            Configuration configuration = context.getConfiguration();
            configuration.setBoolean(DistCpOptionSwitch.IGNORE_FAILURES.getConfigLabel(), ignoreFailures);
            configuration.setBoolean(DistCpOptionSwitch.OVERWRITE.getConfigLabel(), true);
            configuration.setBoolean(DistCpOptionSwitch.SKIP_CRC.getConfigLabel(), true);
            copyMapper.setup(context);

            for (Path path : pathList) {
                final FileStatus fileStatus = fs.getFileStatus(path);
                if (!fileStatus.isDirectory()) {
                    fs.delete(path, true);
                    copyMapper.map(new Text(DistCpUtils.getRelativePath(new Path(SOURCE_PATH), path)),
                            new CopyListingFileStatus(fileStatus), context);
                }
            }
            if (ignoreFailures) {
                for (Text value : stubContext.getWriter().values()) {
                    Assert.assertTrue(value.toString() + " is not skipped", value.toString().startsWith("FAIL:"));
                }
            }
            Assert.assertTrue("There should have been an exception.", ignoreFailures);
        } catch (Exception e) {
            Assert.assertTrue("Unexpected exception: " + e.getMessage(), !ignoreFailures);
            e.printStackTrace();
        }
    }

    private static void deleteState() throws IOException {
        pathList.clear();
        nFiles = 0;
        cluster.getFileSystem().delete(new Path(SOURCE_PATH), true);
        cluster.getFileSystem().delete(new Path(TARGET_PATH), true);
    }
}