com.inmobi.conduit.distcp.tools.TestDistCp.java Source code

Java tutorial

Introduction

Here is the source code for com.inmobi.conduit.distcp.tools.TestDistCp.java

Source

/**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package com.inmobi.conduit.distcp.tools;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.hdfs.MiniDFSCluster;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.io.IOUtils;
import org.apache.hadoop.io.compress.CodecPool;
import org.apache.hadoop.io.compress.Compressor;
import org.apache.hadoop.io.compress.GzipCodec;
import org.apache.hadoop.mapred.MiniMRCluster;
import org.apache.hadoop.mapred.JobClient;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.JobSubmissionFiles;
import org.apache.hadoop.util.ReflectionUtils;
import org.junit.Test;
import org.junit.BeforeClass;
import org.junit.Assert;
import org.junit.AfterClass;

import com.inmobi.conduit.distcp.tools.DistCp;
import com.inmobi.conduit.distcp.tools.DistCpConstants;
import com.inmobi.conduit.distcp.tools.DistCpOptions;
import com.inmobi.conduit.distcp.tools.mapred.CopyOutputFormat;

import java.util.List;
import java.util.ArrayList;
import java.io.*;
import java.lang.reflect.Constructor;
import java.lang.reflect.Method;

public class TestDistCp {
    private static final Log LOG = LogFactory.getLog(TestDistCp.class);
    private static List<Path> pathList = new ArrayList<Path>();
    private static final int FILE_SIZE = 1024;

    private static Configuration configuration;
    private static MiniDFSCluster cluster;
    private static MiniMRCluster mrCluster;

    private static final String SOURCE_PATH = "/tmp/source";
    private static final String TARGET_PATH = "/tmp/target";
    private static final Path counterOutputPath = new Path("counters");

    @BeforeClass
    public static void setup() throws Exception {
        configuration = getConfigurationForCluster();
        cluster = new MiniDFSCluster(configuration, 1, true, null);
        System.setProperty("org.apache.hadoop.mapred.TaskTracker", "target/tmp");
        configuration.set("org.apache.hadoop.mapred.TaskTracker", "target/tmp");
        System.setProperty("hadoop.log.dir", "target/tmp");
        configuration.set("hadoop.log.dir", "target/tmp");
        mrCluster = new MiniMRCluster(1, configuration.get("fs.default.name"), 1);
        Configuration mrConf = mrCluster.createJobConf();
        final String mrJobTracker = mrConf.get("mapred.job.tracker");
        configuration.set("mapred.job.tracker", mrJobTracker);
        final String mrJobTrackerAddress = mrConf.get("mapred.job.tracker.http.address");
        configuration.set("mapred.job.tracker.http.address", mrJobTrackerAddress);
    }

    @AfterClass
    public static void cleanup() {
        if (mrCluster != null)
            mrCluster.shutdown();
        if (cluster != null)
            cluster.shutdown();
    }

    private static Configuration getConfigurationForCluster() throws IOException {
        Configuration configuration = new Configuration();
        System.setProperty("test.build.data", "target/build/TEST_DISTCP/data");
        configuration.set("hadoop.log.dir", "target/tmp");

        LOG.debug("fs.default.name  == " + configuration.get("fs.default.name"));
        LOG.debug("dfs.http.address == " + configuration.get("dfs.http.address"));
        return configuration;
    }

    private static void createSourceData() throws Exception {
        mkdirs(SOURCE_PATH + "/1");
        mkdirs(SOURCE_PATH + "/2");
        mkdirs(SOURCE_PATH + "/2/3/4.gz");
        mkdirs(SOURCE_PATH + "/2/3");
        mkdirs(SOURCE_PATH + "/5");
        touchFile(SOURCE_PATH + "/5/6.gz");
        mkdirs(SOURCE_PATH + "/7");
        mkdirs(SOURCE_PATH + "/7/8");
        touchFile(SOURCE_PATH + "/7/8/9.gz");
    }

    private static void mkdirs(String path) throws Exception {
        FileSystem fileSystem = cluster.getFileSystem();
        final Path qualifiedPath = new Path(path).makeQualified(fileSystem);
        pathList.add(qualifiedPath);
        fileSystem.mkdirs(qualifiedPath);
    }

    private static void touchFile(String path) throws Exception {
        FileSystem fs;
        DataOutputStream outputStream = null;
        GzipCodec gzipCodec = ReflectionUtils.newInstance(GzipCodec.class, getConfigurationForCluster());
        Compressor gzipCompressor = CodecPool.getCompressor(gzipCodec);
        OutputStream compressedOut = null;
        try {
            fs = cluster.getFileSystem();
            final Path qualifiedPath = new Path(path).makeQualified(fs);
            final long blockSize = fs.getDefaultBlockSize() * 2;
            outputStream = fs.create(qualifiedPath, true, 0, (short) (fs.getDefaultReplication() * 2), blockSize);
            compressedOut = gzipCodec.createOutputStream(outputStream, gzipCompressor);
            compressedOut.write(new byte[FILE_SIZE]);
            compressedOut.write("\n".getBytes());
            compressedOut.flush();
            //outputStream.write(new byte[FILE_SIZE]);
            pathList.add(qualifiedPath);
        } finally {
            compressedOut.close();
            IOUtils.cleanup(null, outputStream);
            CodecPool.returnCompressor(gzipCompressor);
        }
    }

    private static void clearState() throws Exception {
        pathList.clear();
        cluster.getFileSystem().delete(new Path(TARGET_PATH));
        createSourceData();
    }

    //@Test
    public void testUniformSizeDistCp() throws Exception {
        try {
            clearState();
            Path sourcePath = new Path(SOURCE_PATH).makeQualified(cluster.getFileSystem());
            List<Path> sources = new ArrayList<Path>();
            sources.add(sourcePath);

            Path targetPath = new Path(TARGET_PATH).makeQualified(cluster.getFileSystem());
            DistCpOptions options = new DistCpOptions(sources, targetPath);
            options.setOutPutDirectory(counterOutputPath);
            options.setAtomicCommit(true);
            options.setBlocking(false);
            Job job = new DistCp(configuration, options).execute();
            Path workDir = CopyOutputFormat.getWorkingDirectory(job);
            Path finalDir = CopyOutputFormat.getCommitDirectory(job);

            while (!job.isComplete()) {
                if (cluster.getFileSystem().exists(workDir)) {
                    break;
                }
            }
            job.waitForCompletion(true);
            Assert.assertFalse(cluster.getFileSystem().exists(workDir));
            Assert.assertTrue(cluster.getFileSystem().exists(finalDir));
            Assert.assertFalse(cluster.getFileSystem()
                    .exists(new Path(job.getConfiguration().get(DistCpConstants.CONF_LABEL_META_FOLDER))));
            verifyResults();
        } catch (Exception e) {
            LOG.error("Exception encountered", e);
            Assert.fail("Unexpected exception: " + e.getMessage());
        }
    }

    @Test
    public void testCleanup() {
        try {
            clearState();
            Path sourcePath = new Path("noscheme:///file");
            List<Path> sources = new ArrayList<Path>();
            sources.add(sourcePath);

            Path targetPath = new Path(TARGET_PATH).makeQualified(cluster.getFileSystem());
            DistCpOptions options = new DistCpOptions(sources, targetPath);
            options.setOutPutDirectory(counterOutputPath);

            Path stagingDir = null/* = JobSubmissionFiles.getStagingDir(
                                  new JobClient(new JobConf(configuration)), configuration)*/;
            boolean hadoop1 = false;
            try {
                LOG.info("Trying to get staging path using hadoop-2");
                Class clusterClass = DistCp.class.getClassLoader().loadClass("org.apache.hadoop.mapreduce.Cluster");
                Method method = JobSubmissionFiles.class.getMethod("getStagingDir", clusterClass,
                        Configuration.class);
                Constructor constructor = clusterClass.getConstructor(Configuration.class);
                stagingDir = (Path) method.invoke(null, constructor.newInstance(configuration), configuration);
            } catch (Exception ignored) {
                // fallback to hadoop-1 API
                hadoop1 = true;
            }
            if (hadoop1) {
                try {
                    LOG.info("Trying to get staging path using hadoop-1");
                    Method method = JobSubmissionFiles.class.getMethod("getStagingDir", JobClient.class,
                            Configuration.class);
                    stagingDir = (Path) method.invoke(null, new JobClient(new JobConf(configuration)),
                            configuration);
                } catch (Exception ignored) {
                    // do nothing
                }
            }

            stagingDir.getFileSystem(configuration).mkdirs(stagingDir);

            try {
                new DistCp(configuration, options).execute();
            } catch (Throwable t) {
                Assert.assertEquals(stagingDir.getFileSystem(configuration).listStatus(stagingDir).length, 0);
            }
        } catch (Exception e) {
            LOG.error("Exception encountered ", e);
            Assert.fail("testCleanup failed " + e.getMessage());
        }
    }

    @Test
    public void testRootPath() throws Exception {
        try {
            clearState();
            List<Path> sources = new ArrayList<Path>();
            sources.add(new Path("/a").makeQualified(cluster.getFileSystem()));
            sources.add(new Path("/b").makeQualified(cluster.getFileSystem()));
            touchFile("/a/a.gz");
            touchFile("/b/b.gz");

            Path targetPath = new Path("/c").makeQualified(cluster.getFileSystem());
            DistCpOptions options = new DistCpOptions(sources, targetPath);
            options.setOutPutDirectory(counterOutputPath);
            new DistCp(configuration, options).execute();
            Assert.assertTrue(cluster.getFileSystem().exists(new Path("/c/a/a.gz")));
            Assert.assertTrue(cluster.getFileSystem().exists(new Path("/c/b/b.gz")));
        } catch (Exception e) {
            LOG.error("Exception encountered", e);
            Assert.fail("Unexpected exception: " + e.getMessage());
        }
    }

    @Test
    public void testDynamicDistCp() throws Exception {
        try {
            clearState();
            Path sourcePath = new Path(SOURCE_PATH).makeQualified(cluster.getFileSystem());
            List<Path> sources = new ArrayList<Path>();
            sources.add(sourcePath);

            Path targetPath = new Path(TARGET_PATH).makeQualified(cluster.getFileSystem());
            DistCpOptions options = new DistCpOptions(sources, targetPath);
            options.setCopyStrategy("dynamic");
            options.setOutPutDirectory(counterOutputPath);

            options.setAtomicCommit(true);
            options.setAtomicWorkPath(new Path("/work"));
            options.setBlocking(false);
            Job job = new DistCp(configuration, options).execute();
            Path workDir = CopyOutputFormat.getWorkingDirectory(job);
            Path finalDir = CopyOutputFormat.getCommitDirectory(job);

            while (!job.isComplete()) {
                if (cluster.getFileSystem().exists(workDir)) {
                    break;
                }
            }
            job.waitForCompletion(true);
            Assert.assertFalse(cluster.getFileSystem().exists(workDir));
            Assert.assertTrue(cluster.getFileSystem().exists(finalDir));

            verifyResults();
        } catch (Exception e) {
            LOG.error("Exception encountered", e);
            Assert.fail("Unexpected exception: " + e.getMessage());
        }
    }

    private static void verifyResults() throws Exception {
        for (Path path : pathList) {
            FileSystem fs = cluster.getFileSystem();

            Path sourcePath = path.makeQualified(fs);
            Path targetPath = new Path(sourcePath.toString().replaceAll(SOURCE_PATH, TARGET_PATH));

            Assert.assertTrue(fs.exists(targetPath));
            Assert.assertEquals(fs.isFile(sourcePath), fs.isFile(targetPath));
        }
    }
}