com.netflix.bdp.s3.TestMRJob.java Source code

Introduction

Here is the source code for com.netflix.bdp.s3.TestMRJob.java
Source

/*
 * Copyright 2017 Netflix, Inc.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package com.netflix.bdp.s3;

import com.amazonaws.services.s3.model.CompleteMultipartUploadRequest;
import com.google.common.collect.Sets;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.OutputCommitter;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import org.apache.hadoop.mapreduce.v2.MiniMRYarnCluster;
import org.junit.AfterClass;
import org.junit.Assert;
import org.junit.BeforeClass;
import org.junit.Rule;
import org.junit.Test;
import org.junit.rules.TemporaryFolder;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.ObjectInputStream;
import java.nio.charset.StandardCharsets;
import java.util.Set;
import java.util.UUID;

import static com.netflix.bdp.s3.S3Committer.UPLOAD_UUID;
import static org.mockito.Mockito.mock;

public class TestMRJob extends TestUtil.MiniDFSTest {

    private static Path S3_OUTPUT_PATH = null;
    private static MiniMRYarnCluster MR_CLUSTER = null;

    @BeforeClass
    public static void setupMiniMRCluster() {
        getConfiguration().set("fs.s3.impl", MockS3FileSystem.class.getName());
        S3_OUTPUT_PATH = new Path("s3://bucket-name/output/path");
        MR_CLUSTER = new MiniMRYarnCluster("test-s3-multipart-output-committer", 2);
        MR_CLUSTER.init(getConfiguration());
        MR_CLUSTER.start();
    }

    @AfterClass
    public static void stopMiniMRCluster() {
        if (MR_CLUSTER != null) {
            MR_CLUSTER.stop();
        }
        MR_CLUSTER = null;
    }

    public static class S3TextOutputFormat<K, V> extends TextOutputFormat<K, V> {
        private MockedS3Committer committer = null;

        @Override
        public synchronized OutputCommitter getOutputCommitter(TaskAttemptContext context) throws IOException {
            if (committer == null) {
                committer = new MockedS3Committer(getOutputPath(context), context);
            }
            return committer;
        }
    }

    public static class M extends Mapper<LongWritable, Text, LongWritable, Text> {
        @Override
        protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
            context.write(key, value);
        }
    }

    @Rule
    public final TemporaryFolder temp = new TemporaryFolder();

    @Test
    public void testMRJob() throws Exception {
        FileSystem mockS3 = mock(FileSystem.class);
        FileSystem s3 = S3_OUTPUT_PATH.getFileSystem(getConfiguration());
        if (s3 instanceof MockS3FileSystem) {
            ((MockS3FileSystem) s3).setMock(mockS3);
        } else {
            throw new RuntimeException("Cannot continue: S3 not mocked");
        }

        String commitUUID = UUID.randomUUID().toString();

        int numFiles = 3;
        Set<String> expectedFiles = Sets.newHashSet();
        for (int i = 0; i < numFiles; i += 1) {
            File file = temp.newFile(String.valueOf(i) + ".text");
            try (FileOutputStream out = new FileOutputStream(file)) {
                out.write(("file " + i).getBytes(StandardCharsets.UTF_8));
            }
            expectedFiles.add(new Path(S3_OUTPUT_PATH, "part-m-0000" + i + "-" + commitUUID).toString());
        }

        Job mrJob = Job.getInstance(MR_CLUSTER.getConfig(), "test-committer-job");
        Configuration conf = mrJob.getConfiguration();

        mrJob.setOutputFormatClass(S3TextOutputFormat.class);
        S3TextOutputFormat.setOutputPath(mrJob, S3_OUTPUT_PATH);

        File mockResultsFile = temp.newFile("committer.bin");
        mockResultsFile.delete();
        String committerPath = "file:" + mockResultsFile;
        conf.set("mock-results-file", committerPath);
        conf.set(UPLOAD_UUID, commitUUID);

        mrJob.setInputFormatClass(TextInputFormat.class);
        TextInputFormat.addInputPath(mrJob, new Path("file:" + temp.getRoot().toString()));

        mrJob.setMapperClass(M.class);
        mrJob.setNumReduceTasks(0);

        mrJob.submit();
        Assert.assertTrue("MR job should succeed", mrJob.waitForCompletion(true));

        TestUtil.ClientResults results;
        try (ObjectInputStream in = new ObjectInputStream(
                FileSystem.getLocal(conf).open(new Path(committerPath)))) {
            results = (TestUtil.ClientResults) in.readObject();
        }

        Assert.assertEquals("Should not delete files", 0, results.deletes.size());

        Assert.assertEquals("Should not abort commits", 0, results.aborts.size());

        Assert.assertEquals("Should commit task output files", numFiles, results.commits.size());

        Set<String> actualFiles = Sets.newHashSet();
        for (CompleteMultipartUploadRequest commit : results.commits) {
            actualFiles.add("s3://" + commit.getBucketName() + "/" + commit.getKey());
        }

        Assert.assertEquals("Should commit the correct file paths", expectedFiles, actualFiles);
    }

}