org.apache.beam.sdk.io.hadoop.format.HadoopFormatIOTest.java Source code

Introduction

Here is the source code for org.apache.beam.sdk.io.hadoop.format.HadoopFormatIOTest.java
Source

/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.beam.sdk.io.hadoop.format;

import static org.apache.beam.sdk.transforms.display.DisplayDataMatchers.hasDisplayItem;
import static org.hamcrest.MatcherAssert.assertThat;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertTrue;

import java.io.IOException;
import java.nio.file.Paths;
import java.util.ArrayList;
import java.util.List;
import org.apache.beam.sdk.Pipeline;
import org.apache.beam.sdk.io.hadoop.inputformat.Employee;
import org.apache.beam.sdk.io.hadoop.inputformat.TestEmployeeDataSet;
import org.apache.beam.sdk.testing.TestPipeline;
import org.apache.beam.sdk.transforms.Create;
import org.apache.beam.sdk.transforms.display.DisplayData;
import org.apache.beam.sdk.values.KV;
import org.apache.beam.sdk.values.PCollection;
import org.apache.beam.sdk.values.TypeDescriptor;
import org.apache.beam.sdk.values.TypeDescriptors;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.MRJobConfig;
import org.apache.hadoop.mapreduce.OutputCommitter;
import org.apache.hadoop.mapreduce.OutputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import org.junit.Before;
import org.junit.Rule;
import org.junit.Test;
import org.junit.rules.ExpectedException;
import org.junit.rules.TemporaryFolder;
import org.junit.runner.RunWith;
import org.mockito.Mockito;
import org.mockito.runners.MockitoJUnitRunner;

/** Unit tests for {@link HadoopFormatIO}. */
@RunWith(MockitoJUnitRunner.class)
public class HadoopFormatIOTest {

    private static final int REDUCERS_COUNT = 2;
    private static final String LOCKS_FOLDER_NAME = "locks";
    private static Configuration conf;

    @Rule
    public final transient TestPipeline p = TestPipeline.create();
    @Rule
    public ExpectedException thrown = ExpectedException.none();
    @Rule
    public TemporaryFolder tmpFolder = new TemporaryFolder();

    @Before
    public void setUp() {
        conf = loadTestConfiguration(EmployeeOutputFormat.class, Text.class, Employee.class);
        OutputCommitter mockedOutputCommitter = Mockito.mock(OutputCommitter.class);
        EmployeeOutputFormat.initWrittenOutput(mockedOutputCommitter);
    }

    private static Configuration loadTestConfiguration(Class<?> outputFormatClassName, Class<?> keyClass,
            Class<?> valueClass) {
        Configuration conf = new Configuration();
        conf.setClass(MRJobConfig.OUTPUT_FORMAT_CLASS_ATTR, outputFormatClassName, OutputFormat.class);
        conf.setClass(MRJobConfig.OUTPUT_KEY_CLASS, keyClass, Object.class);
        conf.setClass(MRJobConfig.OUTPUT_VALUE_CLASS, valueClass, Object.class);
        conf.setInt(MRJobConfig.NUM_REDUCES, REDUCERS_COUNT);
        conf.set(MRJobConfig.ID, String.valueOf(1));
        return conf;
    }

    /**
     * This test validates {@link HadoopFormatIO.Write Write} transform object creation fails with
     * null configuration. {@link HadoopFormatIO.Write.Builder#withConfiguration(Configuration)
     * withConfiguration(Configuration)} method checks configuration is null and throws exception if
     * it is null.
     */
    @Test
    public void testWriteObjectCreationFailsIfConfigurationIsNull() {
        thrown.expect(NullPointerException.class);
        thrown.expectMessage("Hadoop configuration cannot be null");
        HadoopFormatIO.<Text, Employee>write().withConfiguration(null).withPartitioning()
                .withExternalSynchronization(new HDFSSynchronization(getLocksDirPath()));
    }

    /**
     * This test validates functionality of {@link
     * HadoopFormatIO.Write.Builder#withConfiguration(Configuration) withConfiguration(Configuration)}
     * function when Hadoop OutputFormat class is not provided by the user in configuration.
     */
    @Test
    public void testWriteValidationFailsMissingOutputFormatInConf() {
        Configuration configuration = new Configuration();
        configuration.setClass(HadoopFormatIO.OUTPUT_KEY_CLASS, Text.class, Object.class);
        configuration.setClass(HadoopFormatIO.OUTPUT_VALUE_CLASS, Employee.class, Object.class);

        HadoopFormatIO.Write<Text, Employee> writeWithWrongConfig = HadoopFormatIO.<Text, Employee>write()
                .withConfiguration(configuration).withPartitioning()
                .withExternalSynchronization(new HDFSSynchronization(getLocksDirPath()));

        p.apply(Create.of(TestEmployeeDataSet.getEmployeeData()))
                .setTypeDescriptor(TypeDescriptors.kvs(new TypeDescriptor<Text>() {
                }, new TypeDescriptor<Employee>() {
                })).apply("Write", writeWithWrongConfig);

        thrown.expect(Pipeline.PipelineExecutionException.class);
        thrown.expectMessage("Configuration must contain \"mapreduce.job.outputformat.class\"");

        p.run().waitUntilFinish();
    }

    /**
     * This test validates functionality of {@link
     * HadoopFormatIO.Write.Builder#withConfiguration(Configuration) withConfiguration(Configuration)}
     * function when key class is not provided by the user in configuration.
     */
    @Test
    public void testWriteValidationFailsMissingKeyClassInConf() {
        Configuration configuration = new Configuration();
        configuration.setClass(HadoopFormatIO.OUTPUT_FORMAT_CLASS_ATTR, TextOutputFormat.class, OutputFormat.class);
        configuration.setClass(HadoopFormatIO.OUTPUT_VALUE_CLASS, Employee.class, Object.class);

        runValidationPipeline(configuration);

        thrown.expect(Pipeline.PipelineExecutionException.class);
        thrown.expectMessage("Configuration must contain \"mapreduce.job.output.key.class\"");

        p.run().waitUntilFinish();
    }

    private void runValidationPipeline(Configuration configuration) {
        p.apply(Create.of(TestEmployeeDataSet.getEmployeeData()))
                .setTypeDescriptor(TypeDescriptors.kvs(new TypeDescriptor<Text>() {
                }, new TypeDescriptor<Employee>() {
                })).apply("Write",
                        HadoopFormatIO.<Text, Employee>write().withConfiguration(configuration).withPartitioning()
                                .withExternalSynchronization(new HDFSSynchronization(getLocksDirPath())));
    }

    /**
     * This test validates functionality of {@link
     * HadoopFormatIO.Write.Builder#withConfiguration(Configuration) withConfiguration(Configuration)}
     * function when value class is not provided by the user in configuration.
     */
    @Test
    public void testWriteValidationFailsMissingValueClassInConf() {
        Configuration configuration = new Configuration();
        configuration.setClass(HadoopFormatIO.OUTPUT_FORMAT_CLASS_ATTR, TextOutputFormat.class, OutputFormat.class);
        configuration.setClass(HadoopFormatIO.OUTPUT_KEY_CLASS, Text.class, Object.class);

        runValidationPipeline(configuration);

        thrown.expect(Pipeline.PipelineExecutionException.class);
        thrown.expectMessage("Configuration must contain \"mapreduce.job.output.value.class\"");

        p.run().waitUntilFinish();
    }

    /**
     * This test validates functionality of {@link
     * HadoopFormatIO.Write.Builder#withConfiguration(Configuration) withConfiguration(Configuration)}
     * function when job id is not provided by the user in configuration.
     */
    @Test
    public void testWriteValidationFailsMissingJobIDInConf() {
        Configuration configuration = new Configuration();
        configuration.setClass(HadoopFormatIO.OUTPUT_FORMAT_CLASS_ATTR, TextOutputFormat.class, OutputFormat.class);
        configuration.setClass(HadoopFormatIO.OUTPUT_KEY_CLASS, Text.class, Object.class);
        configuration.setClass(HadoopFormatIO.OUTPUT_VALUE_CLASS, Employee.class, Object.class);
        configuration.set(HadoopFormatIO.OUTPUT_DIR, tmpFolder.getRoot().getAbsolutePath());

        runValidationPipeline(configuration);

        thrown.expect(Pipeline.PipelineExecutionException.class);
        thrown.expectMessage("Configuration must contain \"mapreduce.job.id\"");

        p.run().waitUntilFinish();
    }

    @Test
    public void testWritingData() throws IOException {
        conf.set(HadoopFormatIO.OUTPUT_DIR, tmpFolder.getRoot().getAbsolutePath());
        List<KV<Text, Employee>> data = TestEmployeeDataSet.getEmployeeData();
        PCollection<KV<Text, Employee>> input = p.apply(Create.of(data))
                .setTypeDescriptor(TypeDescriptors.kvs(new TypeDescriptor<Text>() {
                }, new TypeDescriptor<Employee>() {
                }));

        input.apply("Write", HadoopFormatIO.<Text, Employee>write().withConfiguration(conf).withPartitioning()
                .withExternalSynchronization(new HDFSSynchronization(getLocksDirPath())));
        p.run();

        List<KV<Text, Employee>> writtenOutput = EmployeeOutputFormat.getWrittenOutput();
        assertEquals(data.size(), writtenOutput.size());
        assertTrue(data.containsAll(writtenOutput));
        assertTrue(writtenOutput.containsAll(data));

        Mockito.verify(EmployeeOutputFormat.getOutputCommitter()).commitJob(Mockito.any());
        Mockito.verify(EmployeeOutputFormat.getOutputCommitter(), Mockito.times(REDUCERS_COUNT))
                .commitTask(Mockito.any());
    }

    @Test
    public void testWritingDataFailInvalidKeyType() {

        conf.set(HadoopFormatIO.OUTPUT_DIR, tmpFolder.getRoot().getAbsolutePath());
        List<KV<String, Employee>> data = new ArrayList<>();
        data.add(KV.of("key", new Employee("name", "address")));
        PCollection<KV<String, Employee>> input = p.apply("CreateData", Create.of(data))
                .setTypeDescriptor(TypeDescriptors.kvs(new TypeDescriptor<String>() {
                }, new TypeDescriptor<Employee>() {
                }));

        thrown.expect(Pipeline.PipelineExecutionException.class);
        thrown.expectMessage(String.class.getName());

        input.apply("Write", HadoopFormatIO.<String, Employee>write().withConfiguration(conf).withPartitioning()
                .withExternalSynchronization(new HDFSSynchronization(getLocksDirPath())));
        p.run().waitUntilFinish();
    }

    @Test
    public void testWritingDataFailInvalidValueType() {

        conf.set(HadoopFormatIO.OUTPUT_DIR, tmpFolder.getRoot().getAbsolutePath());
        List<KV<Text, Text>> data = new ArrayList<>();
        data.add(KV.of(new Text("key"), new Text("value")));
        TypeDescriptor<Text> textTypeDescriptor = new TypeDescriptor<Text>() {
        };
        PCollection<KV<Text, Text>> input = p.apply(Create.of(data))
                .setTypeDescriptor(TypeDescriptors.kvs(textTypeDescriptor, textTypeDescriptor));

        thrown.expect(Pipeline.PipelineExecutionException.class);
        thrown.expectMessage(Text.class.getName());

        input.apply("Write", HadoopFormatIO.<Text, Text>write().withConfiguration(conf).withPartitioning()
                .withExternalSynchronization(new HDFSSynchronization(getLocksDirPath())));

        p.run().waitUntilFinish();
    }

    /**
     * This test validates functionality of {@link
     * HadoopFormatIO.Write#populateDisplayData(DisplayData.Builder)
     * populateDisplayData(DisplayData.WriteBuilder)}.
     */
    @Test
    public void testWriteDisplayData() {
        HadoopFormatIO.Write<String, String> write = HadoopFormatIO.<String, String>write().withConfiguration(conf)
                .withPartitioning().withExternalSynchronization(new HDFSSynchronization(getLocksDirPath()));
        DisplayData displayData = DisplayData.from(write);

        assertThat(displayData, hasDisplayItem(HadoopFormatIO.OUTPUT_FORMAT_CLASS_ATTR,
                conf.get(HadoopFormatIO.OUTPUT_FORMAT_CLASS_ATTR)));
        assertThat(displayData,
                hasDisplayItem(HadoopFormatIO.OUTPUT_KEY_CLASS, conf.get(HadoopFormatIO.OUTPUT_KEY_CLASS)));
        assertThat(displayData,
                hasDisplayItem(HadoopFormatIO.OUTPUT_VALUE_CLASS, conf.get(HadoopFormatIO.OUTPUT_VALUE_CLASS)));
        assertThat(displayData, hasDisplayItem(HadoopFormatIO.PARTITIONER_CLASS_ATTR,
                HadoopFormats.DEFAULT_PARTITIONER_CLASS_ATTR.getName()));
    }

    private String getLocksDirPath() {
        return Paths.get(tmpFolder.getRoot().getAbsolutePath(), LOCKS_FOLDER_NAME).toAbsolutePath().toString();
    }
}