org.apache.beam.sdk.io.hadoop.inputformat.HadoopInputFormatIOTest.java Source code

Introduction

Here is the source code for org.apache.beam.sdk.io.hadoop.inputformat.HadoopInputFormatIOTest.java
Source

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more contributor license
 * agreements. See the NOTICE file distributed with this work for additional information regarding
 * copyright ownership. The ASF licenses this file to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance with the License. You may obtain a
 * copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software distributed under the License
 * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
 * or implied. See the License for the specific language governing permissions and limitations under
 * the License.
 */
package org.apache.beam.sdk.io.hadoop.inputformat;

import static org.apache.beam.sdk.transforms.display.DisplayDataMatchers.hasDisplayItem;
import static org.hamcrest.Matchers.containsInAnyOrder;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertThat;

import java.io.IOException;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import java.util.Map.Entry;

import org.apache.beam.sdk.coders.AvroCoder;
import org.apache.beam.sdk.coders.Coder;
import org.apache.beam.sdk.io.BoundedSource;
import org.apache.beam.sdk.io.BoundedSource.BoundedReader;
import org.apache.beam.sdk.io.hadoop.WritableCoder;
import org.apache.beam.sdk.io.hadoop.inputformat.EmployeeInputFormat.EmployeeRecordReader;
import org.apache.beam.sdk.io.hadoop.inputformat.EmployeeInputFormat.NewObjectsEmployeeInputSplit;
import org.apache.beam.sdk.io.hadoop.inputformat.HadoopInputFormatIO.HadoopInputFormatBoundedSource;
import org.apache.beam.sdk.io.hadoop.inputformat.HadoopInputFormatIO.SerializableConfiguration;
import org.apache.beam.sdk.io.hadoop.inputformat.HadoopInputFormatIO.SerializableSplit;
import org.apache.beam.sdk.testing.PAssert;
import org.apache.beam.sdk.testing.SourceTestUtils;
import org.apache.beam.sdk.testing.TestPipeline;
import org.apache.beam.sdk.transforms.SimpleFunction;
import org.apache.beam.sdk.transforms.display.DisplayData;
import org.apache.beam.sdk.values.KV;
import org.apache.beam.sdk.values.PBegin;
import org.apache.beam.sdk.values.PCollection;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.mapreduce.InputFormat;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.JobContext;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.junit.BeforeClass;
import org.junit.Rule;
import org.junit.Test;
import org.junit.rules.ExpectedException;
import org.junit.runner.RunWith;
import org.junit.runners.JUnit4;
import org.mockito.Mockito;

/**
 * Unit tests for {@link HadoopInputFormatIO}.
 */
@RunWith(JUnit4.class)
public class HadoopInputFormatIOTest {
    static SerializableConfiguration serConf;
    static SimpleFunction<Text, String> myKeyTranslate;
    static SimpleFunction<Employee, String> myValueTranslate;

    @Rule
    public final transient TestPipeline p = TestPipeline.create();
    @Rule
    public ExpectedException thrown = ExpectedException.none();

    private PBegin input = PBegin.in(p);

    @BeforeClass
    public static void setUp() throws IOException, InterruptedException {
        serConf = loadTestConfiguration(EmployeeInputFormat.class, Text.class, Employee.class);
        myKeyTranslate = new SimpleFunction<Text, String>() {
            @Override
            public String apply(Text input) {
                return input.toString();
            }
        };
        myValueTranslate = new SimpleFunction<Employee, String>() {
            @Override
            public String apply(Employee input) {
                return input.getEmpName() + "_" + input.getEmpAddress();
            }
        };
    }

    @Test
    public void testReadBuildsCorrectly() {
        HadoopInputFormatIO.Read<String, String> read = HadoopInputFormatIO.<String, String>read()
                .withConfiguration(serConf.getHadoopConfiguration()).withKeyTranslation(myKeyTranslate)
                .withValueTranslation(myValueTranslate);
        assertEquals(serConf.getHadoopConfiguration(), read.getConfiguration().getHadoopConfiguration());
        assertEquals(myKeyTranslate, read.getKeyTranslationFunction());
        assertEquals(myValueTranslate, read.getValueTranslationFunction());
        assertEquals(myValueTranslate.getOutputTypeDescriptor(), read.getValueTypeDescriptor());
        assertEquals(myKeyTranslate.getOutputTypeDescriptor(), read.getKeyTypeDescriptor());
    }

    /**
     * This test validates {@link HadoopInputFormatIO.Read Read} builds correctly in different order
     * of with configuration/key translation/value translation. This test also validates output
     * PCollection key/value classes are set correctly even if Hadoop configuration is set after
     * setting key/value translation.
     */
    @Test
    public void testReadBuildsCorrectlyInDifferentOrder() {
        HadoopInputFormatIO.Read<String, String> read = HadoopInputFormatIO.<String, String>read()
                .withValueTranslation(myValueTranslate).withConfiguration(serConf.getHadoopConfiguration())
                .withKeyTranslation(myKeyTranslate);
        assertEquals(serConf.getHadoopConfiguration(), read.getConfiguration().getHadoopConfiguration());
        assertEquals(myKeyTranslate, read.getKeyTranslationFunction());
        assertEquals(myValueTranslate, read.getValueTranslationFunction());
        assertEquals(myKeyTranslate.getOutputTypeDescriptor(), read.getKeyTypeDescriptor());
        assertEquals(myValueTranslate.getOutputTypeDescriptor(), read.getValueTypeDescriptor());
    }

    /**
     * This test validates {@link HadoopInputFormatIO.Read Read} object creation if
     * {@link HadoopInputFormatIO.Read#withConfiguration() withConfiguration()} is called more than
     * once.
     * @throws InterruptedException
     * @throws IOException
     */
    @Test
    public void testReadBuildsCorrectlyIfWithConfigurationIsCalledMoreThanOneTime()
            throws IOException, InterruptedException {
        SerializableConfiguration diffConf = loadTestConfiguration(EmployeeInputFormat.class, Employee.class,
                Text.class);
        HadoopInputFormatIO.Read<String, String> read = HadoopInputFormatIO.<String, String>read()
                .withConfiguration(serConf.getHadoopConfiguration()).withKeyTranslation(myKeyTranslate)
                .withConfiguration(diffConf.getHadoopConfiguration());
        assertEquals(diffConf.getHadoopConfiguration(), read.getConfiguration().getHadoopConfiguration());
        assertEquals(myKeyTranslate, read.getKeyTranslationFunction());
        assertEquals(null, read.getValueTranslationFunction());
        assertEquals(myKeyTranslate.getOutputTypeDescriptor(), read.getKeyTypeDescriptor());
        assertEquals(diffConf.getHadoopConfiguration().getClass("value.class", Object.class),
                read.getValueTypeDescriptor().getRawType());
    }

    /**
     * This test validates functionality of {@link HadoopInputFormatIO.Read#populateDisplayData()
     * populateDisplayData()}.
     */
    @Test
    public void testReadDisplayData() {
        HadoopInputFormatIO.Read<String, String> read = HadoopInputFormatIO.<String, String>read()
                .withConfiguration(serConf.getHadoopConfiguration()).withKeyTranslation(myKeyTranslate)
                .withValueTranslation(myValueTranslate);
        DisplayData displayData = DisplayData.from(read);
        Iterator<Entry<String, String>> propertyElement = serConf.getHadoopConfiguration().iterator();
        while (propertyElement.hasNext()) {
            Entry<String, String> element = propertyElement.next();
            assertThat(displayData, hasDisplayItem(element.getKey(), element.getValue()));
        }
    }

    /**
     * This test validates {@link HadoopInputFormatIO.Read Read} transform object creation fails with
     * null configuration. {@link HadoopInputFormatIO.Read#withConfiguration() withConfiguration()}
     * method checks configuration is null and throws exception if it is null.
     */
    @Test
    public void testReadObjectCreationFailsIfConfigurationIsNull() {
        thrown.expect(NullPointerException.class);
        HadoopInputFormatIO.<Text, Employee>read().withConfiguration(null);
    }

    /**
     * This test validates {@link HadoopInputFormatIO.Read Read} transform object creation with only
     * configuration.
     */
    @Test
    public void testReadObjectCreationWithConfiguration() {
        HadoopInputFormatIO.Read<Text, Employee> read = HadoopInputFormatIO.<Text, Employee>read()
                .withConfiguration(serConf.getHadoopConfiguration());
        assertEquals(serConf.getHadoopConfiguration(), read.getConfiguration().getHadoopConfiguration());
        assertEquals(null, read.getKeyTranslationFunction());
        assertEquals(null, read.getValueTranslationFunction());
        assertEquals(serConf.getHadoopConfiguration().getClass("key.class", Object.class),
                read.getKeyTypeDescriptor().getRawType());
        assertEquals(serConf.getHadoopConfiguration().getClass("value.class", Object.class),
                read.getValueTypeDescriptor().getRawType());
    }

    /**
     * This test validates {@link HadoopInputFormatIO.Read Read} transform object creation fails with
     * configuration and null key translation. {@link HadoopInputFormatIO.Read#withKeyTranslation()
     * withKeyTranslation()} checks keyTranslation is null and throws exception if it null value is
     * passed.
     */
    @Test
    public void testReadObjectCreationFailsIfKeyTranslationFunctionIsNull() {
        thrown.expect(NullPointerException.class);
        HadoopInputFormatIO.<String, Employee>read().withConfiguration(serConf.getHadoopConfiguration())
                .withKeyTranslation(null);
    }

    /**
     * This test validates {@link HadoopInputFormatIO.Read Read} transform object creation with
     * configuration and key translation.
     */
    @Test
    public void testReadObjectCreationWithConfigurationKeyTranslation() {
        HadoopInputFormatIO.Read<String, Employee> read = HadoopInputFormatIO.<String, Employee>read()
                .withConfiguration(serConf.getHadoopConfiguration()).withKeyTranslation(myKeyTranslate);
        assertEquals(serConf.getHadoopConfiguration(), read.getConfiguration().getHadoopConfiguration());
        assertEquals(myKeyTranslate, read.getKeyTranslationFunction());
        assertEquals(null, read.getValueTranslationFunction());
        assertEquals(myKeyTranslate.getOutputTypeDescriptor().getRawType(),
                read.getKeyTypeDescriptor().getRawType());
        assertEquals(serConf.getHadoopConfiguration().getClass("value.class", Object.class),
                read.getValueTypeDescriptor().getRawType());
    }

    /**
     * This test validates {@link HadoopInputFormatIO.Read Read} transform object creation fails with
     * configuration and null value translation.
     * {@link HadoopInputFormatIO.Read#withValueTranslation() withValueTranslation()} checks
     * valueTranslation is null and throws exception if null value is passed.
     */
    @Test
    public void testReadObjectCreationFailsIfValueTranslationFunctionIsNull() {
        thrown.expect(NullPointerException.class);
        HadoopInputFormatIO.<Text, String>read().withConfiguration(serConf.getHadoopConfiguration())
                .withValueTranslation(null);
    }

    /**
     * This test validates {@link HadoopInputFormatIO.Read Read} transform object creation with
     * configuration and value translation.
     */
    @Test
    public void testReadObjectCreationWithConfigurationValueTranslation() {
        HadoopInputFormatIO.Read<Text, String> read = HadoopInputFormatIO.<Text, String>read()
                .withConfiguration(serConf.getHadoopConfiguration()).withValueTranslation(myValueTranslate);
        assertEquals(serConf.getHadoopConfiguration(), read.getConfiguration().getHadoopConfiguration());
        assertEquals(null, read.getKeyTranslationFunction());
        assertEquals(myValueTranslate, read.getValueTranslationFunction());
        assertEquals(serConf.getHadoopConfiguration().getClass("key.class", Object.class),
                read.getKeyTypeDescriptor().getRawType());
        assertEquals(myValueTranslate.getOutputTypeDescriptor().getRawType(),
                read.getValueTypeDescriptor().getRawType());
    }

    /**
     * This test validates {@link HadoopInputFormatIO.Read Read} transform object creation with
     * configuration, key translation and value translation.
     */
    @Test
    public void testReadObjectCreationWithConfigurationKeyTranslationValueTranslation() {
        HadoopInputFormatIO.Read<String, String> read = HadoopInputFormatIO.<String, String>read()
                .withConfiguration(serConf.getHadoopConfiguration()).withKeyTranslation(myKeyTranslate)
                .withValueTranslation(myValueTranslate);
        assertEquals(serConf.getHadoopConfiguration(), read.getConfiguration().getHadoopConfiguration());
        assertEquals(myKeyTranslate, read.getKeyTranslationFunction());
        assertEquals(myValueTranslate, read.getValueTranslationFunction());
        assertEquals(myKeyTranslate.getOutputTypeDescriptor().getRawType(),
                read.getKeyTypeDescriptor().getRawType());
        assertEquals(myValueTranslate.getOutputTypeDescriptor().getRawType(),
                read.getValueTypeDescriptor().getRawType());
    }

    /**
     * This test validates functionality of {@link HadoopInputFormatIO.Read#validate()
     * Read.validate()} function when Read transform is created without calling
     * {@link HadoopInputFormatIO.Read#withConfiguration() withConfiguration()}.
     */
    @Test
    public void testReadValidationFailsMissingConfiguration() {
        HadoopInputFormatIO.Read<String, String> read = HadoopInputFormatIO.<String, String>read();
        thrown.expect(NullPointerException.class);
        read.validate(input);
    }

    /**
     * This test validates functionality of {@link HadoopInputFormatIO.Read#withConfiguration()
     * withConfiguration()} function when Hadoop InputFormat class is not provided by the user in
     * configuration.
     */
    @Test
    public void testReadValidationFailsMissingInputFormatInConf() {
        Configuration configuration = new Configuration();
        configuration.setClass("key.class", Text.class, Object.class);
        configuration.setClass("value.class", Employee.class, Object.class);
        thrown.expect(NullPointerException.class);
        HadoopInputFormatIO.<Text, Employee>read().withConfiguration(configuration);
    }

    /**
     * This test validates functionality of {@link HadoopInputFormatIO.Read#withConfiguration()
     * withConfiguration()} function when key class is not provided by the user in configuration.
     */
    @Test
    public void testReadValidationFailsMissingKeyClassInConf() {
        Configuration configuration = new Configuration();
        configuration.setClass("mapreduce.job.inputformat.class", EmployeeInputFormat.class, InputFormat.class);
        configuration.setClass("value.class", Employee.class, Object.class);
        thrown.expect(NullPointerException.class);
        HadoopInputFormatIO.<Text, Employee>read().withConfiguration(configuration);
    }

    /**
     * This test validates functionality of {@link HadoopInputFormatIO.Read#withConfiguration()
     * withConfiguration()} function when value class is not provided by the user in configuration.
     */
    @Test
    public void testReadValidationFailsMissingValueClassInConf() {
        Configuration configuration = new Configuration();
        configuration.setClass("mapreduce.job.inputformat.class", EmployeeInputFormat.class, InputFormat.class);
        configuration.setClass("key.class", Text.class, Object.class);
        thrown.expect(NullPointerException.class);
        HadoopInputFormatIO.<Text, Employee>read().withConfiguration(configuration);
    }

    /**
     * This test validates functionality of {@link HadoopInputFormatIO.Read#validate()
     * Read.validate()} function when myKeyTranslate's (simple function provided by user for key
     * translation) input type is not same as Hadoop InputFormat's keyClass(Which is property set in
     * configuration as "key.class").
     */
    @Test
    public void testReadValidationFailsWithWrongInputTypeKeyTranslationFunction() {
        SimpleFunction<LongWritable, String> myKeyTranslateWithWrongInputType = new SimpleFunction<LongWritable, String>() {
            @Override
            public String apply(LongWritable input) {
                return input.toString();
            }
        };
        HadoopInputFormatIO.Read<String, Employee> read = HadoopInputFormatIO.<String, Employee>read()
                .withConfiguration(serConf.getHadoopConfiguration())
                .withKeyTranslation(myKeyTranslateWithWrongInputType);
        thrown.expect(IllegalArgumentException.class);
        thrown.expectMessage(String.format(
                "Key translation's input type is not same as hadoop InputFormat : %s key " + "class : %s",
                serConf.getHadoopConfiguration().getClass("mapreduce.job.inputformat.class", InputFormat.class),
                serConf.getHadoopConfiguration().getClass("key.class", Object.class)));
        read.validate(input);
    }

    /**
     * This test validates functionality of {@link HadoopInputFormatIO.Read#validate()
     * Read.validate()} function when myValueTranslate's (simple function provided by user for value
     * translation) input type is not same as Hadoop InputFormat's valueClass(Which is property set in
     * configuration as "value.class").
     */
    @Test
    public void testReadValidationFailsWithWrongInputTypeValueTranslationFunction() {
        SimpleFunction<LongWritable, String> myValueTranslateWithWrongInputType = new SimpleFunction<LongWritable, String>() {
            @Override
            public String apply(LongWritable input) {
                return input.toString();
            }
        };
        HadoopInputFormatIO.Read<Text, String> read = HadoopInputFormatIO.<Text, String>read()
                .withConfiguration(serConf.getHadoopConfiguration())
                .withValueTranslation(myValueTranslateWithWrongInputType);
        String expectedMessage = String.format(
                "Value translation's input type is not same as hadoop InputFormat :  " + "%s value class : %s",
                serConf.getHadoopConfiguration().getClass("mapreduce.job.inputformat.class", InputFormat.class),
                serConf.getHadoopConfiguration().getClass("value.class", Object.class));
        thrown.expect(IllegalArgumentException.class);
        thrown.expectMessage(expectedMessage);
        read.validate(input);
    }

    @Test
    public void testReadingData() throws Exception {
        HadoopInputFormatIO.Read<Text, Employee> read = HadoopInputFormatIO.<Text, Employee>read()
                .withConfiguration(serConf.getHadoopConfiguration());
        List<KV<Text, Employee>> expected = TestEmployeeDataSet.getEmployeeData();
        PCollection<KV<Text, Employee>> actual = p.apply("ReadTest", read);
        PAssert.that(actual).containsInAnyOrder(expected);
        p.run();
    }

    /**
     * This test validates behavior of {@link HadoopInputFormatBoundedSource} if RecordReader object
     * creation fails.
     */
    @Test
    public void testReadIfCreateRecordReaderFails() throws Exception {
        thrown.expect(Exception.class);
        thrown.expectMessage("Exception in creating RecordReader");
        InputFormat<Text, Employee> mockInputFormat = Mockito.mock(EmployeeInputFormat.class);
        Mockito.when(mockInputFormat.createRecordReader(Mockito.any(InputSplit.class),
                Mockito.any(TaskAttemptContext.class)))
                .thenThrow(new IOException("Exception in creating RecordReader"));
        HadoopInputFormatBoundedSource<Text, Employee> boundedSource = new HadoopInputFormatBoundedSource<Text, Employee>(
                serConf, WritableCoder.of(Text.class), AvroCoder.of(Employee.class), null, // No key translation required.
                null, // No value translation required.
                new SerializableSplit());
        boundedSource.setInputFormatObj(mockInputFormat);
        SourceTestUtils.readFromSource(boundedSource, p.getOptions());
    }

    /**
     * This test validates behavior of HadoopInputFormatSource if
     * {@link InputFormat#createRecordReader() createRecordReader()} of InputFormat returns null.
     */
    @Test
    public void testReadWithNullCreateRecordReader() throws Exception {
        InputFormat<Text, Employee> mockInputFormat = Mockito.mock(EmployeeInputFormat.class);
        thrown.expect(IOException.class);
        thrown.expectMessage(String.format("Null RecordReader object returned by %s", mockInputFormat.getClass()));
        Mockito.when(mockInputFormat.createRecordReader(Mockito.any(InputSplit.class),
                Mockito.any(TaskAttemptContext.class))).thenReturn(null);
        HadoopInputFormatBoundedSource<Text, Employee> boundedSource = new HadoopInputFormatBoundedSource<Text, Employee>(
                serConf, WritableCoder.of(Text.class), AvroCoder.of(Employee.class), null, // No key translation required.
                null, // No value translation required.
                new SerializableSplit());
        boundedSource.setInputFormatObj(mockInputFormat);
        SourceTestUtils.readFromSource(boundedSource, p.getOptions());
    }

    /**
     * This test validates behavior of
     * {@link HadoopInputFormatBoundedSource.HadoopInputFormatReader#start() start()} method if
     * InputFormat's {@link InputFormat#getSplits() getSplits()} returns InputSplitList having zero
     * records.
     */
    @Test
    public void testReadersStartWhenZeroRecords() throws Exception {
        InputFormat<Text, Employee> mockInputFormat = Mockito.mock(EmployeeInputFormat.class);
        EmployeeRecordReader mockReader = Mockito.mock(EmployeeRecordReader.class);
        Mockito.when(mockInputFormat.createRecordReader(Mockito.any(InputSplit.class),
                Mockito.any(TaskAttemptContext.class))).thenReturn(mockReader);
        Mockito.when(mockReader.nextKeyValue()).thenReturn(false);
        InputSplit mockInputSplit = Mockito.mock(NewObjectsEmployeeInputSplit.class);
        HadoopInputFormatBoundedSource<Text, Employee> boundedSource = new HadoopInputFormatBoundedSource<Text, Employee>(
                serConf, WritableCoder.of(Text.class), AvroCoder.of(Employee.class), null, // No key translation required.
                null, // No value translation required.
                new SerializableSplit(mockInputSplit));
        BoundedReader<KV<Text, Employee>> boundedReader = boundedSource.createReader(p.getOptions());
        assertEquals(false, boundedReader.start());
        assertEquals(Double.valueOf(1), boundedReader.getFractionConsumed());
    }

    /**
     * This test validates the method getFractionConsumed()- which indicates the progress of the read
     * in range of 0 to 1.
     */
    @Test
    public void testReadersGetFractionConsumed() throws Exception {
        List<KV<Text, Employee>> referenceRecords = TestEmployeeDataSet.getEmployeeData();
        HadoopInputFormatBoundedSource<Text, Employee> hifSource = getTestHIFSource(EmployeeInputFormat.class,
                Text.class, Employee.class, WritableCoder.of(Text.class), AvroCoder.of(Employee.class));
        long estimatedSize = hifSource.getEstimatedSizeBytes(p.getOptions());
        // Validate if estimated size is equal to the size of records.
        assertEquals(referenceRecords.size(), estimatedSize);
        List<BoundedSource<KV<Text, Employee>>> boundedSourceList = hifSource.splitIntoBundles(0, p.getOptions());
        // Validate if splitIntoBundles() has split correctly.
        assertEquals(TestEmployeeDataSet.NUMBER_OF_SPLITS, boundedSourceList.size());
        List<KV<Text, Employee>> bundleRecords = new ArrayList<>();
        for (BoundedSource<KV<Text, Employee>> source : boundedSourceList) {
            List<KV<Text, Employee>> elements = new ArrayList<KV<Text, Employee>>();
            BoundedReader<KV<Text, Employee>> reader = source.createReader(p.getOptions());
            float recordsRead = 0;
            // When start is not called, getFractionConsumed() should return 0.
            assertEquals(Double.valueOf(0), reader.getFractionConsumed());
            boolean start = reader.start();
            assertEquals(true, start);
            if (start) {
                elements.add(reader.getCurrent());
                boolean advance = reader.advance();
                // Validate if getFractionConsumed() returns the correct fraction based on
                // the number of records read in the split.
                assertEquals(Double.valueOf(++recordsRead / TestEmployeeDataSet.NUMBER_OF_RECORDS_IN_EACH_SPLIT),
                        reader.getFractionConsumed());
                assertEquals(true, advance);
                while (advance) {
                    elements.add(reader.getCurrent());
                    advance = reader.advance();
                    assertEquals(
                            Double.valueOf(++recordsRead / TestEmployeeDataSet.NUMBER_OF_RECORDS_IN_EACH_SPLIT),
                            reader.getFractionConsumed());
                }
                bundleRecords.addAll(elements);
            }
            // Validate if getFractionConsumed() returns 1 after reading is complete.
            assertEquals(Double.valueOf(1), reader.getFractionConsumed());
            reader.close();
        }
        assertThat(bundleRecords, containsInAnyOrder(referenceRecords.toArray()));
    }

    /**
     * This test validates that reader and its parent source reads the same records.
     */
    @Test
    public void testReaderAndParentSourceReadsSameData() throws Exception {
        InputSplit mockInputSplit = Mockito.mock(NewObjectsEmployeeInputSplit.class);
        HadoopInputFormatBoundedSource<Text, Employee> boundedSource = new HadoopInputFormatBoundedSource<Text, Employee>(
                serConf, WritableCoder.of(Text.class), AvroCoder.of(Employee.class), null, // No key translation required.
                null, // No value translation required.
                new SerializableSplit(mockInputSplit));
        BoundedReader<KV<Text, Employee>> reader = boundedSource.createReader(p.getOptions());
        SourceTestUtils.assertUnstartedReaderReadsSameAsItsSource(reader, p.getOptions());
    }

    /**
     * This test verifies that the method
     * {@link HadoopInputFormatBoundedSource.HadoopInputFormatReader#getCurrentSource()
     * getCurrentSource()} returns correct source object.
     */
    @Test
    public void testGetCurrentSourceFunction() throws Exception {
        SerializableSplit split = new SerializableSplit();
        BoundedSource<KV<Text, Employee>> source = new HadoopInputFormatBoundedSource<Text, Employee>(serConf,
                WritableCoder.of(Text.class), AvroCoder.of(Employee.class), null, // No key translation required.
                null, // No value translation required.
                split);
        BoundedReader<KV<Text, Employee>> hifReader = source.createReader(p.getOptions());
        BoundedSource<KV<Text, Employee>> hifSource = hifReader.getCurrentSource();
        assertEquals(hifSource, source);
    }

    /**
     * This test validates behavior of {@link HadoopInputFormatBoundedSource#createReader()
     * createReader()} method when {@link HadoopInputFormatBoundedSource#splitIntoBundles()
     * splitIntoBundles()} is not called.
     */
    @Test
    public void testCreateReaderIfSplitIntoBundlesNotCalled() throws Exception {
        HadoopInputFormatBoundedSource<Text, Employee> hifSource = getTestHIFSource(EmployeeInputFormat.class,
                Text.class, Employee.class, WritableCoder.of(Text.class), AvroCoder.of(Employee.class));
        thrown.expect(IOException.class);
        thrown.expectMessage("Cannot create reader as source is not split yet.");
        hifSource.createReader(p.getOptions());
    }

    /**
     * This test validates behavior of
     * {@link HadoopInputFormatBoundedSource#computeSplitsIfNecessary() computeSplits()} when Hadoop
     * InputFormat's {@link InputFormat#getSplits() getSplits()} returns empty list.
     */
    @Test
    public void testComputeSplitsIfGetSplitsReturnsEmptyList() throws Exception {
        InputFormat<?, ?> mockInputFormat = Mockito.mock(EmployeeInputFormat.class);
        SerializableSplit mockInputSplit = Mockito.mock(SerializableSplit.class);
        Mockito.when(mockInputFormat.getSplits(Mockito.any(JobContext.class)))
                .thenReturn(new ArrayList<InputSplit>());
        HadoopInputFormatBoundedSource<Text, Employee> hifSource = new HadoopInputFormatBoundedSource<Text, Employee>(
                serConf, WritableCoder.of(Text.class), AvroCoder.of(Employee.class), null, // No key translation required.
                null, // No value translation required.
                mockInputSplit);
        thrown.expect(IOException.class);
        thrown.expectMessage("Error in computing splits, getSplits() returns a empty list");
        hifSource.setInputFormatObj(mockInputFormat);
        hifSource.computeSplitsIfNecessary();
    }

    /**
     * This test validates behavior of
     * {@link HadoopInputFormatBoundedSource#computeSplitsIfNecessary() computeSplits()} when Hadoop
     * InputFormat's {@link InputFormat#getSplits() getSplits()} returns NULL value.
     */
    @Test
    public void testComputeSplitsIfGetSplitsReturnsNullValue() throws Exception {
        InputFormat<Text, Employee> mockInputFormat = Mockito.mock(EmployeeInputFormat.class);
        SerializableSplit mockInputSplit = Mockito.mock(SerializableSplit.class);
        Mockito.when(mockInputFormat.getSplits(Mockito.any(JobContext.class))).thenReturn(null);
        HadoopInputFormatBoundedSource<Text, Employee> hifSource = new HadoopInputFormatBoundedSource<Text, Employee>(
                serConf, WritableCoder.of(Text.class), AvroCoder.of(Employee.class), null, // No key translation required.
                null, // No value translation required.
                mockInputSplit);
        thrown.expect(IOException.class);
        thrown.expectMessage("Error in computing splits, getSplits() returns null.");
        hifSource.setInputFormatObj(mockInputFormat);
        hifSource.computeSplitsIfNecessary();
    }

    /**
     * This test validates behavior of
     * {@link HadoopInputFormatBoundedSource#computeSplitsIfNecessary() computeSplits()} if Hadoop
     * InputFormat's {@link InputFormat#getSplits() getSplits()} returns InputSplit list having some
     * null values.
     */
    @Test
    public void testComputeSplitsIfGetSplitsReturnsListHavingNullValues() throws Exception {
        // InputSplit list having null value.
        InputSplit mockInputSplit = Mockito.mock(InputSplit.class,
                Mockito.withSettings().extraInterfaces(Writable.class));
        List<InputSplit> inputSplitList = new ArrayList<InputSplit>();
        inputSplitList.add(mockInputSplit);
        inputSplitList.add(null);
        InputFormat<Text, Employee> mockInputFormat = Mockito.mock(EmployeeInputFormat.class);
        Mockito.when(mockInputFormat.getSplits(Mockito.any(JobContext.class))).thenReturn(inputSplitList);
        HadoopInputFormatBoundedSource<Text, Employee> hifSource = new HadoopInputFormatBoundedSource<Text, Employee>(
                serConf, WritableCoder.of(Text.class), AvroCoder.of(Employee.class), null, // No key translation required.
                null, // No value translation required.
                new SerializableSplit());
        thrown.expect(IOException.class);
        thrown.expectMessage(
                "Error in computing splits, split is null in InputSplits list populated " + "by getSplits() : ");
        hifSource.setInputFormatObj(mockInputFormat);
        hifSource.computeSplitsIfNecessary();
    }

    /**
     * This test validates records emitted in PCollection are immutable if InputFormat's recordReader
     * returns same objects(i.e. same locations in memory) but with updated values for each record.
     */
    @Test
    public void testImmutablityOfOutputOfReadIfRecordReaderObjectsAreMutable() throws Exception {
        List<BoundedSource<KV<Text, Employee>>> boundedSourceList = getBoundedSourceList(
                ReuseObjectsEmployeeInputFormat.class, Text.class, Employee.class, WritableCoder.of(Text.class),
                AvroCoder.of(Employee.class));
        List<KV<Text, Employee>> bundleRecords = new ArrayList<>();
        for (BoundedSource<KV<Text, Employee>> source : boundedSourceList) {
            List<KV<Text, Employee>> elems = SourceTestUtils.readFromSource(source, p.getOptions());
            bundleRecords.addAll(elems);
        }
        List<KV<Text, Employee>> referenceRecords = TestEmployeeDataSet.getEmployeeData();
        assertThat(bundleRecords, containsInAnyOrder(referenceRecords.toArray()));
    }

    /**
     * Test reading if InputFormat implements {@link org.apache.hadoop.conf.Configurable
     * Configurable}.
     */
    @Test
    public void testReadingWithConfigurableInputFormat() throws Exception {
        List<BoundedSource<KV<Text, Employee>>> boundedSourceList = getBoundedSourceList(
                ConfigurableEmployeeInputFormat.class, Text.class, Employee.class, WritableCoder.of(Text.class),
                AvroCoder.of(Employee.class));
        for (BoundedSource<KV<Text, Employee>> source : boundedSourceList) {
            // Cast to HadoopInputFormatBoundedSource to access getInputFormat().
            @SuppressWarnings("unchecked")
            HadoopInputFormatBoundedSource<Text, Employee> hifSource = (HadoopInputFormatBoundedSource<Text, Employee>) source;
            hifSource.createInputFormatInstance();
            ConfigurableEmployeeInputFormat inputFormatObj = (ConfigurableEmployeeInputFormat) hifSource
                    .getInputFormat();
            assertEquals(true, inputFormatObj.isConfSet);
        }
    }

    /**
     * This test validates records emitted in PCollection are immutable if InputFormat's
     * {@link org.apache.hadoop.mapreduce.RecordReader RecordReader} returns different objects (i.e.
     * different locations in memory).
     */
    @Test
    public void testImmutablityOfOutputOfReadIfRecordReaderObjectsAreImmutable() throws Exception {
        List<BoundedSource<KV<Text, Employee>>> boundedSourceList = getBoundedSourceList(EmployeeInputFormat.class,
                Text.class, Employee.class, WritableCoder.of(Text.class), AvroCoder.of(Employee.class));
        List<KV<Text, Employee>> bundleRecords = new ArrayList<>();
        for (BoundedSource<KV<Text, Employee>> source : boundedSourceList) {
            List<KV<Text, Employee>> elems = SourceTestUtils.readFromSource(source, p.getOptions());
            bundleRecords.addAll(elems);
        }
        List<KV<Text, Employee>> referenceRecords = TestEmployeeDataSet.getEmployeeData();
        assertThat(bundleRecords, containsInAnyOrder(referenceRecords.toArray()));
    }

    private static SerializableConfiguration loadTestConfiguration(Class<?> inputFormatClassName, Class<?> keyClass,
            Class<?> valueClass) {
        Configuration conf = new Configuration();
        conf.setClass("mapreduce.job.inputformat.class", inputFormatClassName, InputFormat.class);
        conf.setClass("key.class", keyClass, Object.class);
        conf.setClass("value.class", valueClass, Object.class);
        return new SerializableConfiguration(conf);
    }

    private <K, V> HadoopInputFormatBoundedSource<K, V> getTestHIFSource(Class<?> inputFormatClass,
            Class<K> inputFormatKeyClass, Class<V> inputFormatValueClass, Coder<K> keyCoder, Coder<V> valueCoder) {
        SerializableConfiguration serConf = loadTestConfiguration(inputFormatClass, inputFormatKeyClass,
                inputFormatValueClass);
        return new HadoopInputFormatBoundedSource<K, V>(serConf, keyCoder, valueCoder, null, // No key translation required.
                null); // No value translation required.
    }

    private <K, V> List<BoundedSource<KV<K, V>>> getBoundedSourceList(Class<?> inputFormatClass,
            Class<K> inputFormatKeyClass, Class<V> inputFormatValueClass, Coder<K> keyCoder, Coder<V> valueCoder)
            throws Exception {
        HadoopInputFormatBoundedSource<K, V> boundedSource = getTestHIFSource(inputFormatClass, inputFormatKeyClass,
                inputFormatValueClass, keyCoder, valueCoder);
        return boundedSource.splitIntoBundles(0, p.getOptions());
    }
}