Java tutorial
/* * Licensed to the Apache Software Foundation (ASF) under one or more contributor license * agreements. See the NOTICE file distributed with this work for additional information regarding * copyright ownership. The ASF licenses this file to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance with the License. You may obtain a * copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software distributed under the License * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express * or implied. See the License for the specific language governing permissions and limitations under * the License. */ package org.apache.beam.sdk.io.hadoop.inputformat; import static org.apache.beam.sdk.transforms.display.DisplayDataMatchers.hasDisplayItem; import static org.hamcrest.Matchers.containsInAnyOrder; import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertThat; import java.io.IOException; import java.util.ArrayList; import java.util.Iterator; import java.util.List; import java.util.Map.Entry; import org.apache.beam.sdk.coders.AvroCoder; import org.apache.beam.sdk.coders.Coder; import org.apache.beam.sdk.io.BoundedSource; import org.apache.beam.sdk.io.BoundedSource.BoundedReader; import org.apache.beam.sdk.io.hadoop.WritableCoder; import org.apache.beam.sdk.io.hadoop.inputformat.EmployeeInputFormat.EmployeeRecordReader; import org.apache.beam.sdk.io.hadoop.inputformat.EmployeeInputFormat.NewObjectsEmployeeInputSplit; import org.apache.beam.sdk.io.hadoop.inputformat.HadoopInputFormatIO.HadoopInputFormatBoundedSource; import org.apache.beam.sdk.io.hadoop.inputformat.HadoopInputFormatIO.SerializableConfiguration; import org.apache.beam.sdk.io.hadoop.inputformat.HadoopInputFormatIO.SerializableSplit; import org.apache.beam.sdk.testing.PAssert; import org.apache.beam.sdk.testing.SourceTestUtils; import org.apache.beam.sdk.testing.TestPipeline; import org.apache.beam.sdk.transforms.SimpleFunction; import org.apache.beam.sdk.transforms.display.DisplayData; import org.apache.beam.sdk.values.KV; import org.apache.beam.sdk.values.PBegin; import org.apache.beam.sdk.values.PCollection; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.io.Writable; import org.apache.hadoop.mapreduce.InputFormat; import org.apache.hadoop.mapreduce.InputSplit; import org.apache.hadoop.mapreduce.JobContext; import org.apache.hadoop.mapreduce.TaskAttemptContext; import org.junit.BeforeClass; import org.junit.Rule; import org.junit.Test; import org.junit.rules.ExpectedException; import org.junit.runner.RunWith; import org.junit.runners.JUnit4; import org.mockito.Mockito; /** * Unit tests for {@link HadoopInputFormatIO}. */ @RunWith(JUnit4.class) public class HadoopInputFormatIOTest { static SerializableConfiguration serConf; static SimpleFunction<Text, String> myKeyTranslate; static SimpleFunction<Employee, String> myValueTranslate; @Rule public final transient TestPipeline p = TestPipeline.create(); @Rule public ExpectedException thrown = ExpectedException.none(); private PBegin input = PBegin.in(p); @BeforeClass public static void setUp() throws IOException, InterruptedException { serConf = loadTestConfiguration(EmployeeInputFormat.class, Text.class, Employee.class); myKeyTranslate = new SimpleFunction<Text, String>() { @Override public String apply(Text input) { return input.toString(); } }; myValueTranslate = new SimpleFunction<Employee, String>() { @Override public String apply(Employee input) { return input.getEmpName() + "_" + input.getEmpAddress(); } }; } @Test public void testReadBuildsCorrectly() { HadoopInputFormatIO.Read<String, String> read = HadoopInputFormatIO.<String, String>read() .withConfiguration(serConf.getHadoopConfiguration()).withKeyTranslation(myKeyTranslate) .withValueTranslation(myValueTranslate); assertEquals(serConf.getHadoopConfiguration(), read.getConfiguration().getHadoopConfiguration()); assertEquals(myKeyTranslate, read.getKeyTranslationFunction()); assertEquals(myValueTranslate, read.getValueTranslationFunction()); assertEquals(myValueTranslate.getOutputTypeDescriptor(), read.getValueTypeDescriptor()); assertEquals(myKeyTranslate.getOutputTypeDescriptor(), read.getKeyTypeDescriptor()); } /** * This test validates {@link HadoopInputFormatIO.Read Read} builds correctly in different order * of with configuration/key translation/value translation. This test also validates output * PCollection key/value classes are set correctly even if Hadoop configuration is set after * setting key/value translation. */ @Test public void testReadBuildsCorrectlyInDifferentOrder() { HadoopInputFormatIO.Read<String, String> read = HadoopInputFormatIO.<String, String>read() .withValueTranslation(myValueTranslate).withConfiguration(serConf.getHadoopConfiguration()) .withKeyTranslation(myKeyTranslate); assertEquals(serConf.getHadoopConfiguration(), read.getConfiguration().getHadoopConfiguration()); assertEquals(myKeyTranslate, read.getKeyTranslationFunction()); assertEquals(myValueTranslate, read.getValueTranslationFunction()); assertEquals(myKeyTranslate.getOutputTypeDescriptor(), read.getKeyTypeDescriptor()); assertEquals(myValueTranslate.getOutputTypeDescriptor(), read.getValueTypeDescriptor()); } /** * This test validates {@link HadoopInputFormatIO.Read Read} object creation if * {@link HadoopInputFormatIO.Read#withConfiguration() withConfiguration()} is called more than * once. * @throws InterruptedException * @throws IOException */ @Test public void testReadBuildsCorrectlyIfWithConfigurationIsCalledMoreThanOneTime() throws IOException, InterruptedException { SerializableConfiguration diffConf = loadTestConfiguration(EmployeeInputFormat.class, Employee.class, Text.class); HadoopInputFormatIO.Read<String, String> read = HadoopInputFormatIO.<String, String>read() .withConfiguration(serConf.getHadoopConfiguration()).withKeyTranslation(myKeyTranslate) .withConfiguration(diffConf.getHadoopConfiguration()); assertEquals(diffConf.getHadoopConfiguration(), read.getConfiguration().getHadoopConfiguration()); assertEquals(myKeyTranslate, read.getKeyTranslationFunction()); assertEquals(null, read.getValueTranslationFunction()); assertEquals(myKeyTranslate.getOutputTypeDescriptor(), read.getKeyTypeDescriptor()); assertEquals(diffConf.getHadoopConfiguration().getClass("value.class", Object.class), read.getValueTypeDescriptor().getRawType()); } /** * This test validates functionality of {@link HadoopInputFormatIO.Read#populateDisplayData() * populateDisplayData()}. */ @Test public void testReadDisplayData() { HadoopInputFormatIO.Read<String, String> read = HadoopInputFormatIO.<String, String>read() .withConfiguration(serConf.getHadoopConfiguration()).withKeyTranslation(myKeyTranslate) .withValueTranslation(myValueTranslate); DisplayData displayData = DisplayData.from(read); Iterator<Entry<String, String>> propertyElement = serConf.getHadoopConfiguration().iterator(); while (propertyElement.hasNext()) { Entry<String, String> element = propertyElement.next(); assertThat(displayData, hasDisplayItem(element.getKey(), element.getValue())); } } /** * This test validates {@link HadoopInputFormatIO.Read Read} transform object creation fails with * null configuration. {@link HadoopInputFormatIO.Read#withConfiguration() withConfiguration()} * method checks configuration is null and throws exception if it is null. */ @Test public void testReadObjectCreationFailsIfConfigurationIsNull() { thrown.expect(NullPointerException.class); HadoopInputFormatIO.<Text, Employee>read().withConfiguration(null); } /** * This test validates {@link HadoopInputFormatIO.Read Read} transform object creation with only * configuration. */ @Test public void testReadObjectCreationWithConfiguration() { HadoopInputFormatIO.Read<Text, Employee> read = HadoopInputFormatIO.<Text, Employee>read() .withConfiguration(serConf.getHadoopConfiguration()); assertEquals(serConf.getHadoopConfiguration(), read.getConfiguration().getHadoopConfiguration()); assertEquals(null, read.getKeyTranslationFunction()); assertEquals(null, read.getValueTranslationFunction()); assertEquals(serConf.getHadoopConfiguration().getClass("key.class", Object.class), read.getKeyTypeDescriptor().getRawType()); assertEquals(serConf.getHadoopConfiguration().getClass("value.class", Object.class), read.getValueTypeDescriptor().getRawType()); } /** * This test validates {@link HadoopInputFormatIO.Read Read} transform object creation fails with * configuration and null key translation. {@link HadoopInputFormatIO.Read#withKeyTranslation() * withKeyTranslation()} checks keyTranslation is null and throws exception if it null value is * passed. */ @Test public void testReadObjectCreationFailsIfKeyTranslationFunctionIsNull() { thrown.expect(NullPointerException.class); HadoopInputFormatIO.<String, Employee>read().withConfiguration(serConf.getHadoopConfiguration()) .withKeyTranslation(null); } /** * This test validates {@link HadoopInputFormatIO.Read Read} transform object creation with * configuration and key translation. */ @Test public void testReadObjectCreationWithConfigurationKeyTranslation() { HadoopInputFormatIO.Read<String, Employee> read = HadoopInputFormatIO.<String, Employee>read() .withConfiguration(serConf.getHadoopConfiguration()).withKeyTranslation(myKeyTranslate); assertEquals(serConf.getHadoopConfiguration(), read.getConfiguration().getHadoopConfiguration()); assertEquals(myKeyTranslate, read.getKeyTranslationFunction()); assertEquals(null, read.getValueTranslationFunction()); assertEquals(myKeyTranslate.getOutputTypeDescriptor().getRawType(), read.getKeyTypeDescriptor().getRawType()); assertEquals(serConf.getHadoopConfiguration().getClass("value.class", Object.class), read.getValueTypeDescriptor().getRawType()); } /** * This test validates {@link HadoopInputFormatIO.Read Read} transform object creation fails with * configuration and null value translation. * {@link HadoopInputFormatIO.Read#withValueTranslation() withValueTranslation()} checks * valueTranslation is null and throws exception if null value is passed. */ @Test public void testReadObjectCreationFailsIfValueTranslationFunctionIsNull() { thrown.expect(NullPointerException.class); HadoopInputFormatIO.<Text, String>read().withConfiguration(serConf.getHadoopConfiguration()) .withValueTranslation(null); } /** * This test validates {@link HadoopInputFormatIO.Read Read} transform object creation with * configuration and value translation. */ @Test public void testReadObjectCreationWithConfigurationValueTranslation() { HadoopInputFormatIO.Read<Text, String> read = HadoopInputFormatIO.<Text, String>read() .withConfiguration(serConf.getHadoopConfiguration()).withValueTranslation(myValueTranslate); assertEquals(serConf.getHadoopConfiguration(), read.getConfiguration().getHadoopConfiguration()); assertEquals(null, read.getKeyTranslationFunction()); assertEquals(myValueTranslate, read.getValueTranslationFunction()); assertEquals(serConf.getHadoopConfiguration().getClass("key.class", Object.class), read.getKeyTypeDescriptor().getRawType()); assertEquals(myValueTranslate.getOutputTypeDescriptor().getRawType(), read.getValueTypeDescriptor().getRawType()); } /** * This test validates {@link HadoopInputFormatIO.Read Read} transform object creation with * configuration, key translation and value translation. */ @Test public void testReadObjectCreationWithConfigurationKeyTranslationValueTranslation() { HadoopInputFormatIO.Read<String, String> read = HadoopInputFormatIO.<String, String>read() .withConfiguration(serConf.getHadoopConfiguration()).withKeyTranslation(myKeyTranslate) .withValueTranslation(myValueTranslate); assertEquals(serConf.getHadoopConfiguration(), read.getConfiguration().getHadoopConfiguration()); assertEquals(myKeyTranslate, read.getKeyTranslationFunction()); assertEquals(myValueTranslate, read.getValueTranslationFunction()); assertEquals(myKeyTranslate.getOutputTypeDescriptor().getRawType(), read.getKeyTypeDescriptor().getRawType()); assertEquals(myValueTranslate.getOutputTypeDescriptor().getRawType(), read.getValueTypeDescriptor().getRawType()); } /** * This test validates functionality of {@link HadoopInputFormatIO.Read#validate() * Read.validate()} function when Read transform is created without calling * {@link HadoopInputFormatIO.Read#withConfiguration() withConfiguration()}. */ @Test public void testReadValidationFailsMissingConfiguration() { HadoopInputFormatIO.Read<String, String> read = HadoopInputFormatIO.<String, String>read(); thrown.expect(NullPointerException.class); read.validate(input); } /** * This test validates functionality of {@link HadoopInputFormatIO.Read#withConfiguration() * withConfiguration()} function when Hadoop InputFormat class is not provided by the user in * configuration. */ @Test public void testReadValidationFailsMissingInputFormatInConf() { Configuration configuration = new Configuration(); configuration.setClass("key.class", Text.class, Object.class); configuration.setClass("value.class", Employee.class, Object.class); thrown.expect(NullPointerException.class); HadoopInputFormatIO.<Text, Employee>read().withConfiguration(configuration); } /** * This test validates functionality of {@link HadoopInputFormatIO.Read#withConfiguration() * withConfiguration()} function when key class is not provided by the user in configuration. */ @Test public void testReadValidationFailsMissingKeyClassInConf() { Configuration configuration = new Configuration(); configuration.setClass("mapreduce.job.inputformat.class", EmployeeInputFormat.class, InputFormat.class); configuration.setClass("value.class", Employee.class, Object.class); thrown.expect(NullPointerException.class); HadoopInputFormatIO.<Text, Employee>read().withConfiguration(configuration); } /** * This test validates functionality of {@link HadoopInputFormatIO.Read#withConfiguration() * withConfiguration()} function when value class is not provided by the user in configuration. */ @Test public void testReadValidationFailsMissingValueClassInConf() { Configuration configuration = new Configuration(); configuration.setClass("mapreduce.job.inputformat.class", EmployeeInputFormat.class, InputFormat.class); configuration.setClass("key.class", Text.class, Object.class); thrown.expect(NullPointerException.class); HadoopInputFormatIO.<Text, Employee>read().withConfiguration(configuration); } /** * This test validates functionality of {@link HadoopInputFormatIO.Read#validate() * Read.validate()} function when myKeyTranslate's (simple function provided by user for key * translation) input type is not same as Hadoop InputFormat's keyClass(Which is property set in * configuration as "key.class"). */ @Test public void testReadValidationFailsWithWrongInputTypeKeyTranslationFunction() { SimpleFunction<LongWritable, String> myKeyTranslateWithWrongInputType = new SimpleFunction<LongWritable, String>() { @Override public String apply(LongWritable input) { return input.toString(); } }; HadoopInputFormatIO.Read<String, Employee> read = HadoopInputFormatIO.<String, Employee>read() .withConfiguration(serConf.getHadoopConfiguration()) .withKeyTranslation(myKeyTranslateWithWrongInputType); thrown.expect(IllegalArgumentException.class); thrown.expectMessage(String.format( "Key translation's input type is not same as hadoop InputFormat : %s key " + "class : %s", serConf.getHadoopConfiguration().getClass("mapreduce.job.inputformat.class", InputFormat.class), serConf.getHadoopConfiguration().getClass("key.class", Object.class))); read.validate(input); } /** * This test validates functionality of {@link HadoopInputFormatIO.Read#validate() * Read.validate()} function when myValueTranslate's (simple function provided by user for value * translation) input type is not same as Hadoop InputFormat's valueClass(Which is property set in * configuration as "value.class"). */ @Test public void testReadValidationFailsWithWrongInputTypeValueTranslationFunction() { SimpleFunction<LongWritable, String> myValueTranslateWithWrongInputType = new SimpleFunction<LongWritable, String>() { @Override public String apply(LongWritable input) { return input.toString(); } }; HadoopInputFormatIO.Read<Text, String> read = HadoopInputFormatIO.<Text, String>read() .withConfiguration(serConf.getHadoopConfiguration()) .withValueTranslation(myValueTranslateWithWrongInputType); String expectedMessage = String.format( "Value translation's input type is not same as hadoop InputFormat : " + "%s value class : %s", serConf.getHadoopConfiguration().getClass("mapreduce.job.inputformat.class", InputFormat.class), serConf.getHadoopConfiguration().getClass("value.class", Object.class)); thrown.expect(IllegalArgumentException.class); thrown.expectMessage(expectedMessage); read.validate(input); } @Test public void testReadingData() throws Exception { HadoopInputFormatIO.Read<Text, Employee> read = HadoopInputFormatIO.<Text, Employee>read() .withConfiguration(serConf.getHadoopConfiguration()); List<KV<Text, Employee>> expected = TestEmployeeDataSet.getEmployeeData(); PCollection<KV<Text, Employee>> actual = p.apply("ReadTest", read); PAssert.that(actual).containsInAnyOrder(expected); p.run(); } /** * This test validates behavior of {@link HadoopInputFormatBoundedSource} if RecordReader object * creation fails. */ @Test public void testReadIfCreateRecordReaderFails() throws Exception { thrown.expect(Exception.class); thrown.expectMessage("Exception in creating RecordReader"); InputFormat<Text, Employee> mockInputFormat = Mockito.mock(EmployeeInputFormat.class); Mockito.when(mockInputFormat.createRecordReader(Mockito.any(InputSplit.class), Mockito.any(TaskAttemptContext.class))) .thenThrow(new IOException("Exception in creating RecordReader")); HadoopInputFormatBoundedSource<Text, Employee> boundedSource = new HadoopInputFormatBoundedSource<Text, Employee>( serConf, WritableCoder.of(Text.class), AvroCoder.of(Employee.class), null, // No key translation required. null, // No value translation required. new SerializableSplit()); boundedSource.setInputFormatObj(mockInputFormat); SourceTestUtils.readFromSource(boundedSource, p.getOptions()); } /** * This test validates behavior of HadoopInputFormatSource if * {@link InputFormat#createRecordReader() createRecordReader()} of InputFormat returns null. */ @Test public void testReadWithNullCreateRecordReader() throws Exception { InputFormat<Text, Employee> mockInputFormat = Mockito.mock(EmployeeInputFormat.class); thrown.expect(IOException.class); thrown.expectMessage(String.format("Null RecordReader object returned by %s", mockInputFormat.getClass())); Mockito.when(mockInputFormat.createRecordReader(Mockito.any(InputSplit.class), Mockito.any(TaskAttemptContext.class))).thenReturn(null); HadoopInputFormatBoundedSource<Text, Employee> boundedSource = new HadoopInputFormatBoundedSource<Text, Employee>( serConf, WritableCoder.of(Text.class), AvroCoder.of(Employee.class), null, // No key translation required. null, // No value translation required. new SerializableSplit()); boundedSource.setInputFormatObj(mockInputFormat); SourceTestUtils.readFromSource(boundedSource, p.getOptions()); } /** * This test validates behavior of * {@link HadoopInputFormatBoundedSource.HadoopInputFormatReader#start() start()} method if * InputFormat's {@link InputFormat#getSplits() getSplits()} returns InputSplitList having zero * records. */ @Test public void testReadersStartWhenZeroRecords() throws Exception { InputFormat<Text, Employee> mockInputFormat = Mockito.mock(EmployeeInputFormat.class); EmployeeRecordReader mockReader = Mockito.mock(EmployeeRecordReader.class); Mockito.when(mockInputFormat.createRecordReader(Mockito.any(InputSplit.class), Mockito.any(TaskAttemptContext.class))).thenReturn(mockReader); Mockito.when(mockReader.nextKeyValue()).thenReturn(false); InputSplit mockInputSplit = Mockito.mock(NewObjectsEmployeeInputSplit.class); HadoopInputFormatBoundedSource<Text, Employee> boundedSource = new HadoopInputFormatBoundedSource<Text, Employee>( serConf, WritableCoder.of(Text.class), AvroCoder.of(Employee.class), null, // No key translation required. null, // No value translation required. new SerializableSplit(mockInputSplit)); BoundedReader<KV<Text, Employee>> boundedReader = boundedSource.createReader(p.getOptions()); assertEquals(false, boundedReader.start()); assertEquals(Double.valueOf(1), boundedReader.getFractionConsumed()); } /** * This test validates the method getFractionConsumed()- which indicates the progress of the read * in range of 0 to 1. */ @Test public void testReadersGetFractionConsumed() throws Exception { List<KV<Text, Employee>> referenceRecords = TestEmployeeDataSet.getEmployeeData(); HadoopInputFormatBoundedSource<Text, Employee> hifSource = getTestHIFSource(EmployeeInputFormat.class, Text.class, Employee.class, WritableCoder.of(Text.class), AvroCoder.of(Employee.class)); long estimatedSize = hifSource.getEstimatedSizeBytes(p.getOptions()); // Validate if estimated size is equal to the size of records. assertEquals(referenceRecords.size(), estimatedSize); List<BoundedSource<KV<Text, Employee>>> boundedSourceList = hifSource.splitIntoBundles(0, p.getOptions()); // Validate if splitIntoBundles() has split correctly. assertEquals(TestEmployeeDataSet.NUMBER_OF_SPLITS, boundedSourceList.size()); List<KV<Text, Employee>> bundleRecords = new ArrayList<>(); for (BoundedSource<KV<Text, Employee>> source : boundedSourceList) { List<KV<Text, Employee>> elements = new ArrayList<KV<Text, Employee>>(); BoundedReader<KV<Text, Employee>> reader = source.createReader(p.getOptions()); float recordsRead = 0; // When start is not called, getFractionConsumed() should return 0. assertEquals(Double.valueOf(0), reader.getFractionConsumed()); boolean start = reader.start(); assertEquals(true, start); if (start) { elements.add(reader.getCurrent()); boolean advance = reader.advance(); // Validate if getFractionConsumed() returns the correct fraction based on // the number of records read in the split. assertEquals(Double.valueOf(++recordsRead / TestEmployeeDataSet.NUMBER_OF_RECORDS_IN_EACH_SPLIT), reader.getFractionConsumed()); assertEquals(true, advance); while (advance) { elements.add(reader.getCurrent()); advance = reader.advance(); assertEquals( Double.valueOf(++recordsRead / TestEmployeeDataSet.NUMBER_OF_RECORDS_IN_EACH_SPLIT), reader.getFractionConsumed()); } bundleRecords.addAll(elements); } // Validate if getFractionConsumed() returns 1 after reading is complete. assertEquals(Double.valueOf(1), reader.getFractionConsumed()); reader.close(); } assertThat(bundleRecords, containsInAnyOrder(referenceRecords.toArray())); } /** * This test validates that reader and its parent source reads the same records. */ @Test public void testReaderAndParentSourceReadsSameData() throws Exception { InputSplit mockInputSplit = Mockito.mock(NewObjectsEmployeeInputSplit.class); HadoopInputFormatBoundedSource<Text, Employee> boundedSource = new HadoopInputFormatBoundedSource<Text, Employee>( serConf, WritableCoder.of(Text.class), AvroCoder.of(Employee.class), null, // No key translation required. null, // No value translation required. new SerializableSplit(mockInputSplit)); BoundedReader<KV<Text, Employee>> reader = boundedSource.createReader(p.getOptions()); SourceTestUtils.assertUnstartedReaderReadsSameAsItsSource(reader, p.getOptions()); } /** * This test verifies that the method * {@link HadoopInputFormatBoundedSource.HadoopInputFormatReader#getCurrentSource() * getCurrentSource()} returns correct source object. */ @Test public void testGetCurrentSourceFunction() throws Exception { SerializableSplit split = new SerializableSplit(); BoundedSource<KV<Text, Employee>> source = new HadoopInputFormatBoundedSource<Text, Employee>(serConf, WritableCoder.of(Text.class), AvroCoder.of(Employee.class), null, // No key translation required. null, // No value translation required. split); BoundedReader<KV<Text, Employee>> hifReader = source.createReader(p.getOptions()); BoundedSource<KV<Text, Employee>> hifSource = hifReader.getCurrentSource(); assertEquals(hifSource, source); } /** * This test validates behavior of {@link HadoopInputFormatBoundedSource#createReader() * createReader()} method when {@link HadoopInputFormatBoundedSource#splitIntoBundles() * splitIntoBundles()} is not called. */ @Test public void testCreateReaderIfSplitIntoBundlesNotCalled() throws Exception { HadoopInputFormatBoundedSource<Text, Employee> hifSource = getTestHIFSource(EmployeeInputFormat.class, Text.class, Employee.class, WritableCoder.of(Text.class), AvroCoder.of(Employee.class)); thrown.expect(IOException.class); thrown.expectMessage("Cannot create reader as source is not split yet."); hifSource.createReader(p.getOptions()); } /** * This test validates behavior of * {@link HadoopInputFormatBoundedSource#computeSplitsIfNecessary() computeSplits()} when Hadoop * InputFormat's {@link InputFormat#getSplits() getSplits()} returns empty list. */ @Test public void testComputeSplitsIfGetSplitsReturnsEmptyList() throws Exception { InputFormat<?, ?> mockInputFormat = Mockito.mock(EmployeeInputFormat.class); SerializableSplit mockInputSplit = Mockito.mock(SerializableSplit.class); Mockito.when(mockInputFormat.getSplits(Mockito.any(JobContext.class))) .thenReturn(new ArrayList<InputSplit>()); HadoopInputFormatBoundedSource<Text, Employee> hifSource = new HadoopInputFormatBoundedSource<Text, Employee>( serConf, WritableCoder.of(Text.class), AvroCoder.of(Employee.class), null, // No key translation required. null, // No value translation required. mockInputSplit); thrown.expect(IOException.class); thrown.expectMessage("Error in computing splits, getSplits() returns a empty list"); hifSource.setInputFormatObj(mockInputFormat); hifSource.computeSplitsIfNecessary(); } /** * This test validates behavior of * {@link HadoopInputFormatBoundedSource#computeSplitsIfNecessary() computeSplits()} when Hadoop * InputFormat's {@link InputFormat#getSplits() getSplits()} returns NULL value. */ @Test public void testComputeSplitsIfGetSplitsReturnsNullValue() throws Exception { InputFormat<Text, Employee> mockInputFormat = Mockito.mock(EmployeeInputFormat.class); SerializableSplit mockInputSplit = Mockito.mock(SerializableSplit.class); Mockito.when(mockInputFormat.getSplits(Mockito.any(JobContext.class))).thenReturn(null); HadoopInputFormatBoundedSource<Text, Employee> hifSource = new HadoopInputFormatBoundedSource<Text, Employee>( serConf, WritableCoder.of(Text.class), AvroCoder.of(Employee.class), null, // No key translation required. null, // No value translation required. mockInputSplit); thrown.expect(IOException.class); thrown.expectMessage("Error in computing splits, getSplits() returns null."); hifSource.setInputFormatObj(mockInputFormat); hifSource.computeSplitsIfNecessary(); } /** * This test validates behavior of * {@link HadoopInputFormatBoundedSource#computeSplitsIfNecessary() computeSplits()} if Hadoop * InputFormat's {@link InputFormat#getSplits() getSplits()} returns InputSplit list having some * null values. */ @Test public void testComputeSplitsIfGetSplitsReturnsListHavingNullValues() throws Exception { // InputSplit list having null value. InputSplit mockInputSplit = Mockito.mock(InputSplit.class, Mockito.withSettings().extraInterfaces(Writable.class)); List<InputSplit> inputSplitList = new ArrayList<InputSplit>(); inputSplitList.add(mockInputSplit); inputSplitList.add(null); InputFormat<Text, Employee> mockInputFormat = Mockito.mock(EmployeeInputFormat.class); Mockito.when(mockInputFormat.getSplits(Mockito.any(JobContext.class))).thenReturn(inputSplitList); HadoopInputFormatBoundedSource<Text, Employee> hifSource = new HadoopInputFormatBoundedSource<Text, Employee>( serConf, WritableCoder.of(Text.class), AvroCoder.of(Employee.class), null, // No key translation required. null, // No value translation required. new SerializableSplit()); thrown.expect(IOException.class); thrown.expectMessage( "Error in computing splits, split is null in InputSplits list populated " + "by getSplits() : "); hifSource.setInputFormatObj(mockInputFormat); hifSource.computeSplitsIfNecessary(); } /** * This test validates records emitted in PCollection are immutable if InputFormat's recordReader * returns same objects(i.e. same locations in memory) but with updated values for each record. */ @Test public void testImmutablityOfOutputOfReadIfRecordReaderObjectsAreMutable() throws Exception { List<BoundedSource<KV<Text, Employee>>> boundedSourceList = getBoundedSourceList( ReuseObjectsEmployeeInputFormat.class, Text.class, Employee.class, WritableCoder.of(Text.class), AvroCoder.of(Employee.class)); List<KV<Text, Employee>> bundleRecords = new ArrayList<>(); for (BoundedSource<KV<Text, Employee>> source : boundedSourceList) { List<KV<Text, Employee>> elems = SourceTestUtils.readFromSource(source, p.getOptions()); bundleRecords.addAll(elems); } List<KV<Text, Employee>> referenceRecords = TestEmployeeDataSet.getEmployeeData(); assertThat(bundleRecords, containsInAnyOrder(referenceRecords.toArray())); } /** * Test reading if InputFormat implements {@link org.apache.hadoop.conf.Configurable * Configurable}. */ @Test public void testReadingWithConfigurableInputFormat() throws Exception { List<BoundedSource<KV<Text, Employee>>> boundedSourceList = getBoundedSourceList( ConfigurableEmployeeInputFormat.class, Text.class, Employee.class, WritableCoder.of(Text.class), AvroCoder.of(Employee.class)); for (BoundedSource<KV<Text, Employee>> source : boundedSourceList) { // Cast to HadoopInputFormatBoundedSource to access getInputFormat(). @SuppressWarnings("unchecked") HadoopInputFormatBoundedSource<Text, Employee> hifSource = (HadoopInputFormatBoundedSource<Text, Employee>) source; hifSource.createInputFormatInstance(); ConfigurableEmployeeInputFormat inputFormatObj = (ConfigurableEmployeeInputFormat) hifSource .getInputFormat(); assertEquals(true, inputFormatObj.isConfSet); } } /** * This test validates records emitted in PCollection are immutable if InputFormat's * {@link org.apache.hadoop.mapreduce.RecordReader RecordReader} returns different objects (i.e. * different locations in memory). */ @Test public void testImmutablityOfOutputOfReadIfRecordReaderObjectsAreImmutable() throws Exception { List<BoundedSource<KV<Text, Employee>>> boundedSourceList = getBoundedSourceList(EmployeeInputFormat.class, Text.class, Employee.class, WritableCoder.of(Text.class), AvroCoder.of(Employee.class)); List<KV<Text, Employee>> bundleRecords = new ArrayList<>(); for (BoundedSource<KV<Text, Employee>> source : boundedSourceList) { List<KV<Text, Employee>> elems = SourceTestUtils.readFromSource(source, p.getOptions()); bundleRecords.addAll(elems); } List<KV<Text, Employee>> referenceRecords = TestEmployeeDataSet.getEmployeeData(); assertThat(bundleRecords, containsInAnyOrder(referenceRecords.toArray())); } private static SerializableConfiguration loadTestConfiguration(Class<?> inputFormatClassName, Class<?> keyClass, Class<?> valueClass) { Configuration conf = new Configuration(); conf.setClass("mapreduce.job.inputformat.class", inputFormatClassName, InputFormat.class); conf.setClass("key.class", keyClass, Object.class); conf.setClass("value.class", valueClass, Object.class); return new SerializableConfiguration(conf); } private <K, V> HadoopInputFormatBoundedSource<K, V> getTestHIFSource(Class<?> inputFormatClass, Class<K> inputFormatKeyClass, Class<V> inputFormatValueClass, Coder<K> keyCoder, Coder<V> valueCoder) { SerializableConfiguration serConf = loadTestConfiguration(inputFormatClass, inputFormatKeyClass, inputFormatValueClass); return new HadoopInputFormatBoundedSource<K, V>(serConf, keyCoder, valueCoder, null, // No key translation required. null); // No value translation required. } private <K, V> List<BoundedSource<KV<K, V>>> getBoundedSourceList(Class<?> inputFormatClass, Class<K> inputFormatKeyClass, Class<V> inputFormatValueClass, Coder<K> keyCoder, Coder<V> valueCoder) throws Exception { HadoopInputFormatBoundedSource<K, V> boundedSource = getTestHIFSource(inputFormatClass, inputFormatKeyClass, inputFormatValueClass, keyCoder, valueCoder); return boundedSource.splitIntoBundles(0, p.getOptions()); } }