Java tutorial
/******************************************************************************* * * Pentaho Big Data * * Copyright (C) 2002-2013 by Pentaho : http://www.pentaho.com * ******************************************************************************* * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ******************************************************************************/ package org.pentaho.hadoop.mapreduce.test; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.IntWritable; import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.io.NullWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapred.Counters.Counter; import org.apache.hadoop.mapred.FileInputFormat; import org.apache.hadoop.mapred.FileOutputFormat; import org.apache.hadoop.mapred.InputSplit; import org.apache.hadoop.mapred.JobConf; import org.apache.hadoop.mapred.OutputCollector; import org.apache.hadoop.mapred.RecordReader; import org.apache.hadoop.mapred.Reporter; import org.apache.hadoop.mapred.TextInputFormat; import org.apache.hadoop.mapred.TextOutputFormat; import org.junit.After; import org.junit.Before; import org.junit.Test; import org.pentaho.di.core.Const; import org.pentaho.di.core.KettleEnvironment; import org.pentaho.di.core.exception.KettleException; import org.pentaho.di.core.logging.LoggingRegistry; import org.pentaho.di.core.plugins.Plugin; import org.pentaho.di.core.plugins.PluginInterface; import org.pentaho.di.core.plugins.PluginMainClassType; import org.pentaho.di.core.plugins.PluginRegistry; import org.pentaho.di.core.plugins.StepPluginType; import org.pentaho.di.core.variables.VariableSpace; import org.pentaho.di.trans.TransConfiguration; import org.pentaho.di.trans.TransExecutionConfiguration; import org.pentaho.di.trans.TransMeta; import org.pentaho.di.trans.steps.hadoopenter.HadoopEnterMeta; import org.pentaho.di.trans.steps.hadoopexit.HadoopExitMeta; import org.pentaho.hadoop.mapreduce.GenericTransCombiner; import org.pentaho.hadoop.mapreduce.GenericTransReduce; import org.pentaho.hadoop.mapreduce.PentahoMapRunnable; import java.io.BufferedReader; import java.io.File; import java.io.FileReader; import java.io.IOException; import java.lang.reflect.Field; import java.util.ArrayList; import java.util.Arrays; import java.util.HashMap; import java.util.Iterator; import java.util.List; import java.util.Map; import java.util.concurrent.atomic.AtomicBoolean; import static org.junit.Assert.*; @SuppressWarnings({ "nls", "unchecked", "deprecation", "rawtypes" }) public class PentahoMapReduceIntegrationTest { private boolean debug = false; @Before public void setup() throws KettleException { KettleEnvironment.init(); } @After public void shutdown() { } @Test public void testMapperBadInjectorFields() throws IOException, KettleException { try { PentahoMapRunnable mapper = new PentahoMapRunnable(); MockOutputCollector outputCollector = new MockOutputCollector(); MockReporter reporter = new MockReporter(); MockRecordReader reader = new MockRecordReader(Arrays.asList("test")); mapper.configure(createJobConf("./src/test/resources/bad-injector-fields.ktr", "./src/test/resources/wordcount-reducer.ktr")); mapper.run(reader, outputCollector, reporter); fail("Should have thrown an exception"); } catch (IOException e) { assertTrue("Test for KettleException", e.getMessage().contains("key or value is not defined in transformation injector step")); } } @Test public void testMapperBadOutputFields() throws IOException, KettleException { try { PentahoMapRunnable mapper = new PentahoMapRunnable(); MockOutputCollector outputCollector = new MockOutputCollector(); MockReporter reporter = new MockReporter(); MockRecordReader reader = new MockRecordReader(Arrays.asList("test")); mapper.configure(createJobConf("./src/test/resources/bad-output-fields.ktr", "./src/test/resources/bad-output-fields.ktr")); mapper.run(reader, outputCollector, reporter); fail("Should have thrown an exception"); } catch (IOException e) { assertTrue("Test for KettleException", e.getMessage().contains("outKey or outValue is not defined in transformation output stream")); } } @Test public void testMapperNoInjectorStep() throws IOException, KettleException { try { PentahoMapRunnable mapper = new PentahoMapRunnable(); MockOutputCollector outputCollector = new MockOutputCollector(); MockReporter reporter = new MockReporter(); MockRecordReader reader = new MockRecordReader(Arrays.asList("test")); mapper.configure(createJobConf("./src/test/resources/no-injector-step.ktr", "./src/test/resources/no-injector-step.ktr")); mapper.run(reader, outputCollector, reporter); fail("Should have thrown an exception"); } catch (IOException e) { assertTrue("Test for KettleException", e.getMessage().contains("Unable to find thread with name Injector and copy number 0")); } } @Test public void testMapperNoOutputStep() throws IOException, KettleException { try { PentahoMapRunnable mapper = new PentahoMapRunnable(); MockOutputCollector outputCollector = new MockOutputCollector(); MockReporter reporter = new MockReporter(); MockRecordReader reader = new MockRecordReader(Arrays.asList("test")); mapper.configure(createJobConf("./src/test/resources/no-output-step.ktr", "./src/test/resources/no-output-step.ktr")); mapper.run(reader, outputCollector, reporter); fail("Should have thrown an exception"); } catch (IOException e) { assertTrue("Test for KettleException", e.getMessage().contains("Output step not defined in transformation")); } } @Test public void testReducerBadInjectorFields() throws IOException, KettleException { try { GenericTransReduce reducer = new GenericTransReduce(); MockOutputCollector outputCollector = new MockOutputCollector(); MockReporter reporter = new MockReporter(); reducer.configure(createJobConf("./src/test/resources/bad-injector-fields.ktr", "./src/test/resources/bad-injector-fields.ktr")); reducer.reduce(new Text("key"), Arrays.asList(new Text("value")).iterator(), outputCollector, reporter); fail("Should have thrown an exception"); } catch (IOException e) { assertTrue("Test for KettleException", e.getMessage().contains("key or value is not defined in transformation injector step")); } } @Test public void testReducerNoInjectorStep() throws IOException, KettleException { try { GenericTransReduce reducer = new GenericTransReduce(); MockOutputCollector outputCollector = new MockOutputCollector(); MockReporter reporter = new MockReporter(); reducer.configure(createJobConf("./src/test/resources/no-injector-step.ktr", "./src/test/resources/no-injector-step.ktr")); reducer.reduce(new Text("key"), Arrays.asList(new Text("value")).iterator(), outputCollector, reporter); fail("Should have thrown an exception"); } catch (IOException e) { assertTrue("Test for KettleException", e.getMessage().contains("Unable to find thread with name Injector and copy number 0")); } } @Test public void testReducerNoOutputStep() throws IOException, KettleException { try { GenericTransReduce reducer = new GenericTransReduce(); MockOutputCollector outputCollector = new MockOutputCollector(); MockReporter reporter = new MockReporter(); reducer.configure(createJobConf("./src/test/resources/no-output-step.ktr", "./src/test/resources/no-output-step.ktr")); reducer.reduce(new Text("key"), Arrays.asList(new Text("value")).iterator(), outputCollector, reporter); fail("Should have thrown an exception"); } catch (IOException e) { assertTrue("Test for KettleException", e.getMessage().contains("Output step not defined in transformation")); } } public static JobConf createJobConf(String mapperTransformationFile, String reducerTransformationFile) throws IOException, KettleException { return createJobConf(mapperTransformationFile, null, reducerTransformationFile, "localhost", "9000", "9001"); } public static JobConf createJobConf(String mapperTransformationFile, String combinerTransformationFile, String reducerTransformationFile) throws IOException, KettleException { return createJobConf(mapperTransformationFile, combinerTransformationFile, reducerTransformationFile, "localhost", "9000", "9001"); } public static JobConf createJobConf(String mapperTransformationFile, String combinerTransformationFile, String reducerTransformationFile, String hostname, String hdfsPort, String trackerPort) throws IOException, KettleException { JobConf conf = new JobConf(); conf.setJobName("wordcount"); KettleEnvironment.init(); // Register Map/Reduce Input and Map/Reduce Output plugin steps PluginMainClassType mainClassTypesAnnotation = StepPluginType.class .getAnnotation(PluginMainClassType.class); Map<Class<?>, String> inputClassMap = new HashMap<Class<?>, String>(); inputClassMap.put(mainClassTypesAnnotation.value(), HadoopEnterMeta.class.getName()); PluginInterface inputStepPlugin = new Plugin(new String[] { "HadoopEnterPlugin" }, StepPluginType.class, mainClassTypesAnnotation.value(), "Hadoop", "MapReduce Input", "Enter a Hadoop Mapper or Reducer transformation", "MRI.png", false, false, inputClassMap, new ArrayList<String>(), null, null); PluginRegistry.getInstance().registerPlugin(StepPluginType.class, inputStepPlugin); Map<Class<?>, String> outputClassMap = new HashMap<Class<?>, String>(); outputClassMap.put(mainClassTypesAnnotation.value(), HadoopExitMeta.class.getName()); PluginInterface outputStepPlugin = new Plugin(new String[] { "HadoopExitPlugin" }, StepPluginType.class, mainClassTypesAnnotation.value(), "Hadoop", "MapReduce Output", "Exit a Hadoop Mapper or Reducer transformation", "MRO.png", false, false, outputClassMap, new ArrayList<String>(), null, null); PluginRegistry.getInstance().registerPlugin(StepPluginType.class, outputStepPlugin); TransExecutionConfiguration transExecConfig = new TransExecutionConfiguration(); TransMeta transMeta = null; TransConfiguration transConfig = null; if (mapperTransformationFile != null) { conf.setMapRunnerClass(PentahoMapRunnable.class); transMeta = new TransMeta(mapperTransformationFile); transConfig = new TransConfiguration(transMeta, transExecConfig); conf.set("transformation-map-xml", transConfig.getXML()); conf.set("transformation-map-input-stepname", "Injector"); conf.set("transformation-map-output-stepname", "Output"); } if (combinerTransformationFile != null) { conf.setCombinerClass(GenericTransCombiner.class); transMeta = new TransMeta(combinerTransformationFile); transConfig = new TransConfiguration(transMeta, transExecConfig); conf.set("transformation-combiner-xml", transConfig.getXML()); conf.set("transformation-combiner-input-stepname", "Injector"); conf.set("transformation-combiner-output-stepname", "Output"); } if (reducerTransformationFile != null) { conf.setReducerClass(GenericTransReduce.class); transMeta = new TransMeta(reducerTransformationFile); transConfig = new TransConfiguration(transMeta, transExecConfig); conf.set("transformation-reduce-xml", transConfig.getXML()); conf.set("transformation-reduce-input-stepname", "Injector"); conf.set("transformation-reduce-output-stepname", "Output"); } conf.setOutputKeyClass(Text.class); conf.setOutputValueClass(IntWritable.class); File jar = new File("./dist/pentaho-big-data-plugin-TRUNK-SNAPSHOT.jar"); conf.setInputFormat(TextInputFormat.class); conf.setOutputFormat(TextOutputFormat.class); FileInputFormat.setInputPaths(conf, new Path("/")); FileOutputFormat.setOutputPath(conf, new Path("/")); conf.set("fs.default.name", "hdfs://" + hostname + ":" + hdfsPort); conf.set("mapred.job.tracker", hostname + ":" + trackerPort); conf.setJar(jar.toURI().toURL().toExternalForm()); conf.setWorkingDirectory(new Path("/tmp/wordcount")); return conf; } public static String getTransformationXml(String transFilename) throws IOException { StringBuilder sb = new StringBuilder(); BufferedReader reader = new BufferedReader(new FileReader(transFilename)); String line = null; while ((line = reader.readLine()) != null) { sb.append(line + Const.CR); } return sb.toString(); } public static class MockOutputCollector implements OutputCollector { private Map<Object, ArrayList<Object>> collection = new HashMap<Object, ArrayList<Object>>(); private AtomicBoolean closed = new AtomicBoolean(false); public void close() { closed.set(true); } @Override public void collect(Object arg0, Object arg1) throws IOException { if (closed.get()) { return; } if (!collection.containsKey(arg0)) { collection.put(arg0, new ArrayList<Object>()); } collection.get(arg0).add(arg1); } public Map<Object, ArrayList<Object>> getCollection() { return collection; } } public static class MockReporter implements Reporter { @Override public void progress() { // TODO Auto-generated method stub } @Override public Counter getCounter(Enum<?> arg0) { // TODO Auto-generated method stub return null; } @Override public Counter getCounter(String arg0, String arg1) { // TODO Auto-generated method stub return null; } @Override public InputSplit getInputSplit() throws UnsupportedOperationException { // TODO Auto-generated method stub return null; } @Override public void incrCounter(Enum<?> arg0, long arg1) { // TODO Auto-generated method stub } @Override public void incrCounter(String arg0, String arg1, long arg2) { // TODO Auto-generated method stub } @Override public void setStatus(String arg0) { // TODO Auto-generated method stub } } class MockRecordReader implements RecordReader<Text, Text> { private Iterator<String> rowIter; int rowNum = -1; int totalRows; // Make them provide a pre-filled list so we don't confuse the overhead of generating strings // with the time it takes to run the mapper public MockRecordReader(List<String> rows) { totalRows = rows.size(); rowIter = rows.iterator(); } @Override public boolean next(Text key, Text value) throws IOException { if (!rowIter.hasNext()) { return false; } rowNum++; key.set(String.valueOf(rowNum)); value.set(rowIter.next()); return true; } @Override public Text createKey() { return new Text(); } @Override public Text createValue() { return new Text(); } @Override public long getPos() throws IOException { return rowNum; } @Override public void close() throws IOException { } @Override public float getProgress() throws IOException { return (rowNum + 1) / totalRows; } } @Test public void testMapRunnable_wordCount() throws IOException, KettleException { PentahoMapRunnable mapRunnable = new PentahoMapRunnable(); MockOutputCollector outputCollector = new MockOutputCollector(); MockReporter reporter = new MockReporter(); mapRunnable.configure(createJobConf("./src/test/resources/wordcount-mapper.ktr", "./src/test/resources/wordcount-reducer.ktr")); final int ROWS = 10000; List<String> strings = new ArrayList<String>(); for (int i = 0; i < ROWS; i++) { strings.add("zebra giraffe hippo elephant tiger"); } MockRecordReader reader = new MockRecordReader(strings); long start = System.currentTimeMillis(); mapRunnable.run(reader, outputCollector, reporter); outputCollector.close(); long stop = System.currentTimeMillis(); System.out.println("Executed " + ROWS + " in " + (stop - start) + "ms"); System.out.println("Average: " + ((stop - start) / (float) ROWS) + "ms"); System.out.println("Rows/Second: " + (ROWS / ((stop - start) / 1000f))); class CountValues { private Object workingKey; public CountValues setWorkingKey(Object k) { workingKey = k; return this; } public void countValues(String k, Object v, MockOutputCollector oc) { if (workingKey.equals(new Text(k))) { assertEquals(k.toString(), v, oc.getCollection().get(new Text(k)).size()); } } } assertNotNull(outputCollector); assertNotNull(outputCollector.getCollection()); assertNotNull(outputCollector.getCollection().keySet()); assertEquals(5, outputCollector.getCollection().keySet().size()); CountValues cv = new CountValues(); for (Object key : outputCollector.getCollection().keySet()) { cv.setWorkingKey(key).countValues("zebra", ROWS, outputCollector); cv.setWorkingKey(key).countValues("giraffe", ROWS, outputCollector); cv.setWorkingKey(key).countValues("hippo", ROWS, outputCollector); cv.setWorkingKey(key).countValues("elephant", ROWS, outputCollector); cv.setWorkingKey(key).countValues("tiger", ROWS, outputCollector); // TODO: Add words that will not exist: unique words, same words - diff case if (debug) { for (Object value : outputCollector.getCollection().get(key)) { System.out.println(key + ": " + value); } } } GenericTransReduce reducer = new GenericTransReduce(); MockOutputCollector inputCollector = outputCollector; outputCollector = new MockOutputCollector(); reducer.configure(createJobConf("./src/test/resources/wordcount-mapper.ktr", "./src/test/resources/wordcount-reducer.ktr")); start = System.currentTimeMillis(); for (Object key : inputCollector.getCollection().keySet()) { System.out.println("reducing: " + key); reducer.reduce((Text) key, new ArrayList(inputCollector.getCollection().get(key)).iterator(), outputCollector, reporter); } reducer.close(); outputCollector.close(); stop = System.currentTimeMillis(); System.out.println("Executed " + ROWS + " in " + (stop - start) + "ms"); System.out.println("Average: " + ((stop - start) / (float) ROWS) + "ms"); System.out.println("Rows/Second: " + (ROWS / ((stop - start) / 1000f))); assertNotNull(outputCollector); assertNotNull(outputCollector.getCollection()); assertNotNull(outputCollector.getCollection().keySet()); assertEquals(5, outputCollector.getCollection().keySet().size()); class CheckValues { private Object workingKey; public CheckValues setWorkingKey(Object k) { workingKey = k; return this; } public void checkValues(String k, Object v, MockOutputCollector oc) { if (workingKey.equals(new Text(k))) { assertEquals(k.toString(), v, ((IntWritable) oc.getCollection().get(new Text(k)).get(0)).get()); } } } CheckValues cv2 = new CheckValues(); for (Object key : outputCollector.getCollection().keySet()) { cv2.setWorkingKey(key).checkValues("zebra", ROWS, outputCollector); cv2.setWorkingKey(key).checkValues("giraffe", ROWS, outputCollector); cv2.setWorkingKey(key).checkValues("hippo", ROWS, outputCollector); cv2.setWorkingKey(key).checkValues("elephant", ROWS, outputCollector); cv2.setWorkingKey(key).checkValues("tiger", ROWS, outputCollector); if (debug) { for (Object value : outputCollector.getCollection().get(key)) { System.out.println(key + ": " + value); } } } } @Test public void testMapper_null_output_value() throws Exception { PentahoMapRunnable mapper = new PentahoMapRunnable(); MockOutputCollector outputCollector = new MockOutputCollector(); MockReporter reporter = new MockReporter(); mapper.configure(createJobConf("./src/test/resources/null-test.ktr", "./src/test/resources/null-test.ktr")); MockRecordReader reader = new MockRecordReader(Arrays.asList("test")); mapper.run(reader, outputCollector, reporter); outputCollector.close(); Exception ex = mapper.getException(); if (ex != null) { ex.printStackTrace(); } assertNull("Exception thrown", ex); assertEquals("Received output when we didn't expect any. <null>s aren't passed through.", 0, outputCollector.getCollection().size()); } @Test public void testCombiner_null_output_value() throws Exception { GenericTransCombiner combiner = new GenericTransCombiner(); MockOutputCollector outputCollector = new MockOutputCollector(); MockReporter reporter = new MockReporter(); combiner.configure(createJobConf(null, "./src/test/resources/null-test.ktr", null)); combiner.reduce(new Text("0"), Arrays.asList(new Text("test")).iterator(), outputCollector, reporter); outputCollector.close(); Exception ex = combiner.getException(); if (ex != null) { ex.printStackTrace(); } assertNull("Exception thrown", ex); assertEquals("Received output when we didn't expect any. <null>s aren't passed through.", 0, outputCollector.getCollection().size()); } @Test public void testReducer_null_output_value() throws Exception { GenericTransReduce reducer = new GenericTransReduce(); MockOutputCollector outputCollector = new MockOutputCollector(); MockReporter reporter = new MockReporter(); reducer.configure( createJobConf("./src/test/resources/null-test.ktr", "./src/test/resources/null-test.ktr")); reducer.reduce(new Text("0"), Arrays.asList(new Text("test")).iterator(), outputCollector, reporter); outputCollector.close(); Exception ex = reducer.getException(); if (ex != null) { ex.printStackTrace(); } assertNull("Exception thrown", ex); assertEquals("Received output when we didn't expect any. <null>s aren't passed through.", 0, outputCollector.getCollection().size()); } @Test public void testLogChannelLeaking_mapper() throws Exception { JobConf jobConf = createJobConf("./src/test/resources/wordcount-mapper.ktr", "./src/test/resources/wordcount-reducer.ktr", "./src/test/resources/wordcount-reducer.ktr"); PentahoMapRunnable mapper = new PentahoMapRunnable(); mapper.configure(jobConf); MockReporter reporter = new MockReporter(); // We expect 4 log channels per run. The total should never grow past logChannelsBefore + 4. final int EXPECTED_CHANNELS_PER_RUN = 5; final int logChannels = LoggingRegistry.getInstance().getMap().size(); // Run the reducer this many times final int RUNS = 10; for (int i = 0; i < RUNS; i++) { MockRecordReader reader = new MockRecordReader(Arrays.asList("test")); MockOutputCollector outputCollector = new MockOutputCollector(); mapper.run(reader, outputCollector, reporter); outputCollector.close(); Exception ex = mapper.getException(); if (ex != null) { ex.printStackTrace(); } assertNull("Exception thrown", ex); assertEquals("Incorrect output", 1, outputCollector.getCollection().size()); assertEquals("LogChannels are not being cleaned up. On Run #" + i + " we have too many.", logChannels + EXPECTED_CHANNELS_PER_RUN, LoggingRegistry.getInstance().getMap().size()); } assertEquals(logChannels + EXPECTED_CHANNELS_PER_RUN, LoggingRegistry.getInstance().getMap().size()); } @Test public void testLogChannelLeaking_combiner() throws Exception { JobConf jobConf = createJobConf("./src/test/resources/wordcount-mapper.ktr", "./src/test/resources/wordcount-reducer.ktr", "./src/test/resources/wordcount-reducer.ktr"); List<IntWritable> input = Arrays.asList(new IntWritable(1)); GenericTransCombiner combiner = new GenericTransCombiner(); combiner.configure(jobConf); MockReporter reporter = new MockReporter(); // We expect 4 log channels per run. The total should never grow past logChannelsBefore + 4. final int EXPECTED_CHANNELS_PER_RUN = 4; final int logChannels = LoggingRegistry.getInstance().getMap().size(); // Run the reducer this many times final int RUNS = 10; for (int i = 0; i < RUNS; i++) { MockOutputCollector outputCollector = new MockOutputCollector(); combiner.reduce(new Text(String.valueOf(i)), input.iterator(), outputCollector, reporter); combiner.close(); outputCollector.close(); Exception ex = combiner.getException(); if (ex != null) { ex.printStackTrace(); } assertNull("Exception thrown", ex); assertEquals("Incorrect output", 1, outputCollector.getCollection().size()); assertEquals("LogChannels are not being cleaned up. On Run #" + i + " we have too many.", logChannels + EXPECTED_CHANNELS_PER_RUN, LoggingRegistry.getInstance().getMap().size()); } assertEquals(logChannels + EXPECTED_CHANNELS_PER_RUN, LoggingRegistry.getInstance().getMap().size()); } @Test public void testLogChannelLeaking_reducer() throws Exception { JobConf jobConf = createJobConf("./src/test/resources/wordcount-mapper.ktr", "./src/test/resources/wordcount-reducer.ktr", "./src/test/resources/wordcount-reducer.ktr"); List<IntWritable> input = Arrays.asList(new IntWritable(1)); GenericTransReduce reducer = new GenericTransReduce(); reducer.configure(jobConf); MockReporter reporter = new MockReporter(); // We expect 4 log channels per run. The total should never grow past logChannelsBefore + 4. final int EXPECTED_CHANNELS_PER_RUN = 4; final int logChannels = LoggingRegistry.getInstance().getMap().size(); // Run the reducer this many times final int RUNS = 10; for (int i = 0; i < RUNS; i++) { MockOutputCollector outputCollector = new MockOutputCollector(); reducer.reduce(new Text(String.valueOf(i)), input.iterator(), outputCollector, reporter); reducer.close(); outputCollector.close(); Exception ex = reducer.getException(); if (ex != null) { ex.printStackTrace(); } assertNull("Exception thrown", ex); assertEquals("Incorrect output", 1, outputCollector.getCollection().size()); assertEquals("LogChannels are not being cleaned up. On Run #" + i + " we have too many.", logChannels + EXPECTED_CHANNELS_PER_RUN, LoggingRegistry.getInstance().getMap().size()); } assertEquals(logChannels + EXPECTED_CHANNELS_PER_RUN, LoggingRegistry.getInstance().getMap().size()); } // TODO Create tests for exception propogation from RowListeners in Mapper, Combiner, and Reducer @Test public void testMapReduce_InputOutput() throws Exception { JobConf jobConf = createJobConf("./src/test/resources/mr-input-output.ktr", "./src/test/resources/mr-passthrough.ktr", "./src/test/resources/mr-passthrough.ktr"); PentahoMapRunnable mapper = new PentahoMapRunnable(); mapper.configure(jobConf); MockReporter reporter = new MockReporter(); MockOutputCollector outputCollector = new MockOutputCollector(); MockRecordReader reader = new MockRecordReader(Arrays.asList("1", "2", "3")); mapper.run(reader, outputCollector, reporter); outputCollector.close(); Exception ex = mapper.getException(); if (ex != null) { ex.printStackTrace(); } assertNull("Exception thrown", ex); assertEquals("Incorrect output", 3, outputCollector.getCollection().size()); assertEquals("Validating output collector", new IntWritable(0), outputCollector.getCollection().get(new Text("1")).get(0)); assertEquals("Validating output collector", new IntWritable(1), outputCollector.getCollection().get(new Text("2")).get(0)); assertEquals("Validating output collector", new IntWritable(2), outputCollector.getCollection().get(new Text("3")).get(0)); } @Test public void testCombinerOutputClasses() throws IOException, KettleException { JobConf jobConf = createJobConf("./src/test/resources/wordcount-mapper.ktr", "./src/test/resources/wordcount-reducer.ktr", "./src/test/resources/wordcount-reducer.ktr"); jobConf.setMapOutputKeyClass(Text.class); jobConf.setMapOutputValueClass(IntWritable.class); jobConf.setOutputValueClass(NullWritable.class); jobConf.setOutputValueClass(LongWritable.class); GenericTransCombiner combiner = new GenericTransCombiner(); combiner.configure(jobConf); assertEquals(jobConf.getMapOutputKeyClass(), combiner.getOutClassK()); assertEquals(jobConf.getMapOutputValueClass(), combiner.getOutClassV()); } @Test public void testReducerOutputClasses() throws IOException, KettleException { JobConf jobConf = createJobConf("./src/test/resources/wordcount-mapper.ktr", "./src/test/resources/wordcount-reducer.ktr", "./src/test/resources/wordcount-reducer.ktr"); jobConf.setMapOutputKeyClass(Text.class); jobConf.setMapOutputValueClass(IntWritable.class); jobConf.setOutputValueClass(NullWritable.class); jobConf.setOutputValueClass(LongWritable.class); GenericTransReduce reducer = new GenericTransReduce(); reducer.configure(jobConf); assertEquals(jobConf.getOutputKeyClass(), reducer.getOutClassK()); assertEquals(jobConf.getOutputValueClass(), reducer.getOutClassV()); } @Test public void testTaskIdExtraction() throws Exception { JobConf conf = createJobConf("./src/test/resources/wordcount-mapper.ktr", "./src/test/resources/wordcount-reducer.ktr", "./src/test/resources/wordcount-reducer.ktr"); conf.set("mapred.task.id", "job_201208090841_0133"); PentahoMapRunnable mapRunnable = new PentahoMapRunnable(); mapRunnable.configure(conf); Field variableSpaceField = PentahoMapRunnable.class.getDeclaredField("variableSpace"); variableSpaceField.setAccessible(true); VariableSpace variableSpace = (VariableSpace) variableSpaceField.get(mapRunnable); String s = variableSpace.getVariable("Internal.Hadoop.NodeNumber"); assertEquals("133", s); } @Test public void testTaskIdExtraction_over_10000() throws Exception { JobConf conf = createJobConf("./src/test/resources/wordcount-mapper.ktr", "./src/test/resources/wordcount-reducer.ktr", "./src/test/resources/wordcount-reducer.ktr"); conf.set("mapred.task.id", "job_201208090841_013302"); PentahoMapRunnable mapRunnable = new PentahoMapRunnable(); mapRunnable.configure(conf); Field variableSpaceField = PentahoMapRunnable.class.getDeclaredField("variableSpace"); variableSpaceField.setAccessible(true); VariableSpace variableSpace = (VariableSpace) variableSpaceField.get(mapRunnable); String s = variableSpace.getVariable("Internal.Hadoop.NodeNumber"); assertEquals("13302", s); } }