Java tutorial
/** * Copyright [2012] [Datasalt Systems S.L.] * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.datasalt.pangool.tuplemr.mapred.lib.output; import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertTrue; import java.io.BufferedWriter; import java.io.File; import java.io.FileWriter; import java.io.IOException; import java.io.Serializable; import java.net.URISyntaxException; import java.nio.charset.Charset; import java.util.ArrayList; import java.util.List; import junit.framework.Assert; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.NullWritable; import org.apache.hadoop.mapreduce.InputFormat; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.OutputFormat; import org.apache.hadoop.mapreduce.lib.output.NullOutputFormat; import org.junit.Test; import com.datasalt.pangool.BaseTest; import com.datasalt.pangool.io.Fields; import com.datasalt.pangool.io.ITuple; import com.datasalt.pangool.io.Schema; import com.datasalt.pangool.io.Schema.Field; import com.datasalt.pangool.io.Schema.Field.Type; import com.datasalt.pangool.tuplemr.IdentityTupleMapper; import com.datasalt.pangool.tuplemr.IdentityTupleReducer; import com.datasalt.pangool.tuplemr.MapOnlyJobBuilder; import com.datasalt.pangool.tuplemr.MultipleOutputsCollector; import com.datasalt.pangool.tuplemr.TupleMRBuilder; import com.datasalt.pangool.tuplemr.TupleMRException; import com.datasalt.pangool.tuplemr.mapred.MapOnlyMapper; import com.datasalt.pangool.tuplemr.mapred.lib.input.TupleTextInputFormat; import com.datasalt.pangool.tuplemr.mapred.lib.input.TupleTextInputFormat.FieldSelector; import com.datasalt.pangool.utils.CommonUtils; import com.datasalt.pangool.utils.HadoopUtils; import com.google.common.io.Files; @SuppressWarnings({ "rawtypes", "serial" }) public class TestTupleTextInputOutputFormat extends BaseTest implements Serializable { public static String OUT = TestTupleTextInputOutputFormat.class.getName() + "-out"; public static String IN = TestTupleTextInputOutputFormat.class.getName() + "-in"; public static enum TestEnum { MICKEY, MOUSE, MINIE; } /* * A test for finding race conditions in initializing InputSplits */ @Test public void testSplits() throws Exception { BufferedWriter writer = new BufferedWriter(new FileWriter(IN)); for (int i = 0; i < 10000; i++) { writer.write("str1" + " " + "str2" + " " + "30" + " " + "4000" + "\n"); } writer.close(); Schema schema = new Schema("schema", Fields.parse("a:string, b:string, c:int, d:long")); InputFormat inputFormat = new TupleTextInputFormat(schema, false, false, ' ', TupleTextInputFormat.NO_QUOTE_CHARACTER, TupleTextInputFormat.NO_ESCAPE_CHARACTER, FieldSelector.NONE, TupleTextInputFormat.NO_NULL_STRING); Configuration conf = getConf(); conf.setLong("mapred.min.split.size", 10 * 1024); conf.setLong("dfs.block.size", 10 * 1024); conf.setLong("mapred.max.split.size", 10 * 1024); FileSystem fS = FileSystem.get(conf); Path outPath = new Path(OUT); MapOnlyJobBuilder mapOnly = new MapOnlyJobBuilder(conf); mapOnly.addInput(new Path(IN), inputFormat, new MapOnlyMapper<ITuple, NullWritable, NullWritable, NullWritable>() { protected void map(ITuple key, NullWritable value, Context context) throws IOException, InterruptedException { Assert.assertEquals("str1", key.get("a").toString()); Assert.assertEquals("str2", key.get("b").toString()); Assert.assertEquals((Integer) 30, (Integer) key.get("c")); Assert.assertEquals((Long) 4000l, (Long) key.get("d")); context.getCounter("stats", "nlines").increment(1); }; }); HadoopUtils.deleteIfExists(fS, outPath); mapOnly.setOutput(outPath, new HadoopOutputFormat(NullOutputFormat.class), NullWritable.class, NullWritable.class); Job job = mapOnly.createJob(); try { assertTrue(job.waitForCompletion(true)); } finally { mapOnly.cleanUpInstanceFiles(); } HadoopUtils.deleteIfExists(fS, new Path(IN)); assertEquals(10000, job.getCounters().getGroup("stats").findCounter("nlines").getValue()); } @Test public void testInputCompression() throws Exception { Schema schema = new Schema("schema", Fields.parse("a:string, b:string, c:int, d:long")); InputFormat inputFormat = new TupleTextInputFormat(schema, false, false, ' ', TupleTextInputFormat.NO_QUOTE_CHARACTER, TupleTextInputFormat.NO_ESCAPE_CHARACTER, FieldSelector.NONE, TupleTextInputFormat.NO_NULL_STRING); Configuration conf = getConf(); FileSystem fS = FileSystem.get(conf); Path outPath = new Path(OUT); MapOnlyJobBuilder mapOnly = new MapOnlyJobBuilder(conf); mapOnly.addInput(new Path("src/test/resources/*.gz"), inputFormat, new MapOnlyMapper<ITuple, NullWritable, NullWritable, NullWritable>() { protected void map(ITuple key, NullWritable value, Context context) throws IOException, InterruptedException { Assert.assertNotNull(key.get("a").toString()); Assert.assertNotNull(key.get("b").toString()); Assert.assertTrue((Integer) key.get("c") > 0); Assert.assertTrue((Long) key.get("d") > 0); context.getCounter("stats", "nlines").increment(1); }; }); HadoopUtils.deleteIfExists(fS, outPath); mapOnly.setOutput(outPath, new HadoopOutputFormat(NullOutputFormat.class), NullWritable.class, NullWritable.class); Job job = mapOnly.createJob(); try { assertTrue(job.waitForCompletion(true)); } finally { mapOnly.cleanUpInstanceFiles(); } HadoopUtils.deleteIfExists(fS, new Path(IN)); assertEquals(100, job.getCounters().getGroup("stats").findCounter("nlines").getValue()); } @Test public void test() throws TupleMRException, IOException, InterruptedException, ClassNotFoundException { String line1 = "foo1\t10.0\t ar \t1.0\t100\t1000000\ttrue\tMICKEY"; String line2 = "foo2\t20.0\tbar2\t2.0\t200\t2000000\tfalse\tMOUSE"; String line3 = "foo3\t30.0\tbar3\t3.0\t300\t3000000\ttrue\tMINIE"; // The input is a simple space-separated file with no quotes CommonUtils.writeTXT(line1 + "\n" + line2 + "\n" + line3, new File(IN)); Configuration conf = getConf(); FileSystem fS = FileSystem.get(conf); Path outPath = new Path(OUT); Path inPath = new Path(IN); HadoopUtils.deleteIfExists(fS, outPath); // Define the Schema according to the text file List<Field> fields = new ArrayList<Field>(); fields.add(Field.create("strField1", Type.STRING)); fields.add(Field.create("floatField", Type.FLOAT)); fields.add(Field.create("strField2", Type.STRING)); fields.add(Field.create("doubleField", Type.DOUBLE)); fields.add(Field.create("intField", Type.INT)); fields.add(Field.create("longField", Type.LONG)); fields.add(Field.create("booleanField", Type.BOOLEAN)); fields.add(Field.createEnum("enumField", TestEnum.class)); Schema schema = new Schema("schema", fields); TupleMRBuilder builder = new TupleMRBuilder(conf); builder.addIntermediateSchema(schema); builder.setGroupByFields("strField1"); // but we don't care, really /* * Define the Input Format and the Output Format! */ InputFormat inputFormat = new TupleTextInputFormat(schema, false, false, '\t', TupleTextOutputFormat.NO_QUOTE_CHARACTER, TupleTextOutputFormat.NO_ESCAPE_CHARACTER, FieldSelector.NONE, TupleTextInputFormat.NO_NULL_STRING); OutputFormat outputFormat = new TupleTextOutputFormat(schema, false, '\t', TupleTextOutputFormat.NO_QUOTE_CHARACTER, TupleTextOutputFormat.NO_ESCAPE_CHARACTER); builder.addInput(inPath, inputFormat, new IdentityTupleMapper()); builder.setTupleReducer(new IdentityTupleReducer()); builder.setOutput(outPath, outputFormat, ITuple.class, NullWritable.class); Job job = builder.createJob(); try { assertRun(job); } finally { builder.cleanUpInstanceFiles(); } Assert.assertEquals(line1 + "\n" + line2 + "\n" + line3, Files.toString(new File(OUT + "/" + "part-r-00000"), Charset.forName("UTF-8")).trim()); HadoopUtils.deleteIfExists(fS, inPath); HadoopUtils.deleteIfExists(fS, outPath); } @Test public void test2() throws TupleMRException, IOException, InterruptedException, ClassNotFoundException { String line1 = "1,\"Kabul\",\"AFG\",\"Kabol\",1780000"; String line2 = "2,\"Qandahar\",\"AFG\",\"Qandahar\",237500"; String line1out = "\"1\",\"Kabul\",\"AFG\",\"Kabol\",\"1780000\""; String line2out = "\"2\",\"Qandahar\",\"AFG\",\"Qandahar\",\"237500\""; // The input is a simple space-separated file with no quotes CommonUtils.writeTXT(line1 + "\n" + line2, new File(IN)); Configuration conf = getConf(); FileSystem fS = FileSystem.get(conf); Path outPath = new Path(OUT); Path inPath = new Path(IN); HadoopUtils.deleteIfExists(fS, outPath); // Define the Schema according to the text file Schema schema = new Schema("schema", Fields.parse("id:int,name:string,country_code:string,district:string,population:int")); TupleMRBuilder builder = new TupleMRBuilder(conf); builder.addIntermediateSchema(schema); builder.setGroupByFields("id"); // but we don't care, really /* * Define the Input Format and the Output Format! */ InputFormat inputFormat = new TupleTextInputFormat(schema, false, false, ',', '"', '\\', FieldSelector.NONE, TupleTextInputFormat.NO_NULL_STRING); OutputFormat outputFormat = new TupleTextOutputFormat(schema, false, ',', '"', '\\'); builder.addInput(inPath, inputFormat, new IdentityTupleMapper()); builder.setTupleReducer(new IdentityTupleReducer()); builder.setOutput(outPath, outputFormat, ITuple.class, NullWritable.class); try { Job job = builder.createJob(); assertRun(job); } finally { builder.cleanUpInstanceFiles(); } Assert.assertEquals(line1out + "\n" + line2out, Files.toString(new File(OUT + "/" + "part-r-00000"), Charset.forName("UTF-8")).trim()); HadoopUtils.deleteIfExists(fS, inPath); HadoopUtils.deleteIfExists(fS, outPath); } @Test public void testHeader() throws TupleMRException, IOException, InterruptedException, ClassNotFoundException { String line1 = "foo1 10.0 bar1 1.0 100 1000000 true MICKEY"; String line2 = "foo2 20.0 bar2 2.0 200 2000000 false MOUSE"; String line3 = "foo3 30.0 bar3 3.0 300 3000000 true MINIE"; String outHeader = "strField1 floatField strField2 doubleField intField longField booleanField enumField"; // The input is a simple space-separated file with no quotes CommonUtils.writeTXT(outHeader + "\n" + line1 + "\n" + line2 + "\n" + line3, new File(IN)); Configuration conf = getConf(); FileSystem fS = FileSystem.get(conf); Path outPath = new Path(OUT); Path inPath = new Path(IN); HadoopUtils.deleteIfExists(fS, outPath); // Define the Schema according to the text file List<Field> fields = new ArrayList<Field>(); fields.add(Field.create("strField1", Type.STRING)); fields.add(Field.create("floatField", Type.FLOAT)); fields.add(Field.create("strField2", Type.STRING)); fields.add(Field.create("doubleField", Type.DOUBLE)); fields.add(Field.create("intField", Type.INT)); fields.add(Field.create("longField", Type.LONG)); fields.add(Field.create("booleanField", Type.BOOLEAN)); fields.add(Field.createEnum("enumField", TestEnum.class)); Schema schema = new Schema("schema", fields); TupleMRBuilder builder = new TupleMRBuilder(conf); builder.addIntermediateSchema(schema); builder.setGroupByFields("strField1"); // but we don't care, really /* * Define the Input Format and the Output Format! */ InputFormat inputFormat = new TupleTextInputFormat(schema, true, false, ' ', TupleTextOutputFormat.NO_QUOTE_CHARACTER, TupleTextOutputFormat.NO_ESCAPE_CHARACTER, FieldSelector.NONE, TupleTextInputFormat.NO_NULL_STRING); OutputFormat outputFormat = new TupleTextOutputFormat(schema, true, ' ', TupleTextOutputFormat.NO_QUOTE_CHARACTER, TupleTextOutputFormat.NO_ESCAPE_CHARACTER); builder.addInput(inPath, inputFormat, new IdentityTupleMapper()); builder.setTupleReducer(new IdentityTupleReducer()); builder.setOutput(outPath, outputFormat, ITuple.class, NullWritable.class); Job job = builder.createJob(); try { assertRun(job); } finally { builder.cleanUpInstanceFiles(); } Assert.assertEquals(outHeader + "\n" + line1 + "\n" + line2 + "\n" + line3, Files.toString(new File(OUT + "/" + "part-r-00000"), Charset.forName("UTF-8")).trim()); HadoopUtils.deleteIfExists(fS, inPath); HadoopUtils.deleteIfExists(fS, outPath); } @Test public void testNulls() throws IOException, InterruptedException, ClassNotFoundException, TupleMRException, URISyntaxException { String line1 = "\"Joe\",\\N,,\"\\\"Joan\\\"\",\"\""; CommonUtils.writeTXT(line1, new File(IN)); Configuration conf = getConf(); FileSystem fS = FileSystem.get(conf); Path outPath = new Path(OUT); Path inPath = new Path(IN); HadoopUtils.deleteIfExists(fS, outPath); Schema schema = new Schema("schema", Fields.parse("name:string,name2:string,age:int,name3:string,emptystring:string")); MapOnlyJobBuilder mO = new MapOnlyJobBuilder(conf); mO.addInput(inPath, new TupleTextInputFormat(schema, false, true, ',', '"', '\\', FieldSelector.NONE, TupleTextInputFormat.NO_NULL_STRING), new MapOnlyMapper<ITuple, NullWritable, NullWritable, NullWritable>() { protected void map(ITuple key, NullWritable value, Context context, MultipleOutputsCollector collector) throws IOException, InterruptedException { try { Assert.assertNull(key.get("name2")); Assert.assertNull(key.get("age")); Assert.assertEquals("Joe", key.get("name")); Assert.assertEquals("\"Joan\"", key.get("name3")); Assert.assertEquals("", key.get("emptystring")); } catch (Throwable t) { t.printStackTrace(); throw new RuntimeException(t); } } }); mO.setOutput(outPath, new HadoopOutputFormat(NullOutputFormat.class), NullWritable.class, NullWritable.class); Job job = mO.createJob(); try { assertTrue(job.waitForCompletion(true)); } finally { mO.cleanUpInstanceFiles(); } HadoopUtils.deleteIfExists(fS, inPath); HadoopUtils.deleteIfExists(fS, outPath); } @Test public void testNumberNulls() throws IOException, InterruptedException, ClassNotFoundException, TupleMRException, URISyntaxException { String line1 = ",-, ,."; CommonUtils.writeTXT(line1, new File(IN)); Configuration conf = getConf(); FileSystem fS = FileSystem.get(conf); Path outPath = new Path(OUT); Path inPath = new Path(IN); HadoopUtils.deleteIfExists(fS, outPath); Schema schema = new Schema("schema", Fields.parse("n1:int,n2:long,n3:float,n4:double")); MapOnlyJobBuilder mO = new MapOnlyJobBuilder(conf); mO.addInput(inPath, new TupleTextInputFormat(schema, false, true, ',', '"', '\\', FieldSelector.NONE, TupleTextInputFormat.NO_NULL_STRING), new MapOnlyMapper<ITuple, NullWritable, NullWritable, NullWritable>() { protected void map(ITuple key, NullWritable value, Context context, MultipleOutputsCollector collector) throws IOException, InterruptedException { try { Assert.assertNull(key.get("n1")); Assert.assertNull(key.get("n2")); Assert.assertNull(key.get("n3")); Assert.assertNull(key.get("n4")); } catch (Throwable t) { t.printStackTrace(); throw new RuntimeException(t); } } }); mO.setOutput(outPath, new HadoopOutputFormat(NullOutputFormat.class), NullWritable.class, NullWritable.class); Job job = mO.createJob(); try { assertTrue(job.waitForCompletion(true)); } finally { mO.cleanUpInstanceFiles(); } HadoopUtils.deleteIfExists(fS, inPath); HadoopUtils.deleteIfExists(fS, outPath); } @Test public void testQuotes() throws IOException, InterruptedException, ClassNotFoundException, TupleMRException, URISyntaxException { String line1 = "\"MYS\",\"Malaysia\",\"Asia\",\"Southeast Asia\",329758.00,1957,22244000,70.8,69213.00,97884.00,\"Malaysia\",\"Constitutional Monarchy, Federation\",\"Salahuddin Abdul Aziz Shah Alhaj\",2464,\"MY\""; CommonUtils.writeTXT(line1, new File(IN)); Configuration conf = getConf(); FileSystem fS = FileSystem.get(conf); Path outPath = new Path(OUT); Path inPath = new Path(IN); HadoopUtils.deleteIfExists(fS, outPath); Schema schema = new Schema("schema", Fields.parse("code:string," + "name:string," + "continent:string," + "region:string," + "surface_area:double," + "indep_year:int," + "population:int," + "life_expectancy:double," + "gnp:double," + "gnp_old:double," + "local_name:string," + "government_form:string," + "head_of_state:string," + "capital:int," + "code2:string")); MapOnlyJobBuilder mO = new MapOnlyJobBuilder(conf); mO.addInput(inPath, new TupleTextInputFormat(schema, false, false, ',', '"', '\\', FieldSelector.NONE, TupleTextInputFormat.NO_NULL_STRING), new MapOnlyMapper<ITuple, NullWritable, NullWritable, NullWritable>() { protected void map(ITuple key, NullWritable value, Context context, MultipleOutputsCollector collector) throws IOException, InterruptedException { try { Assert.assertEquals("Constitutional Monarchy, Federation", key.get("government_form").toString()); Assert.assertEquals("Salahuddin Abdul Aziz Shah Alhaj", key.get("head_of_state").toString()); Assert.assertEquals(2464, key.get("capital")); } catch (Throwable t) { t.printStackTrace(); throw new RuntimeException(t); } } }); mO.setOutput(outPath, new HadoopOutputFormat(NullOutputFormat.class), NullWritable.class, NullWritable.class); Job job = mO.createJob(); try { assertTrue(job.waitForCompletion(true)); } finally { mO.cleanUpInstanceFiles(); } HadoopUtils.deleteIfExists(fS, inPath); HadoopUtils.deleteIfExists(fS, outPath); } @Test public void testFieldSelection() throws IOException, TupleMRException, InterruptedException, ClassNotFoundException { String line1 = "foo1 10.0 bar1 1.0 100 1000000 true MICKEY"; String line2 = "foo2 20.0 bar2 2.0 200 2000000 false MOUSE"; String line3 = "foo3 30.0 bar3 3.0 300 3000000 true MINIE"; // The input is a simple space-separated file with no quotes CommonUtils.writeTXT(line1 + "\n" + line2 + "\n" + line3, new File(IN)); Configuration conf = getConf(); FileSystem fS = FileSystem.get(conf); Path outPath = new Path(OUT); Path inPath = new Path(IN); HadoopUtils.deleteIfExists(fS, outPath); // Define the Schema according to the text file // We will only select a subset of the file columns List<Field> fields = new ArrayList<Field>(); fields.add(Field.create("floatField", Type.FLOAT)); fields.add(Field.create("intField", Type.INT)); fields.add(Field.create("booleanField", Type.BOOLEAN)); Schema schema = new Schema("schema", fields); // Define a FieldSelector to select only columns 1, 4, 6 // 0 is the first column FieldSelector selector = new FieldSelector(1, 4, 6); TupleMRBuilder builder = new TupleMRBuilder(conf); builder.addIntermediateSchema(schema); builder.setGroupByFields("floatField"); // but we don't care, really // Define the Input Format and the Output Format! // Add the selector to the input format InputFormat inputFormat = new TupleTextInputFormat(schema, false, false, ' ', TupleTextOutputFormat.NO_QUOTE_CHARACTER, TupleTextOutputFormat.NO_ESCAPE_CHARACTER, selector, TupleTextInputFormat.NO_NULL_STRING); OutputFormat outputFormat = new TupleTextOutputFormat(schema, false, ' ', TupleTextOutputFormat.NO_QUOTE_CHARACTER, TupleTextOutputFormat.NO_ESCAPE_CHARACTER); builder.addInput(inPath, inputFormat, new IdentityTupleMapper()); builder.setTupleReducer(new IdentityTupleReducer()); builder.setOutput(outPath, outputFormat, ITuple.class, NullWritable.class); Job job = builder.createJob(); try { assertRun(job); } finally { builder.cleanUpInstanceFiles(); } // This is what we expect as output after field selection line1 = "10.0 100 true"; line2 = "20.0 200 false"; line3 = "30.0 300 true"; Assert.assertEquals(line1 + "\n" + line2 + "\n" + line3, Files.toString(new File(OUT + "/" + "part-r-00000"), Charset.forName("UTF-8")).trim()); HadoopUtils.deleteIfExists(fS, inPath); HadoopUtils.deleteIfExists(fS, outPath); } @Test public void testInputFixedWidth() throws TupleMRException, IOException, InterruptedException, ClassNotFoundException { String line1 = "foo1 +10.0 ar 1.0 +10 +10000 true MICKEY"; String line2 = "foo2 20.0 bar2 2.0 -20 +20000 false MOUSE "; String line3 = "foo3 30.0 bar3 3.0 30 3000000 true MINIE"; // "01234567890123456789012345678901234567890123" int fieldsPos[] = new int[] { 0, 3, 5, 9, 11, 14, 16, 18, 20, 22, 24, 30, 32, 36, 38, 43 }; String line1out = "foo1 10.0 ar 1.0 10 10000 true MICKEY"; String line2out = "foo2 20.0 bar2 2.0 -20 20000 false MOUSE"; String line3out = "foo3 30.0 bar3 3.0 30 3000000 true MINIE"; // The input is a simple space-separated file with no quotes CommonUtils.writeTXT(line1 + "\n" + line2 + "\n" + line3, new File(IN)); Configuration conf = getConf(); FileSystem fS = FileSystem.get(conf); Path outPath = new Path(OUT); Path inPath = new Path(IN); HadoopUtils.deleteIfExists(fS, outPath); // Define the Schema according to the text file List<Field> fields = new ArrayList<Field>(); fields.add(Field.create("strField1", Type.STRING)); fields.add(Field.create("floatField", Type.FLOAT)); fields.add(Field.create("strField2", Type.STRING)); fields.add(Field.create("doubleField", Type.DOUBLE)); fields.add(Field.create("intField", Type.INT)); fields.add(Field.create("longField", Type.LONG)); fields.add(Field.create("booleanField", Type.BOOLEAN)); fields.add(Field.createEnum("enumField", TestEnum.class)); Schema schema = new Schema("schema", fields); TupleMRBuilder builder = new TupleMRBuilder(conf); builder.addIntermediateSchema(schema); builder.setGroupByFields("strField1"); // but we don't care, really /* * Define the Input Format and the Output Format! */ InputFormat inputFormat = new TupleTextInputFormat(schema, fieldsPos, false, null); OutputFormat outputFormat = new TupleTextOutputFormat(schema, false, ' ', TupleTextOutputFormat.NO_QUOTE_CHARACTER, TupleTextOutputFormat.NO_ESCAPE_CHARACTER); builder.addInput(inPath, inputFormat, new IdentityTupleMapper()); builder.setTupleReducer(new IdentityTupleReducer()); builder.setOutput(outPath, outputFormat, ITuple.class, NullWritable.class); Job job = builder.createJob(); try { assertRun(job); } finally { builder.cleanUpInstanceFiles(); } Assert.assertEquals(line1out + "\n" + line2out + "\n" + line3out, Files.toString(new File(OUT + "/" + "part-r-00000"), Charset.forName("UTF-8")).trim()); HadoopUtils.deleteIfExists(fS, inPath); HadoopUtils.deleteIfExists(fS, outPath); } @Test public void testInputFixedWidthNull() throws TupleMRException, IOException, InterruptedException, ClassNotFoundException { String line1 = "foo1 +10.0 bar1 1.0 100 1000000 true MICKEY"; String line2 = "foo2 20.0 bar2 2.0 200 2000000 false MOUSE "; String line3 = "foo3 30.0 bar3 3.0 300 3000000 true MINIE"; // "01234567890123456789012345678901234567890123" int fieldsPos[] = new int[] { 0, 3, 5, 9, 11, 14, 16, 18, 20, 22, 24, 30, 32, 36, 38, 43 }; String line1out = "foo1 10.0 bar1 1.0 100 1000000 true MICKEY"; String line2out = "foo2 20.0 bar2 2.0 200 2000000 false MOUSE"; String line3out = "foo3 30.0 bar3 3.0 300 3000000 true MINIE"; // The input is a simple space-separated file with no quotes CommonUtils.writeTXT(line1 + "\n" + line2 + "\n" + line3, new File(IN)); Configuration conf = getConf(); FileSystem fS = FileSystem.get(conf); Path outPath = new Path(OUT); Path inPath = new Path(IN); HadoopUtils.deleteIfExists(fS, outPath); // Define the Schema according to the text file List<Field> fields = new ArrayList<Field>(); fields.add(Field.create("strField1", Type.STRING)); fields.add(Field.create("floatField", Type.FLOAT)); fields.add(Field.create("strField2", Type.STRING)); fields.add(Field.create("doubleField", Type.DOUBLE)); fields.add(Field.create("intField", Type.INT)); fields.add(Field.create("longField", Type.LONG)); fields.add(Field.create("booleanField", Type.BOOLEAN)); fields.add(Field.createEnum("enumField", TestEnum.class)); Schema schema = new Schema("schema", fields); TupleMRBuilder builder = new TupleMRBuilder(conf); builder.addIntermediateSchema(schema); builder.setGroupByFields("strField1"); // but we don't care, really /* * Define the Input Format and the Output Format! */ InputFormat inputFormat = new TupleTextInputFormat(schema, fieldsPos, false, null); OutputFormat outputFormat = new TupleTextOutputFormat(schema, false, ' ', TupleTextOutputFormat.NO_QUOTE_CHARACTER, TupleTextOutputFormat.NO_ESCAPE_CHARACTER); builder.addInput(inPath, inputFormat, new IdentityTupleMapper()); builder.setTupleReducer(new IdentityTupleReducer()); builder.setOutput(outPath, outputFormat, ITuple.class, NullWritable.class); Job job = builder.createJob(); try { assertRun(job); } finally { builder.cleanUpInstanceFiles(); } Assert.assertEquals(line1out + "\n" + line2out + "\n" + line3out, Files.toString(new File(OUT + "/" + "part-r-00000"), Charset.forName("UTF-8")).trim()); HadoopUtils.deleteIfExists(fS, inPath); HadoopUtils.deleteIfExists(fS, outPath); } @Test public void testFixedWidthNulls() throws IOException, InterruptedException, ClassNotFoundException, TupleMRException, URISyntaxException { String line1 = "1000 - "; int fieldsPos[] = new int[] { 0, 3, 5, 7 }; CommonUtils.writeTXT(line1, new File(IN)); Configuration conf = getConf(); FileSystem fS = FileSystem.get(conf); Path outPath = new Path(OUT); Path inPath = new Path(IN); HadoopUtils.deleteIfExists(fS, outPath); Schema schema = new Schema("schema", Fields.parse("name:string,name2:string")); MapOnlyJobBuilder mO = new MapOnlyJobBuilder(conf); mO.addInput(inPath, new TupleTextInputFormat(schema, fieldsPos, false, "-"), new MapOnlyMapper<ITuple, NullWritable, NullWritable, NullWritable>() { protected void map(ITuple key, NullWritable value, Context context, MultipleOutputsCollector collector) throws IOException, InterruptedException { try { Assert.assertNull(key.get("name2")); Assert.assertEquals("1000", key.get("name")); } catch (Throwable t) { t.printStackTrace(); throw new RuntimeException(t); } } }); mO.setOutput(outPath, new HadoopOutputFormat(NullOutputFormat.class), NullWritable.class, NullWritable.class); Job job = mO.createJob(); try { assertTrue(job.waitForCompletion(true)); } finally { mO.cleanUpInstanceFiles(); } HadoopUtils.deleteIfExists(fS, inPath); HadoopUtils.deleteIfExists(fS, outPath); } }