Java tutorial
/** * Copyright [2012] [Datasalt Systems S.L.] * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.datasalt.pangool.tuplemr.mapred; import java.io.IOException; import java.util.ArrayList; import java.util.Iterator; import java.util.List; import junit.framework.Assert; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.NullWritable; import org.apache.hadoop.io.SequenceFile; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat; import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat; import org.junit.Test; import com.datasalt.pangool.io.Fields; import com.datasalt.pangool.io.ITuple; import com.datasalt.pangool.io.Schema; import com.datasalt.pangool.io.Tuple; import com.datasalt.pangool.tuplemr.Criteria.Order; import com.datasalt.pangool.tuplemr.OrderBy; import com.datasalt.pangool.tuplemr.TupleMRBuilder; import com.datasalt.pangool.tuplemr.TupleMRException; import com.datasalt.pangool.tuplemr.TupleMapper; import com.datasalt.pangool.tuplemr.TupleReducer; import com.datasalt.pangool.tuplemr.TupleRollupReducer; import com.datasalt.pangool.tuplemr.mapred.lib.input.HadoopInputFormat; import com.datasalt.pangool.tuplemr.mapred.lib.output.HadoopOutputFormat; import com.datasalt.pangool.utils.test.AbstractHadoopTestLibrary; public class TestRollup extends AbstractHadoopTestLibrary { public static final String TEST_OUT = "TEST-OUTPUT"; public static final org.apache.avro.Schema AVRO_SCHEMA; static { AVRO_SCHEMA = org.apache.avro.Schema.createRecord("MyRecordSchema", null, null, false); List<org.apache.avro.Schema.Field> avroFields = new ArrayList<org.apache.avro.Schema.Field>(); avroFields.add(new org.apache.avro.Schema.Field("my_int", org.apache.avro.Schema.create(org.apache.avro.Schema.Type.INT), null, null)); avroFields.add(new org.apache.avro.Schema.Field("my_string", org.apache.avro.Schema.create(org.apache.avro.Schema.Type.STRING), null, null)); AVRO_SCHEMA.setFields(avroFields); } private static class Map extends TupleMapper<Text, NullWritable> { private Schema schema; /** * Called once at the start of the task. Override it to implement your custom logic. */ public void setup(TupleMRContext context, Collector collector) throws IOException, InterruptedException { this.schema = context.getTupleMRConfig().getIntermediateSchema(0); } /** * */ private static final long serialVersionUID = 1L; @Override public void map(Text key, NullWritable value, TupleMRContext context, Collector collector) throws IOException, InterruptedException { Tuple outputKey = createTuple(key.toString(), schema); collector.write(outputKey); } } private static class IdentityRed extends TupleRollupReducer<Text, Text> { /** * */ private static final long serialVersionUID = 1L; private transient Text outputKey; private transient Text outputValue; @Override public void setup(TupleMRContext context, Collector collector) throws IOException, InterruptedException { outputKey = new Text(); outputValue = new Text(); } @Override public void cleanup(TupleMRContext context, Collector collector) throws IOException, InterruptedException { } @Override public void onOpenGroup(int depth, String field, ITuple firstElement, TupleMRContext context, Collector collector) throws IOException, InterruptedException { outputKey.set("OPEN " + depth); outputValue.set(firstElement.toString()); collector.write(outputKey, outputValue); System.out.println(outputKey + " => " + outputValue); } @Override public void onCloseGroup(int depth, String field, ITuple lastElement, TupleMRContext context, Collector collector) throws IOException, InterruptedException { outputKey.set("CLOSE " + depth); outputValue.set(lastElement.toString()); collector.write(outputKey, outputValue); System.out.println(outputKey + " => " + outputValue); } @Override public void reduce(ITuple group, Iterable<ITuple> tuples, TupleMRContext context, Collector collector) throws IOException, InterruptedException { Iterator<ITuple> iterator = tuples.iterator(); outputKey.set("ELEMENT"); while (iterator.hasNext()) { ITuple tuple = iterator.next(); outputValue.set(tuple.toString()); collector.write(outputKey, outputValue); System.out.println(outputKey + " => " + outputValue); } } } private static Tuple createTuple(String text, Schema schema) { Tuple tuple = new Tuple(schema); String[] tokens = text.split("\\s+"); String country = tokens[0]; Integer age = Integer.parseInt(tokens[1]); String name = tokens[2]; Integer height = Integer.parseInt(tokens[3]); tuple.set(0, country); tuple.set(1, age); tuple.set(2, name); tuple.set(3, height); return tuple; } @Test public void test1() throws IOException, InterruptedException, ClassNotFoundException, InstantiationException, IllegalAccessException, TupleMRException { String input = TEST_OUT + "/input"; String output = TEST_OUT + "/output"; String[] inputElements = new String[] { "ES 20 listo 250", "US 14 beber 202", "US 14 perro 180", "US 14 perro 170", "US 15 jauja 160", "US 16 listo 160", "XE 20 listo 230" }; Schema schema = new Schema("schema", Fields.parse("country:string, age:int, name:string, height:int")); ITuple[] tuples = new ITuple[inputElements.length]; int i = 0; for (String inputElement : inputElements) { withInput(input, writable(inputElement)); tuples[i++] = createTuple(inputElement, schema); } Path outputPath = new Path(output); TupleMRBuilder builder = new TupleMRBuilder(getConf()); builder.addIntermediateSchema(schema); builder.setGroupByFields("country", "age", "name"); builder.setOrderBy(new OrderBy().add("country", Order.ASC).add("age", Order.ASC).add("name", Order.ASC) .add("height", Order.DESC)); builder.setRollupFrom("country"); builder.setTupleReducer(new IdentityRed()); builder.setOutput(outputPath, new HadoopOutputFormat(SequenceFileOutputFormat.class), Text.class, Text.class); builder.addInput(new Path(input), new HadoopInputFormat(SequenceFileInputFormat.class), new Map()); Job job = builder.createJob(); try { job.setNumReduceTasks(1); assertRun(job); } finally { builder.cleanUpInstanceFiles(); } FileSystem fs = FileSystem.get(getConf()); Path outputFile = new Path(output + "/part-r-00000"); checkRollupOutput(outputFile, 0, 2); SequenceFile.Reader reader = new SequenceFile.Reader(fs, outputFile, getConf()); assertOutput(reader, "OPEN 0", tuples[0]); assertOutput(reader, "OPEN 1", tuples[0]); assertOutput(reader, "OPEN 2", tuples[0]); assertOutput(reader, "ELEMENT", tuples[0]); assertOutput(reader, "CLOSE 2", tuples[0]); assertOutput(reader, "CLOSE 1", tuples[0]); assertOutput(reader, "CLOSE 0", tuples[0]); assertOutput(reader, "OPEN 0", tuples[1]); assertOutput(reader, "OPEN 1", tuples[1]); assertOutput(reader, "OPEN 2", tuples[1]); assertOutput(reader, "ELEMENT", tuples[1]); assertOutput(reader, "CLOSE 2", tuples[1]); assertOutput(reader, "OPEN 2", tuples[2]); assertOutput(reader, "ELEMENT", tuples[2]); assertOutput(reader, "ELEMENT", tuples[3]); assertOutput(reader, "CLOSE 2", tuples[3]); assertOutput(reader, "CLOSE 1", tuples[3]); assertOutput(reader, "OPEN 1", tuples[4]); assertOutput(reader, "OPEN 2", tuples[4]); assertOutput(reader, "ELEMENT", tuples[4]); assertOutput(reader, "CLOSE 2", tuples[4]); assertOutput(reader, "CLOSE 1", tuples[4]); assertOutput(reader, "OPEN 1", tuples[5]); assertOutput(reader, "OPEN 2", tuples[5]); assertOutput(reader, "ELEMENT", tuples[5]); assertOutput(reader, "CLOSE 2", tuples[5]); assertOutput(reader, "CLOSE 1", tuples[5]); assertOutput(reader, "CLOSE 0", tuples[5]); assertOutput(reader, "OPEN 0", tuples[6]); assertOutput(reader, "OPEN 1", tuples[6]); assertOutput(reader, "OPEN 2", tuples[6]); assertOutput(reader, "ELEMENT", tuples[6]); assertOutput(reader, "CLOSE 2", tuples[6]); assertOutput(reader, "CLOSE 1", tuples[6]); assertOutput(reader, "CLOSE 0", tuples[6]); reader.close(); cleanUp(); trash(TEST_OUT); } @Test public void test2() throws IOException, InterruptedException, ClassNotFoundException, InstantiationException, IllegalAccessException, TupleMRException { String input = TEST_OUT + "/input"; String output = TEST_OUT + "/output"; String[] inputElements = new String[] { "ES 20 listo 250", "US 14 beber 202", "US 14 perro 180", "US 14 perro 170", "US 15 jauja 160", "US 16 listo 160", "XE 16 listo 230" }; Schema schema = new Schema("schema", Fields.parse("country:string, age:int, name:string, height:int")); ITuple[] tuples = new ITuple[inputElements.length]; int i = 0; for (String inputElement : inputElements) { withInput(input, writable(inputElement)); tuples[i++] = createTuple(inputElement, schema); } Path outputPath = new Path(output); TupleMRBuilder builder = new TupleMRBuilder(getConf()); builder.addIntermediateSchema(schema); builder.setGroupByFields("age", "name", "country"); builder.setOrderBy(new OrderBy().add("country", Order.ASC).add("age", Order.ASC).add("name", Order.ASC) .add("height", Order.DESC)); builder.setRollupFrom("age"); builder.setTupleReducer(new IdentityRed()); builder.setOutput(outputPath, new HadoopOutputFormat(SequenceFileOutputFormat.class), Text.class, Text.class); builder.addInput(new Path(input), new HadoopInputFormat(SequenceFileInputFormat.class), new Map()); Job job = builder.createJob(); try { job.setNumReduceTasks(1); assertRun(job); } finally { builder.cleanUpInstanceFiles(); } FileSystem fs = FileSystem.get(getConf()); Path outputFile = new Path(output + "/part-r-00000"); checkRollupOutput(outputFile, 1, 2); SequenceFile.Reader reader = new SequenceFile.Reader(fs, outputFile, getConf()); assertOutput(reader, "OPEN 1", tuples[0]); assertOutput(reader, "OPEN 2", tuples[0]); assertOutput(reader, "ELEMENT", tuples[0]); assertOutput(reader, "CLOSE 2", tuples[0]); assertOutput(reader, "CLOSE 1", tuples[0]); assertOutput(reader, "OPEN 1", tuples[1]); assertOutput(reader, "OPEN 2", tuples[1]); assertOutput(reader, "ELEMENT", tuples[1]); assertOutput(reader, "CLOSE 2", tuples[1]); assertOutput(reader, "OPEN 2", tuples[2]); assertOutput(reader, "ELEMENT", tuples[2]); assertOutput(reader, "ELEMENT", tuples[3]); assertOutput(reader, "CLOSE 2", tuples[3]); assertOutput(reader, "CLOSE 1", tuples[3]); assertOutput(reader, "OPEN 1", tuples[4]); assertOutput(reader, "OPEN 2", tuples[4]); assertOutput(reader, "ELEMENT", tuples[4]); assertOutput(reader, "CLOSE 2", tuples[4]); assertOutput(reader, "CLOSE 1", tuples[4]); assertOutput(reader, "OPEN 1", tuples[5]); assertOutput(reader, "OPEN 2", tuples[5]); assertOutput(reader, "ELEMENT", tuples[5]); assertOutput(reader, "CLOSE 2", tuples[5]); assertOutput(reader, "CLOSE 1", tuples[5]); assertOutput(reader, "OPEN 1", tuples[6]); assertOutput(reader, "OPEN 2", tuples[6]); assertOutput(reader, "ELEMENT", tuples[6]); assertOutput(reader, "CLOSE 2", tuples[6]); assertOutput(reader, "CLOSE 1", tuples[6]); reader.close(); cleanUp(); trash(TEST_OUT); } private enum State { OPEN, CLOSE, ELEMENT } /** * * Checks that {@link RollupReducer} calls properly {@link TupleReducer#onOpenGroup}, * {@link TupleReducer#onCloseGroup} and {@link TupleReducer#onGroupElements} and checks that the elements (tuples) * passed are coherent. This method assumes an specific output from the {@link TupleReducer}. The output needs to be a * Text,Text for key and value This will be the format used : key("OPEN depth"), value("serialized value") * key("CLOSE depth"), value("serialized value") key("ELEMENT"),value("serialized element") (for every element * received in onElements needs to contain a record like this) * * For instance : key("OPEN 0"), value(" element1") key("OPEN 1"), value("element1 ") key("ELEMENT") , value * ("element1") key("ELEMENT"),value ("element2") key("CLOSE 1"),value ("element2") key("CLOSE 0"),value("element2") * * */ public void checkRollupOutput(Path path, int minDepth, int maxDepth) throws IOException { SequenceFile.Reader reader = new SequenceFile.Reader(FileSystem.getLocal(getConf()), path, getConf()); Text actualKey = new Text(); Text actualValue = new Text(); reader.next(actualKey, actualValue); // first action String currentKey = actualKey.toString(); String currentValue = actualValue.toString(); Assert.assertTrue("First output needs to be an OPEN ", currentKey.startsWith("OPEN")); int currentDepth = Integer.parseInt(currentKey.split(" ")[1]); Assert.assertEquals("First OPEN needs to match minDepth", minDepth, currentDepth); int lastDepth = currentDepth; String lastValue = currentValue; State lastState = State.OPEN; while (reader.next(actualKey, actualValue)) { currentKey = actualKey.toString(); currentValue = actualValue.toString(); if (currentKey.startsWith("OPEN")) { currentDepth = Integer.parseInt(currentKey.split(" ")[1]); Assert.assertEquals("OPEN needs to increase depth in +1 ", lastDepth + 1, currentDepth); Assert.assertTrue("Too many OPENs, over maxDepth ", maxDepth >= currentDepth); if (lastState == State.OPEN) { Assert.assertEquals("First element in OPEN needs to match first element in previous OPEN", lastValue, currentValue); } else if (lastState == State.CLOSE) { Assert.assertNotSame( "Element from new group needs to be different from last element from last group ", lastValue, currentValue); } else { Assert.fail("Not allowed OPEN after ELEMENT"); } lastState = State.OPEN; lastValue = currentValue; lastDepth = currentDepth; } else if (currentKey.startsWith("CLOSE")) { currentDepth = Integer.parseInt(currentKey.split(" ")[1]); Assert.assertNotSame("Not allowed CLOSE after OPEN , needs at least one ELEMENT in between", State.OPEN, lastState); Assert.assertEquals("CLOSE depth needs to match previous OPEN depth", lastDepth, currentDepth); Assert.assertEquals("Element in CLOSE needs to match lastElement in group", lastValue, currentValue); lastState = State.CLOSE; lastValue = currentValue; lastDepth = currentDepth - 1; } else if (currentKey.startsWith("ELEMENT")) { Assert.assertNotSame("Not allowed ELEMENT after CLOSE, needs an OPEN or ELEMENT before", State.CLOSE, lastState); lastState = State.ELEMENT; lastValue = currentValue; } } Assert.assertEquals("File doesn't properly finishes with a CLOSE ", State.CLOSE, lastState); Assert.assertEquals("Last CLOSE doesn't close the minDepth ", minDepth - 1, lastDepth); reader.close(); } private void assertOutput(SequenceFile.Reader reader, String expectedKey, ITuple expectedValue) throws IOException { Text actualKey = new Text(); Text actualValue = new Text(); reader.next(actualKey, actualValue); Assert.assertEquals(new Text(expectedKey), actualKey); Assert.assertEquals(new Text(expectedValue.toString()), actualValue); } @SuppressWarnings("serial") private static class DoNothingMap extends TupleMapper<Text, NullWritable> { @Override public void map(Text key, NullWritable value, TupleMRContext context, Collector collector) throws IOException, InterruptedException { } } /** * Tests the case in which the reducer receives no data. */ @Test public void testNoDataReducer() throws IOException, InterruptedException, ClassNotFoundException, InstantiationException, IllegalAccessException, TupleMRException { String input = TEST_OUT + "/input"; String output = TEST_OUT + "/output"; withInput(input, writable("ES 20 listo 250")); Schema schema = new Schema("schema", Fields.parse("country:string, age:int, name:string, height:int")); Path outputPath = new Path(output); TupleMRBuilder builder = new TupleMRBuilder(getConf()); builder.addIntermediateSchema(schema); builder.setGroupByFields("age", "name", "country"); builder.setOrderBy(new OrderBy().add("country", Order.ASC).add("age", Order.ASC).add("name", Order.ASC)); builder.setRollupFrom("age"); builder.setTupleReducer(new IdentityRed()); builder.setOutput(outputPath, new HadoopOutputFormat(SequenceFileOutputFormat.class), Text.class, Text.class); builder.addInput(new Path(input), new HadoopInputFormat(SequenceFileInputFormat.class), new DoNothingMap()); Job job = builder.createJob(); try { job.setNumReduceTasks(1); assertRun(job); } finally { builder.cleanUpInstanceFiles(); } cleanUp(); trash(TEST_OUT); } }