Java tutorial
/* * Copyright (c) 2007-2015 Concurrent, Inc. All Rights Reserved. * * Project and contact information: http://www.cascading.org/ * * This file is part of the Cascading project. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package cascading.tuple.hadoop; import java.util.HashSet; import java.util.Iterator; import java.util.Random; import java.util.Set; import cascading.CascadingTestCase; import cascading.flow.hadoop.HadoopFlowProcess; import cascading.tuple.Tuple; import cascading.tuple.collect.SpillableProps; import cascading.tuple.hadoop.collect.HadoopSpillableTupleList; import cascading.tuple.hadoop.collect.HadoopSpillableTupleMap; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.io.BooleanWritable; import org.apache.hadoop.io.BytesWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.io.compress.CompressionCodec; import org.apache.hadoop.io.compress.GzipCodec; import org.apache.hadoop.io.serializer.WritableSerialization; import org.apache.hadoop.util.ReflectionUtils; import org.junit.Test; /** * */ public class SpillableTupleHadoopTest extends CascadingTestCase { public SpillableTupleHadoopTest() { super(); } @Test public void testSpillList() { long time = System.currentTimeMillis(); performListTest(5, 50, null, 0); performListTest(49, 50, null, 0); performListTest(50, 50, null, 0); performListTest(51, 50, null, 1); performListTest(499, 50, null, 9); performListTest(500, 50, null, 9); performListTest(501, 50, null, 10); System.out.println("time = " + (System.currentTimeMillis() - time)); } @Test public void testSpillListCompressed() { GzipCodec codec = ReflectionUtils.newInstance(GzipCodec.class, new Configuration()); long time = System.currentTimeMillis(); performListTest(5, 50, codec, 0); performListTest(49, 50, codec, 0); performListTest(50, 50, codec, 0); performListTest(51, 50, codec, 1); performListTest(499, 50, codec, 9); performListTest(500, 50, codec, 9); performListTest(501, 50, codec, 10); System.out.println("time = " + (System.currentTimeMillis() - time)); } private void performListTest(int size, int threshold, CompressionCodec codec, int spills) { Configuration jobConf = new Configuration(); jobConf.set("io.serializations", TestSerialization.class.getName() + "," + WritableSerialization.class.getName()); // disable/replace WritableSerialization class jobConf.set("cascading.serialization.tokens", "1000=" + BooleanWritable.class.getName() + ",10001=" + Text.class.getName()); // not using Text, just testing parsing HadoopSpillableTupleList list = new HadoopSpillableTupleList(threshold, codec, jobConf); for (int i = 0; i < size; i++) { String aString = "string number " + i; double random = Math.random(); list.add(new Tuple(i, aString, random, new Text(aString), new TestText(aString), new Tuple("inner tuple", new BytesWritable(aString.getBytes())))); } assertEquals("not equal: list.size();", size, list.size()); assertEquals("not equal: list.getNumFiles()", spills, list.spillCount()); int i = -1; int count = 0; for (Tuple tuple : list) { int value = tuple.getInteger(0); assertTrue("wrong diff", value - i == 1); assertEquals("wrong value", "string number " + count, tuple.getObject(3).toString()); assertEquals("wrong value", "string number " + count, tuple.getObject(4).toString()); assertTrue("wrong type", tuple.getObject(5) instanceof Tuple); BytesWritable bytesWritable = (BytesWritable) ((Tuple) tuple.getObject(5)).getObject(1); byte[] bytes = bytesWritable.getBytes(); String actual = new String(bytes, 0, bytesWritable.getLength()); assertEquals("wrong value", "string number " + count, actual); i = value; count++; } assertEquals("not equal: list.size();", size, count); Iterator<Tuple> iterator = list.iterator(); assertEquals("not equal: iterator.next().get(1)", "string number 0", iterator.next().getObject(1)); assertEquals("not equal: iterator.next().get(1)", "string number 1", iterator.next().getObject(1)); } @Test public void testSpillMap() { long time = System.currentTimeMillis(); Configuration jobConf = new Configuration(); performMapTest(5, 5, 100, 20, jobConf); performMapTest(5, 50, 100, 20, jobConf); performMapTest(50, 5, 200, 20, jobConf); performMapTest(500, 50, 7000, 20, jobConf); System.out.println("time = " + (System.currentTimeMillis() - time)); } @Test public void testSpillMapCompressed() { long time = System.currentTimeMillis(); Configuration jobConf = new Configuration(); jobConf.set(SpillableProps.SPILL_CODECS, "org.apache.hadoop.io.compress.GzipCodec"); performMapTest(5, 5, 100, 20, jobConf); performMapTest(5, 50, 100, 20, jobConf); performMapTest(50, 5, 200, 20, jobConf); performMapTest(500, 50, 7000, 20, jobConf); System.out.println("time = " + (System.currentTimeMillis() - time)); } private void performMapTest(int numKeys, int listSize, int mapThreshold, int listThreshold, Configuration jobConf) { jobConf.set("io.serializations", TestSerialization.class.getName() + "," + WritableSerialization.class.getName()); // disable/replace WritableSerialization class jobConf.set("cascading.serialization.tokens", "1000=" + BooleanWritable.class.getName() + ",10001=" + Text.class.getName()); // not using Text, just testing parsing HadoopFlowProcess flowProcess = new HadoopFlowProcess(jobConf); HadoopSpillableTupleMap map = new HadoopSpillableTupleMap(SpillableProps.defaultMapInitialCapacity, SpillableProps.defaultMapLoadFactor, mapThreshold, listThreshold, flowProcess); Set<Integer> keySet = new HashSet<Integer>(); Random gen = new Random(1); for (int i = 0; i < listSize * numKeys; i++) { String aString = "string number " + i; double random = Math.random(); double keys = numKeys / 3.0; int key = (int) (gen.nextDouble() * keys + gen.nextDouble() * keys + gen.nextDouble() * keys); Tuple tuple = new Tuple(i, aString, random, new Text(aString), new TestText(aString), new Tuple("inner tuple", new BytesWritable(aString.getBytes()))); map.get(new Tuple(key)).add(tuple); keySet.add(key); } // the list test above verifies the contents are being serialized, the Map is just a container of lists. assertEquals("not equal: map.size();", keySet.size(), map.size()); } }