Java tutorial
/** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.hadoop.mapred; import java.io.IOException; import java.util.ArrayList; import java.util.Iterator; import java.util.List; import java.util.Random; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.hadoop.io.BytesWritable; import org.apache.hadoop.io.NullWritable; import org.apache.hadoop.io.RawComparator; import org.apache.hadoop.io.WritableComparator; import org.apache.hadoop.mapred.TestMapCollection.FakeIF; import org.apache.hadoop.mapred.lib.NullOutputFormat; import junit.framework.TestCase; @SuppressWarnings("deprecation") public class TestNewCollector extends TestCase { private static Log LOG = LogFactory.getLog(TestNewCollector.class); private MiniMRCluster mrCluster; protected void setUp() { JobConf conf = new JobConf(); try { mrCluster = new MiniMRCluster(2, "file:///", 3, null, null, conf); } catch (IOException e) { throw new RuntimeException(e); } } protected void tearDown() { mrCluster.shutdown(); } public static class BytesWritableFactory { private static Random random = new Random(); public static BytesWritable getRandomBytesWritable(int size) { byte[] bytes = new byte[size]; random.nextBytes(bytes); BytesWritable bytesWritable = new BytesWritable(bytes); return bytesWritable; } public static BytesWritable getRepeatedBytesWritable(byte[] bytes, int repeatNum) { int newLen = bytes.length * repeatNum; byte[] bb = new byte[newLen]; for (int i = 0; i < repeatNum; i++) { System.arraycopy(bytes, 0, bb, bytes.length * i, bytes.length); } BytesWritable bytesWritable = new BytesWritable(bb); return bytesWritable; } } public static class TestNewCollectorKey extends BytesWritable { private int hashCode = -1; public TestNewCollectorKey(BytesWritable k) { super(k.getBytes()); } public TestNewCollectorKey() { super(); } public int hashCode() { if (hashCode < 0) { hashCode = super.hashCode(); } return hashCode; } public void setHashCode(int hashCode) { this.hashCode = hashCode; } } public static class RecordNumStore { private static String RECORD_NUM_CONF = "test.reducer.records.num"; /* * conf to specify number of big records to spill right after the mapper * starts, it is comma separated string for a list of values, each value is * one reducer. */ private static String BIG_RECORDS_BEGINNING = "test.reducer.bigrecords.start"; /* * conf to specify number of big records to spill in the middle of a * mapper, it is comma separated string for a list of values, each value is * one reducer. */ private static String BIG_RECORDS_MIDDLE = "test.reducer.bigrecords.middle"; /* * conf to specify number of big records to spill right before the mapper * finish, it is comma separated string for a list of values, each value is * one reducer. */ private static String BIG_RECORDS_END = "test.reducer.bigrecords.end"; private JobConf currentJobConf; private List<Integer> reducerToReciveRecNum; private int[] mapperOutNumForEachReducer; private static RecordNumStore inst; private static Object instanceLock = new Object(); private RecordNumStore(JobConf job) { this.currentJobConf = job; init(job); } public void init(JobConf job) { String recordNumStr = job.get(RECORD_NUM_CONF); int numMappers = job.getNumMapTasks(); reducerToReciveRecNum = new ArrayList<Integer>(numMappers); if (recordNumStr != null) { String[] splits = recordNumStr.split(","); for (String num : splits) { if (num == null || num.trim().equals("")) { continue; } reducerToReciveRecNum.add(Integer.parseInt(num)); } } for (int i = reducerToReciveRecNum.size(); i < numMappers; i++) { reducerToReciveRecNum.add(0); } } public static RecordNumStore getInst(JobConf job) { synchronized (instanceLock) { if (job != null && (inst == null || job != inst.getCurrentJobConf())) { inst = new RecordNumStore(job); } return inst; } } protected JobConf getCurrentJobConf() { return currentJobConf; } public synchronized int[] getMapperOutNumForEachReducer() { int numReducers = currentJobConf.getNumReduceTasks(); int numMappers = currentJobConf.getNumMapTasks(); if (mapperOutNumForEachReducer == null) { mapperOutNumForEachReducer = new int[numReducers]; } List<Integer> reducerToReciveNum = this.reducerToReciveRecNum; for (int i = 0; i < numReducers; i++) { mapperOutNumForEachReducer[i] = reducerToReciveNum.get(i) / numMappers; } return mapperOutNumForEachReducer; } public synchronized int[] getBigRecodsStart() { String bigRecordNumStartStr = currentJobConf.get(BIG_RECORDS_BEGINNING); int[] bigRecordsStart = splitConfToIntArray(bigRecordNumStartStr); return bigRecordsStart; } public synchronized int[] getBigRecodsMiddle() { String bigRecordNumMiddleStr = currentJobConf.get(BIG_RECORDS_MIDDLE); int[] bigRecordsMiddle = splitConfToIntArray(bigRecordNumMiddleStr); return bigRecordsMiddle; } public synchronized int[] getBigRecodsEnd() { String bigRecordNumEndStr = currentJobConf.get(BIG_RECORDS_END); int[] bigRecordsEnd = splitConfToIntArray(bigRecordNumEndStr); return bigRecordsEnd; } private int[] splitConfToIntArray(String confStr) { String[] splits = confStr.split(","); int[] numArray = new int[splits.length]; for (int i = 0; i < splits.length; i++) { String num = splits[i]; if (num == null || num.trim().equals("")) { numArray[i] = 0; } else { numArray[i] = Integer.parseInt(num); } } return numArray; } public boolean checkReducerReceiveRecNum(int reducerNum) { return reducerToReciveRecNum.remove(Integer.valueOf(reducerNum)); } /** * Each mapper is omitting the same number of records. And * reducerRecPercents array decides how many should go to each reducer. One * reducer will receive the same number of records from different mappers. * * @param numReducers * number of reducers to run * @param mappers * number of mappers to run * @param recordNumPerMapper * how many records each mapper outputs * @param reducerRecPercents * for one mapper, how to allocate output records to reducers * @param numBigRecordsStart * @param numBigRecordsMiddle * @param numBigRecordsEnd * @param job */ public static void setJobConf(int numReducers, int mappers, int recordNumPerMapper, double[] reducerRecPercents, int[] numBigRecordsStart, int[] numBigRecordsMiddle, int[] numBigRecordsEnd, JobConf job) { int[] recNumReducerOneMapper = new int[numReducers]; double left = 1.0f; int preAllocated = 0; int leftToAllocate = recordNumPerMapper; if (numBigRecordsStart == null) { numBigRecordsStart = new int[numReducers]; fillZero(numBigRecordsStart); } if (numBigRecordsMiddle == null) { numBigRecordsMiddle = new int[numReducers]; fillZero(numBigRecordsMiddle); } if (numBigRecordsEnd == null) { numBigRecordsEnd = new int[numReducers]; fillZero(numBigRecordsEnd); } if (reducerRecPercents != null) { if (reducerRecPercents.length > numReducers) { throw new IllegalArgumentException("percents array length is " + reducerRecPercents.length + " while numReducers is " + numReducers); } preAllocated = reducerRecPercents.length; } for (int i = 0; i < preAllocated; i++) { left -= reducerRecPercents[i]; if (left < 0) { throw new IllegalArgumentException("sum of percents array is bigger than 1.0"); } recNumReducerOneMapper[i] = (int) (recordNumPerMapper * reducerRecPercents[i]); leftToAllocate -= recNumReducerOneMapper[i]; } int toAllocateReducer = preAllocated; while (leftToAllocate > 0 && toAllocateReducer < numReducers) { recNumReducerOneMapper[toAllocateReducer] += 1; toAllocateReducer++; if (toAllocateReducer == numReducers) { toAllocateReducer = preAllocated; } leftToAllocate--; } for (int i = 0; i < recNumReducerOneMapper.length; i++) { recNumReducerOneMapper[i] = recNumReducerOneMapper[i] * mappers; int bigRecords = numBigRecordsStart[i] + numBigRecordsMiddle[i] + numBigRecordsEnd[i]; if (bigRecords > recNumReducerOneMapper[i]) { throw new IllegalArgumentException("big records number is bigger than total."); } } String recordNumConf = getStringConf(recNumReducerOneMapper); job.set(RECORD_NUM_CONF, recordNumConf); String bigRecordStartConf = getStringConf(numBigRecordsStart); job.set(BIG_RECORDS_BEGINNING, bigRecordStartConf); String bigRecordMiddleConf = getStringConf(numBigRecordsMiddle); job.set(BIG_RECORDS_MIDDLE, bigRecordMiddleConf); String bigRecordEndConf = getStringConf(numBigRecordsEnd); job.set(BIG_RECORDS_END, bigRecordEndConf); System.out.println("RECORD_NUM_CONF is " + recordNumConf); System.out.println("BIG_RECORDS_BEGINNING is " + bigRecordStartConf); System.out.println("BIG_RECORDS_MIDDLE is " + bigRecordMiddleConf); System.out.println("BIG_RECORDS_END is " + bigRecordEndConf); } private static String getStringConf(int[] numArray) { StringBuilder sb = new StringBuilder(); boolean first = true; for (int num : numArray) { if (first) { first = false; } else { sb.append(","); } sb.append(num); } return sb.toString(); } private static void fillZero(int[] numBigRecordsStart) { for (int i = 0; i < numBigRecordsStart.length; i++) { numBigRecordsStart[i] = 0; } } } public static String toString(int[] numArray) { StringBuilder sb = new StringBuilder(); for (int num : numArray) { sb.append(num); sb.append(","); } return sb.toString(); } public static class TestNewCollectorMapper implements Mapper<NullWritable, NullWritable, BytesWritable, BytesWritable> { private int keylen = 1; private int vallen = 1; private int bigKeyLen = 10000; private int bigValLen = 10000; private int[] recNumForReducer; private int[] bigRecordsStart; private int[] normalKVNum; private int[] bigRecordsMiddle; private int[] bigRecordsEnd; public void configure(JobConf job) { recNumForReducer = RecordNumStore.getInst(job).getMapperOutNumForEachReducer(); keylen = job.getInt("test.key.length", 1); vallen = job.getInt("test.value.length", 1); bigKeyLen = job.getInt("test.bigkey.length", 10000); bigValLen = job.getInt("test.bigvalue.length", 10000); bigRecordsStart = RecordNumStore.getInst(job).getBigRecodsStart(); bigRecordsMiddle = RecordNumStore.getInst(job).getBigRecodsMiddle(); bigRecordsEnd = RecordNumStore.getInst(job).getBigRecodsEnd(); normalKVNum = new int[bigRecordsStart.length]; for (int i = 0; i < normalKVNum.length; i++) { normalKVNum[i] = recNumForReducer[i] - (bigRecordsStart[i] + bigRecordsMiddle[i] + bigRecordsEnd[i]); } } public void close() { } @Override public void map(NullWritable key, NullWritable value, OutputCollector<BytesWritable, BytesWritable> output, Reporter reporter) throws IOException { boolean outputed = false; int i = -1; while (true) { reporter.progress(); i++; if (i == recNumForReducer.length) { if (!outputed) { break; } i = 0; outputed = false; } if (recNumForReducer[i] == 0) { continue; } if (bigRecordsStart[i] > 0) { collectBigKV(output, i); bigRecordsStart[i]--; recNumForReducer[i]--; } else if (normalKVNum[i] > 0 || bigRecordsMiddle[i] > 0) { if (normalKVNum[i] > 0) { collectNormalKV(output, i); normalKVNum[i]--; recNumForReducer[i]--; } if (bigRecordsMiddle[i] > 0) { collectBigKV(output, i); bigRecordsMiddle[i]--; recNumForReducer[i]--; } } else if (bigRecordsEnd[i] > 0) { collectBigKV(output, i); bigRecordsEnd[i]--; recNumForReducer[i]--; } else { throw new RuntimeException("Uncatched situation."); } outputed = true; } } private void collectKV(OutputCollector<BytesWritable, BytesWritable> output, int reducerNo, int keyLen, int valueLen) throws IOException { BytesWritable k = BytesWritableFactory.getRandomBytesWritable(keyLen); BytesWritable val = BytesWritableFactory.getRandomBytesWritable(valueLen); TestNewCollectorKey collectorKey = new TestNewCollectorKey(k); collectorKey.setHashCode(reducerNo); output.collect(collectorKey, val); } private void collectBigKV(OutputCollector<BytesWritable, BytesWritable> output, int reduceNo) throws IOException { this.collectKV(output, reduceNo, bigKeyLen, bigValLen); } private void collectNormalKV(OutputCollector<BytesWritable, BytesWritable> output, int reducerNo) throws IOException { this.collectKV(output, reducerNo, keylen, vallen); } } public static class TestNewCollectorReducer implements Reducer<BytesWritable, BytesWritable, NullWritable, NullWritable> { private int received = 0; private JobConf job; private BytesWritable lastKey = null; private RawComparator rawComparator; public void configure(JobConf job) { this.job = job; this.rawComparator = WritableComparator.get(BytesWritable.class); } public void close() { boolean found = RecordNumStore.getInst(job).checkReducerReceiveRecNum(received); System.out.println("received count is " + received + ", found is " + found); assertTrue("Unexpected record count (" + received + ")", found); } @Override @SuppressWarnings("unchecked") public void reduce(BytesWritable key, Iterator<BytesWritable> values, OutputCollector<NullWritable, NullWritable> output, Reporter reporter) throws IOException { if (lastKey == null) { lastKey = new BytesWritable(); lastKey.set(key.getBytes(), 0, key.getLength()); } else { int ret = rawComparator.compare(lastKey, key); assertTrue("Incorrect comparasion result given by mapreduce", ret < 0); lastKey.set(key.getBytes(), 0, key.getLength()); } while (values.hasNext()) { values.next(); ++received; } } private void printBytes(BytesWritable key) { byte[] bytes = key.getBytes(); for (int i = 0; i < key.getLength(); i++) { System.out.printf("%02x", bytes[i]); } System.out.println(); } } private void runTest(String name, int keyLen, int valLen, int recordsNumPerMapper, int sortMb, float spillPer, int numMapperTasks, int numReducerTask, double[] reducerRecPercents) throws Exception { this.runTest(name, keyLen, valLen, 0, 0, recordsNumPerMapper, sortMb, spillPer, numMapperTasks, numReducerTask, reducerRecPercents, null, null, null); } private void runTest(String name, int keyLen, int valLen, int bigKeyLen, int bigValLen, int recordsNumPerMapper, int sortMb, float spillPer, int numMapperTasks, int numReducerTask, double[] reducerRecPercents, int[] numBigRecordsStart, int[] numBigRecordsMiddle, int[] numBigRecordsEnd) throws Exception { JobConf conf = mrCluster.createJobConf(); conf.setInt("io.sort.mb", sortMb); conf.set("io.sort.spill.percent", Float.toString(spillPer)); conf.setInt("test.key.length", keyLen); conf.setInt("test.value.length", valLen); conf.setInt("test.bigkey.length", bigKeyLen); conf.setInt("test.bigvalue.length", bigValLen); conf.setNumMapTasks(numMapperTasks); conf.setNumReduceTasks(numReducerTask); conf.setInputFormat(FakeIF.class); conf.setOutputFormat(NullOutputFormat.class); conf.setMapperClass(TestNewCollectorMapper.class); conf.setReducerClass(TestNewCollectorReducer.class); conf.setMapOutputKeyClass(TestNewCollectorKey.class); conf.setMapOutputValueClass(BytesWritable.class); conf.setBoolean("mapred.map.output.blockcollector", true); RecordNumStore.setJobConf(numReducerTask, numMapperTasks, recordsNumPerMapper, reducerRecPercents, numBigRecordsStart, numBigRecordsMiddle, numBigRecordsEnd, conf); RecordNumStore.getInst(conf); LOG.info("Running " + name); JobClient.runJob(conf); } public void testNormalInMemory() throws Exception { runTest("testSmallScale_1", 1, 1, 1, 40, 0.5f, 1, 1, new double[] { 1.0f }); // 200 bytes for each record, and 10000 records for each mapper, so // serialized data will use 2MB // data, and should be able to hold in memory. runTest("testSmallScale_2", 100, 100, 10000, 4, 0.8f, 1, 1, new double[] { 1.0f }); runTest("testSmallScale_2", 100, 100, 10000, 4, 0.8f, 10, 1, new double[] { 1.0f }); // run 2 mappers and 1 reducers, and each mapper output 4MB data. runTest("testSmallScale_3", 100, 100, 10000, 4, 0.8f, 2, 1, new double[] { 1.0f }); // run 2 mappers and 2 reducers, and each mapper output 4MB data. runTest("testSmallScale_4", 100, 100, 10000, 4, 0.8f, 2, 2, new double[] { 0.5f, 0.5f }); } //test cases that require spilling data to disk public void testSpill() throws Exception { // 600 bytes for each mapper, 10K records for each mapper, and totally use // 6MB data. So it will require spill to disk runTest("testSpill_1", 100, 500, 10000, 4, 0.8f, 1, 1, new double[] { 1.0f }); runTest("testSpill_2", 100, 500, 10000, 4, 0.8f, 2, 1, new double[] { 1.0f }); runTest("testSpill_3", 100, 500, 10000, 4, 0.8f, 2, 2, new double[] { 0.5f, 0.5f }); } //test cases that require spilling data to disk public void testSpillMore() throws Exception { // 600 bytes for each mapper, 10K records for each mapper, and totally use // 6MB data. So it will require spill to disk runTest("testSpillMore_1", 100, 500, 10000, 1, 0.8f, 1, 1, new double[] { 1.0f }); runTest("testSpillMore_2", 100, 500, 10000, 1, 0.8f, 2, 1, new double[] { 1.0f }); runTest("testSpillMore_3", 100, 500, 10000, 1, 0.8f, 2, 2, new double[] { 0.5f, 0.5f }); } //test skew cases public void testSkew() throws Exception { // first reducer got 90% records runTest("testSpillSkew_1", 100, 500, 10000, 4, 0.8f, 1, 10, new double[] { 0.9f }); // first got 40%, and second got 40% runTest("testSpillSkew_2", 100, 500, 10000, 4, 0.8f, 1, 10, new double[] { 0.4f, 0.4f }); // first got 60%, and second got 30% runTest("testSpillSkew_3", 100, 500, 10000, 4, 0.8f, 2, 10, new double[] { 0.6f, 0.3f }); } public void testBigRecords() throws Exception { // 600 bytes for each small kv, and also output 60 big // records, 20 at the beginning, 20 in the middle, and 20 at the end runTest("testSpillBigRecords_1", 100, 500, 10000, 500000, 3000, 1, 0.8f, 1, 1, new double[] { 1.0f }, new int[] { 20 }, new int[] { 20 }, new int[] { 20 }); runTest("testSpillBigRecords_2", 100, 500, 10000, 500000, 3000, 1, 0.8f, 2, 1, new double[] { 1.0f }, new int[] { 20 }, new int[] { 20 }, new int[] { 20 }); runTest("testSpillBigRecords_3", 100, 500, 10000, 500000, 3000, 1, 0.8f, 2, 2, new double[] { 0.5f, 0.5f }, new int[] { 20, 20 }, new int[] { 20, 20 }, new int[] { 20, 20 }); } }