Source code

Java tutorial


Here is the source code for


 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * See the License for the specific language governing permissions and
 * limitations under the License.
package org.apache.hadoop.mapred;

import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import java.util.Random;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.mapred.TestMapCollection.FakeIF;
import org.apache.hadoop.mapred.lib.NullOutputFormat;

import junit.framework.TestCase;

public class TestNewCollector extends TestCase {

    private static Log LOG = LogFactory.getLog(TestNewCollector.class);

    private MiniMRCluster mrCluster;

    protected void setUp() {
        JobConf conf = new JobConf();
        try {
            mrCluster = new MiniMRCluster(2, "file:///", 3, null, null, conf);
        } catch (IOException e) {
            throw new RuntimeException(e);

    protected void tearDown() {

    public static class BytesWritableFactory {

        private static Random random = new Random();

        public static BytesWritable getRandomBytesWritable(int size) {
            byte[] bytes = new byte[size];
            BytesWritable bytesWritable = new BytesWritable(bytes);
            return bytesWritable;

        public static BytesWritable getRepeatedBytesWritable(byte[] bytes, int repeatNum) {
            int newLen = bytes.length * repeatNum;
            byte[] bb = new byte[newLen];
            for (int i = 0; i < repeatNum; i++) {
                System.arraycopy(bytes, 0, bb, bytes.length * i, bytes.length);
            BytesWritable bytesWritable = new BytesWritable(bb);
            return bytesWritable;

    public static class TestNewCollectorKey extends BytesWritable {
        private int hashCode = -1;

        public TestNewCollectorKey(BytesWritable k) {

        public TestNewCollectorKey() {

        public int hashCode() {
            if (hashCode < 0) {
                hashCode = super.hashCode();
            return hashCode;

        public void setHashCode(int hashCode) {
            this.hashCode = hashCode;

    public static class RecordNumStore {

        private static String RECORD_NUM_CONF = "test.reducer.records.num";

         * conf to specify number of big records to spill right after the mapper
         * starts, it is comma separated string for a list of values, each value is
         * one reducer.
        private static String BIG_RECORDS_BEGINNING = "test.reducer.bigrecords.start";

         * conf to specify number of big records to spill in the middle of a
         * mapper, it is comma separated string for a list of values, each value is
         * one reducer.
        private static String BIG_RECORDS_MIDDLE = "test.reducer.bigrecords.middle";

         * conf to specify number of big records to spill right before the mapper
         * finish, it is comma separated string for a list of values, each value is
         * one reducer.
        private static String BIG_RECORDS_END = "test.reducer.bigrecords.end";

        private JobConf currentJobConf;
        private List<Integer> reducerToReciveRecNum;
        private int[] mapperOutNumForEachReducer;

        private static RecordNumStore inst;
        private static Object instanceLock = new Object();

        private RecordNumStore(JobConf job) {
            this.currentJobConf = job;

        public void init(JobConf job) {
            String recordNumStr = job.get(RECORD_NUM_CONF);

            int numMappers = job.getNumMapTasks();
            reducerToReciveRecNum = new ArrayList<Integer>(numMappers);
            if (recordNumStr != null) {
                String[] splits = recordNumStr.split(",");
                for (String num : splits) {
                    if (num == null || num.trim().equals("")) {

            for (int i = reducerToReciveRecNum.size(); i < numMappers; i++) {

        public static RecordNumStore getInst(JobConf job) {
            synchronized (instanceLock) {
                if (job != null && (inst == null || job != inst.getCurrentJobConf())) {
                    inst = new RecordNumStore(job);
                return inst;

        protected JobConf getCurrentJobConf() {
            return currentJobConf;

        public synchronized int[] getMapperOutNumForEachReducer() {
            int numReducers = currentJobConf.getNumReduceTasks();
            int numMappers = currentJobConf.getNumMapTasks();
            if (mapperOutNumForEachReducer == null) {
                mapperOutNumForEachReducer = new int[numReducers];

            List<Integer> reducerToReciveNum = this.reducerToReciveRecNum;
            for (int i = 0; i < numReducers; i++) {
                mapperOutNumForEachReducer[i] = reducerToReciveNum.get(i) / numMappers;

            return mapperOutNumForEachReducer;

        public synchronized int[] getBigRecodsStart() {
            String bigRecordNumStartStr = currentJobConf.get(BIG_RECORDS_BEGINNING);
            int[] bigRecordsStart = splitConfToIntArray(bigRecordNumStartStr);

            return bigRecordsStart;

        public synchronized int[] getBigRecodsMiddle() {
            String bigRecordNumMiddleStr = currentJobConf.get(BIG_RECORDS_MIDDLE);
            int[] bigRecordsMiddle = splitConfToIntArray(bigRecordNumMiddleStr);

            return bigRecordsMiddle;

        public synchronized int[] getBigRecodsEnd() {
            String bigRecordNumEndStr = currentJobConf.get(BIG_RECORDS_END);
            int[] bigRecordsEnd = splitConfToIntArray(bigRecordNumEndStr);

            return bigRecordsEnd;

        private int[] splitConfToIntArray(String confStr) {
            String[] splits = confStr.split(",");
            int[] numArray = new int[splits.length];
            for (int i = 0; i < splits.length; i++) {
                String num = splits[i];
                if (num == null || num.trim().equals("")) {
                    numArray[i] = 0;
                } else {
                    numArray[i] = Integer.parseInt(num);
            return numArray;

        public boolean checkReducerReceiveRecNum(int reducerNum) {
            return reducerToReciveRecNum.remove(Integer.valueOf(reducerNum));

         * Each mapper is omitting the same number of records. And
         * reducerRecPercents array decides how many should go to each reducer. One
         * reducer will receive the same number of records from different mappers.
         * @param numReducers
         *          number of reducers to run
         * @param mappers
         *          number of mappers to run
         * @param recordNumPerMapper
         *          how many records each mapper outputs
         * @param reducerRecPercents
         *          for one mapper, how to allocate output records to reducers
         * @param numBigRecordsStart
         * @param numBigRecordsMiddle
         * @param numBigRecordsEnd
         * @param job
        public static void setJobConf(int numReducers, int mappers, int recordNumPerMapper,
                double[] reducerRecPercents, int[] numBigRecordsStart, int[] numBigRecordsMiddle,
                int[] numBigRecordsEnd, JobConf job) {
            int[] recNumReducerOneMapper = new int[numReducers];
            double left = 1.0f;
            int preAllocated = 0;
            int leftToAllocate = recordNumPerMapper;

            if (numBigRecordsStart == null) {
                numBigRecordsStart = new int[numReducers];

            if (numBigRecordsMiddle == null) {
                numBigRecordsMiddle = new int[numReducers];

            if (numBigRecordsEnd == null) {
                numBigRecordsEnd = new int[numReducers];

            if (reducerRecPercents != null) {
                if (reducerRecPercents.length > numReducers) {
                    throw new IllegalArgumentException("percents array length is " + reducerRecPercents.length
                            + " while numReducers is " + numReducers);
                preAllocated = reducerRecPercents.length;
            for (int i = 0; i < preAllocated; i++) {
                left -= reducerRecPercents[i];
                if (left < 0) {
                    throw new IllegalArgumentException("sum of percents array is bigger than 1.0");
                recNumReducerOneMapper[i] = (int) (recordNumPerMapper * reducerRecPercents[i]);
                leftToAllocate -= recNumReducerOneMapper[i];

            int toAllocateReducer = preAllocated;
            while (leftToAllocate > 0 && toAllocateReducer < numReducers) {
                recNumReducerOneMapper[toAllocateReducer] += 1;
                if (toAllocateReducer == numReducers) {
                    toAllocateReducer = preAllocated;

            for (int i = 0; i < recNumReducerOneMapper.length; i++) {
                recNumReducerOneMapper[i] = recNumReducerOneMapper[i] * mappers;
                int bigRecords = numBigRecordsStart[i] + numBigRecordsMiddle[i] + numBigRecordsEnd[i];
                if (bigRecords > recNumReducerOneMapper[i]) {
                    throw new IllegalArgumentException("big records number is bigger than total.");

            String recordNumConf = getStringConf(recNumReducerOneMapper);
            job.set(RECORD_NUM_CONF, recordNumConf);

            String bigRecordStartConf = getStringConf(numBigRecordsStart);
            job.set(BIG_RECORDS_BEGINNING, bigRecordStartConf);

            String bigRecordMiddleConf = getStringConf(numBigRecordsMiddle);
            job.set(BIG_RECORDS_MIDDLE, bigRecordMiddleConf);

            String bigRecordEndConf = getStringConf(numBigRecordsEnd);
            job.set(BIG_RECORDS_END, bigRecordEndConf);

            System.out.println("RECORD_NUM_CONF is " + recordNumConf);
            System.out.println("BIG_RECORDS_BEGINNING is " + bigRecordStartConf);
            System.out.println("BIG_RECORDS_MIDDLE is " + bigRecordMiddleConf);
            System.out.println("BIG_RECORDS_END is " + bigRecordEndConf);

        private static String getStringConf(int[] numArray) {
            StringBuilder sb = new StringBuilder();
            boolean first = true;
            for (int num : numArray) {
                if (first) {
                    first = false;
                } else {
            return sb.toString();

        private static void fillZero(int[] numBigRecordsStart) {
            for (int i = 0; i < numBigRecordsStart.length; i++) {
                numBigRecordsStart[i] = 0;

    public static String toString(int[] numArray) {
        StringBuilder sb = new StringBuilder();
        for (int num : numArray) {

        return sb.toString();

    public static class TestNewCollectorMapper
            implements Mapper<NullWritable, NullWritable, BytesWritable, BytesWritable> {

        private int keylen = 1;
        private int vallen = 1;
        private int bigKeyLen = 10000;
        private int bigValLen = 10000;

        private int[] recNumForReducer;
        private int[] bigRecordsStart;
        private int[] normalKVNum;
        private int[] bigRecordsMiddle;
        private int[] bigRecordsEnd;

        public void configure(JobConf job) {
            recNumForReducer = RecordNumStore.getInst(job).getMapperOutNumForEachReducer();
            keylen = job.getInt("test.key.length", 1);
            vallen = job.getInt("test.value.length", 1);
            bigKeyLen = job.getInt("test.bigkey.length", 10000);
            bigValLen = job.getInt("test.bigvalue.length", 10000);
            bigRecordsStart = RecordNumStore.getInst(job).getBigRecodsStart();
            bigRecordsMiddle = RecordNumStore.getInst(job).getBigRecodsMiddle();
            bigRecordsEnd = RecordNumStore.getInst(job).getBigRecodsEnd();
            normalKVNum = new int[bigRecordsStart.length];
            for (int i = 0; i < normalKVNum.length; i++) {
                normalKVNum[i] = recNumForReducer[i]
                        - (bigRecordsStart[i] + bigRecordsMiddle[i] + bigRecordsEnd[i]);

        public void close() {

        public void map(NullWritable key, NullWritable value, OutputCollector<BytesWritable, BytesWritable> output,
                Reporter reporter) throws IOException {
            boolean outputed = false;
            int i = -1;
            while (true) {
                if (i == recNumForReducer.length) {
                    if (!outputed) {
                    i = 0;
                    outputed = false;
                if (recNumForReducer[i] == 0) {
                if (bigRecordsStart[i] > 0) {
                    collectBigKV(output, i);
                } else if (normalKVNum[i] > 0 || bigRecordsMiddle[i] > 0) {
                    if (normalKVNum[i] > 0) {
                        collectNormalKV(output, i);
                    if (bigRecordsMiddle[i] > 0) {
                        collectBigKV(output, i);
                } else if (bigRecordsEnd[i] > 0) {
                    collectBigKV(output, i);
                } else {
                    throw new RuntimeException("Uncatched situation.");
                outputed = true;

        private void collectKV(OutputCollector<BytesWritable, BytesWritable> output, int reducerNo, int keyLen,
                int valueLen) throws IOException {
            BytesWritable k = BytesWritableFactory.getRandomBytesWritable(keyLen);
            BytesWritable val = BytesWritableFactory.getRandomBytesWritable(valueLen);
            TestNewCollectorKey collectorKey = new TestNewCollectorKey(k);
            output.collect(collectorKey, val);

        private void collectBigKV(OutputCollector<BytesWritable, BytesWritable> output, int reduceNo)
                throws IOException {
            this.collectKV(output, reduceNo, bigKeyLen, bigValLen);

        private void collectNormalKV(OutputCollector<BytesWritable, BytesWritable> output, int reducerNo)
                throws IOException {
            this.collectKV(output, reducerNo, keylen, vallen);


    public static class TestNewCollectorReducer
            implements Reducer<BytesWritable, BytesWritable, NullWritable, NullWritable> {

        private int received = 0;
        private JobConf job;
        private BytesWritable lastKey = null;
        private RawComparator rawComparator;

        public void configure(JobConf job) {
            this.job = job;
            this.rawComparator = WritableComparator.get(BytesWritable.class);

        public void close() {
            boolean found = RecordNumStore.getInst(job).checkReducerReceiveRecNum(received);
            System.out.println("received count is " + received + ", found is " + found);
            assertTrue("Unexpected record count (" + received + ")", found);

        public void reduce(BytesWritable key, Iterator<BytesWritable> values,
                OutputCollector<NullWritable, NullWritable> output, Reporter reporter) throws IOException {
            if (lastKey == null) {
                lastKey = new BytesWritable();
                lastKey.set(key.getBytes(), 0, key.getLength());
            } else {
                int ret =, key);
                assertTrue("Incorrect comparasion result given by mapreduce", ret < 0);
                lastKey.set(key.getBytes(), 0, key.getLength());
            while (values.hasNext()) {

        private void printBytes(BytesWritable key) {
            byte[] bytes = key.getBytes();
            for (int i = 0; i < key.getLength(); i++) {
                System.out.printf("%02x", bytes[i]);

    private void runTest(String name, int keyLen, int valLen, int recordsNumPerMapper, int sortMb, float spillPer,
            int numMapperTasks, int numReducerTask, double[] reducerRecPercents) throws Exception {
        this.runTest(name, keyLen, valLen, 0, 0, recordsNumPerMapper, sortMb, spillPer, numMapperTasks,
                numReducerTask, reducerRecPercents, null, null, null);

    private void runTest(String name, int keyLen, int valLen, int bigKeyLen, int bigValLen, int recordsNumPerMapper,
            int sortMb, float spillPer, int numMapperTasks, int numReducerTask, double[] reducerRecPercents,
            int[] numBigRecordsStart, int[] numBigRecordsMiddle, int[] numBigRecordsEnd) throws Exception {
        JobConf conf = mrCluster.createJobConf();
        conf.setInt("io.sort.mb", sortMb);
        conf.set("io.sort.spill.percent", Float.toString(spillPer));
        conf.setInt("test.key.length", keyLen);
        conf.setInt("test.value.length", valLen);
        conf.setInt("test.bigkey.length", bigKeyLen);
        conf.setInt("test.bigvalue.length", bigValLen);
        conf.setBoolean("", true);

        RecordNumStore.setJobConf(numReducerTask, numMapperTasks, recordsNumPerMapper, reducerRecPercents,
                numBigRecordsStart, numBigRecordsMiddle, numBigRecordsEnd, conf);
        RecordNumStore.getInst(conf);"Running " + name);

    public void testNormalInMemory() throws Exception {
        runTest("testSmallScale_1", 1, 1, 1, 40, 0.5f, 1, 1, new double[] { 1.0f });
        // 200 bytes for each record, and 10000 records for each mapper, so
        // serialized data will use 2MB
        // data, and should be able to hold in memory.
        runTest("testSmallScale_2", 100, 100, 10000, 4, 0.8f, 1, 1, new double[] { 1.0f });
        runTest("testSmallScale_2", 100, 100, 10000, 4, 0.8f, 10, 1, new double[] { 1.0f });
        // run 2 mappers and 1 reducers, and each mapper output 4MB data.
        runTest("testSmallScale_3", 100, 100, 10000, 4, 0.8f, 2, 1, new double[] { 1.0f });
        // run 2 mappers and 2 reducers, and each mapper output 4MB data.
        runTest("testSmallScale_4", 100, 100, 10000, 4, 0.8f, 2, 2, new double[] { 0.5f, 0.5f });

    //test cases that require spilling data to disk
    public void testSpill() throws Exception {
        // 600 bytes for each mapper, 10K records for each mapper, and totally use
        // 6MB data. So it will require spill to disk
        runTest("testSpill_1", 100, 500, 10000, 4, 0.8f, 1, 1, new double[] { 1.0f });
        runTest("testSpill_2", 100, 500, 10000, 4, 0.8f, 2, 1, new double[] { 1.0f });
        runTest("testSpill_3", 100, 500, 10000, 4, 0.8f, 2, 2, new double[] { 0.5f, 0.5f });

    //test cases that require spilling data to disk
    public void testSpillMore() throws Exception {
        // 600 bytes for each mapper, 10K records for each mapper, and totally use
        // 6MB data. So it will require spill to disk
        runTest("testSpillMore_1", 100, 500, 10000, 1, 0.8f, 1, 1, new double[] { 1.0f });
        runTest("testSpillMore_2", 100, 500, 10000, 1, 0.8f, 2, 1, new double[] { 1.0f });
        runTest("testSpillMore_3", 100, 500, 10000, 1, 0.8f, 2, 2, new double[] { 0.5f, 0.5f });

    //test skew cases
    public void testSkew() throws Exception {
        // first reducer got 90% records
        runTest("testSpillSkew_1", 100, 500, 10000, 4, 0.8f, 1, 10, new double[] { 0.9f });
        // first got 40%, and second got 40%
        runTest("testSpillSkew_2", 100, 500, 10000, 4, 0.8f, 1, 10, new double[] { 0.4f, 0.4f });
        // first got 60%, and second got 30%
        runTest("testSpillSkew_3", 100, 500, 10000, 4, 0.8f, 2, 10, new double[] { 0.6f, 0.3f });

    public void testBigRecords() throws Exception {
        // 600 bytes for each small kv, and also output 60 big
        // records, 20 at the beginning, 20 in the middle, and 20 at the end
        runTest("testSpillBigRecords_1", 100, 500, 10000, 500000, 3000, 1, 0.8f, 1, 1, new double[] { 1.0f },
                new int[] { 20 }, new int[] { 20 }, new int[] { 20 });
        runTest("testSpillBigRecords_2", 100, 500, 10000, 500000, 3000, 1, 0.8f, 2, 1, new double[] { 1.0f },
                new int[] { 20 }, new int[] { 20 }, new int[] { 20 });
        runTest("testSpillBigRecords_3", 100, 500, 10000, 500000, 3000, 1, 0.8f, 2, 2, new double[] { 0.5f, 0.5f },
                new int[] { 20, 20 }, new int[] { 20, 20 }, new int[] { 20, 20 });
