org.apache.hadoop.mapred.TestNewCollector.java Source code

Java tutorial

Introduction

Here is the source code for org.apache.hadoop.mapred.TestNewCollector.java

Source

/**
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.hadoop.mapred;

import java.io.IOException;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import java.util.Random;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.io.BytesWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.RawComparator;
import org.apache.hadoop.io.WritableComparator;
import org.apache.hadoop.mapred.TestMapCollection.FakeIF;
import org.apache.hadoop.mapred.lib.NullOutputFormat;

import junit.framework.TestCase;

@SuppressWarnings("deprecation")
public class TestNewCollector extends TestCase {

    private static Log LOG = LogFactory.getLog(TestNewCollector.class);

    private MiniMRCluster mrCluster;

    protected void setUp() {
        JobConf conf = new JobConf();
        try {
            mrCluster = new MiniMRCluster(2, "file:///", 3, null, null, conf);
        } catch (IOException e) {
            throw new RuntimeException(e);
        }
    }

    protected void tearDown() {
        mrCluster.shutdown();
    }

    public static class BytesWritableFactory {

        private static Random random = new Random();

        public static BytesWritable getRandomBytesWritable(int size) {
            byte[] bytes = new byte[size];
            random.nextBytes(bytes);
            BytesWritable bytesWritable = new BytesWritable(bytes);
            return bytesWritable;
        }

        public static BytesWritable getRepeatedBytesWritable(byte[] bytes, int repeatNum) {
            int newLen = bytes.length * repeatNum;
            byte[] bb = new byte[newLen];
            for (int i = 0; i < repeatNum; i++) {
                System.arraycopy(bytes, 0, bb, bytes.length * i, bytes.length);
            }
            BytesWritable bytesWritable = new BytesWritable(bb);
            return bytesWritable;
        }
    }

    public static class TestNewCollectorKey extends BytesWritable {
        private int hashCode = -1;

        public TestNewCollectorKey(BytesWritable k) {
            super(k.getBytes());
        }

        public TestNewCollectorKey() {
            super();
        }

        public int hashCode() {
            if (hashCode < 0) {
                hashCode = super.hashCode();
            }
            return hashCode;
        }

        public void setHashCode(int hashCode) {
            this.hashCode = hashCode;
        }
    }

    public static class RecordNumStore {

        private static String RECORD_NUM_CONF = "test.reducer.records.num";

        /*
         * conf to specify number of big records to spill right after the mapper
         * starts, it is comma separated string for a list of values, each value is
         * one reducer.
         */
        private static String BIG_RECORDS_BEGINNING = "test.reducer.bigrecords.start";

        /*
         * conf to specify number of big records to spill in the middle of a
         * mapper, it is comma separated string for a list of values, each value is
         * one reducer.
         */
        private static String BIG_RECORDS_MIDDLE = "test.reducer.bigrecords.middle";

        /*
         * conf to specify number of big records to spill right before the mapper
         * finish, it is comma separated string for a list of values, each value is
         * one reducer.
         */
        private static String BIG_RECORDS_END = "test.reducer.bigrecords.end";

        private JobConf currentJobConf;
        private List<Integer> reducerToReciveRecNum;
        private int[] mapperOutNumForEachReducer;

        private static RecordNumStore inst;
        private static Object instanceLock = new Object();

        private RecordNumStore(JobConf job) {
            this.currentJobConf = job;
            init(job);
        }

        public void init(JobConf job) {
            String recordNumStr = job.get(RECORD_NUM_CONF);

            int numMappers = job.getNumMapTasks();
            reducerToReciveRecNum = new ArrayList<Integer>(numMappers);
            if (recordNumStr != null) {
                String[] splits = recordNumStr.split(",");
                for (String num : splits) {
                    if (num == null || num.trim().equals("")) {
                        continue;
                    }
                    reducerToReciveRecNum.add(Integer.parseInt(num));
                }
            }

            for (int i = reducerToReciveRecNum.size(); i < numMappers; i++) {
                reducerToReciveRecNum.add(0);
            }
        }

        public static RecordNumStore getInst(JobConf job) {
            synchronized (instanceLock) {
                if (job != null && (inst == null || job != inst.getCurrentJobConf())) {
                    inst = new RecordNumStore(job);
                }
                return inst;
            }
        }

        protected JobConf getCurrentJobConf() {
            return currentJobConf;
        }

        public synchronized int[] getMapperOutNumForEachReducer() {
            int numReducers = currentJobConf.getNumReduceTasks();
            int numMappers = currentJobConf.getNumMapTasks();
            if (mapperOutNumForEachReducer == null) {
                mapperOutNumForEachReducer = new int[numReducers];
            }

            List<Integer> reducerToReciveNum = this.reducerToReciveRecNum;
            for (int i = 0; i < numReducers; i++) {
                mapperOutNumForEachReducer[i] = reducerToReciveNum.get(i) / numMappers;
            }

            return mapperOutNumForEachReducer;
        }

        public synchronized int[] getBigRecodsStart() {
            String bigRecordNumStartStr = currentJobConf.get(BIG_RECORDS_BEGINNING);
            int[] bigRecordsStart = splitConfToIntArray(bigRecordNumStartStr);

            return bigRecordsStart;
        }

        public synchronized int[] getBigRecodsMiddle() {
            String bigRecordNumMiddleStr = currentJobConf.get(BIG_RECORDS_MIDDLE);
            int[] bigRecordsMiddle = splitConfToIntArray(bigRecordNumMiddleStr);

            return bigRecordsMiddle;
        }

        public synchronized int[] getBigRecodsEnd() {
            String bigRecordNumEndStr = currentJobConf.get(BIG_RECORDS_END);
            int[] bigRecordsEnd = splitConfToIntArray(bigRecordNumEndStr);

            return bigRecordsEnd;
        }

        private int[] splitConfToIntArray(String confStr) {
            String[] splits = confStr.split(",");
            int[] numArray = new int[splits.length];
            for (int i = 0; i < splits.length; i++) {
                String num = splits[i];
                if (num == null || num.trim().equals("")) {
                    numArray[i] = 0;
                } else {
                    numArray[i] = Integer.parseInt(num);
                }
            }
            return numArray;
        }

        public boolean checkReducerReceiveRecNum(int reducerNum) {
            return reducerToReciveRecNum.remove(Integer.valueOf(reducerNum));
        }

        /**
         * Each mapper is omitting the same number of records. And
         * reducerRecPercents array decides how many should go to each reducer. One
         * reducer will receive the same number of records from different mappers.
         * 
         * @param numReducers
         *          number of reducers to run
         * @param mappers
         *          number of mappers to run
         * @param recordNumPerMapper
         *          how many records each mapper outputs
         * @param reducerRecPercents
         *          for one mapper, how to allocate output records to reducers
         * @param numBigRecordsStart
         * @param numBigRecordsMiddle
         * @param numBigRecordsEnd
         * @param job
         */
        public static void setJobConf(int numReducers, int mappers, int recordNumPerMapper,
                double[] reducerRecPercents, int[] numBigRecordsStart, int[] numBigRecordsMiddle,
                int[] numBigRecordsEnd, JobConf job) {
            int[] recNumReducerOneMapper = new int[numReducers];
            double left = 1.0f;
            int preAllocated = 0;
            int leftToAllocate = recordNumPerMapper;

            if (numBigRecordsStart == null) {
                numBigRecordsStart = new int[numReducers];
                fillZero(numBigRecordsStart);
            }

            if (numBigRecordsMiddle == null) {
                numBigRecordsMiddle = new int[numReducers];
                fillZero(numBigRecordsMiddle);
            }

            if (numBigRecordsEnd == null) {
                numBigRecordsEnd = new int[numReducers];
                fillZero(numBigRecordsEnd);
            }

            if (reducerRecPercents != null) {
                if (reducerRecPercents.length > numReducers) {
                    throw new IllegalArgumentException("percents array length is " + reducerRecPercents.length
                            + " while numReducers is " + numReducers);
                }
                preAllocated = reducerRecPercents.length;
            }
            for (int i = 0; i < preAllocated; i++) {
                left -= reducerRecPercents[i];
                if (left < 0) {
                    throw new IllegalArgumentException("sum of percents array is bigger than 1.0");
                }
                recNumReducerOneMapper[i] = (int) (recordNumPerMapper * reducerRecPercents[i]);
                leftToAllocate -= recNumReducerOneMapper[i];
            }

            int toAllocateReducer = preAllocated;
            while (leftToAllocate > 0 && toAllocateReducer < numReducers) {
                recNumReducerOneMapper[toAllocateReducer] += 1;
                toAllocateReducer++;
                if (toAllocateReducer == numReducers) {
                    toAllocateReducer = preAllocated;
                }
                leftToAllocate--;
            }

            for (int i = 0; i < recNumReducerOneMapper.length; i++) {
                recNumReducerOneMapper[i] = recNumReducerOneMapper[i] * mappers;
                int bigRecords = numBigRecordsStart[i] + numBigRecordsMiddle[i] + numBigRecordsEnd[i];
                if (bigRecords > recNumReducerOneMapper[i]) {
                    throw new IllegalArgumentException("big records number is bigger than total.");
                }
            }

            String recordNumConf = getStringConf(recNumReducerOneMapper);
            job.set(RECORD_NUM_CONF, recordNumConf);

            String bigRecordStartConf = getStringConf(numBigRecordsStart);
            job.set(BIG_RECORDS_BEGINNING, bigRecordStartConf);

            String bigRecordMiddleConf = getStringConf(numBigRecordsMiddle);
            job.set(BIG_RECORDS_MIDDLE, bigRecordMiddleConf);

            String bigRecordEndConf = getStringConf(numBigRecordsEnd);
            job.set(BIG_RECORDS_END, bigRecordEndConf);

            System.out.println("RECORD_NUM_CONF is " + recordNumConf);
            System.out.println("BIG_RECORDS_BEGINNING is " + bigRecordStartConf);
            System.out.println("BIG_RECORDS_MIDDLE is " + bigRecordMiddleConf);
            System.out.println("BIG_RECORDS_END is " + bigRecordEndConf);
        }

        private static String getStringConf(int[] numArray) {
            StringBuilder sb = new StringBuilder();
            boolean first = true;
            for (int num : numArray) {
                if (first) {
                    first = false;
                } else {
                    sb.append(",");
                }
                sb.append(num);
            }
            return sb.toString();
        }

        private static void fillZero(int[] numBigRecordsStart) {
            for (int i = 0; i < numBigRecordsStart.length; i++) {
                numBigRecordsStart[i] = 0;
            }
        }
    }

    public static String toString(int[] numArray) {
        StringBuilder sb = new StringBuilder();
        for (int num : numArray) {
            sb.append(num);
            sb.append(",");
        }

        return sb.toString();
    }

    public static class TestNewCollectorMapper
            implements Mapper<NullWritable, NullWritable, BytesWritable, BytesWritable> {

        private int keylen = 1;
        private int vallen = 1;
        private int bigKeyLen = 10000;
        private int bigValLen = 10000;

        private int[] recNumForReducer;
        private int[] bigRecordsStart;
        private int[] normalKVNum;
        private int[] bigRecordsMiddle;
        private int[] bigRecordsEnd;

        public void configure(JobConf job) {
            recNumForReducer = RecordNumStore.getInst(job).getMapperOutNumForEachReducer();
            keylen = job.getInt("test.key.length", 1);
            vallen = job.getInt("test.value.length", 1);
            bigKeyLen = job.getInt("test.bigkey.length", 10000);
            bigValLen = job.getInt("test.bigvalue.length", 10000);
            bigRecordsStart = RecordNumStore.getInst(job).getBigRecodsStart();
            bigRecordsMiddle = RecordNumStore.getInst(job).getBigRecodsMiddle();
            bigRecordsEnd = RecordNumStore.getInst(job).getBigRecodsEnd();
            normalKVNum = new int[bigRecordsStart.length];
            for (int i = 0; i < normalKVNum.length; i++) {
                normalKVNum[i] = recNumForReducer[i]
                        - (bigRecordsStart[i] + bigRecordsMiddle[i] + bigRecordsEnd[i]);
            }
        }

        public void close() {
        }

        @Override
        public void map(NullWritable key, NullWritable value, OutputCollector<BytesWritable, BytesWritable> output,
                Reporter reporter) throws IOException {
            boolean outputed = false;
            int i = -1;
            while (true) {
                reporter.progress();
                i++;
                if (i == recNumForReducer.length) {
                    if (!outputed) {
                        break;
                    }
                    i = 0;
                    outputed = false;
                }
                if (recNumForReducer[i] == 0) {
                    continue;
                }
                if (bigRecordsStart[i] > 0) {
                    collectBigKV(output, i);
                    bigRecordsStart[i]--;
                    recNumForReducer[i]--;
                } else if (normalKVNum[i] > 0 || bigRecordsMiddle[i] > 0) {
                    if (normalKVNum[i] > 0) {
                        collectNormalKV(output, i);
                        normalKVNum[i]--;
                        recNumForReducer[i]--;
                    }
                    if (bigRecordsMiddle[i] > 0) {
                        collectBigKV(output, i);
                        bigRecordsMiddle[i]--;
                        recNumForReducer[i]--;
                    }
                } else if (bigRecordsEnd[i] > 0) {
                    collectBigKV(output, i);
                    bigRecordsEnd[i]--;
                    recNumForReducer[i]--;
                } else {
                    throw new RuntimeException("Uncatched situation.");
                }
                outputed = true;
            }
        }

        private void collectKV(OutputCollector<BytesWritable, BytesWritable> output, int reducerNo, int keyLen,
                int valueLen) throws IOException {
            BytesWritable k = BytesWritableFactory.getRandomBytesWritable(keyLen);
            BytesWritable val = BytesWritableFactory.getRandomBytesWritable(valueLen);
            TestNewCollectorKey collectorKey = new TestNewCollectorKey(k);
            collectorKey.setHashCode(reducerNo);
            output.collect(collectorKey, val);
        }

        private void collectBigKV(OutputCollector<BytesWritable, BytesWritable> output, int reduceNo)
                throws IOException {
            this.collectKV(output, reduceNo, bigKeyLen, bigValLen);
        }

        private void collectNormalKV(OutputCollector<BytesWritable, BytesWritable> output, int reducerNo)
                throws IOException {
            this.collectKV(output, reducerNo, keylen, vallen);
        }

    }

    public static class TestNewCollectorReducer
            implements Reducer<BytesWritable, BytesWritable, NullWritable, NullWritable> {

        private int received = 0;
        private JobConf job;
        private BytesWritable lastKey = null;
        private RawComparator rawComparator;

        public void configure(JobConf job) {
            this.job = job;
            this.rawComparator = WritableComparator.get(BytesWritable.class);
        }

        public void close() {
            boolean found = RecordNumStore.getInst(job).checkReducerReceiveRecNum(received);
            System.out.println("received count is " + received + ", found is " + found);
            assertTrue("Unexpected record count (" + received + ")", found);
        }

        @Override
        @SuppressWarnings("unchecked")
        public void reduce(BytesWritable key, Iterator<BytesWritable> values,
                OutputCollector<NullWritable, NullWritable> output, Reporter reporter) throws IOException {
            if (lastKey == null) {
                lastKey = new BytesWritable();
                lastKey.set(key.getBytes(), 0, key.getLength());
            } else {
                int ret = rawComparator.compare(lastKey, key);
                assertTrue("Incorrect comparasion result given by mapreduce", ret < 0);
                lastKey.set(key.getBytes(), 0, key.getLength());
            }
            while (values.hasNext()) {
                values.next();
                ++received;
            }
        }

        private void printBytes(BytesWritable key) {
            byte[] bytes = key.getBytes();
            for (int i = 0; i < key.getLength(); i++) {
                System.out.printf("%02x", bytes[i]);
            }
            System.out.println();
        }
    }

    private void runTest(String name, int keyLen, int valLen, int recordsNumPerMapper, int sortMb, float spillPer,
            int numMapperTasks, int numReducerTask, double[] reducerRecPercents) throws Exception {
        this.runTest(name, keyLen, valLen, 0, 0, recordsNumPerMapper, sortMb, spillPer, numMapperTasks,
                numReducerTask, reducerRecPercents, null, null, null);
    }

    private void runTest(String name, int keyLen, int valLen, int bigKeyLen, int bigValLen, int recordsNumPerMapper,
            int sortMb, float spillPer, int numMapperTasks, int numReducerTask, double[] reducerRecPercents,
            int[] numBigRecordsStart, int[] numBigRecordsMiddle, int[] numBigRecordsEnd) throws Exception {
        JobConf conf = mrCluster.createJobConf();
        conf.setInt("io.sort.mb", sortMb);
        conf.set("io.sort.spill.percent", Float.toString(spillPer));
        conf.setInt("test.key.length", keyLen);
        conf.setInt("test.value.length", valLen);
        conf.setInt("test.bigkey.length", bigKeyLen);
        conf.setInt("test.bigvalue.length", bigValLen);
        conf.setNumMapTasks(numMapperTasks);
        conf.setNumReduceTasks(numReducerTask);
        conf.setInputFormat(FakeIF.class);
        conf.setOutputFormat(NullOutputFormat.class);
        conf.setMapperClass(TestNewCollectorMapper.class);
        conf.setReducerClass(TestNewCollectorReducer.class);
        conf.setMapOutputKeyClass(TestNewCollectorKey.class);
        conf.setMapOutputValueClass(BytesWritable.class);
        conf.setBoolean("mapred.map.output.blockcollector", true);

        RecordNumStore.setJobConf(numReducerTask, numMapperTasks, recordsNumPerMapper, reducerRecPercents,
                numBigRecordsStart, numBigRecordsMiddle, numBigRecordsEnd, conf);
        RecordNumStore.getInst(conf);
        LOG.info("Running " + name);
        JobClient.runJob(conf);
    }

    public void testNormalInMemory() throws Exception {
        runTest("testSmallScale_1", 1, 1, 1, 40, 0.5f, 1, 1, new double[] { 1.0f });
        // 200 bytes for each record, and 10000 records for each mapper, so
        // serialized data will use 2MB
        // data, and should be able to hold in memory.
        runTest("testSmallScale_2", 100, 100, 10000, 4, 0.8f, 1, 1, new double[] { 1.0f });
        runTest("testSmallScale_2", 100, 100, 10000, 4, 0.8f, 10, 1, new double[] { 1.0f });
        // run 2 mappers and 1 reducers, and each mapper output 4MB data.
        runTest("testSmallScale_3", 100, 100, 10000, 4, 0.8f, 2, 1, new double[] { 1.0f });
        // run 2 mappers and 2 reducers, and each mapper output 4MB data.
        runTest("testSmallScale_4", 100, 100, 10000, 4, 0.8f, 2, 2, new double[] { 0.5f, 0.5f });
    }

    //test cases that require spilling data to disk
    public void testSpill() throws Exception {
        // 600 bytes for each mapper, 10K records for each mapper, and totally use
        // 6MB data. So it will require spill to disk
        runTest("testSpill_1", 100, 500, 10000, 4, 0.8f, 1, 1, new double[] { 1.0f });
        runTest("testSpill_2", 100, 500, 10000, 4, 0.8f, 2, 1, new double[] { 1.0f });
        runTest("testSpill_3", 100, 500, 10000, 4, 0.8f, 2, 2, new double[] { 0.5f, 0.5f });
    }

    //test cases that require spilling data to disk
    public void testSpillMore() throws Exception {
        // 600 bytes for each mapper, 10K records for each mapper, and totally use
        // 6MB data. So it will require spill to disk
        runTest("testSpillMore_1", 100, 500, 10000, 1, 0.8f, 1, 1, new double[] { 1.0f });
        runTest("testSpillMore_2", 100, 500, 10000, 1, 0.8f, 2, 1, new double[] { 1.0f });
        runTest("testSpillMore_3", 100, 500, 10000, 1, 0.8f, 2, 2, new double[] { 0.5f, 0.5f });
    }

    //test skew cases
    public void testSkew() throws Exception {
        // first reducer got 90% records
        runTest("testSpillSkew_1", 100, 500, 10000, 4, 0.8f, 1, 10, new double[] { 0.9f });
        // first got 40%, and second got 40%
        runTest("testSpillSkew_2", 100, 500, 10000, 4, 0.8f, 1, 10, new double[] { 0.4f, 0.4f });
        // first got 60%, and second got 30%
        runTest("testSpillSkew_3", 100, 500, 10000, 4, 0.8f, 2, 10, new double[] { 0.6f, 0.3f });
    }

    public void testBigRecords() throws Exception {
        // 600 bytes for each small kv, and also output 60 big
        // records, 20 at the beginning, 20 in the middle, and 20 at the end
        runTest("testSpillBigRecords_1", 100, 500, 10000, 500000, 3000, 1, 0.8f, 1, 1, new double[] { 1.0f },
                new int[] { 20 }, new int[] { 20 }, new int[] { 20 });
        runTest("testSpillBigRecords_2", 100, 500, 10000, 500000, 3000, 1, 0.8f, 2, 1, new double[] { 1.0f },
                new int[] { 20 }, new int[] { 20 }, new int[] { 20 });
        runTest("testSpillBigRecords_3", 100, 500, 10000, 500000, 3000, 1, 0.8f, 2, 2, new double[] { 0.5f, 0.5f },
                new int[] { 20, 20 }, new int[] { 20, 20 }, new int[] { 20, 20 });
    }

}