eu.stratosphere.pact.runtime.hash.MultiLevelHashITCase.java Source code

Java tutorial

Introduction

Here is the source code for eu.stratosphere.pact.runtime.hash.MultiLevelHashITCase.java

Source

/***********************************************************************************************************************
 *
 * Copyright (C) 2010 by the Stratosphere project (http://stratosphere.eu)
 *
 * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
 * the License. You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 * specific language governing permissions and limitations under the License.
 *
 **********************************************************************************************************************/

package eu.stratosphere.pact.runtime.hash;

import java.util.ArrayList;
import java.util.Formatter;
import java.util.HashMap;
import java.util.Iterator;
import java.util.SortedMap;
import java.util.TreeMap;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.junit.Assert;
import org.junit.Test;

import eu.stratosphere.pact.runtime.hash.MultiLevelHashTester.BucketBoundaries;
import eu.stratosphere.pact.runtime.hash.util.LastBitsToRange;
import eu.stratosphere.pact.runtime.hash.util.RandomIterator;
import eu.stratosphere.pact.runtime.hash.util.RangeCalculator;
import eu.stratosphere.pact.runtime.hash.util.RangeIterator;
import eu.stratosphere.pact.runtime.hash.util.StepRangeIterator;

/**
 * Test distribution of hash function for multiple levels
 * 
 * @author Matthias Ringwald
 * 
 */
public class MultiLevelHashITCase {

    private static final Log LOG = LogFactory.getLog(MultiLevelHashITCase.class);

    private static final long SEED = 561349061987311L;

    @Test
    public void testStepSeventeen() {

        // Define numbers of buckets on each level
        RangeCalculator[] rangeCalculators = { new LastBitsToRange(10), // 2^10=1024 Buckets on level 0
                new LastBitsToRange(10), // 2^10=1024 Buckets on level 1
                new LastBitsToRange(10) }; // 2^10=1024 Buckets on level 2

        Iterator<Integer> importIterator = new StepRangeIterator(-30000000, 30000000, 17);

        MultiLevelHashTester ht = new MultiLevelHashTester(importIterator, rangeCalculators);

        BucketBoundaries[] boundaries = { new BucketBoundaries(3000, 3700, 5, 0.01),
                new BucketBoundaries(0, 20, BucketBoundaries.MAX_EMPTY_UNBOUNDED, 0.0001),
                new BucketBoundaries(0, 3, BucketBoundaries.MAX_EMPTY_UNBOUNDED, 0.000001) };

        LOG.debug("Start Step Seventeen hash test");
        ht.runTest(boundaries);
        LOG.debug("End Step Seventeen hash test");
    }

    @Test
    public void testThreeLevel() {

        // Define numbers of buckets on each level
        RangeCalculator[] rangeCalculators = { new LastBitsToRange(10), // 2^10=1024 Buckets on level 0
                new LastBitsToRange(10), // 2^10=1024 Buckets on level 1
                new LastBitsToRange(10) }; // 2^10=1024 Buckets on level 2

        Iterator<Integer> importIterator = new RangeIterator(-1000000, 1000000);

        MultiLevelHashTester ht = new MultiLevelHashTester(importIterator, rangeCalculators);

        BucketBoundaries[] boundaries = { new BucketBoundaries(1800, 2110, 5, 0.01),
                new BucketBoundaries(0, 15, BucketBoundaries.MAX_EMPTY_UNBOUNDED, 0.0001),
                new BucketBoundaries(0, 2, BucketBoundaries.MAX_EMPTY_UNBOUNDED, 0.000001) };

        LOG.debug("Start Three Level hash test");
        ht.runTest(boundaries);
        LOG.debug("End Three Level hash test");
    }

    @Test
    public void testRandom() {

        // Define numbers of buckets on each level
        RangeCalculator[] rangeCalculators = { new LastBitsToRange(10), // 2^10=1024 Buckets on level 0
                new LastBitsToRange(10), // 2^10=1024 Buckets on level 1
                new LastBitsToRange(10) }; // 2^10=1024 Buckets on level 2

        Iterator<Integer> importIterator = new RandomIterator(SEED, 2000000);

        MultiLevelHashTester ht = new MultiLevelHashTester(importIterator, rangeCalculators);

        BucketBoundaries[] boundaries = { new BucketBoundaries(1800, 2110, 5, 0.01),
                new BucketBoundaries(0, 15, BucketBoundaries.MAX_EMPTY_UNBOUNDED, 0.0001),
                new BucketBoundaries(0, 2, BucketBoundaries.MAX_EMPTY_UNBOUNDED, 0.000001) };

        LOG.debug("Start Random hash test");
        ht.runTest(boundaries);
        LOG.debug("End Random hash test");
    }

    @Test
    public void testTwoLevel() {

        // Define numbers of buckets on each level
        RangeCalculator[] rangeCalculators = { new LastBitsToRange(12), // 2^12=4096 Buckets on level 0
                new LastBitsToRange(12) }; // 2^12=4096 Buckets on level 1

        Iterator<Integer> importIterator = new RangeIterator(-1000000, 1000000);

        MultiLevelHashTester ht = new MultiLevelHashTester(importIterator, rangeCalculators);

        BucketBoundaries[] boundaries = { new BucketBoundaries(400, 600, 5, 0.01),
                new BucketBoundaries(0, 4, BucketBoundaries.MAX_EMPTY_UNBOUNDED, 0.0001) };

        LOG.debug("Start Two Level hash test");
        ht.runTest(boundaries);
        LOG.debug("End Two Level hash test");
    }

}

class MultiLevelHashTester {

    private static final Log LOG = LogFactory.getLog(MultiLevelHashTester.class);

    private final int maxLevel;
    private final Iterator<Integer> importIterator;
    private final RangeCalculator[] rangeCalculators;
    private final HashMap<Integer, Object> rootMap = new HashMap<Integer, Object>();
    private final ArrayList<SortedMap<Integer, Integer>> bucketSizesPerLevel;

    /**
     * 
     * @param hashFunction
     *            HashFunction to be tested
     * @param importIterator
     *            Iterator over values to be used in test run
     * @param rangeCalculators
     *            For each level a range calculator which defines how to map
     *            from hash to bucket
     */
    public MultiLevelHashTester(Iterator<Integer> importIterator, RangeCalculator[] rangeCalculators) {
        this.importIterator = importIterator;
        this.rangeCalculators = rangeCalculators;
        this.maxLevel = rangeCalculators.length;
        this.bucketSizesPerLevel = new ArrayList<SortedMap<Integer, Integer>>(maxLevel);

        for (int i = 0; i < maxLevel; i++) {
            bucketSizesPerLevel.add(i, new TreeMap<Integer, Integer>());
        }
    }

    /**
     * Run the test by: - Adding values from iterator to map - Creating
     * histogram over bucket sizes per level - Printing histogram informations
     * 
     * @param boundaries
     *            Expected results for each level
     */
    public void runTest(BucketBoundaries[] boundaries) {
        addValues();
        collectStatistics(rootMap, 0);
        if (LOG.isDebugEnabled()) {
            printStatistics();
        }
        checkBoundaries(boundaries);
    }

    private void checkBoundaries(BucketBoundaries[] boundaries) {
        for (int level = 0; level < boundaries.length; level++) {
            int lowerBound = boundaries[level].getLowerBound();
            int upperBound = boundaries[level].getUpperBound();
            int bucketCountInLevel = 0;
            int bucketCountOutOfRange = 0;

            SortedMap<Integer, Integer> levelMap = bucketSizesPerLevel.get(level);
            Iterator<Integer> bucketSizeIterator = levelMap.keySet().iterator();

            while (bucketSizeIterator.hasNext()) {
                int bucketSize = bucketSizeIterator.next();
                if (bucketSize != 0) {
                    int countForBucketSize = levelMap.get(bucketSize);
                    bucketCountInLevel += countForBucketSize;
                    if (lowerBound > bucketSize || upperBound < bucketSize) {
                        bucketCountOutOfRange += countForBucketSize;
                    }

                }
            }
            double bucketsOutOfRange = (double) bucketCountOutOfRange / (double) bucketCountInLevel;
            double maxBucketsOutOfRange = boundaries[level].getPercentOutOfRange();
            Assert.assertTrue(
                    "More than " + (maxBucketsOutOfRange * 100) + "% of buckets out of range in level " + level,
                    bucketsOutOfRange <= maxBucketsOutOfRange);

            int maxEmpty = boundaries[level].getMaxEmpty();
            Assert.assertTrue("More than " + maxEmpty + " empty buckets in level " + level,
                    (maxEmpty == BucketBoundaries.MAX_EMPTY_UNBOUNDED)
                            || (levelMap.get(0) <= boundaries[level].getMaxEmpty()));
        }
    }

    /**
     * Find for each value the right bucket on the deepest level and increase
     * its count
     */
    @SuppressWarnings("unchecked")
    private void addValues() {

        while (importIterator.hasNext()) {
            int nextValue = importIterator.next();

            HashMap<Integer, Object> mapForCurrentLevel = rootMap;

            for (int i = 0; i < maxLevel - 1; i++) {
                int hashValue = MutableHashTable.hash(nextValue, i);
                int bucket = rangeCalculators[i].getBucket(hashValue);
                Object nextObject = mapForCurrentLevel.get(bucket);
                if (nextObject == null) {
                    HashMap<Integer, Object> mapForNextLevel = new HashMap<Integer, Object>();
                    mapForCurrentLevel.put(bucket, mapForNextLevel);
                    mapForCurrentLevel = mapForNextLevel;

                } else {
                    mapForCurrentLevel = (HashMap<Integer, Object>) nextObject;
                }
            }

            int lastHashValue = MutableHashTable.hash(nextValue, maxLevel - 1);
            int deepestBucketNr = rangeCalculators[maxLevel - 1].getBucket(lastHashValue);
            Object countOnDeepestLevel = mapForCurrentLevel.get(deepestBucketNr);
            if (countOnDeepestLevel == null) {
                mapForCurrentLevel.put(deepestBucketNr, 1);
            } else {
                mapForCurrentLevel.put(deepestBucketNr, ((Integer) countOnDeepestLevel) + 1);
            }

        }
    }

    private void printStatistics() {
        for (int level = 0; level < maxLevel; level++) {
            int bucketCountInLevel = 0;

            SortedMap<Integer, Integer> levelMap = bucketSizesPerLevel.get(level);
            Iterator<Integer> bucketSizeIterator = levelMap.keySet().iterator();

            LOG.debug("Statistics for level: " + level);
            LOG.debug("----------------------------------------------");
            LOG.debug("");
            LOG.debug("Bucket Size |      Count");
            LOG.debug("------------------------");

            int i = 0;
            while (bucketSizeIterator.hasNext()) {
                int bucketSize = bucketSizeIterator.next();
                if (bucketSize != 0) {
                    int countForBucketSize = levelMap.get(bucketSize);
                    bucketCountInLevel += countForBucketSize;
                    Formatter formatter = new Formatter();
                    formatter.format(" %10d | %10d", bucketSize, countForBucketSize);

                    if (levelMap.size() < 20 || i < 3 || i >= (levelMap.size() - 3)) {
                        LOG.debug(formatter.out());
                    } else if (levelMap.size() / 2 == i) {
                        LOG.debug("         .. |         ..");
                        LOG.debug(formatter.out());
                        LOG.debug("         .. |         ..");
                    }
                    i++;
                }
            }

            LOG.debug("");
            LOG.debug("Number of non-empty buckets in level: " + bucketCountInLevel);
            LOG.debug("Number of empty buckets in level    : " + levelMap.get(0));
            LOG.debug("Number of different bucket sizes    : " + (levelMap.size() - 1));
            LOG.debug("");
            LOG.debug("");
            LOG.debug("");
        }
    }

    /**
     * Create histogram over bucket sizes
     * 
     * @param map
     *            Map to be analyzed
     * @param level
     *            Level on which the map is located in
     * @return The total count of hashed values in the map
     */
    private int collectStatistics(HashMap<Integer, Object> map, int level) {
        SortedMap<Integer, Integer> bucketSizesForLevel = bucketSizesPerLevel.get(level);

        Iterator<Object> bucketIterator = map.values().iterator();
        int bucketCount = 0;
        int totalValueCount = 0;

        while (bucketIterator.hasNext()) {
            bucketCount++;

            Integer hashValuesInBucket;
            // If we are already on the deepest level, get the count in the
            // bucket, otherwise
            // recursively examine the subtree
            if (level == maxLevel - 1) {
                hashValuesInBucket = (Integer) bucketIterator.next();
            } else {
                @SuppressWarnings("unchecked")
                HashMap<Integer, Object> nextMap = (HashMap<Integer, Object>) bucketIterator.next();
                hashValuesInBucket = collectStatistics(nextMap, level + 1);
            }
            totalValueCount += hashValuesInBucket;
            Integer countOfBucketSizes = bucketSizesForLevel.get(hashValuesInBucket);
            if (countOfBucketSizes == null) {
                countOfBucketSizes = 1;
            } else {
                countOfBucketSizes += 1;
            }
            bucketSizesForLevel.put(hashValuesInBucket, countOfBucketSizes);
        }

        Integer countOfEmptyBuckets = bucketSizesForLevel.get(0);
        if (countOfEmptyBuckets == null) {
            countOfEmptyBuckets = rangeCalculators[level].getBucketCount() - bucketCount;
        } else {
            countOfEmptyBuckets += rangeCalculators[level].getBucketCount() - bucketCount;
        }
        bucketSizesForLevel.put(0, countOfEmptyBuckets);

        return totalValueCount;
    }

    /**
     * Expected results for bucket sizes per level
     * 
     * @author Matthias Ringwald
     *
     */
    static class BucketBoundaries {

        public static final int MAX_EMPTY_UNBOUNDED = -1;
        private int lowerBound;
        private int upperBound;
        private int maxEmpty;
        private double percentOutOfRange;

        /**
         * 
         * 
         * @param lowerBound Lower bound for bucket sizes
         * @param upperBound Upper bound for bucket sizes
         * @param maxEmpty Maximum number of empty buckets
         * @param percentOutOfRange Maximum percentage of buckets out of range
         */
        public BucketBoundaries(int lowerBound, int upperBound, int maxEmpty, double percentOutOfRange) {
            this.lowerBound = lowerBound;
            this.upperBound = upperBound;
            this.maxEmpty = maxEmpty;
            this.percentOutOfRange = percentOutOfRange;
        }

        /**
         * 
         * @return Lower bound for bucket sizes
         */
        public int getLowerBound() {
            return lowerBound;
        }

        /**
         * 
         * @return Upper bound for bucket sizes
         */
        public int getUpperBound() {
            return upperBound;
        }

        /**
         * 
         * @return Maximum number of empty buckets
         */
        public int getMaxEmpty() {
            return maxEmpty;
        }

        /**
         * 
         * @return Maximum percentage of buckets out of range
         */
        public double getPercentOutOfRange() {
            return percentOutOfRange;
        }
    }
}