org.apache.hadoop.hbase.util.RegionSplitCalculator.java Source code

Introduction

Here is the source code for org.apache.hadoop.hbase.util.RegionSplitCalculator.java
Source

/**
 *
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.hadoop.hbase.util;

import java.util.ArrayList;
import java.util.Collection;
import java.util.Comparator;
import java.util.List;
import java.util.Map.Entry;
import java.util.TreeMap;
import java.util.TreeSet;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.classification.InterfaceAudience;
import org.apache.hadoop.hbase.util.Bytes.ByteArrayComparator;

import com.google.common.collect.ArrayListMultimap;
import com.google.common.collect.Multimap;
import com.google.common.collect.TreeMultimap;

/**
 * This is a generic region split calculator. It requires Ranges that provide
 * start, end, and a comparator. It works in two phases -- the first adds ranges
 * and rejects backwards ranges. Then one calls calcRegions to generate the
 * multimap that has a start split key as a key and possibly multiple Ranges as
 * members.
 * 
 * To traverse, one normally would get the split set, and iterate through the
 * calcRegions. Normal regions would have only one entry, holes would have zero,
 * and any overlaps would have multiple entries.
 * 
 * The interface is a bit cumbersome currently but is exposed this way so that
 * clients can choose how to iterate through the region splits.
 * 
 * @param <R>
 */
@InterfaceAudience.Private
public class RegionSplitCalculator<R extends KeyRange> {
    final static Log LOG = LogFactory.getLog(RegionSplitCalculator.class);

    private final Comparator<R> rangeCmp;
    /**
     * This contains a sorted set of all the possible split points
     * 
     * Invariant: once populated this has 0 entries if empty or at most n+1 values
     * where n == number of added ranges.
     */
    private final TreeSet<byte[]> splits = new TreeSet<byte[]>(BYTES_COMPARATOR);

    /**
     * This is a map from start key to regions with the same start key.
     * 
     * Invariant: This always have n values in total
     */
    private final Multimap<byte[], R> starts = ArrayListMultimap.create();

    /**
     * SPECIAL CASE
     */
    private final static byte[] ENDKEY = null;

    public RegionSplitCalculator(Comparator<R> cmp) {
        rangeCmp = cmp;
    }

    public final static Comparator<byte[]> BYTES_COMPARATOR = new ByteArrayComparator() {
        @Override
        public int compare(byte[] l, byte[] r) {
            if (l == null && r == null)
                return 0;
            if (l == null)
                return 1;
            if (r == null)
                return -1;
            return super.compare(l, r);
        }
    };

    /**
     * SPECIAL CASE wrapper for empty end key
     * 
     * @return ENDKEY if end key is empty, else normal endkey.
     */
    private static <R extends KeyRange> byte[] specialEndKey(R range) {
        byte[] end = range.getEndKey();
        if (end.length == 0) {
            return ENDKEY;
        }
        return end;
    }

    /**
     * Adds an edge to the split calculator
     * 
     * @return true if is included, false if backwards/invalid
     */
    public boolean add(R range) {
        byte[] start = range.getStartKey();
        byte[] end = specialEndKey(range);

        if (end != ENDKEY && Bytes.compareTo(start, end) > 0) {
            // don't allow backwards edges
            LOG.debug("attempted to add backwards edge: " + Bytes.toString(start) + " " + Bytes.toString(end));
            return false;
        }

        splits.add(start);
        splits.add(end);
        starts.put(start, range);
        return true;
    }

    /**
     * Generates a coverage multimap from split key to Regions that start with the
     * split key.
     * 
     * @return coverage multimap
     */
    public Multimap<byte[], R> calcCoverage() {
        // This needs to be sorted to force the use of the comparator on the values,
        // otherwise byte array comparison isn't used
        Multimap<byte[], R> regions = TreeMultimap.create(BYTES_COMPARATOR, rangeCmp);

        // march through all splits from the start points
        for (Entry<byte[], Collection<R>> start : starts.asMap().entrySet()) {
            byte[] key = start.getKey();
            for (R r : start.getValue()) {
                regions.put(key, r);

                for (byte[] coveredSplit : splits.subSet(r.getStartKey(), specialEndKey(r))) {
                    regions.put(coveredSplit, r);
                }
            }
        }
        return regions;
    }

    public TreeSet<byte[]> getSplits() {
        return splits;
    }

    public Multimap<byte[], R> getStarts() {
        return starts;
    }

    /**
     * Find specified number of top ranges in a big overlap group.
     * It could return less if there are not that many top ranges.
     * Once these top ranges are excluded, the big overlap group will
     * be broken into ranges with no overlapping, or smaller overlapped
     * groups, and most likely some holes.
     *
     * @param bigOverlap a list of ranges that overlap with each other
     * @param count the max number of ranges to find
     * @return a list of ranges that overlap with most others
     */
    public static <R extends KeyRange> List<R> findBigRanges(Collection<R> bigOverlap, int count) {
        List<R> bigRanges = new ArrayList<R>();

        // The key is the count of overlaps,
        // The value is a list of ranges that have that many overlaps
        TreeMap<Integer, List<R>> overlapRangeMap = new TreeMap<Integer, List<R>>();
        for (R r : bigOverlap) {
            // Calculates the # of overlaps for each region
            // and populates rangeOverlapMap
            byte[] startKey = r.getStartKey();
            byte[] endKey = specialEndKey(r);

            int overlappedRegions = 0;
            for (R rr : bigOverlap) {
                byte[] start = rr.getStartKey();
                byte[] end = specialEndKey(rr);

                if (BYTES_COMPARATOR.compare(startKey, end) < 0 && BYTES_COMPARATOR.compare(endKey, start) > 0) {
                    overlappedRegions++;
                }
            }

            // One region always overlaps with itself,
            // so overlappedRegions should be more than 1
            // for actual overlaps.
            if (overlappedRegions > 1) {
                Integer key = Integer.valueOf(overlappedRegions);
                List<R> ranges = overlapRangeMap.get(key);
                if (ranges == null) {
                    ranges = new ArrayList<R>();
                    overlapRangeMap.put(key, ranges);
                }
                ranges.add(r);
            }
        }
        int toBeAdded = count;
        for (Integer key : overlapRangeMap.descendingKeySet()) {
            List<R> chunk = overlapRangeMap.get(key);
            int chunkSize = chunk.size();
            if (chunkSize <= toBeAdded) {
                bigRanges.addAll(chunk);
                toBeAdded -= chunkSize;
                if (toBeAdded > 0)
                    continue;
            } else {
                // Try to use the middle chunk in case the overlapping is
                // chained, for example: [a, c), [b, e), [d, g), [f h)...
                // In such a case, sideline the middle chunk will break
                // the group efficiently.
                int start = (chunkSize - toBeAdded) / 2;
                int end = start + toBeAdded;
                for (int i = start; i < end; i++) {
                    bigRanges.add(chunk.get(i));
                }
            }
            break;
        }
        return bigRanges;
    }
}