org.apache.hadoop.hive.ql.exec.persistence.BytesBytesMultiHashMap.java Source code

Introduction

Here is the source code for org.apache.hadoop.hive.ql.exec.persistence.BytesBytesMultiHashMap.java
Source

/**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.hadoop.hive.ql.exec.persistence;

import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import java.util.TreeMap;

import org.apache.commons.lang.StringUtils;
import org.apache.hadoop.hive.common.MemoryEstimate;
import org.apache.hadoop.hive.ql.util.JavaDataModel;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.apache.hadoop.hive.conf.HiveConf.ConfVars;
import org.apache.hadoop.hive.ql.debug.Utils;
import org.apache.hadoop.hive.serde2.ByteStream.RandomAccessOutput;
import org.apache.hadoop.hive.serde2.SerDeException;
import org.apache.hadoop.hive.serde2.WriteBuffers;

import com.google.common.annotations.VisibleForTesting;

/**
 * HashMap that maps byte arrays to byte arrays with limited functionality necessary for
 * MapJoin hash tables, with small memory overhead. Supports multiple values for single key.
 * Values can be added for key (but cannot be removed); values can be gotten for the key.
 * Some things (like entrySet) are easy to add; some e.g. deletion are pretty hard to do well.
 * Additionally, for each key it contains a magic "state byte" which is not part of the key and
 * can be updated on every put for that key. That is really silly, we use it to store aliasFilter.
 * Magic byte could be removed for generality.
 * Initially inspired by HPPC LongLongOpenHashMap; however, the code is almost completely reworked
 * and there's very little in common left save for quadratic probing (and that with some changes).
 */
public final class BytesBytesMultiHashMap implements MemoryEstimate {
    public static final Logger LOG = LoggerFactory.getLogger(BytesBytesMultiHashMap.class);

    /*
     * This hashtable stores "references" in an array of longs;  index in the array is hash of
     * the key; these references point into infinite byte buffer (see below). This buffer contains
     * records written one after another. There are several simple record formats.
     * - single record for the key
     *    [key bytes][value bytes][vlong value length][vlong key length][padding]
     *    We leave padding to ensure we have at least 5 bytes after key and value.
     * - first of multiple records for the key (updated from "single value for the key")
     *    [key bytes][value bytes][5-byte long offset to a list start record]
     *  - list start record
     *    [vlong value length][vlong key length][5-byte long offset to the 2nd list record]
     *    Lengths are preserved from the first record. Offset is discussed above.
     *  - subsequent values in the list
     *    [value bytes][value length][vlong relative offset to next record].
     *
     * In summary, because we have separate list record, we have very little list overhead for
     * the typical case of primary key join, where there's no list for any key; large lists also
     * don't have a lot of relative overhead (also see the todo below).
     *
     * So the record looks as follows for one value per key (hash is fixed, 4 bytes, and is
     * stored to expand w/o rehashing, and to more efficiently deal with collision
     *
     *             i = key hash
     *           ._______.
     * REFS: ... |offset | ....
     *           `--\----'
     *               `-------------.
     *                            \|/
     *          .______._____._____'__.__._.
     * WBS: ... | hash | key | val |vl|kl| | ....
     *          `------'-----'-----'--'--'-'
     *
     * After that refs don't change so they are not pictured.
     * When we add the 2nd value, we rewrite lengths with relative offset to the list start record.
     * That way, the first record points to the "list record".
     *                         ref .---------.
     *                         \|/ |        \|/
     *       .______._____._____'__|___.     '__.__.______.
     * WBS:  | hash | key | val |offset| ... |vl|kl|      |
     *       `------'-----'-----'------'     '--'--'------'
     * After that refs don't change so they are not pictured. List record points to the 2nd value.
     *                         ref .---------.        .---------------.
     *                         \|/ |        \|/       |              \|/
     *       .______._____._____'__|___.     '__.__.__|___.     ._____'__._.
     * WBS:  | hash | key | val |offset| ... |vl|kl|offset| ... | val |vl|0|
     *       `------'-----'-----'------'     '--'--'------'     '-----'--'-'
     * If we add another value, we overwrite the list record.
     * We don't need to overwrite any vlongs and suffer because of that.
     *                         ref .---------.         .-------------------------------.
     *                         \|/ |        \|/        |                              \|/
     *       .______._____._____'__|___.     '__.__.___|__.     ._____.__._.     ._____'__.______.
     * WBS:  | hash | key | val |offset| ... |vl|kl|offset| ... | val |vl|0| ... | val |vl|offset|
     *       `------'-----'-----'------'     '--'--'------'     '-----:--'-'     '-----'--'--|---'
     *                                                               /|\                     |
     *                                                                `----------------------'
     * And another value (for example)
     * ... ---.         .-----------------------------------------------------.
     *       \|/        |                                                    \|/
     *        '__.__.___|__.     ._____.__._.     ._____.__.______.     ._____'__.______.
     * ...    |vl|kl|offset| ... | val |vl|0| ... | val |vl|offset| ... | val |vl|offset|
     *        '--'--'------'     '-----:--'-'     '-----'--:--|---'     '-----'--'--|---'
     *                                /|\                 /|\ |                     |
     *                                 `-------------------+--'                     |
     *                                                     `------------------------'
     */

    /**
     * Write buffers for keys and values. For the description of the structure above, think
     * of this as one infinite byte buffer.
     */
    private WriteBuffers writeBuffers;

    private final float loadFactor;

    private int resizeThreshold;
    private int keysAssigned;
    private int numValues;

    /**
     * Largest number of probe steps ever taken to find location for a key. When getting, we can
     * conclude that they key is not in hashtable when we make this many steps and don't find it.
     */
    private int largestNumberOfSteps = 0;

    /**
     * References to keys of the hashtable. The index is hash of the key; collisions are
     * resolved using open addressing with quadratic probing. Reference format
     * [40: offset into writeBuffers][8: state byte][1: has list flag]
     * [15: part of hash used to optimize probing]
     * Offset is tail offset of the first record for the key (the one containing the key).
     * It is not necessary to store 15 bits in particular to optimize probing; in fact when
     * we always store the hash it is not necessary. But we have nothing else to do with them.
     * TODO: actually we could also use few bits to store largestNumberOfSteps for each,
     *      so we'd stop earlier on read collision. Need to profile on real queries.
     */
    private long[] refs;
    private int startingHashBitCount, hashBitCount;

    private int metricPutConflict = 0, metricGetConflict = 0, metricExpands = 0, metricExpandsMs = 0;

    /** We have 39 bits to store list pointer from the first record; this is size limit */
    final static long MAX_WB_SIZE = ((long) 1) << 38;
    /** 8 Gb of refs is the max capacity if memory limit is not specified. If someone has 100s of
     * Gbs of memory (this might happen pretty soon) we'd need to string together arrays anyway. */
    private final static int DEFAULT_MAX_CAPACITY = 1024 * 1024 * 1024;
    /** Make sure maxCapacity has a lower limit */
    private final static int DEFAULT_MIN_MAX_CAPACITY = 16 * 1024 * 1024;

    public BytesBytesMultiHashMap(int initialCapacity, float loadFactor, int wbSize, long maxProbeSize) {
        if (loadFactor < 0 || loadFactor > 1) {
            throw new AssertionError("Load factor must be between (0, 1].");
        }
        assert initialCapacity > 0;
        initialCapacity = (Long.bitCount(initialCapacity) == 1) ? initialCapacity
                : nextHighestPowerOfTwo(initialCapacity);
        // 8 bytes per long in the refs, assume data will be empty. This is just a sanity check.
        int maxCapacity = (maxProbeSize <= 0) ? DEFAULT_MAX_CAPACITY
                : (int) Math.min((long) DEFAULT_MAX_CAPACITY, maxProbeSize / 8);
        if (maxCapacity < DEFAULT_MIN_MAX_CAPACITY) {
            maxCapacity = DEFAULT_MIN_MAX_CAPACITY;
        }
        if (maxCapacity < initialCapacity || initialCapacity <= 0) {
            // Either initialCapacity is too large, or nextHighestPowerOfTwo overflows
            initialCapacity = (Long.bitCount(maxCapacity) == 1) ? maxCapacity : nextLowestPowerOfTwo(maxCapacity);
        }

        validateCapacity(initialCapacity);
        startingHashBitCount = 63 - Long.numberOfLeadingZeros(initialCapacity);
        this.loadFactor = loadFactor;
        refs = new long[initialCapacity];
        writeBuffers = new WriteBuffers(wbSize, MAX_WB_SIZE);
        resizeThreshold = (int) (initialCapacity * this.loadFactor);
    }

    @VisibleForTesting
    BytesBytesMultiHashMap(int initialCapacity, float loadFactor, int wbSize) {
        this(initialCapacity, loadFactor, wbSize, -1);
    }

    /**
     * The result of looking up a key in the multi-hash map.
     *
     * This object can read through the 0, 1, or more values found for the key.
     */
    public static class Result {

        // Whether there are more than 0 rows.
        private boolean hasRows;

        // We need a pointer to the hash map since this class must be static to support having
        // multiple hash tables with Hybrid Grace partitioning.
        private BytesBytesMultiHashMap hashMap;

        // And, a mutable read position for thread safety when sharing a hash map.
        private WriteBuffers.Position readPos;

        // These values come from setValueResult when it finds a key.  These values allow this
        // class to read (and re-read) the values.
        private long firstOffset;
        private boolean hasList;
        private long offsetAfterListRecordKeyLen;

        // When we have multiple values, we save the next value record's offset here.
        private long nextTailOffset;

        // 0-based index of which row we are on.
        private long readIndex;

        // A reference to the current row.
        private WriteBuffers.ByteSegmentRef byteSegmentRef;

        public Result() {
            hasRows = false;
            byteSegmentRef = new WriteBuffers.ByteSegmentRef();
            readPos = new WriteBuffers.Position();
        }

        /**
         * Return the thread-safe read position.
         */
        public WriteBuffers.Position getReadPos() {
            return readPos;
        }

        /**
         * @return Whether there are 1 or more values.
         */
        public boolean hasRows() {
            // NOTE: Originally we named this isEmpty, but that name conflicted with another interface.
            return hasRows;
        }

        /**
         * @return Whether there is just 1 value row.
         */
        public boolean isSingleRow() {
            return !hasList;
        }

        /**
         * Set internal values for reading the values after finding a key.
         *
         * @param hashMap
         *          The hash map we found the key in.
         * @param firstOffset
         *          The absolute offset of the first record in the write buffers.
         * @param hasList
         *          Whether there are multiple values (true) or just a single value (false).
         * @param offsetAfterListRecordKeyLen
         *          The offset of just after the key length in the list record.  Or, 0 when single row.
         */
        public void set(BytesBytesMultiHashMap hashMap, long firstOffset, boolean hasList,
                long offsetAfterListRecordKeyLen) {

            this.hashMap = hashMap;

            this.firstOffset = firstOffset;
            this.hasList = hasList;
            this.offsetAfterListRecordKeyLen = offsetAfterListRecordKeyLen;

            // Position at first row.
            readIndex = 0;
            nextTailOffset = -1;

            hasRows = true;
        }

        public WriteBuffers.ByteSegmentRef first() {
            if (!hasRows) {
                return null;
            }

            // Position at first row.
            readIndex = 0;
            nextTailOffset = -1;

            return internalRead();
        }

        public WriteBuffers.ByteSegmentRef next() {
            if (!hasRows) {
                return null;
            }

            return internalRead();
        }

        /**
         * Read the current value.
         *
         * @return
         *           The ByteSegmentRef to the current value read.
         */
        private WriteBuffers.ByteSegmentRef internalRead() {

            if (!hasList) {

                /*
                 * Single value.
                 */

                if (readIndex > 0) {
                    return null;
                }

                // For a non-list (i.e. single value), the offset is for the variable length long (VLong)
                // holding the value length (followed by the key length).
                hashMap.writeBuffers.setReadPoint(firstOffset, readPos);
                int valueLength = (int) hashMap.writeBuffers.readVLong(readPos);

                // The value is before the offset.  Make byte segment reference absolute.
                byteSegmentRef.reset(firstOffset - valueLength, valueLength);
                hashMap.writeBuffers.populateValue(byteSegmentRef);

                readIndex++;
                return byteSegmentRef;
            }

            /*
             * Multiple values.
             */

            if (readIndex == 0) {
                // For a list, the value and key lengths of 1st record were overwritten with the
                // relative offset to a new list record.
                long relativeOffset = hashMap.writeBuffers.readNByteLong(firstOffset, 5, readPos);

                // At the beginning of the list record will be the value length.
                hashMap.writeBuffers.setReadPoint(firstOffset + relativeOffset, readPos);
                int valueLength = (int) hashMap.writeBuffers.readVLong(readPos);

                // The value is before the list record offset.  Make byte segment reference absolute.
                byteSegmentRef.reset(firstOffset - valueLength, valueLength);
                hashMap.writeBuffers.populateValue(byteSegmentRef);

                readIndex++;
                return byteSegmentRef;
            }

            if (readIndex == 1) {
                // We remembered the offset of just after the key length in the list record.
                // Read the absolute offset to the 2nd value.
                nextTailOffset = hashMap.writeBuffers.readNByteLong(offsetAfterListRecordKeyLen, 5, readPos);
                if (nextTailOffset <= 0) {
                    throw new Error("Expecting a second value");
                }
            } else if (nextTailOffset <= 0) {
                return null;
            }

            hashMap.writeBuffers.setReadPoint(nextTailOffset, readPos);

            // Get the value length.
            int valueLength = (int) hashMap.writeBuffers.readVLong(readPos);

            // Now read the relative offset to next record. Next record is always before the
            // previous record in the write buffers (see writeBuffers javadoc).
            long delta = hashMap.writeBuffers.readVLong(readPos);
            long newTailOffset = delta == 0 ? 0 : (nextTailOffset - delta);

            // The value is before the value record offset.  Make byte segment reference absolute.
            byteSegmentRef.reset(nextTailOffset - valueLength, valueLength);
            hashMap.writeBuffers.populateValue(byteSegmentRef);

            nextTailOffset = newTailOffset;
            readIndex++;
            return byteSegmentRef;
        }

        /**
         * Lets go of any references to a hash map.
         */
        public void forget() {
            hashMap = null;
            byteSegmentRef.reset(0, 0);
            hasRows = false;
            readIndex = 0;
            nextTailOffset = -1;
        }
    }

    /** The source of keys and values to put into hashtable; avoids byte copying. */
    public static interface KvSource {
        /** Write key into output. */
        public void writeKey(RandomAccessOutput dest) throws SerDeException;

        /** Write value into output. */
        public void writeValue(RandomAccessOutput dest) throws SerDeException;

        /**
         * Provide updated value for state byte for a key.
         * @param previousValue Previous value; null if this is the first call per key.
         * @return The updated value.
         */
        public byte updateStateByte(Byte previousValue);
    }

    /**
     * Adds new value to new or existing key in hashmap. Not thread-safe.
     * @param kv Keyvalue writer. Each method will be called at most once.
     */
    private static final byte[] FOUR_ZEROES = new byte[] { 0, 0, 0, 0 };

    public void put(KvSource kv, int keyHashCode) throws SerDeException {
        if (resizeThreshold <= keysAssigned) {
            expandAndRehash();
        }

        // Reserve 4 bytes for the hash (don't just reserve, there may be junk there)
        writeBuffers.write(FOUR_ZEROES);

        // Write key to buffer to compute hashcode and compare; if it's a new key, it will
        // become part of the record; otherwise, we will just write over it later.
        long keyOffset = writeBuffers.getWritePoint();

        kv.writeKey(writeBuffers);
        int keyLength = (int) (writeBuffers.getWritePoint() - keyOffset);
        int hashCode = (keyHashCode == -1) ? writeBuffers.unsafeHashCode(keyOffset, keyLength) : keyHashCode;

        int slot = findKeySlotToWrite(keyOffset, keyLength, hashCode);
        // LOG.info("Write hash code is " + Integer.toBinaryString(hashCode) + " - " + slot);

        long ref = refs[slot];
        if (ref == 0) {
            // This is a new key, keep writing the first record.
            long tailOffset = writeFirstValueRecord(kv, keyOffset, keyLength, hashCode);
            byte stateByte = kv.updateStateByte(null);
            refs[slot] = Ref.makeFirstRef(tailOffset, stateByte, hashCode, startingHashBitCount);
            ++keysAssigned;
        } else {
            // This is not a new key; we'll overwrite the key and hash bytes - not needed anymore.
            writeBuffers.setWritePoint(keyOffset - 4);
            long lrPtrOffset = createOrGetListRecord(ref);
            long tailOffset = writeValueAndLength(kv);
            addRecordToList(lrPtrOffset, tailOffset);
            byte oldStateByte = Ref.getStateByte(ref);
            byte stateByte = kv.updateStateByte(oldStateByte);
            if (oldStateByte != stateByte) {
                ref = Ref.setStateByte(ref, stateByte);
            }
            refs[slot] = Ref.setListFlag(ref);
        }
        ++numValues;
    }

    /**
     * Finds a key.  Values can be read with the supplied result object.
     *
     * Important Note: The caller is expected to pre-allocate the hashMapResult and not
     * share it among other threads.
     *
     * @param key Key buffer.
     * @param offset the offset to the key in the buffer
     * @param hashMapResult The object to fill in that can read the values.
     * @return The state byte.
     */
    public byte getValueResult(byte[] key, int offset, int length, Result hashMapResult) {

        hashMapResult.forget();

        WriteBuffers.Position readPos = hashMapResult.getReadPos();

        // First, find first record for the key.
        long ref = findKeyRefToRead(key, offset, length, readPos);
        if (ref == 0) {
            return 0;
        }

        boolean hasList = Ref.hasList(ref);

        // This relies on findKeyRefToRead doing key equality check and leaving read ptr where needed.
        long offsetAfterListRecordKeyLen = hasList ? writeBuffers.getReadPoint(readPos) : 0;

        hashMapResult.set(this, Ref.getOffset(ref), hasList, offsetAfterListRecordKeyLen);

        return Ref.getStateByte(ref);
    }

    /**
     * Take the segment reference from {@link #getValueRefs(byte[], int, List)}
     * result and makes it self-contained - adds byte array where the value is stored, and
     * updates the offset from "global" write buffers offset to offset within that array.
     */
    public void populateValue(WriteBuffers.ByteSegmentRef valueRef) {
        writeBuffers.populateValue(valueRef);
    }

    /**
     * Number of keys in the hashmap
     * @return number of keys
     */
    public int size() {
        return keysAssigned;
    }

    /**
     * Number of values in the hashmap
     * This is equal to or bigger than number of keys, since some values may share the same key
     * @return number of values
     */
    public int getNumValues() {
        return numValues;
    }

    /**
     * Number of bytes used by the hashmap
     * There are two main components that take most memory: writeBuffers and refs
     * Others include instance fields: 100
     * @return number of bytes
     */
    public long memorySize() {
        return getEstimatedMemorySize();
    }

    @Override
    public long getEstimatedMemorySize() {
        JavaDataModel jdm = JavaDataModel.get();
        long size = 0;
        size += writeBuffers.getEstimatedMemorySize();
        size += jdm.lengthForLongArrayOfSize(refs.length);
        // 11 primitive1 fields, 2 refs above with alignment
        size += JavaDataModel.alignUp(15 * jdm.primitive1(), jdm.memoryAlign());
        return size;
    }

    public void seal() {
        writeBuffers.seal();
    }

    public void clear() {
        // This will make the object completely unusable. Semantics of clear are not defined...
        this.writeBuffers.clear();
        this.refs = new long[1];
        this.keysAssigned = 0;
        this.numValues = 0;
    }

    public void expandAndRehashToTarget(int estimateNewRowCount) {
        int oldRefsCount = refs.length;
        int newRefsCount = oldRefsCount + estimateNewRowCount;
        if (resizeThreshold <= newRefsCount) {
            newRefsCount = (Long.bitCount(newRefsCount) == 1) ? estimateNewRowCount
                    : nextHighestPowerOfTwo(newRefsCount);
            expandAndRehashImpl(newRefsCount);
            LOG.info("Expand and rehash to " + newRefsCount + " from " + oldRefsCount);
        }
    }

    private static void validateCapacity(long capacity) {
        if (Long.bitCount(capacity) != 1) {
            throw new AssertionError("Capacity must be a power of two");
        }
        if (capacity <= 0) {
            throw new AssertionError("Invalid capacity " + capacity);
        }
        if (capacity > Integer.MAX_VALUE) {
            throw new RuntimeException("Attempting to expand the hash table to " + capacity
                    + " that overflows maximum array size. For this query, you may want to disable "
                    + ConfVars.HIVEDYNAMICPARTITIONHASHJOIN.varname + " or reduce "
                    + ConfVars.HIVECONVERTJOINNOCONDITIONALTASKTHRESHOLD.varname);
        }
    }

    /**
     * Finds the slot to use for writing, based on the key bytes already written to buffers.
     * @param keyOffset Offset to the key.
     * @param keyLength Length of the key.
     * @param hashCode Hash code of the key (passed in because java doesn't have ref/pointers).
     * @return The slot to use for writing; can be new, or matching existing key.
     */
    private int findKeySlotToWrite(long keyOffset, int keyLength, int hashCode) {
        final int bucketMask = (refs.length - 1);
        int slot = hashCode & bucketMask;
        long probeSlot = slot;
        int i = 0;
        while (true) {
            long ref = refs[slot];
            if (ref == 0 || isSameKey(keyOffset, keyLength, ref, hashCode)) {
                break;
            }
            ++metricPutConflict;
            // Some other key (collision) - keep probing.
            probeSlot += (++i);
            slot = (int) (probeSlot & bucketMask);
        }
        if (largestNumberOfSteps < i) {
            if (LOG.isDebugEnabled()) {
                LOG.debug("Probed " + i + " slots (the longest so far) to find space");
            }
            largestNumberOfSteps = i;
            // debugDumpKeyProbe(keyOffset, keyLength, hashCode, slot);
        }
        return slot;
    }

    /**
     * Finds the slot to use for reading.
     * @param key Read key array.
     * @param length Read key length.
     * @return The ref to use for reading.
     */
    private long findKeyRefToRead(byte[] key, int offset, int length, WriteBuffers.Position readPos) {
        final int bucketMask = (refs.length - 1);
        int hashCode = writeBuffers.hashCode(key, offset, length);
        int slot = hashCode & bucketMask;
        // LOG.info("Read hash code for " + Utils.toStringBinary(key, 0, length)
        //   + " is " + Integer.toBinaryString(hashCode) + " - " + slot);
        long probeSlot = slot;
        int i = 0;
        while (true) {
            long ref = refs[slot];
            // When we were inserting the key, we would have inserted here; so, there's no key.
            if (ref == 0) {
                return 0;
            }
            if (isSameKey(key, offset, length, ref, hashCode, readPos)) {
                return ref;
            }
            ++metricGetConflict;
            probeSlot += (++i);
            if (i > largestNumberOfSteps) {
                // We know we never went that far when we were inserting.
                return 0;
            }
            slot = (int) (probeSlot & bucketMask);
        }
    }

    /**
     * Puts ref into new refs array.
     * @param newRefs New refs array.
     * @param newRef New ref value.
     * @param hashCode Hash code to use.
     * @return The number of probe steps taken to find key position.
     */
    private int relocateKeyRef(long[] newRefs, long newRef, int hashCode) {
        final int bucketMask = newRefs.length - 1;
        int newSlot = hashCode & bucketMask;
        long probeSlot = newSlot;
        int i = 0;
        while (true) {
            long current = newRefs[newSlot];
            if (current == 0) {
                newRefs[newSlot] = newRef;
                break;
            }
            // New array cannot contain the records w/the same key, so just advance, don't check.
            probeSlot += (++i);
            newSlot = (int) (probeSlot & bucketMask);
        }
        return i;
    }

    /**
     * Verifies that the key matches a requisite key.
     * @param cmpOffset The offset to the key to compare with.
     * @param cmpLength The length of the key to compare with.
     * @param ref The ref that can be used to retrieve the candidate key.
     * @param hashCode
     * @return -1 if the key referenced by ref is different than the one referenced by cmp...
     *         0 if the keys match, and there's only one value for this key (no list).
     *         Offset if the keys match, and there are multiple values for this key (a list).
     */
    private boolean isSameKey(long cmpOffset, int cmpLength, long ref, int hashCode) {
        if (!compareHashBits(ref, hashCode)) {
            return false; // Hash bits in ref don't match.
        }
        writeBuffers.setUnsafeReadPoint(getFirstRecordLengthsOffset(ref, null));
        int valueLength = (int) writeBuffers.unsafeReadVLong(), keyLength = (int) writeBuffers.unsafeReadVLong();
        if (keyLength != cmpLength) {
            return false;
        }
        long keyOffset = Ref.getOffset(ref) - (valueLength + keyLength);
        // There's full hash code stored in front of the key. We could check that first. If keyLength
        // is <= 4 it obviously doesn't make sense, less bytes to check in a key. Then, if there's a
        // match, we check it in vain. But what is the proportion of matches? For writes it could be 0
        // if all keys are unique, for reads we hope it's really high. Then if there's a mismatch what
        // probability is there that key mismatches in <4 bytes (so just checking the key is faster)?
        // We assume the latter is pretty high, so we don't check for now.
        return writeBuffers.isEqual(cmpOffset, cmpLength, keyOffset, keyLength);
    }

    /**
     * Same as {@link #isSameKey(long, int, long, int)} but for externally stored key.
     */
    private boolean isSameKey(byte[] key, int offset, int length, long ref, int hashCode,
            WriteBuffers.Position readPos) {
        if (!compareHashBits(ref, hashCode)) {
            return false; // Hash bits don't match.
        }
        writeBuffers.setReadPoint(getFirstRecordLengthsOffset(ref, readPos), readPos);
        int valueLength = (int) writeBuffers.readVLong(readPos), keyLength = (int) writeBuffers.readVLong(readPos);
        long keyOffset = Ref.getOffset(ref) - (valueLength + keyLength);
        // See the comment in the other isSameKey
        if (offset == 0) {
            return writeBuffers.isEqual(key, length, keyOffset, keyLength);
        } else {
            return writeBuffers.isEqual(key, offset, length, keyOffset, keyLength);
        }
    }

    private boolean compareHashBits(long ref, int hashCode) {
        long fakeRef = Ref.makeFirstRef(0, (byte) 0, hashCode, startingHashBitCount);
        return (Ref.getHashBits(fakeRef) == Ref.getHashBits(ref));
    }

    /**
     * @param ref Reference.
     * @return The offset to value and key length vlongs of the first record referenced by ref.
     */
    private long getFirstRecordLengthsOffset(long ref, WriteBuffers.Position readPos) {
        long tailOffset = Ref.getOffset(ref);
        if (Ref.hasList(ref)) {
            long relativeOffset = (readPos == null) ? writeBuffers.unsafeReadNByteLong(tailOffset, 5)
                    : writeBuffers.readNByteLong(tailOffset, 5, readPos);
            tailOffset += relativeOffset;
        }
        return tailOffset;
    }

    private void expandAndRehash() {
        expandAndRehashImpl(((long) refs.length) << 1);
    }

    private void expandAndRehashImpl(long capacity) {
        long expandTime = System.currentTimeMillis();
        final long[] oldRefs = refs;
        validateCapacity(capacity);
        long[] newRefs = new long[(int) capacity];

        // We store some hash bits in ref; for every expansion, we need to add one bit to hash.
        // If we have enough bits, we'll do that; if we don't, we'll rehash.
        // LOG.info("Expanding the hashtable to " + capacity + " capacity");
        int newHashBitCount = hashBitCount + 1;

        // Relocate all assigned slots from the old hash table.
        int maxSteps = 0;
        for (int oldSlot = 0; oldSlot < oldRefs.length; ++oldSlot) {
            long oldRef = oldRefs[oldSlot];
            if (oldRef == 0) {
                continue;
            }
            // TODO: we could actually store a bit flag in ref indicating whether this is a hash
            //       match or a probe, and in the former case use hash bits (for a first few resizes).
            // int hashCodeOrPart = oldSlot | Ref.getNthHashBit(oldRef, startingHashBitCount, newHashBitCount);
            writeBuffers.setUnsafeReadPoint(getFirstRecordLengthsOffset(oldRef, null));
            // Read the value and key length for the first record.
            int hashCode = (int) writeBuffers.unsafeReadNByteLong(
                    Ref.getOffset(oldRef) - writeBuffers.unsafeReadVLong() - writeBuffers.unsafeReadVLong() - 4, 4);
            int probeSteps = relocateKeyRef(newRefs, oldRef, hashCode);
            maxSteps = Math.max(probeSteps, maxSteps);
        }
        this.refs = newRefs;
        this.largestNumberOfSteps = maxSteps;
        this.hashBitCount = newHashBitCount;
        this.resizeThreshold = (int) (capacity * loadFactor);
        metricExpandsMs += (System.currentTimeMillis() - expandTime);
        ++metricExpands;
    }

    /**
     * @param ref The ref.
     * @return The offset to list record pointer; list record is created if it doesn't exist.
     */
    private long createOrGetListRecord(long ref) {
        if (Ref.hasList(ref)) {
            // LOG.info("Found list record at " + writeBuffers.getReadPoint());
            return writeBuffers.getUnsafeReadPoint(); // Assumes we are here after key compare.
        }
        long firstTailOffset = Ref.getOffset(ref);
        // LOG.info("First tail offset to create list record is " + firstTailOffset);

        // Determine the length of storage for value and key lengths of the first record.
        writeBuffers.setUnsafeReadPoint(firstTailOffset);
        writeBuffers.unsafeSkipVLong();
        writeBuffers.unsafeSkipVLong();
        int lengthsLength = (int) (writeBuffers.getUnsafeReadPoint() - firstTailOffset);

        // Create the list record, copy first record value/key lengths there.
        writeBuffers.writeBytes(firstTailOffset, lengthsLength);
        long lrPtrOffset = writeBuffers.getWritePoint();
        // LOG.info("Creating list record: copying " + lengthsLength + ", lrPtrOffset " + lrPtrOffset);

        // Reserve 5 bytes for writeValueRecord to fill. There might be junk there so null them.
        writeBuffers.write(FIVE_ZEROES);
        // Link the list record to the first element.
        writeBuffers.writeFiveByteULong(firstTailOffset, lrPtrOffset - lengthsLength - firstTailOffset);
        return lrPtrOffset;
    }

    /**
     * Adds a newly-written record to existing list.
     * @param lrPtrOffset List record pointer offset.
     * @param tailOffset New record offset.
     */
    private void addRecordToList(long lrPtrOffset, long tailOffset) {
        // Now, insert this record into the list.
        long prevHeadOffset = writeBuffers.unsafeReadNByteLong(lrPtrOffset, 5);
        // LOG.info("Reading offset " + prevHeadOffset + " at " + lrPtrOffset);
        assert prevHeadOffset < tailOffset; // We replace an earlier element, must have lower offset.
        writeBuffers.writeFiveByteULong(lrPtrOffset, tailOffset);
        // LOG.info("Writing offset " + tailOffset + " at " + lrPtrOffset);
        writeBuffers.writeVLong(prevHeadOffset == 0 ? 0 : (tailOffset - prevHeadOffset));
    }

    /**
     * Writes first value and lengths to finish the first record after the key has been written.
     * @param kv Key-value writer.
     * @param keyOffset
     * @param keyLength Key length (already written).
     * @param hashCode
     * @return The offset of the new record.
     */
    private long writeFirstValueRecord(KvSource kv, long keyOffset, int keyLength, int hashCode)
            throws SerDeException {
        long valueOffset = writeBuffers.getWritePoint();
        kv.writeValue(writeBuffers);
        long tailOffset = writeBuffers.getWritePoint();
        int valueLength = (int) (tailOffset - valueOffset);
        // LOG.info("Writing value at " + valueOffset + " length " + valueLength);
        // In an unlikely case of 0-length key and value for the very first entry, we want to tell
        // this apart from an empty value. We'll just advance one byte; this byte will be lost.
        if (tailOffset == 0) {
            writeBuffers.reserve(1);
            ++tailOffset;
        }
        // LOG.info("First tail offset " + writeBuffers.getWritePoint());
        writeBuffers.writeVLong(valueLength);
        writeBuffers.writeVLong(keyLength);
        long lengthsLength = writeBuffers.getWritePoint() - tailOffset;
        if (lengthsLength < 5) { // Reserve space for potential future list
            writeBuffers.reserve(5 - (int) lengthsLength);
        }
        // Finally write the hash code.
        writeBuffers.writeInt(keyOffset - 4, hashCode);
        return tailOffset;
    }

    /**
     * Writes the value and value length for non-first record.
     * @param kv Key-value writer.
     * @return The offset of the new record.
     */
    private long writeValueAndLength(KvSource kv) throws SerDeException {
        long valueOffset = writeBuffers.getWritePoint();
        kv.writeValue(writeBuffers);
        long tailOffset = writeBuffers.getWritePoint();
        writeBuffers.writeVLong(tailOffset - valueOffset);
        // LOG.info("Writing value at " + valueOffset + " length " + (tailOffset - valueOffset));
        return tailOffset;
    }

    /** Writes the debug dump of the table into logs. Not thread-safe. */
    public void debugDumpTable() {
        StringBuilder dump = new StringBuilder(keysAssigned + " keys\n");
        TreeMap<Long, Integer> byteIntervals = new TreeMap<Long, Integer>();
        int examined = 0;
        for (int slot = 0; slot < refs.length; ++slot) {
            long ref = refs[slot];
            if (ref == 0) {
                continue;
            }
            ++examined;
            long recOffset = getFirstRecordLengthsOffset(ref, null);
            long tailOffset = Ref.getOffset(ref);
            writeBuffers.setUnsafeReadPoint(recOffset);
            int valueLength = (int) writeBuffers.unsafeReadVLong(),
                    keyLength = (int) writeBuffers.unsafeReadVLong();
            long ptrOffset = writeBuffers.getUnsafeReadPoint();
            if (Ref.hasList(ref)) {
                byteIntervals.put(recOffset, (int) (ptrOffset + 5 - recOffset));
            }
            long keyOffset = tailOffset - valueLength - keyLength;
            byte[] key = new byte[keyLength];
            WriteBuffers.ByteSegmentRef fakeRef = new WriteBuffers.ByteSegmentRef(keyOffset, keyLength);
            byteIntervals.put(keyOffset - 4, keyLength + 4);
            writeBuffers.populateValue(fakeRef);
            System.arraycopy(fakeRef.getBytes(), (int) fakeRef.getOffset(), key, 0, keyLength);
            dump.append(Utils.toStringBinary(key, 0, key.length)).append(" ref [").append(dumpRef(ref))
                    .append("]: ");
            Result hashMapResult = new Result();
            getValueResult(key, 0, key.length, hashMapResult);
            List<WriteBuffers.ByteSegmentRef> results = new ArrayList<WriteBuffers.ByteSegmentRef>();
            WriteBuffers.ByteSegmentRef byteSegmentRef = hashMapResult.first();
            while (byteSegmentRef != null) {
                results.add(hashMapResult.byteSegmentRef);
                byteSegmentRef = hashMapResult.next();
            }
            dump.append(results.size()).append(" rows\n");
            for (int i = 0; i < results.size(); ++i) {
                WriteBuffers.ByteSegmentRef segment = results.get(i);
                byteIntervals.put(segment.getOffset(), segment.getLength() + ((i == 0) ? 1 : 0)); // state byte in the first record
            }
        }
        if (examined != keysAssigned) {
            dump.append("Found " + examined + " keys!\n");
        }
        // Report suspicious gaps in writeBuffers
        long currentOffset = 0;
        for (Map.Entry<Long, Integer> e : byteIntervals.entrySet()) {
            long start = e.getKey(), len = e.getValue();
            if (start - currentOffset > 4) {
                dump.append("Gap! [" + currentOffset + ", " + start + ")\n");
            }
            currentOffset = start + len;
        }
        LOG.info("Hashtable dump:\n " + dump.toString());
    }

    private final static byte[] FIVE_ZEROES = new byte[] { 0, 0, 0, 0, 0 };

    private static int nextHighestPowerOfTwo(int v) {
        return Integer.highestOneBit(v) << 1;
    }

    private static int nextLowestPowerOfTwo(int v) {
        return Integer.highestOneBit(v);
    }

    @VisibleForTesting
    int getCapacity() {
        return refs.length;
    }

    /** Static helper for manipulating refs */
    private final static class Ref {
        private final static int OFFSET_SHIFT = 24;
        private final static int STATE_BYTE_SHIFT = 16;
        private final static long STATE_BYTE_MASK = ((long) 1 << (OFFSET_SHIFT - STATE_BYTE_SHIFT)) - 1;
        public final static long HASH_BITS_COUNT = STATE_BYTE_SHIFT - 1;
        private final static long HAS_LIST_MASK = (long) 1 << HASH_BITS_COUNT;
        private final static long HASH_BITS_MASK = HAS_LIST_MASK - 1;

        private final static long REMOVE_STATE_BYTE = ~(STATE_BYTE_MASK << STATE_BYTE_SHIFT);

        public static long getOffset(long ref) {
            return ref >>> OFFSET_SHIFT;
        }

        public static byte getStateByte(long ref) {
            return (byte) ((ref >>> STATE_BYTE_SHIFT) & STATE_BYTE_MASK);
        }

        public static boolean hasList(long ref) {
            return (ref & HAS_LIST_MASK) == HAS_LIST_MASK;
        }

        public static long getHashBits(long ref) {
            return ref & HASH_BITS_MASK;
        }

        public static long makeFirstRef(long offset, byte stateByte, int hashCode, int skipHashBits) {
            long hashPart = (hashCode >>> skipHashBits) & HASH_BITS_MASK;
            return offset << OFFSET_SHIFT | hashPart | ((stateByte & 0xffl) << STATE_BYTE_SHIFT);
        }

        public static int getNthHashBit(long ref, int skippedBits, int position) {
            long hashPart = getHashBits(ref) << skippedBits; // Original hashcode, with 0-d low bits.
            return (int) (hashPart & (1 << (position - 1)));
        }

        public static long setStateByte(long ref, byte stateByte) {
            return (ref & REMOVE_STATE_BYTE) | ((stateByte & 0xffl) << STATE_BYTE_SHIFT);
        }

        public static long setListFlag(long ref) {
            return ref | HAS_LIST_MASK;
        }
    }

    private static String dumpRef(long ref) {
        return StringUtils.leftPad(Long.toBinaryString(ref), 64, "0") + " o=" + Ref.getOffset(ref) + " s="
                + Ref.getStateByte(ref) + " l=" + Ref.hasList(ref) + " h="
                + Long.toBinaryString(Ref.getHashBits(ref));
    }

    public void debugDumpMetrics() {
        LOG.info("Map metrics: keys allocated " + this.refs.length + ", keys assigned " + keysAssigned
                + ", write conflict " + metricPutConflict + ", write max dist " + largestNumberOfSteps
                + ", read conflict " + metricGetConflict + ", expanded " + metricExpands + " times in "
                + metricExpandsMs + "ms");
    }

    private void debugDumpKeyProbe(long keyOffset, int keyLength, int hashCode, int finalSlot) {
        final int bucketMask = refs.length - 1;
        WriteBuffers.ByteSegmentRef fakeRef = new WriteBuffers.ByteSegmentRef(keyOffset, keyLength);
        writeBuffers.populateValue(fakeRef);
        int slot = hashCode & bucketMask;
        long probeSlot = slot;
        StringBuilder sb = new StringBuilder("Probe path debug for [");
        sb.append(Utils.toStringBinary(fakeRef.getBytes(), (int) fakeRef.getOffset(), fakeRef.getLength()));
        sb.append("] hashCode ").append(Integer.toBinaryString(hashCode)).append(" is: ");
        int i = 0;
        while (slot != finalSlot) {
            probeSlot += (++i);
            slot = (int) (probeSlot & bucketMask);
            sb.append(slot).append(" - ").append(probeSlot).append(" - ").append(Long.toBinaryString(refs[slot]))
                    .append("\n");
        }
        LOG.info(sb.toString());
    }
}