org.apache.hadoop.hive.common.ndv.hll.HyperLogLog.java Source code

Java tutorial

Introduction

Here is the source code for org.apache.hadoop.hive.common.ndv.hll.HyperLogLog.java

Source

/**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.hadoop.hive.common.ndv.hll;

import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.nio.ByteBuffer;
import java.nio.charset.Charset;
import java.util.Map;
import java.util.TreeMap;

import org.apache.commons.codec.binary.Base64;
import org.apache.hadoop.hive.common.ndv.NumDistinctValueEstimator;
import org.apache.hadoop.hive.common.type.HiveDecimal;
import org.apache.hadoop.hive.ql.util.JavaDataModel;
import org.apache.hive.common.util.Murmur3;

/**
 * <pre>
 * This is an implementation of the following variants of hyperloglog (HLL)
 * algorithm 
 * Original  - Original HLL algorithm from Flajolet et. al from
 *             http://algo.inria.fr/flajolet/Publications/FlFuGaMe07.pdf
 * HLLNoBias - Google's implementation of bias correction based on lookup table
 *             http://static.googleusercontent.com/media/research.google.com/en//pubs/archive/40671.pdf
 * HLL++     - Google's implementation of HLL++ algorithm that uses SPARSE registers
 *             http://static.googleusercontent.com/media/research.google.com/en//pubs/archive/40671.pdf
 * 
 * Following are the constructor parameters that determines which algorithm is
 * used
 * <b>numRegisterIndexBits</b> - number of LSB hashcode bits to be used as register index.
 *                        <i>Default is 14</i>. min = 4 and max = 16
 * <b>numHashBits</b> - number of bits for hashcode. <i>Default is 64</i>. min = 32 and max = 128
 * <b>encoding</b> - Type of encoding to use (SPARSE or DENSE). The algorithm automatically
 *            switches to DENSE beyond a threshold. <i>Default: SPARSE</i>
 * <b>enableBitPacking</b> - To enable bit packing or not. Bit packing improves compression
 *                    at the cost of more CPU cycles. <i>Default: true</i>
 * <b>noBias</b> - Use Google's bias table lookup for short range bias correction.
 *          Enabling this will highly improve the estimation accuracy for short
 *          range values. <i>Default: true</i>
 * 
 * </pre>
 */
public class HyperLogLog implements NumDistinctValueEstimator {
    private final static int DEFAULT_HASH_BITS = 64;
    private final static long HASH64_ZERO = Murmur3.hash64(new byte[] { 0 });
    private final static long HASH64_ONE = Murmur3.hash64(new byte[] { 1 });
    private final static ByteBuffer SHORT_BUFFER = ByteBuffer.allocate(Short.BYTES);
    private final static ByteBuffer INT_BUFFER = ByteBuffer.allocate(Integer.BYTES);
    private final static ByteBuffer LONG_BUFFER = ByteBuffer.allocate(Long.BYTES);

    public enum EncodingType {
        SPARSE, DENSE
    }

    // number of bits to address registers
    private final int p;

    // number of registers - 2^p
    private final int m;

    // refer paper
    private float alphaMM;

    // enable/disable bias correction using table lookup
    private final boolean noBias;

    // enable/disable bitpacking
    private final boolean bitPacking;

    // Not making it configurable for perf reasons (avoid checks)
    private final int chosenHashBits = DEFAULT_HASH_BITS;

    private HLLDenseRegister denseRegister;
    private HLLSparseRegister sparseRegister;

    // counts are cached to avoid repeated complex computation. If register value
    // is updated the count will be computed again.
    private long cachedCount;
    private boolean invalidateCount;

    private EncodingType encoding;

    // threshold to switch from SPARSE to DENSE encoding
    private int encodingSwitchThreshold;

    private HyperLogLog(HyperLogLogBuilder hllBuilder) {
        if (hllBuilder.numRegisterIndexBits < HLLConstants.MIN_P_VALUE
                || hllBuilder.numRegisterIndexBits > HLLConstants.MAX_P_VALUE) {
            throw new IllegalArgumentException(
                    "p value should be between " + HLLConstants.MIN_P_VALUE + " to " + HLLConstants.MAX_P_VALUE);
        }
        this.p = hllBuilder.numRegisterIndexBits;
        this.m = 1 << p;
        this.noBias = hllBuilder.noBias;
        this.bitPacking = hllBuilder.bitPacking;

        // the threshold should be less than 12K bytes for p = 14.
        // The reason to divide by 5 is, in sparse mode after serialization the
        // entriesin sparse map are compressed, and delta encoded as varints. The
        // worst case size of varints are 5 bytes. Hence, 12K/5 ~= 2400 entries in
        // sparse map.
        if (bitPacking) {
            this.encodingSwitchThreshold = ((m * 6) / 8) / 5;
        } else {
            // if bitpacking is disabled, all register values takes 8 bits and hence
            // we can be more flexible with the threshold. For p=14, 16K/5 = 3200
            // entries in sparse map can be allowed.
            this.encodingSwitchThreshold = m / 3;
        }

        // initializeAlpha(DEFAULT_HASH_BITS);
        // alphaMM value for 128 bits hash seems to perform better for default 64 hash bits
        this.alphaMM = 0.7213f / (1 + 1.079f / m);
        // For efficiency alpha is multiplied by m^2
        this.alphaMM = this.alphaMM * m * m;

        this.cachedCount = -1;
        this.invalidateCount = false;
        this.encoding = hllBuilder.encoding;
        if (encoding.equals(EncodingType.SPARSE)) {
            this.sparseRegister = new HLLSparseRegister(p, HLLConstants.P_PRIME_VALUE, HLLConstants.Q_PRIME_VALUE);
            this.denseRegister = null;
        } else {
            this.sparseRegister = null;
            this.denseRegister = new HLLDenseRegister(p, bitPacking);
        }
    }

    public static HyperLogLogBuilder builder() {
        return new HyperLogLogBuilder();
    }

    public static class HyperLogLogBuilder {
        private int numRegisterIndexBits = 14;
        private EncodingType encoding = EncodingType.SPARSE;
        private boolean bitPacking = true;
        private boolean noBias = true;

        public HyperLogLogBuilder() {
        }

        public HyperLogLogBuilder setNumRegisterIndexBits(int b) {
            this.numRegisterIndexBits = b;
            return this;
        }

        public HyperLogLogBuilder setEncoding(EncodingType enc) {
            this.encoding = enc;
            return this;
        }

        public HyperLogLogBuilder enableBitPacking(boolean b) {
            this.bitPacking = b;
            return this;
        }

        public HyperLogLogBuilder enableNoBias(boolean nb) {
            this.noBias = nb;
            return this;
        }

        public HyperLogLog build() {
            return new HyperLogLog(this);
        }
    }

    // see paper for alpha initialization.
    private void initializeAlpha(final int hashBits) {
        if (hashBits <= 16) {
            alphaMM = 0.673f;
        } else if (hashBits <= 32) {
            alphaMM = 0.697f;
        } else if (hashBits <= 64) {
            alphaMM = 0.709f;
        } else {
            alphaMM = 0.7213f / (float) (1 + 1.079f / m);
        }

        // For efficiency alpha is multiplied by m^2
        alphaMM = alphaMM * m * m;
    }

    public void addBoolean(boolean val) {
        add(val ? HASH64_ONE : HASH64_ZERO);
    }

    public void addByte(byte val) {
        add(Murmur3.hash64(new byte[] { val }));
    }

    public void addBytes(byte[] val) {
        add(Murmur3.hash64(val));
    }

    public void addShort(short val) {
        SHORT_BUFFER.putShort(0, val);
        add(Murmur3.hash64(SHORT_BUFFER.array()));
    }

    public void addInt(int val) {
        INT_BUFFER.putInt(0, val);
        add(Murmur3.hash64(INT_BUFFER.array()));
    }

    public void addLong(long val) {
        LONG_BUFFER.putLong(0, val);
        add(Murmur3.hash64(LONG_BUFFER.array()));
    }

    public void addFloat(float val) {
        INT_BUFFER.putFloat(0, val);
        add(Murmur3.hash64(INT_BUFFER.array()));
    }

    public void addDouble(double val) {
        LONG_BUFFER.putDouble(0, val);
        add(Murmur3.hash64(LONG_BUFFER.array()));
    }

    public void addChar(char val) {
        SHORT_BUFFER.putChar(0, val);
        add(Murmur3.hash64(SHORT_BUFFER.array()));
    }

    /**
     * Java's default charset will be used for strings.
     * @param val
     *          - input string
     */
    public void addString(String val) {
        add(Murmur3.hash64(val.getBytes()));
    }

    public void addString(String val, Charset charset) {
        add(Murmur3.hash64(val.getBytes(charset)));
    }

    public void add(long hashcode) {
        if (encoding.equals(EncodingType.SPARSE)) {
            if (sparseRegister.add(hashcode)) {
                invalidateCount = true;
            }

            // if size of sparse map excess the threshold convert the sparse map to
            // dense register and switch to DENSE encoding
            if (sparseRegister.getSize() > encodingSwitchThreshold) {
                encoding = EncodingType.DENSE;
                denseRegister = sparseToDenseRegister(sparseRegister);
                sparseRegister = null;
                invalidateCount = true;
            }
        } else {
            if (denseRegister.add(hashcode)) {
                invalidateCount = true;
            }
        }
    }

    public long estimateNumDistinctValues() {
        // FMSketch treats the ndv of all nulls as 1 but hll treates the ndv as 0.
        // In order to get rid of divide by 0 problem, we follow FMSketch
        return count() > 0 ? count() : 1;
    }

    public long count() {
        // compute count only if the register values are updated else return the
        // cached count
        if (invalidateCount || cachedCount < 0) {
            if (encoding.equals(EncodingType.SPARSE)) {

                // if encoding is still SPARSE use linear counting with increase
                // accuracy (as we use pPrime bits for register index)
                int mPrime = 1 << sparseRegister.getPPrime();
                cachedCount = linearCount(mPrime, mPrime - sparseRegister.getSize());
            } else {

                // for DENSE encoding, use bias table lookup for HLLNoBias algorithm
                // else fallback to HLLOriginal algorithm
                double sum = denseRegister.getSumInversePow2();
                long numZeros = denseRegister.getNumZeroes();

                // cardinality estimate from normalized bias corrected harmonic mean on
                // the registers
                cachedCount = (long) (alphaMM * (1.0 / sum));
                long pow = (long) Math.pow(2, chosenHashBits);

                // when bias correction is enabled
                if (noBias) {
                    cachedCount = cachedCount <= 5 * m ? (cachedCount - estimateBias(cachedCount)) : cachedCount;
                    long h = cachedCount;
                    if (numZeros != 0) {
                        h = linearCount(m, numZeros);
                    }

                    if (h < getThreshold()) {
                        cachedCount = h;
                    }
                } else {
                    // HLL algorithm shows stronger bias for values in (2.5 * m) range.
                    // To compensate for this short range bias, linear counting is used
                    // for values before this short range. The original paper also says
                    // similar bias is seen for long range values due to hash collisions
                    // in range >1/30*(2^32). For the default case, we do not have to
                    // worry about this long range bias as the paper used 32-bit hashing
                    // and we use 64-bit hashing as default. 2^64 values are too high to
                    // observe long range bias (hash collisions).
                    if (cachedCount <= 2.5 * m) {

                        // for short range use linear counting
                        if (numZeros != 0) {
                            cachedCount = linearCount(m, numZeros);
                        }
                    } else if (chosenHashBits < 64 && cachedCount > (0.033333 * pow)) {

                        // long range bias for 32-bit hashcodes
                        if (cachedCount > (1 / 30) * pow) {
                            cachedCount = (long) (-pow * Math.log(1.0 - (double) cachedCount / (double) pow));
                        }
                    }
                }
            }
            invalidateCount = false;
        }

        return cachedCount;
    }

    private long getThreshold() {
        return (long) (HLLConstants.thresholdData[p - 4] + 0.5);
    }

    /**
     * Estimate bias from lookup table
     * @param count
     *          - cardinality before bias correction
     * @return cardinality after bias correction
     */
    private long estimateBias(long count) {
        double[] rawEstForP = HLLConstants.rawEstimateData[p - 4];

        // compute distance and store it in sorted map
        TreeMap<Double, Integer> estIndexMap = new TreeMap<>();
        double distance = 0;
        for (int i = 0; i < rawEstForP.length; i++) {
            distance = Math.pow(count - rawEstForP[i], 2);
            estIndexMap.put(distance, i);
        }

        // take top-k closest neighbors and compute the bias corrected cardinality
        long result = 0;
        double[] biasForP = HLLConstants.biasData[p - 4];
        double biasSum = 0;
        int kNeighbors = HLLConstants.K_NEAREST_NEIGHBOR;
        for (Map.Entry<Double, Integer> entry : estIndexMap.entrySet()) {
            biasSum += biasForP[entry.getValue()];
            kNeighbors--;
            if (kNeighbors <= 0) {
                break;
            }
        }

        // 0.5 added for rounding off
        result = (long) ((biasSum / HLLConstants.K_NEAREST_NEIGHBOR) + 0.5);
        return result;
    }

    public void setCount(long count) {
        this.cachedCount = count;
        this.invalidateCount = true;
    }

    private long linearCount(int mVal, long numZeros) {
        return (long) (Math.round(mVal * Math.log(mVal / ((double) numZeros))));
    }

    // refer paper
    public double getStandardError() {
        return 1.04 / Math.sqrt(m);
    }

    public HLLDenseRegister getHLLDenseRegister() {
        return denseRegister;
    }

    public HLLSparseRegister getHLLSparseRegister() {
        return sparseRegister;
    }

    /**
     * Reconstruct sparse map from serialized integer list
     * @param reg
     *          - uncompressed and delta decoded integer list
     */
    public void setHLLSparseRegister(int[] reg) {
        for (int i : reg) {
            int key = i >>> HLLConstants.Q_PRIME_VALUE;
            byte value = (byte) (i & 0x3f);
            sparseRegister.set(key, value);
        }
    }

    /**
     * Reconstruct dense registers from byte array
     * @param reg
     *          - unpacked byte array
     */
    public void setHLLDenseRegister(byte[] reg) {
        int i = 0;
        for (byte b : reg) {
            denseRegister.set(i, b);
            i++;
        }
    }

    /**
     * Merge the specified hyperloglog to the current one. Encoding switches
     * automatically after merge if the encoding switch threshold is exceeded.
     * @param hll
     *          - hyperloglog to be merged
     * @throws IllegalArgumentException
     */
    public void merge(HyperLogLog hll) {
        if (p != hll.p || chosenHashBits != hll.chosenHashBits) {
            throw new IllegalArgumentException(
                    "HyperLogLog cannot be merged as either p or hashbits are different. Current: " + toString()
                            + " Provided: " + hll.toString());
        }

        EncodingType otherEncoding = hll.getEncoding();

        if (encoding.equals(EncodingType.SPARSE) && otherEncoding.equals(EncodingType.SPARSE)) {
            sparseRegister.merge(hll.getHLLSparseRegister());
            // if after merge the sparse switching threshold is exceeded then change
            // to dense encoding
            if (sparseRegister.getSize() > encodingSwitchThreshold) {
                encoding = EncodingType.DENSE;
                denseRegister = sparseToDenseRegister(sparseRegister);
                sparseRegister = null;
            }
        } else if (encoding.equals(EncodingType.DENSE) && otherEncoding.equals(EncodingType.DENSE)) {
            denseRegister.merge(hll.getHLLDenseRegister());
        } else if (encoding.equals(EncodingType.SPARSE) && otherEncoding.equals(EncodingType.DENSE)) {
            denseRegister = sparseToDenseRegister(sparseRegister);
            denseRegister.merge(hll.getHLLDenseRegister());
            sparseRegister = null;
            encoding = EncodingType.DENSE;
        } else if (encoding.equals(EncodingType.DENSE) && otherEncoding.equals(EncodingType.SPARSE)) {
            HLLDenseRegister otherDenseRegister = sparseToDenseRegister(hll.getHLLSparseRegister());
            denseRegister.merge(otherDenseRegister);
        }

        invalidateCount = true;
    }

    /**
     * Converts sparse to dense hll register
     * @param sparseRegister
     *          - sparse register to be converted
     * @return converted dense register
     */
    private HLLDenseRegister sparseToDenseRegister(HLLSparseRegister sparseRegister) {
        if (sparseRegister == null) {
            return null;
        }
        int p = sparseRegister.getP();
        int pMask = (1 << p) - 1;
        HLLDenseRegister result = new HLLDenseRegister(p, bitPacking);
        for (Map.Entry<Integer, Byte> entry : sparseRegister.getSparseMap().entrySet()) {
            int key = entry.getKey();
            int idx = key & pMask;
            result.set(idx, entry.getValue());
        }
        return result;
    }

    @Override
    public String toString() {
        StringBuilder sb = new StringBuilder();
        sb.append("Encoding: ");
        sb.append(encoding);
        sb.append(", p: ");
        sb.append(p);
        sb.append(", estimatedCardinality: ");
        sb.append(estimateNumDistinctValues());
        return sb.toString();
    }

    public String toStringExtended() {
        if (encoding.equals(EncodingType.DENSE)) {
            return toString() + ", " + denseRegister.toExtendedString();
        } else if (encoding.equals(EncodingType.SPARSE)) {
            return toString() + ", " + sparseRegister.toExtendedString();
        }

        return toString();
    }

    public int getNumRegisterIndexBits() {
        return p;
    }

    public EncodingType getEncoding() {
        return encoding;
    }

    public void setEncoding(EncodingType encoding) {
        this.encoding = encoding;
    }

    @Override
    public boolean equals(Object obj) {
        if (!(obj instanceof HyperLogLog)) {
            return false;
        }

        HyperLogLog other = (HyperLogLog) obj;
        long count = estimateNumDistinctValues();
        long otherCount = other.estimateNumDistinctValues();
        boolean result = p == other.p && chosenHashBits == other.chosenHashBits && encoding.equals(other.encoding)
                && count == otherCount;
        if (encoding.equals(EncodingType.DENSE)) {
            result = result && denseRegister.equals(other.getHLLDenseRegister());
        }

        if (encoding.equals(EncodingType.SPARSE)) {
            result = result && sparseRegister.equals(other.getHLLSparseRegister());
        }
        return result;
    }

    @Override
    public int hashCode() {
        int hashcode = 0;
        hashcode += 31 * p;
        hashcode += 31 * chosenHashBits;
        hashcode += encoding.hashCode();
        hashcode += 31 * estimateNumDistinctValues();
        if (encoding.equals(EncodingType.DENSE)) {
            hashcode += 31 * denseRegister.hashCode();
        }

        if (encoding.equals(EncodingType.SPARSE)) {
            hashcode += 31 * sparseRegister.hashCode();
        }
        return hashcode;
    }

    @Override
    public void reset() {
    }

    @Override
    public String serialize() {
        ByteArrayOutputStream bos = new ByteArrayOutputStream();
        // write bytes to bos ...
        try {
            HyperLogLogUtils.serializeHLL(bos, this);
        } catch (IOException e) {
            throw new RuntimeException(e);
        }
        return Base64.encodeBase64String(bos.toByteArray());
    }

    @Override
    public NumDistinctValueEstimator deserialize(String s) {
        InputStream is = new ByteArrayInputStream(Base64.decodeBase64(s));
        try {
            return HyperLogLogUtils.deserializeHLL(is);
        } catch (IOException e) {
            throw new RuntimeException(e);
        }
    }

    @Override
    public void addToEstimator(long v) {
        addLong(v);
    }

    @Override
    public void addToEstimator(String s) {
        addString(s);
    }

    @Override
    public void addToEstimator(double d) {
        addDouble(d);
    }

    @Override
    public void addToEstimator(HiveDecimal decimal) {
        addDouble(decimal.doubleValue());
    }

    @Override
    public void mergeEstimators(NumDistinctValueEstimator o) {
        merge((HyperLogLog) o);
    }

    @Override
    public int lengthFor(JavaDataModel model) {
        // 5 is the head, 1<<p means the number of bytes for register
        return (5 + (1 << p));
    }

    @Override
    public boolean canMerge(NumDistinctValueEstimator o) {
        return o instanceof HyperLogLog;
    }

}