Java tutorial
/* * Druid - a distributed column store. * Copyright 2012 - 2015 Metamarkets Group Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package io.druid.query.aggregation.histogram; import com.fasterxml.jackson.annotation.JsonValue; import com.google.common.base.Preconditions; import com.google.common.collect.Lists; import com.google.common.collect.Maps; import com.google.common.primitives.Floats; import com.google.common.primitives.Ints; import com.google.common.primitives.Longs; import com.google.common.primitives.Shorts; import java.nio.ByteBuffer; import java.util.ArrayList; import java.util.Arrays; import java.util.Collections; import java.util.List; import java.util.Map; public class ApproximateHistogram { public static final int DEFAULT_HISTOGRAM_SIZE = 50; public static final int DEFAULT_BUCKET_SIZE = 7; // max size of the histogram (number of bincount/position pairs) int size; public float[] positions; public long[] bins; // used bincount int binCount; // min value that's been put into histogram float min; float max; // total number of values that have been put into histogram transient long count; // lower limit to maintain resolution // cutoff above which we merge bins is the difference of the limits / (size - 3) // so we'll set size = 203, lower limit = 0, upper limit = 10.00 if we don't want // to merge differences < 0.05 transient float lowerLimit; transient float upperLimit; // use sign bit to indicate approximate bin and remaining bits for bin count private static final long APPROX_FLAG_BIT = Long.MIN_VALUE; private static final long COUNT_BITS = Long.MAX_VALUE; @Override public boolean equals(Object o) { if (this == o) { return true; } if (o == null || getClass() != o.getClass()) { return false; } ApproximateHistogram that = (ApproximateHistogram) o; if (size != that.size) { return false; } if (binCount != that.binCount) { return false; } if (Float.compare(that.max, max) != 0) { return false; } if (Float.compare(that.min, min) != 0) { return false; } for (int i = 0; i < binCount; ++i) { if (positions[i] != that.positions[i]) { return false; } } for (int i = 0; i < binCount; ++i) { if (bins[i] != that.bins[i]) { return false; } } return true; } @Override public int hashCode() { int result = size; result = 31 * result + (positions != null ? ArrayUtils.hashCode(positions, 0, binCount) : 0); result = 31 * result + (bins != null ? ArrayUtils.hashCode(bins, 0, binCount) : 0); result = 31 * result + binCount; result = 31 * result + (min != +0.0f ? Float.floatToIntBits(min) : 0); result = 31 * result + (max != +0.0f ? Float.floatToIntBits(max) : 0); return result; } public ApproximateHistogram(int size, float[] positions, long[] bins, int binCount, float min, float max, long count, float lowerLimit, float upperLimit) { Preconditions.checkArgument(positions.length == bins.length, "position and bin array must have same size"); Preconditions.checkArgument(binCount <= size, "binCount must be less or equal to size"); this.size = size; this.positions = positions; this.bins = bins; this.binCount = binCount; this.min = min; this.max = max; this.count = count; this.lowerLimit = lowerLimit; this.upperLimit = upperLimit; } public ApproximateHistogram() { this(DEFAULT_HISTOGRAM_SIZE); } public ApproximateHistogram(int size) { this(size, //size new float[size], //positions new long[size], //bins 0, //binCount Float.POSITIVE_INFINITY, //min Float.NEGATIVE_INFINITY, //max 0, //count Float.NEGATIVE_INFINITY, //lowerLimit Float.POSITIVE_INFINITY //upperLimit ); } public ApproximateHistogram(int size, float lowerLimit, float upperLimit) { this(size, //size new float[size], //positions new long[size], //bins 0, //binCount Float.POSITIVE_INFINITY, //min Float.NEGATIVE_INFINITY, //max 0, //count lowerLimit, //lowerLimit upperLimit //upperLimit ); } public ApproximateHistogram(int binCount, float[] positions, long[] bins, float min, float max) { this(positions.length, //size positions, //positions bins, //bins binCount, //binCount min, //min max, //max sumBins(bins, binCount), //count Float.NEGATIVE_INFINITY, //lowerLimit Float.POSITIVE_INFINITY //upperLimit ); } public long count() { return count; } public float min() { return min; } public float max() { return max; } public int binCount() { return binCount; } public int capacity() { return size; } public float[] positions() { return Arrays.copyOfRange(positions, 0, binCount); } public long[] bins() { long[] counts = new long[binCount]; for (int i = 0; i < binCount; ++i) { counts[i] = bins[i] & COUNT_BITS; } return counts; } @Override public String toString() { return "ApproximateHistogram{" + "size=" + size + ", lowerLimit=" + lowerLimit + ", upperLimit=" + upperLimit + ", positions=" + Arrays.toString(positions()) + ", bins=" + getBinsString() + ", binCount=" + binCount + ", min=" + min + ", max=" + max + ", count=" + count + '}'; } public long getExactCount() { long exactCount = 0; for (int i = 0; i < binCount; ++i) { if ((bins[i] & APPROX_FLAG_BIT) == 0) { exactCount += (bins[i] & COUNT_BITS); } } return exactCount; } public float getMin() { return this.min; } public float getMax() { return this.max; } private static long sumBins(long[] bins, int binCount) { long count = 0; for (int i = 0; i < binCount; ++i) { count += bins[i] & COUNT_BITS; } return count; } /** * @return a string representation of the actual bin counts */ protected String getBinsString() { StringBuilder s = new StringBuilder(); s.append('['); for (int i = 0; i < bins.length; ++i) { if (i > 0) { s.append(", "); } if ((bins[i] & APPROX_FLAG_BIT) != 0) { s.append("*"); } s.append(bins[i] & COUNT_BITS); } s.append(']'); return s.toString(); } public void setLowerLimit(float lowerLimit) { this.lowerLimit = lowerLimit; } public void setUpperLimit(float upperLimit) { this.upperLimit = upperLimit; } /** * Adds the given value to the histogram * * @param value the value to be added */ public void offer(float value) { // update min/max if (value < min) { min = value; } if (value > max) { max = value; } // initial value if (binCount == 0) { positions[0] = value; bins[0] = 1; count++; binCount++; return; } final int index = Arrays.binarySearch(positions, 0, binCount, value); if (index >= 0) { // we have an exact match, simply increase the count, but keep the approximate flag bins[index] = (bins[index] & APPROX_FLAG_BIT) | ((bins[index] & COUNT_BITS) + 1); count++; return; } // otherwise merge the value into a new or existing bin at the following position final int insertAt = -(index + 1); if (binCount < size) { // we have a spare slot, put the value into a new bin shiftRight(insertAt, binCount); positions[insertAt] = value; bins[insertAt] = 1; count++; binCount++; return; } // no more slots available merge the new value into and existing bin // or merge existing bins before inserting the new one int minPos = minDeltaIndex(); float minDelta = minPos >= 0 ? positions[minPos + 1] - positions[minPos] : Float.MAX_VALUE; // determine the distance of new value to the nearest bins final float deltaRight = insertAt < binCount ? positions[insertAt] - value : Float.MAX_VALUE; final float deltaLeft = insertAt > 0 ? value - positions[insertAt - 1] : Float.MAX_VALUE; boolean mergeValue = false; if (deltaRight < minDelta) { minDelta = deltaRight; minPos = insertAt; mergeValue = true; } if (deltaLeft < minDelta) { minDelta = deltaLeft; minPos = insertAt - 1; mergeValue = true; } if (mergeValue) { // merge new value into an existing bin and set approximate flag final long k = bins[minPos] & COUNT_BITS; positions[minPos] = (positions[minPos] * k + value) / (k + 1); bins[minPos] = (k + 1) | APPROX_FLAG_BIT; count++; } else { // merge the closest bins together and insert new value as a separate bin mergeInsert(minPos, insertAt, value, 1); } } protected int minDeltaIndex() { // determine minimum distance between existing bins float minDelta = Float.MAX_VALUE; int minPos = -1; for (int i = 0; i < binCount - 1; ++i) { float delta = (positions[i + 1] - positions[i]); if (delta < minDelta) { minDelta = delta; minPos = i; } } return minPos; } /** * Merges the bin in the given position with the next bin * * @param index index of the bin to merge, index must satisfy 0 <= index < binCount - 1 */ protected void merge(final int index) { mergeInsert(index, -1, 0, 0); } /** * Merges the bin in the mergeAt position with the bin in position mergeAt+1 * and simultaneously inserts the given bin (v,c) as a new bin at position insertAt * * @param mergeAt index of the bin to be merged * @param insertAt index to insert the new bin at * @param v bin position * @param c bin count */ protected void mergeInsert(final int mergeAt, int insertAt, final float v, final long c) { final long k0 = (bins[mergeAt] & COUNT_BITS); final long k1 = (bins[mergeAt + 1] & COUNT_BITS); final long sum = k0 + k1; // merge bin at given position with the next bin and set approximate flag positions[mergeAt] = (float) (((double) positions[mergeAt] * k0 + (double) positions[mergeAt + 1] * k1) / sum); bins[mergeAt] = sum | APPROX_FLAG_BIT; final int unusedIndex = mergeAt + 1; if (insertAt >= 0) { // use unused slot to shift array left or right and make space for the new bin to insert if (insertAt < unusedIndex) { shiftRight(insertAt, unusedIndex); } else if (insertAt >= unusedIndex) { shiftLeft(unusedIndex, insertAt - 1); insertAt--; } positions[insertAt] = v; bins[insertAt] = c; count++; } else { // simple merging of bins, shift everything left and free up the unused bin shiftLeft(unusedIndex, binCount - 1); binCount--; } } /** * Shifts the given range the histogram bins one slot to the right * * @param start index of the first bin to shift * @param end index of the rightmost bin to shift into */ protected void shiftRight(int start, int end) { float prevVal = positions[start]; long prevCnt = bins[start]; for (int i = start + 1; i <= end; ++i) { float tmpVal = positions[i]; long tmpCnt = bins[i]; positions[i] = prevVal; bins[i] = prevCnt; prevVal = tmpVal; prevCnt = tmpCnt; } } /** * Shifts the given range of histogram bins one slot to the left * * @param start index of the leftmost empty bin to shift into * @param end index of the last bin to shift left */ protected void shiftLeft(int start, int end) { for (int i = start; i < end; ++i) { positions[i] = positions[i + 1]; bins[i] = bins[i + 1]; } } public ApproximateHistogram fold(ApproximateHistogram h) { return fold(h, null, null, null); } public ApproximateHistogram fold(ApproximateHistogram h, float[] mergedPositions, long[] mergedBins, float[] deltas) { if (size == 0) { return copy(h); } else { return foldMin(h, mergedPositions, mergedBins, deltas); } } public ApproximateHistogram foldFast(ApproximateHistogram h) { return foldFast(h, null, null); } /** * @param h histogram to be merged into the current histogram * @param mergedPositions temporary buffer of size greater or equal to this.capacity() * @param mergedBins temporary buffer of size greater or equal to this.capacity() * * @return returns this histogram with h folded into it */ public ApproximateHistogram foldFast(ApproximateHistogram h, float[] mergedPositions, long[] mergedBins) { if (size == 0) { return copy(h); } else { return foldRule(h, mergedPositions, mergedBins); } } /** * Copies histogram h into the current histogram. * * @param h ApproximateHistogram to copy * * @return this histogram */ public ApproximateHistogram copy(ApproximateHistogram h) { this.size = h.size; this.positions = new float[size]; this.bins = new long[size]; System.arraycopy(h.positions, 0, this.positions, 0, h.binCount); System.arraycopy(h.bins, 0, this.bins, 0, h.binCount); this.min = h.min; this.max = h.max; this.binCount = h.binCount; this.count = h.count; return this; } //approximate histogram solution using min heap to store location of min deltas protected ApproximateHistogram foldMin(ApproximateHistogram h, float[] mergedPositions, long[] mergedBins, float[] deltas) { // find common min / max float mergedMin = this.min < h.min ? this.min : h.min; float mergedMax = this.max > h.max ? this.max : h.max; long mergedCount = this.count + h.count; int maxSize = this.binCount + h.binCount; int[] next = new int[maxSize]; int[] prev = new int[maxSize]; // use preallocated arrays if passed if (mergedPositions == null || mergedBins == null || deltas == null) { mergedPositions = new float[maxSize]; mergedBins = new long[maxSize]; deltas = new float[maxSize]; } else { Preconditions.checkArgument(mergedPositions.length >= maxSize, "temp buffer [mergedPositions] too small: length must be at least [%d], got [%d]", maxSize, mergedPositions.length); Preconditions.checkArgument(mergedBins.length >= maxSize, "temp buffer [mergedBins] too small: length must be at least [%d], got [%d]", maxSize, mergedPositions.length); Preconditions.checkArgument(deltas.length >= maxSize, "temp buffer [deltas] too small: length must be at least [%d], got [%d]", maxSize, mergedPositions.length); } int mergedBinCount = combineBins(this.binCount, this.positions, this.bins, h.binCount, h.positions, h.bins, mergedPositions, mergedBins, deltas); if (mergedBinCount == 0) { return this; } // determine how many bins to merge int numMerge = mergedBinCount - this.size; if (numMerge < 0) { numMerge = 0; } // perform the required number of merges mergeBins(mergedBinCount, mergedPositions, mergedBins, deltas, numMerge, next, prev); // copy merged values int i = 0; int k = 0; while (i < mergedBinCount) { this.positions[k] = mergedPositions[i]; this.bins[k] = mergedBins[i]; ++k; i = next[i]; } this.binCount = mergedBinCount - numMerge; this.min = mergedMin; this.max = mergedMax; this.count = mergedCount; return this; } protected ApproximateHistogram foldRule(ApproximateHistogram h, float[] mergedPositions, long[] mergedBins) { // ruleCombine bins requires at least one bin if (h.binCount == 0) { return this; } // find common min / max float mergedMin = this.min < h.min ? this.min : h.min; float mergedMax = this.max > h.max ? this.max : h.max; long mergedCount = this.count + h.count; this.min = mergedMin; this.max = mergedMax; // use preallocated arrays if passed if (mergedPositions == null) { mergedPositions = new float[this.size]; mergedBins = new long[this.size]; } int mergedBinCount; if (this.binCount + h.binCount <= this.size) { // no need to merge bins mergedBinCount = combineBins(this.binCount, this.positions, this.bins, h.binCount, h.positions, h.bins, mergedPositions, mergedBins, null); } else { mergedBinCount = ruleCombineBins(this.binCount, this.positions, this.bins, h.binCount, h.positions, h.bins, mergedPositions, mergedBins); } for (int i = 0; i < mergedBinCount; ++i) { this.positions[i] = mergedPositions[i]; this.bins[i] = mergedBins[i]; } this.binCount = mergedBinCount; this.count = mergedCount; return this; } protected int ruleCombineBins(int leftBinCount, float[] leftPositions, long[] leftBins, int rightBinCount, float[] rightPositions, long[] rightBins, float[] mergedPositions, long[] mergedBins) { final float cutoff; // assumes binCount is greater than one for both histograms // if upper and lower limits are set, we use the first and last used values of the arrays // for information below and above the limits, respectively if (this.upperLimit != Float.POSITIVE_INFINITY && this.lowerLimit != Float.NEGATIVE_INFINITY) { cutoff = (this.upperLimit - this.lowerLimit) / (size - 2 - 1); } else { if (this.upperLimit != Float.POSITIVE_INFINITY) { cutoff = (this.upperLimit - this.min) / (size - 2); } else if (this.lowerLimit != Float.NEGATIVE_INFINITY) { cutoff = (this.max - this.lowerLimit) / (size - 2); } else { cutoff = (this.max - this.min) / (size - 1); } } float lowerPosition = 0f; long lowerBin = 0; float upperPosition = 0f; long upperBin = 0; int j = 0; int k = 0; int pos = 0; // continuously merge the left histogram below the lower limit while (j != leftBinCount) { final float m1 = leftPositions[j]; if (m1 < lowerLimit) { final long k1 = leftBins[j] & COUNT_BITS; float delta = (m1 - lowerPosition); final long k0 = lowerBin & COUNT_BITS; final long sum = k0 + k1; final float w = (float) k0 / (float) sum; lowerPosition = -delta * w + m1; // set approximate flag lowerBin = sum | APPROX_FLAG_BIT; ++j; } else { break; } } // continuously merge the right histogram below the lower limit while (k != rightBinCount) { final float m1 = rightPositions[k]; if (m1 < lowerLimit) { final long k1 = rightBins[k] & COUNT_BITS; float delta = (m1 - lowerPosition); final long k0 = lowerBin & COUNT_BITS; final long sum = k0 + k1; final float w = (float) k0 / (float) sum; lowerPosition = -delta * w + m1; // set approximate flag lowerBin = sum | APPROX_FLAG_BIT; ++k; } else { break; } } // if there are values below the lower limit, store them in array position 0 if ((lowerBin & COUNT_BITS) > 0) { mergedPositions[0] = lowerPosition; mergedBins[0] = lowerBin; pos = 1; } // if there are values below the lower limit, fill in array position 1 // else array position 0 while (j != leftBinCount || k != rightBinCount) { if (j != leftBinCount && (k == rightBinCount || leftPositions[j] < rightPositions[k])) { mergedPositions[pos] = leftPositions[j]; mergedBins[pos] = leftBins[j]; ++j; break; } else { mergedPositions[pos] = rightPositions[k]; mergedBins[pos] = rightBins[k]; ++k; break; } } while (j != leftBinCount || k != rightBinCount) { if (j != leftBinCount && (k == rightBinCount || leftPositions[j] < rightPositions[k])) { final float m1 = leftPositions[j]; final long k1 = leftBins[j] & COUNT_BITS; // above the upper limit gets merged continuously in the left histogram if (m1 > upperLimit) { float delta = (m1 - upperPosition); final long k0 = upperBin & COUNT_BITS; final long sum = k0 + k1; final float w = (float) k0 / (float) sum; upperPosition = -delta * w + m1; // set approximate flag upperBin = sum | APPROX_FLAG_BIT; ++j; continue; } final float delta = (m1 - mergedPositions[pos]); if (delta <= cutoff) { final long k0 = mergedBins[pos] & COUNT_BITS; final long sum = k0 + k1; final float w = (float) k0 / (float) sum; mergedPositions[pos] = -delta * w + m1; // set approximate flag mergedBins[pos] = sum | APPROX_FLAG_BIT; } else { ++pos; mergedPositions[pos] = m1; mergedBins[pos] = k1; } ++j; } else { final float m1 = rightPositions[k]; final long k1 = rightBins[k] & COUNT_BITS; // above the upper limit gets merged continuously in the right histogram if (m1 > upperLimit) { float delta = (m1 - upperPosition); final long k0 = upperBin & COUNT_BITS; final long sum = k0 + k1; final float w = (float) k0 / (float) sum; upperPosition = -delta * w + m1; // set approximate flag upperBin = sum | APPROX_FLAG_BIT; ++k; continue; } final float delta = (m1 - mergedPositions[pos]); if (delta <= cutoff) { final long k0 = mergedBins[pos] & COUNT_BITS; final long sum = k0 + k1; final float w = (float) k0 / (float) sum; mergedPositions[pos] = -delta * w + m1; mergedBins[pos] = sum | APPROX_FLAG_BIT; } else { ++pos; mergedPositions[pos] = m1; mergedBins[pos] = k1; } ++k; } } if ((upperBin & COUNT_BITS) > 0) { ++pos; mergedPositions[pos] = upperPosition; mergedBins[pos] = upperBin; } return pos + 1; } /** * mergeBins performs the given number of bin merge operations on the given histogram * * It repeatedly merges the two closest bins until it has performed the requested number of merge operations. * Merges are done in-place and unused bins have unknown state * * next / prev maintains a doubly-linked list of valid bin indices into the mergedBins array. * * Fast operation is achieved by building a min-heap of the deltas as opposed to repeatedly * scanning the array of deltas to find the minimum. A reverse index into the heap is maintained * to allow deleting and updating of specific deltas. * * next and prev arrays are used to maintain indices to the previous / next valid bin from a given bin index * * Its effect is equivalent to running the following code: * * <pre> * ApproximateHistogram merged = new ApproximateHistogram(mergedBinCount, mergedPositions, mergedBins); * * int targetSize = merged.binCount() - numMerge; * while(merged.binCount() > targetSize) { * merged.merge(merged.minDeltaIndex()); * } * </pre> * * @param mergedBinCount * @param mergedPositions * @param mergedBins * @param deltas * @param numMerge * @param next * @param prev * * @return the last valid index into the mergedPositions and mergedBins arrays */ private static void mergeBins(int mergedBinCount, float[] mergedPositions, long[] mergedBins, float[] deltas, int numMerge, int[] next, int[] prev) { // repeatedly search for two closest bins, merge them and update the corresponding deltas // maintain index to the last valid bin int lastValidIndex = mergedBinCount - 1; // initialize prev / next lookup arrays for (int i = 0; i < mergedBinCount; ++i) { next[i] = i + 1; } for (int i = 0; i < mergedBinCount; ++i) { prev[i] = i - 1; } // initialize min-heap of deltas and the reverse index into the heap int heapSize = mergedBinCount - 1; int[] heap = new int[heapSize]; int[] reverseIndex = new int[heapSize]; for (int i = 0; i < heapSize; ++i) { heap[i] = i; } for (int i = 0; i < heapSize; ++i) { reverseIndex[i] = i; } heapify(heap, reverseIndex, heapSize, deltas); { int i = 0; while (i < numMerge) { // find the smallest delta within the range used for bins // pick minimum delta by scanning array //int currentIndex = minIndex(deltas, lastValidIndex); // pick minimum delta index using min-heap int currentIndex = heap[0]; final int nextIndex = next[currentIndex]; final int prevIndex = prev[currentIndex]; final long k0 = mergedBins[currentIndex] & COUNT_BITS; final long k1 = mergedBins[nextIndex] & COUNT_BITS; final float m0 = mergedPositions[currentIndex]; final float m1 = mergedPositions[nextIndex]; final float d1 = deltas[nextIndex]; final long sum = k0 + k1; final float w = (float) k0 / (float) sum; // merge bin at given position with the next bin final float mm0 = (m0 - m1) * w + m1; mergedPositions[currentIndex] = mm0; //mergedPositions[nextIndex] = Float.MAX_VALUE; // for debugging mergedBins[currentIndex] = sum | APPROX_FLAG_BIT; //mergedBins[nextIndex] = -1; // for debugging // update deltas and min-heap if (nextIndex == lastValidIndex) { // merged bin is the last => remove the current bin delta from the heap heapSize = heapDelete(heap, reverseIndex, heapSize, reverseIndex[currentIndex], deltas); //deltas[currentIndex] = Float.MAX_VALUE; // for debugging } else { // merged bin is not the last => remove the merged bin delta from the heap heapSize = heapDelete(heap, reverseIndex, heapSize, reverseIndex[nextIndex], deltas); // updated current delta deltas[currentIndex] = m1 - mm0 + d1; // updated delta is necessarily larger than existing one, therefore we only need to push it down the heap siftDown(heap, reverseIndex, reverseIndex[currentIndex], heapSize - 1, deltas); } if (prevIndex >= 0) { // current bin is not the first, therefore update the previous bin delta deltas[prevIndex] = mm0 - mergedPositions[prevIndex]; // updated previous bin delta is necessarily larger than its existing value => push down the heap siftDown(heap, reverseIndex, reverseIndex[prevIndex], heapSize - 1, deltas); } // mark the merged bin as invalid // deltas[nextIndex] = Float.MAX_VALUE; // for debugging // update last valid index if we merged the last bin if (nextIndex == lastValidIndex) { lastValidIndex = currentIndex; } next[currentIndex] = next[nextIndex]; if (nextIndex < lastValidIndex) { prev[next[nextIndex]] = currentIndex; } ++i; } } } /** * Builds a min-heap and a reverseIndex into the heap from the given array of values * * @param heap min-heap stored as indices into the array of values * @param reverseIndex reverse index from the array of values into the heap * @param count current size of the heap * @param values values to be stored in the heap */ private static void heapify(int[] heap, int[] reverseIndex, int count, float[] values) { int start = (count - 2) / 2; while (start >= 0) { siftDown(heap, reverseIndex, start, count - 1, values); start--; } } /** * Rebalances the min-heap by pushing values from the top down and simultaneously updating the reverse index * * @param heap min-heap stored as indices into the array of values * @param reverseIndex reverse index from the array of values into the heap * @param start index to start re-balancing from * @param end index to stop re-balancing at * @param values values stored in the heap */ private static void siftDown(int[] heap, int[] reverseIndex, int start, int end, float[] values) { int root = start; while (root * 2 + 1 <= end) { int child = root * 2 + 1; int swap = root; if (values[heap[swap]] > values[heap[child]]) { swap = child; } if (child + 1 <= end && values[heap[swap]] > values[heap[child + 1]]) { swap = child + 1; } if (swap != root) { // swap int tmp = heap[swap]; heap[swap] = heap[root]; heap[root] = tmp; // heap index from delta index reverseIndex[heap[swap]] = swap; reverseIndex[heap[root]] = root; root = swap; } else { return; } } } /** * Deletes an item from the min-heap and updates the reverse index * * @param heap min-heap stored as indices into the array of values * @param reverseIndex reverse index from the array of values into the heap * @param count current size of the heap * @param heapIndex index of the item to be deleted * @param values values stored in the heap */ private static int heapDelete(int[] heap, int[] reverseIndex, int count, int heapIndex, float[] values) { int end = count - 1; reverseIndex[heap[heapIndex]] = -1; heap[heapIndex] = heap[end]; reverseIndex[heap[heapIndex]] = heapIndex; end--; siftDown(heap, reverseIndex, heapIndex, end, values); return count - 1; } private static int minIndex(float[] deltas, int lastValidIndex) { int minIndex = -1; float min = Float.MAX_VALUE; for (int k = 0; k < lastValidIndex; ++k) { float value = deltas[k]; if (value < min) { minIndex = k; min = value; } } return minIndex; } /** * Combines two sets of histogram bins using merge-sort and computes the delta between consecutive bin positions. * Duplicate bins are merged together. * * @param leftBinCount * @param leftPositions * @param leftBins * @param rightBinCount * @param rightPositions * @param rightBins * @param mergedPositions array to store the combined bin positions (size must be at least leftBinCount + rightBinCount) * @param mergedBins array to store the combined bin counts (size must be at least leftBinCount + rightBinCount) * @param deltas deltas between consecutive bin positions in the merged bins (size must be at least leftBinCount + rightBinCount) * * @return the number of combined bins */ private static int combineBins(int leftBinCount, float[] leftPositions, long[] leftBins, int rightBinCount, float[] rightPositions, long[] rightBins, float[] mergedPositions, long[] mergedBins, float[] deltas) { int i = 0; int j = 0; int k = 0; while (j < leftBinCount || k < rightBinCount) { if (j < leftBinCount && (k == rightBinCount || leftPositions[j] < rightPositions[k])) { mergedPositions[i] = leftPositions[j]; mergedBins[i] = leftBins[j]; ++j; } else if (k < rightBinCount && (j == leftBinCount || leftPositions[j] > rightPositions[k])) { mergedPositions[i] = rightPositions[k]; mergedBins[i] = rightBins[k]; ++k; } else { // combine overlapping bins mergedPositions[i] = leftPositions[j]; mergedBins[i] = leftBins[j] + rightBins[k]; ++j; ++k; } if (deltas != null && i > 0) { deltas[i - 1] = mergedPositions[i] - mergedPositions[i - 1]; } ++i; } return i; } /** * Returns a byte-array representation of this ApproximateHistogram object * * @return byte array representation */ @JsonValue public byte[] toBytes() { ByteBuffer buf = ByteBuffer.allocate(getMinStorageSize()); toBytes(buf); return buf.array(); } public int getDenseStorageSize() { return Ints.BYTES * 2 + Floats.BYTES * size + Longs.BYTES * size + Floats.BYTES * 2; } public int getSparseStorageSize() { return Ints.BYTES * 2 + Floats.BYTES * binCount + Longs.BYTES * binCount + Floats.BYTES * 2; } public int getCompactStorageSize() { // ensures exactCount and (count - exactCount) can safely be cast to (int) Preconditions.checkState(canStoreCompact(), "Approximate histogram cannot be stored in compact form"); final long exactCount = getExactCount(); if (exactCount == count) { return Shorts.BYTES + 1 + Floats.BYTES * (int) exactCount; } else { return Shorts.BYTES + 1 + Floats.BYTES * (int) exactCount + 1 + Floats.BYTES * (int) (count - exactCount) + Floats.BYTES * 2; } } public int getMaxStorageSize() { return getDenseStorageSize(); } /** * Returns the minimum number of bytes required to store this ApproximateHistogram object * * @return required number of bytes */ public int getMinStorageSize() { // sparse is always small than dense, so no need to check if (canStoreCompact() && getCompactStorageSize() < getSparseStorageSize()) { return getCompactStorageSize(); } else { return getSparseStorageSize(); } } /** * Checks whether this approximate histogram can be stored in a compact form * * @return true if yes, false otherwise */ public boolean canStoreCompact() { final long exactCount = getExactCount(); return (size <= Short.MAX_VALUE && exactCount <= Byte.MAX_VALUE && (count - exactCount) <= Byte.MAX_VALUE); } /** * Writes the representation of this ApproximateHistogram object to the given byte-buffer * * @param buf ByteBuffer to write the ApproximateHistogram to */ public void toBytes(ByteBuffer buf) { if (canStoreCompact() && getCompactStorageSize() < getSparseStorageSize()) { // store compact toBytesCompact(buf); } else { // store sparse toBytesSparse(buf); } } /** * Writes the dense representation of this ApproximateHistogram object to the given byte-buffer * * Requires 16 + 12 * size bytes of storage * * @param buf ByteBuffer to write the ApproximateHistogram to */ public void toBytesDense(ByteBuffer buf) { buf.putInt(size); buf.putInt(binCount); buf.asFloatBuffer().put(positions); buf.position(buf.position() + Floats.BYTES * positions.length); buf.asLongBuffer().put(bins); buf.position(buf.position() + Longs.BYTES * bins.length); buf.putFloat(min); buf.putFloat(max); } /** * Writes the sparse representation of this ApproximateHistogram object to the given byte-buffer * * Requires 16 + 12 * binCount bytes of storage * * @param buf ByteBuffer to write the ApproximateHistogram to */ public void toBytesSparse(ByteBuffer buf) { buf.putInt(size); buf.putInt(-1 * binCount); // use negative binCount to indicate sparse storage for (int i = 0; i < binCount; ++i) { buf.putFloat(positions[i]); } for (int i = 0; i < binCount; ++i) { buf.putLong(bins[i]); } buf.putFloat(min); buf.putFloat(max); } /** * Returns a compact byte-buffer representation of this ApproximateHistogram object * storing actual values as opposed to histogram bins * * Requires 3 + 4 * count bytes of storage with count <= 127 * * @param buf ByteBuffer to write the ApproximateHistogram to */ public void toBytesCompact(ByteBuffer buf) { Preconditions.checkState(canStoreCompact(), "Approximate histogram cannot be stored in compact form"); buf.putShort((short) (-1 * size)); // use negative size to indicate compact storage final long exactCount = getExactCount(); if (exactCount != count) { // use negative count to indicate approximate bins buf.put((byte) (-1 * (count - exactCount))); // store actual values instead of bins for (int i = 0; i < binCount; ++i) { // repeat each value bins[i] times for approximate bins if ((bins[i] & APPROX_FLAG_BIT) != 0) { for (int k = 0; k < (bins[i] & COUNT_BITS); ++k) { buf.putFloat(positions[i]); } } } // tack on min and max since they may be lost int the approximate bins buf.putFloat(min); buf.putFloat(max); } buf.put((byte) exactCount); // store actual values instead of bins for (int i = 0; i < binCount; ++i) { // repeat each value bins[i] times for exact bins if ((bins[i] & APPROX_FLAG_BIT) == 0) { for (int k = 0; k < (bins[i] & COUNT_BITS); ++k) { buf.putFloat(positions[i]); } } } } /** * Constructs an Approximate Histogram object from the given byte-array representation * * @param bytes byte array to construct an ApproximateHistogram from * * @return ApproximateHistogram constructed from the given byte array */ public static ApproximateHistogram fromBytes(byte[] bytes) { ByteBuffer buf = ByteBuffer.wrap(bytes); return fromBytes(buf); } /** * Constructs an ApproximateHistogram object from the given dense byte-buffer representation * * @param buf ByteBuffer to construct an ApproximateHistogram from * * @return ApproximateHistogram constructed from the given ByteBuffer */ public static ApproximateHistogram fromBytesDense(ByteBuffer buf) { int size = buf.getInt(); int binCount = buf.getInt(); float[] positions = new float[size]; long[] bins = new long[size]; buf.asFloatBuffer().get(positions); buf.position(buf.position() + Floats.BYTES * positions.length); buf.asLongBuffer().get(bins); buf.position(buf.position() + Longs.BYTES * bins.length); float min = buf.getFloat(); float max = buf.getFloat(); return new ApproximateHistogram(binCount, positions, bins, min, max); } /** * Constructs an ApproximateHistogram object from the given dense byte-buffer representation * * @param buf ByteBuffer to construct an ApproximateHistogram from * * @return ApproximateHistogram constructed from the given ByteBuffer */ public static ApproximateHistogram fromBytesSparse(ByteBuffer buf) { int size = buf.getInt(); int binCount = -1 * buf.getInt(); float[] positions = new float[size]; long[] bins = new long[size]; for (int i = 0; i < binCount; ++i) { positions[i] = buf.getFloat(); } for (int i = 0; i < binCount; ++i) { bins[i] = buf.getLong(); } float min = buf.getFloat(); float max = buf.getFloat(); return new ApproximateHistogram(binCount, positions, bins, min, max); } /** * Constructs an ApproximateHistogram object from the given compact byte-buffer representation * * @param buf ByteBuffer to construct an ApproximateHistogram from * * @return ApproximateHistogram constructed from the given ByteBuffer */ public static ApproximateHistogram fromBytesCompact(ByteBuffer buf) { short size = (short) (-1 * buf.getShort()); byte count = buf.get(); if (count >= 0) { // only exact bins ApproximateHistogram histogram = new ApproximateHistogram(size); for (int i = 0; i < count; ++i) { histogram.offer(buf.getFloat()); } return histogram; } else { byte approxCount = (byte) (-1 * count); Map<Float, Long> approx = Maps.newHashMap(); for (int i = 0; i < approxCount; ++i) { final float value = buf.getFloat(); if (approx.containsKey(value)) { approx.put(value, approx.get(value) + 1); } else { approx.put(value, 1L); } } float min = buf.getFloat(); float max = buf.getFloat(); byte exactCount = buf.get(); Map<Float, Long> exact = Maps.newHashMap(); for (int i = 0; i < exactCount; ++i) { final float value = buf.getFloat(); if (exact.containsKey(value)) { exact.put(value, exact.get(value) + 1); } else { exact.put(value, 1L); } } int binCount = exact.size() + approx.size(); List<Float> pos = Lists.newArrayList(); pos.addAll(exact.keySet()); pos.addAll(approx.keySet()); Collections.sort(pos); float[] positions = new float[size]; long[] bins = new long[size]; for (int i = 0; i < pos.size(); ++i) { positions[i] = pos.get(i); } for (int i = 0; i < pos.size(); ++i) { final float value = pos.get(i); if (exact.containsKey(value)) { bins[i] = exact.get(value); } else { bins[i] = approx.get(value) | APPROX_FLAG_BIT; } } return new ApproximateHistogram(binCount, positions, bins, min, max); } } /** * Constructs an ApproximateHistogram object from the given byte-buffer representation * * @param buf ByteBuffer to construct an ApproximateHistogram from * * @return ApproximateHistogram constructed from the given ByteBuffer */ public static ApproximateHistogram fromBytes(ByteBuffer buf) { ByteBuffer copy = buf.asReadOnlyBuffer(); // negative size indicates compact representation // this works regardless of whether we use int or short for the size since the leftmost bit is the sign bit if (copy.getShort(buf.position()) < 0) { return fromBytesCompact(buf); } else { // ignore size copy.getInt(); // determine if sparse or dense based on sign of binCount if (copy.getInt() < 0) { return fromBytesSparse(buf); } else { return fromBytesDense(buf); } } } /** * Returns the approximate number of items less than or equal to b in the histogram * * @param b the cutoff * * @return the approximate number of items less than or equal to b */ public double sum(final float b) { if (b < min) { return 0; } if (b >= max) { return count; } int index = Arrays.binarySearch(positions, 0, binCount, b); boolean exactMatch = index >= 0; index = exactMatch ? index : -(index + 1); // we want positions[index] <= b < positions[index+1] if (!exactMatch) { index--; } final boolean outerLeft = index < 0; final boolean outerRight = index >= (binCount - 1); final long m0 = outerLeft ? 0 : (bins[index] & COUNT_BITS); final long m1 = outerRight ? 0 : (bins[index + 1] & COUNT_BITS); final double p0 = outerLeft ? min : positions[index]; final double p1 = outerRight ? max : positions[index + 1]; final boolean exact0 = (!outerLeft && (bins[index] & APPROX_FLAG_BIT) == 0); final boolean exact1 = (!outerRight && (bins[index + 1] & APPROX_FLAG_BIT) == 0); // handle case when p0 = p1, which happens if the first bin = min or the last bin = max final double l = (p1 == p0) ? 0 : (b - p0) / (p1 - p0); // don't include exact counts in the trapezoid calculation long tm0 = m0; long tm1 = m1; if (exact0) { tm0 = 0; } if (exact1) { tm1 = 0; } final double mb = tm0 + (tm1 - tm0) * l; double s = 0.5 * (tm0 + mb) * l; for (int i = 0; i < index; ++i) { s += (bins[i] & COUNT_BITS); } // add full bin count if left bin count is exact if (exact0) { return (s + m0); } // otherwise add only the left half of the bin else { return (s + 0.5 * m0); } } /** * Returns the approximate quantiles corresponding to the given probabilities. * probabilities = [.5f] returns [median] * probabilities = [.25f, .5f, .75f] returns the quartiles, [25%ile, median, 75%ile] * * @param probabilities array of probabilities * * @return an array of length probabilities.length representing the the approximate sample quantiles * corresponding to the given probabilities */ public float[] getQuantiles(float[] probabilities) { for (float p : probabilities) { Preconditions.checkArgument(0 < p & p < 1, "quantile probabilities must be strictly between 0 and 1"); } float[] quantiles = new float[probabilities.length]; Arrays.fill(quantiles, Float.NaN); if (this.count() == 0) { return quantiles; } final long[] bins = this.bins(); for (int j = 0; j < probabilities.length; ++j) { final double s = probabilities[j] * this.count(); int i = 0; int sum = 0; int k = 1; long count = 0; while (k <= this.binCount()) { count = bins[k - 1]; if (sum + count > s) { i = k - 1; break; } else { sum += count; } ++k; } if (i == 0) { quantiles[j] = this.min(); } else { final double d = s - sum; final double c = -2 * d; final long a = bins[i] - bins[i - 1]; final long b = 2 * bins[i - 1]; double z = 0; if (a == 0) { z = -c / b; } else { z = (-b + Math.sqrt(b * b - 4 * a * c)) / (2 * a); } final double uj = this.positions[i - 1] + (this.positions[i] - this.positions[i - 1]) * z; quantiles[j] = (float) uj; } } return quantiles; } /** * Computes a visual representation of the approximate histogram with bins laid out according to the given breaks * * @param breaks breaks defining the histogram bins * * @return visual representation of the histogram */ public Histogram toHistogram(final float[] breaks) { final double[] approximateBins = new double[breaks.length - 1]; double prev = sum(breaks[0]); for (int i = 1; i < breaks.length; ++i) { double s = sum(breaks[i]); approximateBins[i - 1] = (float) (s - prev); prev = s; } return new Histogram(breaks, approximateBins); } /** * Computes a visual representation of the approximate histogram with a given number of equal-sized bins * * @param size number of equal-sized bins to divide the histogram into * * @return visual representation of the histogram */ public Histogram toHistogram(int size) { Preconditions.checkArgument(size > 1, "histogram size must be greater than 1"); float[] breaks = new float[size + 1]; float delta = (max - min) / (size - 1); breaks[0] = min - delta; for (int i = 1; i < breaks.length - 1; ++i) { breaks[i] = breaks[i - 1] + delta; } breaks[breaks.length - 1] = max; return toHistogram(breaks); } /** * Computes a visual representation given an initial breakpoint, offset, and a bucket size. * * @param bucketSize the size of each bucket * @param offset the location of one breakpoint * * @return visual representation of the histogram */ public Histogram toHistogram(final float bucketSize, final float offset) { final float minFloor = (float) Math.floor((min() - offset) / bucketSize) * bucketSize + offset; final float lowerLimitFloor = (float) Math.floor((lowerLimit - offset) / bucketSize) * bucketSize + offset; final float firstBreak = Math.max(minFloor, lowerLimitFloor); final float maxCeil = (float) Math.ceil((max() - offset) / bucketSize) * bucketSize + offset; final float upperLimitCeil = (float) Math.ceil((upperLimit - offset) / bucketSize) * bucketSize + offset; final float lastBreak = Math.min(maxCeil, upperLimitCeil); final float cutoff = 0.1f; final ArrayList<Float> breaks = new ArrayList<Float>(); // to deal with left inclusivity when the min is the same as a break final float bottomBreak = minFloor - bucketSize; if (bottomBreak != firstBreak && (sum(firstBreak) - sum(bottomBreak) > cutoff)) { breaks.add(bottomBreak); } float left = firstBreak; boolean leftSet = false; //the + bucketSize / 10 is because floating point addition is always slightly incorrect and so we need to account for that while (left + bucketSize <= lastBreak + (bucketSize / 10)) { final float right = left + bucketSize; if (sum(right) - sum(left) > cutoff) { if (!leftSet) { breaks.add(left); } breaks.add(right); leftSet = true; } else { leftSet = false; } left = right; } if (breaks.get(breaks.size() - 1) != maxCeil && (sum(maxCeil) - sum(breaks.get(breaks.size() - 1)) > cutoff)) { breaks.add(maxCeil); } return toHistogram(Floats.toArray(breaks)); } }