com.addthis.hydra.data.tree.prop.DataReservoir.java Source code

Java tutorial

Introduction

Here is the source code for com.addthis.hydra.data.tree.prop.DataReservoir.java

Source

/*
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package com.addthis.hydra.data.tree.prop;

import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
import java.util.List;
import java.util.Map;

import java.math.RoundingMode;

import com.addthis.basis.util.Varint;

import com.addthis.bundle.core.BundleField;
import com.addthis.bundle.value.ValueObject;
import com.addthis.codec.Codec;
import com.addthis.hydra.data.tree.DataTreeNode;
import com.addthis.hydra.data.tree.DataTreeNodeUpdater;
import com.addthis.hydra.data.tree.TreeDataParameters;
import com.addthis.hydra.data.tree.TreeNodeData;

import com.google.common.annotations.VisibleForTesting;
import com.google.common.collect.ImmutableList;
import com.google.common.math.DoubleMath;

import org.apache.commons.math3.distribution.ExponentialDistribution;
import org.apache.commons.math3.distribution.GammaDistribution;
import org.apache.commons.math3.distribution.NormalDistribution;
import org.apache.commons.math3.stat.inference.KolmogorovSmirnovTest;

import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import io.netty.buffer.ByteBuf;
import io.netty.buffer.PooledByteBufAllocator;
import io.netty.buffer.Unpooled;

public class DataReservoir extends TreeNodeData<DataReservoir.Config> implements Codec.BytesCodable {

    private static final Logger log = LoggerFactory.getLogger(DataReservoir.class);

    private static final ImmutableList<DataTreeNode> EMPTY_LIST = ImmutableList.<DataTreeNode>builder().build();

    private static final byte[] EMPTY_BYTES = new byte[0];

    /**
     * This data attachment <span class="hydra-summary">keeps circular buffer
     * of N counters</span>.
     *
     * <p>The numbers of buckets that are stored is determined by the {@link #size}
     * parameter. The value stored in the {@link #epochField} bundle field determines
     * the current epoch. The counter stored within this epoch is incremented. Older
     * epochs are dropped as newer epochs are encountered.
     *
     * <p>The data attachment is queried with the notation {@code /+%name=epoch=N~sigma=N.N~obs=N}.
     * Epoch determines the epoch to be tested. sigma is the number of standard deviations
     * to use as a threshold. obs specifies how many previous observations to use. All these
     * fields are required. Specifying min=N is an optional parameter for a minimum number
     * of observations that must be detected. The output returned is of the form
     * {@code /delta:+hits/measurement:+hits/mean:+hits/stddev:+hits/threshold:+hits}.
     *
     * @user-reference
     * @hydra-name reservoir
     */
    public static final class Config extends TreeDataParameters<DataReservoir> {

        /**
         * Bundle field name from which to draw the epoch.
         * This field is required.
         */
        @Codec.Set(codable = true, required = true)
        private String epochField;

        /**
         * Size of the reservoir. This field is required.
         * @return
         */
        @Codec.Set(codable = true, required = true)
        private int size;

        @Override
        public DataReservoir newInstance() {
            return new DataReservoir();
        }
    }

    @Codec.Set(codable = true, required = true)
    private int[] reservoir;

    /**
     * The minEpoch is a monotonically increasing value.
     * An increase in this value is associated with the elimination
     * of state from older epochs. All effort is made to increment the
     * value as little as possible.
     */
    @Codec.Set(codable = true, required = true)
    private long minEpoch;

    private BundleField keyAccess;

    /**
     * Resize the reservoir to the new size.
     * If the reservoir has not yet been allocated then
     * construct it. If the requested length is smaller
     * than the reservoir then discard the oldest values.
     * If the requested length is larger than the reservoir
     * then allocate additional space for the reservoir.
     *
     * @param newsize new size of the reservoir
     */
    private void resize(int newsize) {
        if (reservoir == null) {
            reservoir = new int[newsize];
        } else if (reservoir.length < newsize) {
            int[] newReservoir = new int[newsize];
            System.arraycopy(reservoir, 0, newReservoir, 0, reservoir.length);
            reservoir = newReservoir;
        } else if (reservoir.length > newsize) {
            int[] newReservoir = new int[newsize];
            System.arraycopy(reservoir, reservoir.length - newsize, newReservoir, 0, newsize);
            minEpoch += (reservoir.length - newsize);
            reservoir = newReservoir;
        }
    }

    /**
     * Shift the minimum epoch to accommodate
     * the new epoch. If the epoch is less than the minimum
     * epoch then do nothing. If the epoch falls within the boundary
     * of the the reservoir then do nothing. If the epoch is farther
     * away then one length away from the maximum epoch, then empty
     * out the reservoir and set the maximum epoch to the target
     * epoch. Otherwise shift the reservoir to accommodate the new
     * epoch.
     *
     * @param epoch new epoch to accommodate
     */
    private void shift(long epoch) {
        long delta = (epoch - minEpoch);
        if (delta < reservoir.length) {
            // do nothing
        } else if (delta > 2 * (reservoir.length - 1)) {
            Arrays.fill(reservoir, 0);
            minEpoch = epoch - (reservoir.length - 1);
        } else {
            int shift = (int) (delta - reservoir.length + 1);
            System.arraycopy(reservoir, shift, reservoir, 0, reservoir.length - shift);
            Arrays.fill(reservoir, reservoir.length - shift, reservoir.length, 0);
            minEpoch += shift;
        }
    }

    /**
     * Insert the new epoch. Assumes that {@link #shift(long epoch)}
     * has previously been invoked.
     *
     * @param epoch new epoch to insert
     */
    private void update(long epoch, long count) {
        if (epoch >= minEpoch) {
            reservoir[(int) (epoch - minEpoch)] += count;
        }
    }

    /**
     * Update the reservoir with the input epoch and a value of one.
     *
     * @param epoch input time period
     * @param size alters the capacity of the reservoir
     */
    @VisibleForTesting
    void updateReservoir(long epoch, int size) {
        updateReservoir(epoch, size, 1);
    }

    /**
     * Update the reservoir with the input epoch and specified additional count.
     *
     * @param epoch input time period
     * @param size alters the capacity of the reservoir
     * @param count amount to increment the time period
     */
    @VisibleForTesting
    void updateReservoir(long epoch, int size, long count) {
        resize(size);
        shift(epoch);
        update(epoch, count);
    }

    /**
     * Return the count associated with the input epoch,
     * or an error value if the input is out of bounds.
     *
     * @param epoch target epoch
     * @return the non-negative count or -1 if input is less
     *         than minimum epoch or -2 if input is greater
     *         than maximum epoch or -3 if the data structure
     *         has not been initialized.
     */
    @VisibleForTesting
    int retrieveCount(long epoch) {
        if (reservoir == null) {
            return -3;
        } else if (epoch < minEpoch) {
            return -1;
        } else if (epoch >= (minEpoch + reservoir.length)) {
            return -2;
        } else {
            return reservoir[(int) (epoch - minEpoch)];
        }
    }

    @Override
    public boolean updateChildData(DataTreeNodeUpdater state, DataTreeNode childNode, DataReservoir.Config conf) {
        if (keyAccess == null) {
            keyAccess = state.getBundle().getFormat().getField(conf.epochField);
        }
        ValueObject val = state.getBundle().getValue(keyAccess);
        if (val != null) {
            try {
                long epoch = val.asLong().getLong();
                updateReservoir(epoch, conf.size);
                return true;
            } catch (Exception ex) {
                log.error("Error trying to insert " + val + " into reservoir: ", ex);
            }
        }
        return false;
    }

    @Override
    public ValueObject getValue(String key) {
        return null;
    }

    /**
     * Helper method for {@link #getNodes(com.addthis.hydra.data.tree.DataTreeNode, String)}
     * If raw=true then add nodes for the raw observations.
     */
    private void addRawObservations(List<DataTreeNode> result, long targetEpoch, int numObservations) {

        if (targetEpoch < 0 || targetEpoch >= minEpoch + reservoir.length) {
            targetEpoch = minEpoch + reservoir.length - 1;
        }
        if (numObservations < 0 || numObservations > reservoir.length - 1) {
            numObservations = reservoir.length - 1;
        }

        int count = 0;
        int index = reservoir.length - 1;
        long currentEpoch = minEpoch + index;

        while (currentEpoch != targetEpoch) {
            index--;
            currentEpoch--;
        }

        /**
         * numObservations elements for the historical value.
         * Add one element to store for the target epoch.
         * Add one element to store the "minEpoch" node.
         */
        VirtualTreeNode[] children = new VirtualTreeNode[numObservations + 2];
        children[count++] = new VirtualTreeNode(Long.toString(currentEpoch), reservoir[index--]);

        while (count <= numObservations && index >= 0) {
            children[count++] = new VirtualTreeNode(Long.toString(minEpoch + index), reservoir[index--]);
        }

        while (count <= numObservations) {
            children[count++] = new VirtualTreeNode(Long.toString(minEpoch + index), 0);
            index--;
        }

        children[count] = new VirtualTreeNode("minEpoch", minEpoch);
        result.add(new VirtualTreeNode("observations", 1, children));
    }

    /**
     * Either generate some nodes for debugging purposes or
     * return an empty list.
     *
     * @param raw if true then generate debugging nodes
     * @return list of nodes
     */
    private List<DataTreeNode> makeDefaultNodes(boolean raw, long targetEpoch, int numObservations) {
        if (raw) {
            List<DataTreeNode> result = new ArrayList<>();
            addRawObservations(result, targetEpoch, numObservations);
            return result;
        } else {
            return EMPTY_LIST;
        }
    }

    /**
     * Convenience method to convert an node into an array of size one.
     */
    private static VirtualTreeNode[] generateSingletonArray(VirtualTreeNode value) {
        VirtualTreeNode[] result = new VirtualTreeNode[1];
        result[0] = value;
        return result;
    }

    private static long generateValue(double value, boolean doubleToLongBits) {
        if (doubleToLongBits) {
            return Double.doubleToLongBits(value);
        } else {
            return DoubleMath.roundToLong(value, RoundingMode.HALF_UP);
        }
    }

    @Override
    public List<DataTreeNode> getNodes(DataTreeNode parent, String key) {
        long targetEpoch = -1;
        int numObservations = -1;
        double sigma = Double.POSITIVE_INFINITY;
        int percentile = 0;
        boolean doubleToLongBits = false;
        int minMeasurement = Integer.MIN_VALUE;
        boolean raw = false;
        String mode = "sigma";
        if (key == null) {
            return null;
        }
        String[] kvpairs = key.split("~");
        for (String kvpair : kvpairs) {
            String[] kv = kvpair.split("=");
            if (kv.length == 2) {
                String kvkey = kv[0];
                String kvvalue = kv[1];
                switch (kvkey) {
                case "double":
                    doubleToLongBits = Boolean.parseBoolean(kvvalue);
                    break;
                case "epoch":
                    targetEpoch = Long.parseLong(kvvalue);
                    break;
                case "sigma":
                    sigma = Double.parseDouble(kvvalue);
                    break;
                case "min":
                    minMeasurement = Integer.parseInt(kvvalue);
                    break;
                case "obs":
                    numObservations = Integer.parseInt(kvvalue);
                    break;
                case "raw":
                    raw = Boolean.parseBoolean(kvvalue);
                    break;
                case "percentile":
                    percentile = Integer.parseInt(kvvalue);
                    break;
                case "mode":
                    mode = kvvalue;
                    break;
                default:
                    throw new RuntimeException("Unknown key " + kvkey);
                }
            }
        }
        switch (mode) {
        case "sigma":
            return sigmaAnomalyDetection(targetEpoch, numObservations, doubleToLongBits, raw, sigma,
                    minMeasurement);
        case "modelfit":
            return modelFitAnomalyDetection(targetEpoch, numObservations, doubleToLongBits, raw, percentile);
        default:
            throw new RuntimeException("Unknown mode type '" + mode + "'");
        }
    }

    private static void updateFrequencies(Map<Integer, Integer> frequencies, int value) {
        Integer count = frequencies.get(value);
        if (count == null) {
            count = 0;
        }
        frequencies.put(value, count + 1);
    }

    private double gaussianNegativeProbability(double mean, double stddev) {
        NormalDistribution distribution = new NormalDistribution(mean, stddev);
        return distribution.cumulativeProbability(0.0);
    }

    @VisibleForTesting
    List<DataTreeNode> modelFitAnomalyDetection(long targetEpoch, int numObservations, boolean doubleToLongBits,
            boolean raw, int percentile) {
        int measurement;
        int count = 0;
        int min = Integer.MAX_VALUE;

        if (targetEpoch < 0) {
            return makeDefaultNodes(raw, targetEpoch, numObservations);
        } else if (numObservations <= 0) {
            return makeDefaultNodes(raw, targetEpoch, numObservations);
        } else if (reservoir == null) {
            return makeDefaultNodes(raw, targetEpoch, numObservations);
        } else if (targetEpoch < minEpoch) {
            return makeDefaultNodes(raw, targetEpoch, numObservations);
        } else if (targetEpoch >= minEpoch + reservoir.length) {
            return makeDefaultNodes(raw, targetEpoch, numObservations);
        } else if (numObservations > (reservoir.length - 1)) {
            return makeDefaultNodes(raw, targetEpoch, numObservations);
        }

        /**
         * Fitting to a geometric distribution uses the mean value of the sample.
         *
         * Fitting to a normal distribution uses the Apache Commons Math implementation.
         */
        double mean = 0.0;
        double m2 = 0.0;
        double stddev;
        double gaussianNegative = -1.0;
        Map<Integer, Integer> frequencies = new HashMap<>();
        double threshold;

        int index = reservoir.length - 1;
        long currentEpoch = minEpoch + index;

        while (currentEpoch != targetEpoch) {
            index--;
            currentEpoch--;
        }

        measurement = reservoir[index--];
        currentEpoch--;

        while (count < numObservations && index >= 0) {
            int value = reservoir[index--];
            if (value < min) {
                min = value;
            }
            updateFrequencies(frequencies, value);
            count++;
            double delta = value - mean;
            mean += delta / count;
            m2 += delta * (value - mean);
        }

        while (count < numObservations) {
            int value = 0;
            if (value < min) {
                min = value;
            }
            updateFrequencies(frequencies, value);
            count++;
            double delta = value - mean;
            mean += delta / count;
            m2 += delta * (value - mean);
        }

        if (count < 2) {
            stddev = 0.0;
        } else {
            stddev = Math.sqrt(m2 / count);
        }

        int mode = -1;
        int modeCount = -1;

        for (Map.Entry<Integer, Integer> entry : frequencies.entrySet()) {
            int key = entry.getKey();
            int value = entry.getValue();
            if (value > modeCount || (value == modeCount && key > mode)) {
                mode = key;
                modeCount = value;
            }
        }

        if (mean > 0.0 && stddev > 0.0) {
            gaussianNegative = gaussianNegativeProbability(mean, stddev);
        }

        if (percentile == 0.0) {
            threshold = -1.0;
        } else if (mean == 0.0) {
            threshold = 0.0;
        } else if (stddev == 0.0) {
            threshold = mean;
        } else if (mean > 1.0) {
            NormalDistribution distribution = new NormalDistribution(mean, stddev);
            double badProbability = distribution.cumulativeProbability(1.0);
            double goodProbability = badProbability + (1.0 - badProbability) * (percentile / 100.0);
            threshold = distribution.inverseCumulativeProbability(goodProbability);
        } else {
            ExponentialDistribution distribution = new ExponentialDistribution(mean);
            double badProbability = distribution.cumulativeProbability(1.0);
            double goodProbability = badProbability + (1.0 - badProbability) * (percentile / 100.0);
            threshold = distribution.inverseCumulativeProbability(goodProbability);
        }

        List<DataTreeNode> result = new ArrayList<>();
        VirtualTreeNode vchild, vparent;

        if (measurement > threshold) {
            vchild = new VirtualTreeNode("gaussianNegative", generateValue(gaussianNegative, doubleToLongBits));
            vparent = new VirtualTreeNode("mode", mode, generateSingletonArray(vchild));
            vchild = vparent;
            vparent = new VirtualTreeNode("stddev", generateValue(stddev, doubleToLongBits),
                    generateSingletonArray(vchild));
            vchild = vparent;
            vparent = new VirtualTreeNode("mean", generateValue(mean, doubleToLongBits),
                    generateSingletonArray(vchild));
            vchild = vparent;
            vparent = new VirtualTreeNode("measurement", measurement, generateSingletonArray(vchild));
            vchild = vparent;
            vparent = new VirtualTreeNode("delta", generateValue(measurement - threshold, doubleToLongBits),
                    generateSingletonArray(vchild));
            result.add(vparent);
            if (raw) {
                addRawObservations(result, targetEpoch, numObservations);
            }
        } else {
            makeDefaultNodes(raw, targetEpoch, numObservations);
        }
        return result;
    }

    private List<DataTreeNode> sigmaAnomalyDetection(long targetEpoch, int numObservations,
            boolean doubleToLongBits, boolean raw, double sigma, int minMeasurement) {

        int measurement;
        if (targetEpoch < 0) {
            return makeDefaultNodes(raw, targetEpoch, numObservations);
        } else if (sigma == Double.POSITIVE_INFINITY) {
            return makeDefaultNodes(raw, targetEpoch, numObservations);
        } else if (numObservations <= 0) {
            return makeDefaultNodes(raw, targetEpoch, numObservations);
        } else if (reservoir == null) {
            return makeDefaultNodes(raw, targetEpoch, numObservations);
        } else if (targetEpoch < minEpoch) {
            return makeDefaultNodes(raw, targetEpoch, numObservations);
        } else if (targetEpoch >= minEpoch + reservoir.length) {
            return makeDefaultNodes(raw, targetEpoch, numObservations);
        } else if (numObservations > (reservoir.length - 1)) {
            return makeDefaultNodes(raw, targetEpoch, numObservations);
        }

        int count = 0;
        double mean = 0.0;
        double m2 = 0.0;
        double stddev;

        int index = reservoir.length - 1;
        long currentEpoch = minEpoch + index;

        while (currentEpoch != targetEpoch) {
            index--;
            currentEpoch--;
        }

        measurement = reservoir[index--];

        while (count < numObservations && index >= 0) {
            int value = reservoir[index--];
            count++;
            double delta = value - mean;
            mean += delta / count;
            m2 += delta * (value - mean);
        }

        while (count < numObservations) {
            int value = 0;
            count++;
            double delta = value - mean;
            mean += delta / count;
            m2 += delta * (value - mean);
        }

        if (count < 2) {
            stddev = 0.0;
        } else {
            stddev = Math.sqrt(m2 / count);
        }

        double delta = (measurement - (sigma * stddev + mean));

        VirtualTreeNode vchild, vparent;
        if (delta >= 0 && measurement >= minMeasurement) {
            List<DataTreeNode> result = new ArrayList<>();
            vchild = new VirtualTreeNode("threshold", generateValue(sigma * stddev + mean, doubleToLongBits));
            vparent = new VirtualTreeNode("stddev", generateValue(stddev, doubleToLongBits),
                    generateSingletonArray(vchild));
            vchild = vparent;
            vparent = new VirtualTreeNode("mean", generateValue(mean, doubleToLongBits),
                    generateSingletonArray(vchild));
            vchild = vparent;
            vparent = new VirtualTreeNode("measurement", measurement, generateSingletonArray(vchild));
            vchild = vparent;
            vparent = new VirtualTreeNode("delta", generateValue(delta, doubleToLongBits),
                    generateSingletonArray(vchild));
            result.add(vparent);
            if (raw) {
                addRawObservations(result, targetEpoch, numObservations);
            }
            return result;
        } else {
            return makeDefaultNodes(raw, targetEpoch, numObservations);
        }
    }

    @Override
    public byte[] bytesEncode(long version) {
        if (reservoir == null) {
            return EMPTY_BYTES;
        }
        byte[] retBytes = null;
        ByteBuf byteBuf = PooledByteBufAllocator.DEFAULT.buffer();
        try {
            Varint.writeUnsignedVarLong(minEpoch, byteBuf);
            Varint.writeUnsignedVarInt(reservoir.length, byteBuf);
            for (int element : reservoir) {
                Varint.writeUnsignedVarInt(element, byteBuf);
            }
            retBytes = new byte[byteBuf.readableBytes()];
            byteBuf.readBytes(retBytes);
        } finally {
            byteBuf.release();
        }
        return retBytes;
    }

    @Override
    public void bytesDecode(byte[] b, long version) {
        if (b.length == 0) {
            return;
        }
        ByteBuf byteBuf = Unpooled.wrappedBuffer(b);
        try {
            minEpoch = Varint.readUnsignedVarLong(byteBuf);
            int length = Varint.readUnsignedVarInt(byteBuf);
            reservoir = new int[length];
            for (int i = 0; i < reservoir.length; i++) {
                reservoir[i] = Varint.readUnsignedVarInt(byteBuf);
            }
        } finally {
            byteBuf.release();
        }
    }
}