io.divolte.server.ShortTermDuplicateMemory.java Source code

Introduction

Here is the source code for io.divolte.server.ShortTermDuplicateMemory.java
Source

/*
 * Copyright 2014 GoDataDriven B.V.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package io.divolte.server;

import com.google.common.hash.HashCode;
import com.google.common.hash.HashFunction;
import com.google.common.hash.Hasher;
import com.google.common.hash.Hashing;
import com.google.common.primitives.Ints;
import com.google.common.primitives.Longs;

import javax.annotation.ParametersAreNonnullByDefault;
import javax.annotation.concurrent.NotThreadSafe;
import java.nio.charset.StandardCharsets;

/**
 * Probabilistic detection of duplicate events in a stream with fixed memory overhead.
 * <p>
 * This class is used to detect duplicates in an event stream. An event
 * is identified by an array of strings that represent characteristics of the
 * event. (The same values indicate the same logical event.) Invoking
 * {@link #isProbableDuplicate(String...)} not only returns whether the event
 * is probably a duplicate or not, but also updates the internal state such
 * that the event has been 'seen'. (A second immediate invocation with the same
 * parameter will always return <code>true</code>.)
 * <p>
 * Because this class is probabilistic it can return both false positives
 * (a unique event is considered to be a duplicate) and false negatives (an event
 * previously seen is not flagged as a duplicate).
 * <p>
 * This class maintains a number of slots as internal state, each of which
 * can store an event signature. The number of slots is specified as a
 * constructor parameter. Duplicate detection works by hashing the event
 * properties to a specific slot, and checking whether the signature stored
 * in that slot matches the event. The signature is independent of the hash
 * used to choose a slot.
 * <p>
 * Duplicate events are missed (false negatives) when multiple different events
 * hash to the same slot. The signature of the each such event will replace the
 * signature of the previous such event. When a prior event is repeated its
 * signature is no longer present at the slot location and it is not recognized
 * as a duplicate. For a fixed number of events the proportion of false negatives
 * is:
 * <ul>
 *   <li>Inversely proportional to the number of slots that are configured.
 *     More slots means fewer false negatives.</li>
 *   <li>Proportional to the interval between duplicate events. The further
 *     apart duplicate events occur, the less likely they are to be recognized.</li>
 * </ul>
 * <p>
 * Unique events are incorrectly categorized as duplicates (false positives) when
 * multiple different events hash to the same slot <em>and</em> have the same
 * signature, without an intervening event hashing to the same slot. For a fixed
 * number of events the proportion of false positives is:
 * <ul>
 *   <li>Inversely proportional to the number of slots that are configured.
 *     More slots means fewer false positives.</li>
 *   <li>Inversely proportional to the probability of multiple events in the same
 *     slot having the same signature. Signatures are 64-bits in length, meaning
 *     that the probability of two events having the same signature is 1/(2^32).</li>
 * </ul>
 */
/* TODO: These calculations need revising.
 *
 * The probability of a false positive is equal to the probability of a hash
 * collision with any observation currently in the filter memory. So, if the
 * memory size is 10 million, the chance of a false positive is equal to the
 * chance of the hash code of a new event colliding with one of 10 million
 * arbitrary other hash codes (probability of a hash collision times the
 * probability of the colliding hash code being drawn when drawing 10 million
 * hash codes from the entire space of hash codes).
 *
 * The probability of a false negative is a function of the memory size, the
 * amount of time that passes between two incarnations of the duplicate event
 * and the number of events that occur during that time. A false negative will
 * occur when to equal observations are interleaved with a different observation
 * that maps to the same position in the memory. Given a uniform distribution of
 * event hash codes and a resulting uniform distribution of position
 * assignments, the probability of a false negative for duplicates that occur a
 * given amount of time apart, can be calculated as:
 * time between duplicates / (memory size / throughput rate) / 2.
 * For example:
 * 600 seconds / (10 million / 1 kilohertz) / 2 = 600 seconds / 10000 seconds / 2 = 0.03.
 * This means that at a rate of 1000 events per second, there is a 3% chance of
 * a position in the memory being overwritten during the last 10 minutes.
 *
 * If 99% of duplicate events occur within 2 minutes from each other, we expect
 * to see 120 seconds / (10 million / 1 kilohertz) / 2 = 0.6% of the positions
 * to be overwritten in the last 2 minutes. At a true positive rate of 0.5% for
 * those 99% of events, we expect to see 0.99 * 0.005 * 0.006 = 0.00297% false
 * positives at a rate of 1000 events / second.
 */
@ParametersAreNonnullByDefault
@NotThreadSafe
final class ShortTermDuplicateMemory {
    private static final HashFunction HASHING_FUNCTION = Hashing.murmur3_128();

    private final long[] memory;

    /**
     * Construct an instance with a specific number of slots.
     * <p>
     * More slots lowers the probability of events being categorized
     * incorrectly, at the expense of more memory.
     *
     * @param slotCount the number of slots to use for detecting duplicate events.
     */
    public ShortTermDuplicateMemory(final int slotCount) {
        memory = new long[slotCount];
    }

    /**
     * Query whether an event has been seen before or not, based on event properties.
     * @param eventProperties   An array of values that are specific to the event.
     * @return <code>true</code> if we have probably seen this event previously, or
     *  false otherwise.
     */
    public boolean isProbableDuplicate(final String... eventProperties) {
        final Hasher hasher = HASHING_FUNCTION.newHasher();
        for (final String eventProperty : eventProperties) {
            hasher.putString(eventProperty, StandardCharsets.UTF_8);
        }
        return isProbablyDuplicate(hasher.hash());
    }

    private boolean isProbablyDuplicate(final HashCode eventDigest) {
        // Our hashing algorithm produces 8 bytes:
        //  0: slot[0]
        //  1: slot[1]
        //  2: slot[2]
        //  3: slot[3]
        //  4:
        //  5:
        //  6:
        //  7:
        //  8: signature[0]
        //  9:  ..
        // 10:  ..
        // 11:  ..
        // 12:  ..
        // 13:  ..
        // 14:  ..
        // 15: signature[7]
        final byte[] hashBytes = eventDigest.asBytes();

        // We use the low int for the slot.
        final int slotSelector = Ints.fromBytes(hashBytes[0], hashBytes[1], hashBytes[2], hashBytes[3]);
        // We use the high long for the signature.
        final long signature = Longs.fromBytes(hashBytes[8], hashBytes[9], hashBytes[10], hashBytes[11],
                hashBytes[12], hashBytes[13], hashBytes[14], hashBytes[15]);

        final int slot = (slotSelector & Integer.MAX_VALUE) % memory.length;
        final boolean result = memory[slot] == signature;
        memory[slot] = signature;
        return result;
    }
}