Java tutorial
/* * Copyright 2014 GoDataDriven B.V. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package io.divolte.server; import com.google.common.hash.HashCode; import com.google.common.hash.HashFunction; import com.google.common.hash.Hasher; import com.google.common.hash.Hashing; import com.google.common.primitives.Ints; import com.google.common.primitives.Longs; import javax.annotation.ParametersAreNonnullByDefault; import javax.annotation.concurrent.NotThreadSafe; import java.nio.charset.StandardCharsets; /** * Probabilistic detection of duplicate events in a stream with fixed memory overhead. * <p> * This class is used to detect duplicates in an event stream. An event * is identified by an array of strings that represent characteristics of the * event. (The same values indicate the same logical event.) Invoking * {@link #isProbableDuplicate(String...)} not only returns whether the event * is probably a duplicate or not, but also updates the internal state such * that the event has been 'seen'. (A second immediate invocation with the same * parameter will always return <code>true</code>.) * <p> * Because this class is probabilistic it can return both false positives * (a unique event is considered to be a duplicate) and false negatives (an event * previously seen is not flagged as a duplicate). * <p> * This class maintains a number of slots as internal state, each of which * can store an event signature. The number of slots is specified as a * constructor parameter. Duplicate detection works by hashing the event * properties to a specific slot, and checking whether the signature stored * in that slot matches the event. The signature is independent of the hash * used to choose a slot. * <p> * Duplicate events are missed (false negatives) when multiple different events * hash to the same slot. The signature of the each such event will replace the * signature of the previous such event. When a prior event is repeated its * signature is no longer present at the slot location and it is not recognized * as a duplicate. For a fixed number of events the proportion of false negatives * is: * <ul> * <li>Inversely proportional to the number of slots that are configured. * More slots means fewer false negatives.</li> * <li>Proportional to the interval between duplicate events. The further * apart duplicate events occur, the less likely they are to be recognized.</li> * </ul> * <p> * Unique events are incorrectly categorized as duplicates (false positives) when * multiple different events hash to the same slot <em>and</em> have the same * signature, without an intervening event hashing to the same slot. For a fixed * number of events the proportion of false positives is: * <ul> * <li>Inversely proportional to the number of slots that are configured. * More slots means fewer false positives.</li> * <li>Inversely proportional to the probability of multiple events in the same * slot having the same signature. Signatures are 64-bits in length, meaning * that the probability of two events having the same signature is 1/(2^32).</li> * </ul> */ /* TODO: These calculations need revising. * * The probability of a false positive is equal to the probability of a hash * collision with any observation currently in the filter memory. So, if the * memory size is 10 million, the chance of a false positive is equal to the * chance of the hash code of a new event colliding with one of 10 million * arbitrary other hash codes (probability of a hash collision times the * probability of the colliding hash code being drawn when drawing 10 million * hash codes from the entire space of hash codes). * * The probability of a false negative is a function of the memory size, the * amount of time that passes between two incarnations of the duplicate event * and the number of events that occur during that time. A false negative will * occur when to equal observations are interleaved with a different observation * that maps to the same position in the memory. Given a uniform distribution of * event hash codes and a resulting uniform distribution of position * assignments, the probability of a false negative for duplicates that occur a * given amount of time apart, can be calculated as: * time between duplicates / (memory size / throughput rate) / 2. * For example: * 600 seconds / (10 million / 1 kilohertz) / 2 = 600 seconds / 10000 seconds / 2 = 0.03. * This means that at a rate of 1000 events per second, there is a 3% chance of * a position in the memory being overwritten during the last 10 minutes. * * If 99% of duplicate events occur within 2 minutes from each other, we expect * to see 120 seconds / (10 million / 1 kilohertz) / 2 = 0.6% of the positions * to be overwritten in the last 2 minutes. At a true positive rate of 0.5% for * those 99% of events, we expect to see 0.99 * 0.005 * 0.006 = 0.00297% false * positives at a rate of 1000 events / second. */ @ParametersAreNonnullByDefault @NotThreadSafe final class ShortTermDuplicateMemory { private static final HashFunction HASHING_FUNCTION = Hashing.murmur3_128(); private final long[] memory; /** * Construct an instance with a specific number of slots. * <p> * More slots lowers the probability of events being categorized * incorrectly, at the expense of more memory. * * @param slotCount the number of slots to use for detecting duplicate events. */ public ShortTermDuplicateMemory(final int slotCount) { memory = new long[slotCount]; } /** * Query whether an event has been seen before or not, based on event properties. * @param eventProperties An array of values that are specific to the event. * @return <code>true</code> if we have probably seen this event previously, or * false otherwise. */ public boolean isProbableDuplicate(final String... eventProperties) { final Hasher hasher = HASHING_FUNCTION.newHasher(); for (final String eventProperty : eventProperties) { hasher.putString(eventProperty, StandardCharsets.UTF_8); } return isProbablyDuplicate(hasher.hash()); } private boolean isProbablyDuplicate(final HashCode eventDigest) { // Our hashing algorithm produces 8 bytes: // 0: slot[0] // 1: slot[1] // 2: slot[2] // 3: slot[3] // 4: // 5: // 6: // 7: // 8: signature[0] // 9: .. // 10: .. // 11: .. // 12: .. // 13: .. // 14: .. // 15: signature[7] final byte[] hashBytes = eventDigest.asBytes(); // We use the low int for the slot. final int slotSelector = Ints.fromBytes(hashBytes[0], hashBytes[1], hashBytes[2], hashBytes[3]); // We use the high long for the signature. final long signature = Longs.fromBytes(hashBytes[8], hashBytes[9], hashBytes[10], hashBytes[11], hashBytes[12], hashBytes[13], hashBytes[14], hashBytes[15]); final int slot = (slotSelector & Integer.MAX_VALUE) % memory.length; final boolean result = memory[slot] == signature; memory[slot] = signature; return result; } }