Java tutorial
/* * Copyright (C) 2011 Clearspring Technologies, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.clearspring.analytics.stream.cardinality; import java.io.ByteArrayOutputStream; import java.io.DataOutputStream; import java.io.IOException; import java.util.ArrayList; import java.util.Arrays; import java.util.HashSet; import java.util.List; import java.util.Random; import java.util.Set; import java.util.UUID; import com.clearspring.analytics.TestUtils; import com.clearspring.analytics.util.Varint; import org.apache.commons.lang3.RandomStringUtils; import org.junit.Test; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import static org.junit.Assert.assertArrayEquals; import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertNotEquals; import static org.junit.Assert.assertTrue; public class TestHyperLogLogPlus { private static final Logger log = LoggerFactory.getLogger(TestHyperLogLogPlus.class); @Test public void testEquals() { HyperLogLogPlus hll1 = new HyperLogLogPlus(5, 25); HyperLogLogPlus hll2 = new HyperLogLogPlus(5, 25); hll1.offer("A"); hll2.offer("A"); assertEquals(hll1, hll2); hll2.offer("B"); hll2.offer("C"); hll2.offer("D"); assertNotEquals(hll1, hll2); HyperLogLogPlus hll3 = new HyperLogLogPlus(5, 25); for (int i = 0; i < 50000; i++) { hll3.offer("" + i); } assertNotEquals(hll1, hll3); } @Test public void consistentBytes() throws Throwable { int[] NUM_STRINGS = { 30, 50, 100, 200, 300, 500, 1000, 10000, 100000 }; for (int n : NUM_STRINGS) { String[] strings = new String[n]; for (int i = 0; i < n; i++) { strings[i] = RandomStringUtils.randomAlphabetic(20); } HyperLogLogPlus hllpp1 = new HyperLogLogPlus(5, 5); HyperLogLogPlus hllpp2 = new HyperLogLogPlus(5, 5); for (int i = 0; i < n; i++) { hllpp1.offer(strings[i]); hllpp2.offer(strings[n - 1 - i]); } // calling these here ensures their internal state (format type) is stable for the rest of these checks. // (end users have no need for this because they cannot access the format directly anyway) hllpp1.mergeTempList(); hllpp2.mergeTempList(); log.debug("n={} format1={} format2={}", n, hllpp1.format, hllpp2.format); try { if (hllpp1.format == hllpp2.format) { assertEquals(hllpp1, hllpp2); assertEquals(hllpp1.hashCode(), hllpp2.hashCode()); assertArrayEquals(hllpp1.getBytes(), hllpp2.getBytes()); } else { assertNotEquals(hllpp1, hllpp2); } } catch (Throwable any) { log.error("n={} format1={} format2={}", n, hllpp1.format, hllpp2.format, any); throw any; } } } public static void main(final String[] args) throws Throwable { long startTime = System.currentTimeMillis(); int numSets = 10; int setSize = 1 * 1000 * 1000; int repeats = 5; HyperLogLogPlus[] counters = new HyperLogLogPlus[numSets]; for (int i = 0; i < numSets; i++) { counters[i] = new HyperLogLogPlus(15, 15); } for (int i = 0; i < numSets; i++) { for (int j = 0; j < setSize; j++) { String val = UUID.randomUUID().toString(); for (int z = 0; z < repeats; z++) { counters[i].offer(val); } } } ICardinality merged = counters[0]; long sum = merged.cardinality(); for (int i = 1; i < numSets; i++) { sum += counters[i].cardinality(); merged = merged.merge(counters[i]); } long trueSize = numSets * setSize; System.out.println("True Cardinality: " + trueSize); System.out.println("Summed Cardinality: " + sum); System.out.println("Merged Cardinality: " + merged.cardinality()); System.out.println("Merged Error: " + (merged.cardinality() - trueSize) / (float) trueSize); System.out.println("Duration: " + ((System.currentTimeMillis() - startTime) / 1000) + "s"); } @Test public void testComputeCount() { HyperLogLogPlus hyperLogLogPlus = new HyperLogLogPlus(14, 25); int count = 70000; for (int i = 0; i < count; i++) { hyperLogLogPlus.offer("i" + i); } long estimate = hyperLogLogPlus.cardinality(); double se = count * (1.04 / Math.sqrt(Math.pow(2, 14))); long expectedCardinality = count; System.out.println("Expect estimate: " + estimate + " is between " + (expectedCardinality - (3 * se)) + " and " + (expectedCardinality + (3 * se))); assertTrue(estimate >= expectedCardinality - (3 * se)); assertTrue(estimate <= expectedCardinality + (3 * se)); } @Test public void testSmallCardinalityRepeatedInsert() { HyperLogLogPlus hyperLogLogPlus = new HyperLogLogPlus(14, 25); int count = 15000; int maxAttempts = 200; Random r = new Random(); for (int i = 0; i < count; i++) { int n = r.nextInt(maxAttempts) + 1; for (int j = 0; j < n; j++) { hyperLogLogPlus.offer("i" + i); } } long estimate = hyperLogLogPlus.cardinality(); double se = count * (1.04 / Math.sqrt(Math.pow(2, 14))); long expectedCardinality = count; System.out.println("Expect estimate: " + estimate + " is between " + (expectedCardinality - (3 * se)) + " and " + (expectedCardinality + (3 * se))); assertTrue(estimate >= expectedCardinality - (3 * se)); assertTrue(estimate <= expectedCardinality + (3 * se)); } @Test public void testSerialization_Normal() throws IOException { HyperLogLogPlus hll = new HyperLogLogPlus(5, 25); for (int i = 0; i < 100000; i++) { hll.offer("" + i); } System.out.println(hll.cardinality()); HyperLogLogPlus hll2 = HyperLogLogPlus.Builder.build(hll.getBytes()); assertEquals(hll.cardinality(), hll2.cardinality()); } @Test public void testSerialization() throws IOException, ClassNotFoundException { HyperLogLogPlus hll = new HyperLogLogPlus(5, 25); for (int i = 0; i < 100000; i++) { hll.offer("" + i); } System.out.println(hll.cardinality()); HyperLogLogPlus hll2 = (HyperLogLogPlus) TestUtils.deserialize(TestUtils.serialize(hll)); assertEquals(hll.cardinality(), hll2.cardinality()); } @Test public void testSerialization_Sparse() throws IOException { HyperLogLogPlus hll = new HyperLogLogPlus(14, 25); hll.offer("a"); hll.offer("b"); hll.offer("c"); hll.offer("d"); hll.offer("e"); HyperLogLogPlus hll2 = HyperLogLogPlus.Builder.build(hll.getBytes()); assertEquals(hll.cardinality(), hll2.cardinality()); } @Test public void testHighPrecisionInitialization() { for (int sp = 4; sp <= 32; sp++) { int expectedSm = (int) Math.pow(2, sp); for (int p = 4; p <= sp; p++) { int expectedM = (int) Math.pow(2, p); HyperLogLogPlus hyperLogLogPlus = new HyperLogLogPlus(p, sp); assertEquals(expectedM, hyperLogLogPlus.getM()); assertEquals(expectedSm, hyperLogLogPlus.getSm()); } } } @Test public void testHighCardinality() { long start = System.currentTimeMillis(); HyperLogLogPlus hyperLogLogPlus = new HyperLogLogPlus(18, 25); int size = 10000000; for (int i = 0; i < size; i++) { hyperLogLogPlus.offer(TestICardinality.streamElement(i)); } System.out.println("expected: " + size + ", estimate: " + hyperLogLogPlus.cardinality() + ", time: " + (System.currentTimeMillis() - start)); long estimate = hyperLogLogPlus.cardinality(); double err = Math.abs(estimate - size) / (double) size; System.out.println("Percentage error " + err); assertTrue(err < .1); } @Test public void testSortEncodedSet() { int[] testSet = new int[3]; testSet[0] = 655403; testSet[1] = 655416; testSet[2] = 655425; HyperLogLogPlus hyperLogLogPlus = new HyperLogLogPlus(14, 25); testSet = hyperLogLogPlus.sortEncodedSet(testSet, 3); assertEquals(655403, testSet[0]); assertEquals(655425, testSet[1]); assertEquals(655416, testSet[2]); } @Test public void testMergeSelf_forceNormal() throws CardinalityMergeException, IOException { final int[] cardinalities = { 0, 1, 10, 100, 1000, 10000, 100000, 1000000 }; for (int cardinality : cardinalities) { for (int j = 4; j < 24; j++) { System.out.println("p=" + j); HyperLogLogPlus hllPlus = new HyperLogLogPlus(j, 0); for (int l = 0; l < cardinality; l++) { hllPlus.offer(Math.random()); } System.out.println("hllcardinality=" + hllPlus.cardinality() + " cardinality=" + cardinality); HyperLogLogPlus deserialized = HyperLogLogPlus.Builder.build(hllPlus.getBytes()); assertEquals(hllPlus.cardinality(), deserialized.cardinality()); ICardinality merged = hllPlus.merge(deserialized); System.out.println(merged.cardinality() + " : " + hllPlus.cardinality()); assertEquals(hllPlus.cardinality(), merged.cardinality()); } } } @Test public void testMergeSelf() throws CardinalityMergeException, IOException { final int[] cardinalities = { 0, 1, 10, 100, 1000, 10000, 100000 }; final int[] ps = { 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20 }; final int[] sps = { 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32 }; for (int cardinality : cardinalities) { for (int j = 0; j < ps.length; j++) { for (int sp : sps) { if (sp < ps[j]) { continue; } HyperLogLogPlus hllPlus = new HyperLogLogPlus(ps[j], sp); for (int l = 0; l < cardinality; l++) { hllPlus.offer(Math.random()); } HyperLogLogPlus deserialized = HyperLogLogPlus.Builder.build(hllPlus.getBytes()); System.out.println(ps[j] + "-" + sp + ": " + cardinality + " -> " + hllPlus.cardinality()); assertEquals(hllPlus.cardinality(), deserialized.cardinality()); ICardinality merged = hllPlus.merge(deserialized); assertEquals(hllPlus.cardinality(), merged.cardinality()); } } } } @Test public void testOne() throws IOException { HyperLogLogPlus one = new HyperLogLogPlus(8, 25); one.offer("a"); assertEquals(1, one.cardinality()); } @Test public void testSparseSpace() throws IOException { HyperLogLogPlus hllp = new HyperLogLogPlus(14, 14); for (int i = 0; i < 10000; i++) { hllp.offer(i); } System.out.println("Size: " + hllp.getBytes().length); } @Test public void testMerge_Sparse() throws CardinalityMergeException { int numToMerge = 4; int bits = 18; int cardinality = 4000; HyperLogLogPlus[] hyperLogLogs = new HyperLogLogPlus[numToMerge]; HyperLogLogPlus baseline = new HyperLogLogPlus(bits, 25); for (int i = 0; i < numToMerge; i++) { hyperLogLogs[i] = new HyperLogLogPlus(bits, 25); for (int j = 0; j < cardinality; j++) { double val = Math.random(); hyperLogLogs[i].offer(val); baseline.offer(val); } } long expectedCardinality = numToMerge * cardinality; HyperLogLogPlus hll = hyperLogLogs[0]; hyperLogLogs = Arrays.asList(hyperLogLogs).subList(1, hyperLogLogs.length).toArray(new HyperLogLogPlus[0]); long mergedEstimate = hll.merge(hyperLogLogs).cardinality(); double se = expectedCardinality * (1.04 / Math.sqrt(Math.pow(2, bits))); System.out.println("Expect estimate: " + mergedEstimate + " is between " + (expectedCardinality - (3 * se)) + " and " + (expectedCardinality + (3 * se))); double err = Math.abs(mergedEstimate - expectedCardinality) / (double) expectedCardinality; System.out.println("Percentage error " + err); assertTrue(err < .1); assertTrue(mergedEstimate >= expectedCardinality - (3 * se)); assertTrue(mergedEstimate <= expectedCardinality + (3 * se)); } @Test public void testMerge_Normal() throws CardinalityMergeException { int numToMerge = 4; int bits = 18; int cardinality = 5000; HyperLogLogPlus[] hyperLogLogs = new HyperLogLogPlus[numToMerge]; HyperLogLogPlus baseline = new HyperLogLogPlus(bits, 25); for (int i = 0; i < numToMerge; i++) { hyperLogLogs[i] = new HyperLogLogPlus(bits, 25); for (int j = 0; j < cardinality; j++) { double val = Math.random(); hyperLogLogs[i].offer(val); baseline.offer(val); } } long expectedCardinality = numToMerge * cardinality; HyperLogLogPlus hll = hyperLogLogs[0]; hyperLogLogs = Arrays.asList(hyperLogLogs).subList(1, hyperLogLogs.length).toArray(new HyperLogLogPlus[0]); long mergedEstimate = hll.merge(hyperLogLogs).cardinality(); double se = expectedCardinality * (1.04 / Math.sqrt(Math.pow(2, bits))); System.out.println("Expect estimate: " + mergedEstimate + " is between " + (expectedCardinality - (3 * se)) + " and " + (expectedCardinality + (3 * se))); assertTrue(mergedEstimate >= expectedCardinality - (3 * se)); assertTrue(mergedEstimate <= expectedCardinality + (3 * se)); } @Test public void testLegacyCodec_normal() throws IOException { int bits = 18; int cardinality = 1000000; HyperLogLogPlus baseline = new HyperLogLogPlus(bits, 25); for (int j = 0; j < cardinality; j++) { double val = Math.random(); baseline.offer(val); } ByteArrayOutputStream baos = new ByteArrayOutputStream(); DataOutputStream dos = new DataOutputStream(baos); dos.writeInt(bits); dos.writeInt(25); dos.writeInt(0); dos.writeInt(baseline.getRegisterSet().size * 4); for (int x : baseline.getRegisterSet().readOnlyBits()) { dos.writeInt(x); } byte[] legacyBytes = baos.toByteArray(); // decode legacy HyperLogLogPlus decoded = HyperLogLogPlus.Builder.build(legacyBytes); assertEquals(baseline.cardinality(), decoded.cardinality()); byte[] newBytes = baseline.getBytes(); assertTrue(newBytes.length < legacyBytes.length); } @Test public void testLegacyCodec_sparse() throws IOException { int bits = 18; int cardinality = 5000; HyperLogLogPlus baseline = new HyperLogLogPlus(bits, 25); for (int j = 0; j < cardinality; j++) { double val = Math.random(); baseline.offer(val); } ByteArrayOutputStream baos = new ByteArrayOutputStream(); DataOutputStream dos = new DataOutputStream(baos); dos.writeInt(bits); dos.writeInt(25); dos.writeInt(1); baseline.mergeTempList(); int[] sparseSet = baseline.getSparseSet(); List<byte[]> sparseBytes = new ArrayList<byte[]>(sparseSet.length); int prevDelta = 0; for (int k : sparseSet) { sparseBytes.add(Varint.writeUnsignedVarInt(k - prevDelta)); prevDelta = k; } for (byte[] bytes : sparseBytes) { dos.writeInt(bytes.length); dos.write(bytes); } dos.writeInt(-1); byte[] legacyBytes = baos.toByteArray(); // decode legacy HyperLogLogPlus decoded = HyperLogLogPlus.Builder.build(legacyBytes); assertEquals(baseline.cardinality(), decoded.cardinality()); byte[] newBytes = baseline.getBytes(); assertTrue(newBytes.length < legacyBytes.length); } @Test public void testMerge_ManySparse() throws CardinalityMergeException { int numToMerge = 20; int bits = 18; int cardinality = 10000; HyperLogLogPlus[] hyperLogLogs = new HyperLogLogPlus[numToMerge]; HyperLogLogPlus baseline = new HyperLogLogPlus(bits, 25); for (int i = 0; i < numToMerge; i++) { hyperLogLogs[i] = new HyperLogLogPlus(bits, 25); for (int j = 0; j < cardinality; j++) { double val = Math.random(); hyperLogLogs[i].offer(val); baseline.offer(val); } } long expectedCardinality = numToMerge * cardinality; HyperLogLogPlus hll = hyperLogLogs[0]; hyperLogLogs = Arrays.asList(hyperLogLogs).subList(1, hyperLogLogs.length).toArray(new HyperLogLogPlus[0]); long mergedEstimate = hll.merge(hyperLogLogs).cardinality(); double se = expectedCardinality * (1.04 / Math.sqrt(Math.pow(2, bits))); System.out.println("Expect estimate: " + mergedEstimate + " is between " + (expectedCardinality - (3 * se)) + " and " + (expectedCardinality + (3 * se))); assertTrue(mergedEstimate >= expectedCardinality - (3 * se)); assertTrue(mergedEstimate <= expectedCardinality + (3 * se)); } @Test public void testMerge_SparseIntersection() throws CardinalityMergeException { HyperLogLogPlus a = new HyperLogLogPlus(11, 16); HyperLogLogPlus b = new HyperLogLogPlus(11, 16); // Note that only one element, 41, is shared amongst the two sets, // and so the number of total unique elements is 14. int[] aInput = { 12, 13, 22, 34, 38, 40, 41, 46, 49 }; int[] bInput = { 2, 6, 19, 29, 41, 48 }; Set<Integer> testSet = new HashSet<Integer>(); for (Integer in : aInput) { testSet.add(in); a.offer(in); } for (Integer in : bInput) { testSet.add(in); b.offer(in); } assertEquals(14, testSet.size()); assertEquals(9, a.cardinality()); assertEquals(6, b.cardinality()); a.addAll(b); assertEquals(14, a.cardinality()); } @Test public void testSerializationWithNewSortMethod() throws IOException { HyperLogLogPlus hll = new HyperLogLogPlus(14, 25); hll.offerHashed(0x0000000000000000l); hll.offerHashed(0x7FFFFFFFFFFFFFFFl); hll.offerHashed(0x8000000000000000l); hll.offerHashed(0xFFFFFFFFFFFFFFFFl); // test against old serialization assertArrayEquals(new byte[] { -1, -1, -1, -2, 14, 25, 1, 4, 25, -27, -1, -1, 15, -101, -128, -128, -16, 7, -27, -1, -1, -97, 8 }, hll.getBytes()); } }