com.facebook.presto.bloomfilter.BloomFilter.java Source code

Java tutorial

Introduction

Here is the source code for com.facebook.presto.bloomfilter.BloomFilter.java

Source

/*
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package com.facebook.presto.bloomfilter;

import com.google.common.annotations.VisibleForTesting;
import com.google.common.hash.HashCode;
import com.google.common.hash.Hashing;
import com.google.common.io.ByteStreams;
import io.airlift.log.Logger;
import io.airlift.slice.BasicSliceInput;
import io.airlift.slice.DynamicSliceOutput;
import io.airlift.slice.Slice;
import io.airlift.slice.Slices;
import orestes.bloomfilter.FilterBuilder;
import orestes.bloomfilter.HashProvider;
import org.apache.commons.io.IOUtils;
import org.eclipse.jetty.client.api.Request;
import org.eclipse.jetty.client.api.Response;
import org.eclipse.jetty.client.util.InputStreamResponseListener;

import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.ObjectInputStream;
import java.io.ObjectOutput;
import java.io.ObjectOutputStream;
import java.util.concurrent.TimeUnit;
import java.util.zip.GZIPInputStream;
import java.util.zip.GZIPOutputStream;

// Layout is <hash>:<size>:<size_pre>:<bf_pre>:<bf>, where
//   hash: is a sha256 hash of the bloom filter
//   size: is an int describing the length of the bf bytes
//   size_pre: is an int describing the length of the pre bf bytes
//   expectedInsertions: is an int describing the amount of expected elements
//   falsePositivePercentage: is a double describing the desired false positive percentage
//   bf_pre: is the serialized bloom filter used for pre-filtering
//   bf: is the serialized bloom filter
public class BloomFilter {
    private static final HashCode HASH_CODE_NOT_FOUND = HashCode.fromInt(0);
    private orestes.bloomfilter.BloomFilter instancePreFilter;
    private orestes.bloomfilter.BloomFilter instance;
    private int expectedInsertions;
    private double falsePositivePercentage;
    private long preMiss = 0;

    private static final boolean USE_PRE_FILTER = true;

    private static final Logger log = Logger.get(BloomFilter.class);

    public static final int DEFAULT_BLOOM_FILTER_EXPECTED_INSERTIONS = 10_000_000;
    public static final double DEFAULT_BLOOM_FILTER_FALSE_POSITIVE_PERCENTAGE = 0.01;

    public static final double BF_MEM_CONSTANT = Math.log(1.0 / (Math.pow(2.0, Math.log(2.0))));

    static {
        try {
            if (!BloomFilterScalarFunctions.HTTP_CLIENT.isStarted()) {
                log.warn("Http client was not started, trying to start");
                BloomFilterScalarFunctions.HTTP_CLIENT.start();
            }
        } catch (Exception ex) {
            log.error("Error starting http client: " + ex.getMessage());
        }
    }

    public int getExpectedInsertions() {
        return expectedInsertions;
    }

    public double getFalsePositivePercentage() {
        return falsePositivePercentage;
    }

    public static BloomFilter newInstance() {
        return new BloomFilter(DEFAULT_BLOOM_FILTER_EXPECTED_INSERTIONS,
                DEFAULT_BLOOM_FILTER_FALSE_POSITIVE_PERCENTAGE);
    }

    public static BloomFilter newInstance(int expectedInsertions, double falsePositivePercentage) {
        return new BloomFilter(expectedInsertions, falsePositivePercentage);
    }

    public static BloomFilter newInstance(int expectedInsertions) {
        return new BloomFilter(expectedInsertions, DEFAULT_BLOOM_FILTER_FALSE_POSITIVE_PERCENTAGE);
    }

    // Construct from serialized string
    public static BloomFilter newInstance(byte[] fromBytes) {
        BloomFilter bf = newInstance();
        byte[] serializedBytes = java.util.Base64.getDecoder().decode(fromBytes);
        bf.load(Slices.wrappedBuffer(serializedBytes));
        return bf;
    }

    public static BloomFilter fromUrl(String url) throws Exception {
        log.info("Loading bloom filter from " + url);

        Request request = BloomFilterScalarFunctions.HTTP_CLIENT.newRequest(url);
        request.method("GET");
        InputStreamResponseListener listener = new InputStreamResponseListener();
        request.send(listener);

        // Wait for the response headers to arrive
        Response response = listener.get(10, TimeUnit.SECONDS);

        // Look at the response
        if (response.getStatus() == 200) {
            // Use try-with-resources to close input stream.
            try (InputStream responseContent = listener.getInputStream()) {
                byte[] bytes = ByteStreams.toByteArray(responseContent);
                return newInstance(bytes);
            }
        }
        log.warn("Non-200 response status " + response.getStatus());
        return null;
    }

    public static BloomFilter newInstance(Slice serialized) {
        BloomFilter bf = newInstance();
        bf.load(serialized);
        return bf;
    }

    private BloomFilter(int expectedInsertions, double falsePositivePercentage) {
        this.expectedInsertions = expectedInsertions;
        this.falsePositivePercentage = falsePositivePercentage;
        initbloomFilters();
    }

    public byte[] toBase64() {
        return java.util.Base64.getEncoder().encode(serialize().getBytes());
    }

    public BloomFilter put(Slice s) {
        if (s == null) {
            return this;
        }
        byte[] b = s.getBytes();
        if (b.length < 1) {
            return this;
        }
        instance.add(b);
        if (USE_PRE_FILTER) {
            instancePreFilter.add(b);
        }
        return this;
    }

    public BloomFilter putAll(BloomFilter other) {
        instance.union(other.instance);
        if (USE_PRE_FILTER) {
            instancePreFilter.union(other.instancePreFilter);
        }
        return this;
    }

    public boolean mightContain(Slice s) {
        byte[] b = s.getBytes();
        if (USE_PRE_FILTER) {
            if (instancePreFilter.contains(b)) {
                return instance.contains(b);
            } else {
                preMiss++;
                return false;
            }
        } else {
            return instance.contains(b);
        }
    }

    @VisibleForTesting
    public long getPreMiss() {
        return preMiss;
    }

    private void load(Slice serialized) {
        BasicSliceInput input = serialized.getInput();

        // Read hash
        byte[] bfHash = new byte[32];
        input.readBytes(bfHash, 0, 32);

        // Get the size of the bloom filter
        int bfSize = input.readInt();

        // Get the size of the bloom filter
        int bfSizePre = input.readInt();

        // Params
        expectedInsertions = input.readInt();
        falsePositivePercentage = input.readDouble();

        // Read the buffer
        ByteArrayOutputStream out = new ByteArrayOutputStream();
        try {
            input.readBytes(out, bfSize);
        } catch (IOException ix) {
            log.error(ix);
        }

        // Uncompress
        byte[] uncompressed;
        try {
            uncompressed = decompress(out.toByteArray());
        } catch (IOException ix) {
            log.error(ix);
            uncompressed = new byte[0];
        }

        // Input stream
        ByteArrayInputStream in = new ByteArrayInputStream(uncompressed);

        // Setup bloom filter
        try {
            ObjectInputStream ois = new ObjectInputStream(in);
            instance = (orestes.bloomfilter.BloomFilter) ois.readObject();
            input.close();
        } catch (Exception ix) {
            log.error(ix);
            initbloomFilters();
        }

        // Read the buffer
        ByteArrayOutputStream outPre = new ByteArrayOutputStream();
        try {
            input.readBytes(outPre, bfSizePre);
        } catch (IOException ix) {
            log.error(ix);
        }

        // Uncompress
        byte[] uncompressedPre;
        try {
            uncompressedPre = decompress(outPre.toByteArray());
        } catch (IOException ix) {
            log.error(ix);
            uncompressedPre = new byte[0];
        }

        // Input stream
        ByteArrayInputStream inPre = new ByteArrayInputStream(uncompressedPre);

        // Setup bloom filter
        try {
            ObjectInputStream ois = new ObjectInputStream(inPre);
            instancePreFilter = (orestes.bloomfilter.BloomFilter) ois.readObject();
            input.close();
        } catch (Exception ix) {
            log.error(ix);
            initbloomFilters();
        }
    }

    private void initbloomFilters() {
        instance = newBloomFilter();
        instancePreFilter = newPreBloomFilter();
    }

    private orestes.bloomfilter.BloomFilter newBloomFilter() {
        return new FilterBuilder(expectedInsertions, falsePositivePercentage)
                .hashFunction(HashProvider.HashMethod.Murmur3KirschMitzenmacher).buildBloomFilter();
    }

    private orestes.bloomfilter.BloomFilter newPreBloomFilter() {
        return new FilterBuilder(Math.max(expectedInsertions / 10, 10), Math.min(falsePositivePercentage * 10, 0.5))
                .hashFunction(HashProvider.HashMethod.FNVWithLCG).hashes(1).buildBloomFilter();
    }

    public Slice serialize() {
        byte[] bytes = new byte[0];
        byte[] bytesPre = new byte[0];
        try {
            ByteArrayOutputStream buffer = new ByteArrayOutputStream();
            ObjectOutput output = new ObjectOutputStream(buffer);
            output.writeObject(instance);
            bytes = buffer.toByteArray();

            ByteArrayOutputStream bufferPre = new ByteArrayOutputStream();
            ObjectOutput outputPre = new ObjectOutputStream(bufferPre);
            outputPre.writeObject(instancePreFilter);
            bytesPre = bufferPre.toByteArray();
        } catch (Exception ix) {
            log.error(ix);
        }

        // Create hash
        byte[] bfHash = Hashing.sha256().hashBytes(bytes).asBytes();

        // Compress
        byte[] compressed;
        try {
            compressed = compress(bytes);
        } catch (IOException ix) {
            log.error(ix);
            compressed = new byte[0];
        }
        int size = compressed.length;

        // Compress
        byte[] compressedPre;
        try {
            compressedPre = compress(bytesPre);
        } catch (IOException ix) {
            log.error(ix);
            compressedPre = new byte[0];
        }
        int sizePre = compressedPre.length;

        // To slice
        DynamicSliceOutput output = new DynamicSliceOutput(size);

        // Write hash
        output.writeBytes(bfHash); // 32 bytes

        // Write the length of the bloom filter
        output.appendInt(size);

        // Write the length of the pre bloom filter
        output.appendInt(sizePre);

        // Params
        output.appendInt(expectedInsertions);
        output.appendDouble(falsePositivePercentage);

        // Write the bloom filter
        output.appendBytes(compressed);

        // Write the bloom filter
        output.appendBytes(compressedPre);

        return output.slice();
    }

    public static byte[] compress(byte[] b) throws IOException {
        ByteArrayOutputStream out = new ByteArrayOutputStream();
        GZIPOutputStream gzip = new GZIPOutputStream(out);
        gzip.write(b);
        gzip.close();
        return out.toByteArray();
    }

    public static byte[] decompress(byte[] b) throws IOException {
        ByteArrayOutputStream out = new ByteArrayOutputStream();
        IOUtils.copy(new GZIPInputStream(new ByteArrayInputStream(b)), out);
        return out.toByteArray();
    }

    public int estimatedInMemorySize() {
        // m = ceil((n * log(p)) / log(1.0 / (pow(2.0, log(2.0)))));
        // k = round(log(2.0) * m / n);
        // Source: http://hur.st/bloomfilter
        double m = Math.ceil(((double) expectedInsertions * Math.log(falsePositivePercentage)) / BF_MEM_CONSTANT)
                / 8.0D;
        return (int) Math.round(m);
    }

    public static HashCode readHash(Slice s) {
        if (s == null) {
            return HASH_CODE_NOT_FOUND;
        }
        return HashCode.fromBytes(s.getBytes(0, 32));
    }
}