com.cloudera.util.bloom.BloomSet.java Source code

Java tutorial

Introduction

Here is the source code for com.cloudera.util.bloom.BloomSet.java

Source

/**
 * Licensed to Cloudera, Inc. under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  Cloudera, Inc. licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package com.cloudera.util.bloom;

import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.DataInputStream;
import java.io.DataOutputStream;
import java.io.IOException;
import java.nio.ByteBuffer;
import java.util.Arrays;

import org.apache.hadoop.util.bloom.BloomFilter;
import org.apache.hadoop.util.bloom.Key;
import org.apache.hadoop.util.hash.Hash;

import com.google.common.base.Preconditions;

/**
 * This is an interface for a bloom set. Currently this shim wraps the
 * o.a.h.util.bloom.BloomFilter with a more generic API but uses inefficient
 * calls.
 * 
 * This is only for test code so for now. Operations of this are woefully
 * inefficient computationally and this should be eventually replaced with a
 * version that does not rely on hadoop's Writeable-tied version.
 */
public class BloomSet {

    final BloomFilter bloom;
    final static int hashType = Hash.JENKINS_HASH; // just pick a default for now.

    /**
     * Create empty BloomSet.
     * 
     * Wikipedia uses the following size and # of hashes to achieve particular
     * false positives rates.
     * 
     * m bits needed, n inserted elements, p false positive rate, k hash
     * functions, e is max false positive error rate.
     * 
     * k = (m/n) ln 2
     * 
     * m = - n ln p / (ln 2)^2
     * 
     * 1.44 log_2 (1/e) = # of bits per inserted element
     **/
    public BloomSet(int nbits, int hashes) {
        bloom = new BloomFilter(nbits, hashes, hashType);
    }

    /**
     * Copy constructor
     */
    public BloomSet(BloomSet src) {
        Preconditions.checkArgument(src != null);
        byte[] bytes = src.getBytes();
        bloom = deserialize(bytes);
    }

    /**
     * Instantiate a serialized BloomSet.
     */
    public BloomSet(byte[] serialized) {
        bloom = deserialize(serialized);
    }

    /**
     * Takes an array of bytes and deserializes it into the current BloomSet.
     */
    protected BloomFilter deserialize(byte[] serialized) {
        try {
            Preconditions.checkArgument(serialized != null);
            DataInputStream in = new DataInputStream(new ByteArrayInputStream(serialized));
            BloomFilter bloom = new BloomFilter(); // empty constructor
            bloom.readFields(in);
            return bloom;
        } catch (IOException e) {
            throw new IllegalArgumentException(e.getMessage());
        }
    }

    /**
     * Adds an int to the bloom filter.
     */
    public void addInt(int i) {
        ByteBuffer buf = ByteBuffer.allocate(4);
        buf.putInt(i);
        Key k = new Key(buf.array());
        bloom.add(k);
    }

    /**
     * Get a serialized version of the BloomSet
     */
    public byte[] getBytes() {
        try {
            // serialize
            ByteArrayOutputStream bits = new ByteArrayOutputStream();
            DataOutputStream out = new DataOutputStream(bits);
            bloom.write(out);
            out.flush();
            return bits.toByteArray();
        } catch (IOException e) {
            // should never happen.
            e.printStackTrace();
            return null;
        }

    }

    @Override
    public int hashCode() {
        // TODO likely inefficient
        return Arrays.hashCode(getBytes());
    }

    @Override
    public boolean equals(Object b) {
        if (!(b instanceof BloomSet)) {
            return false;
        }

        byte[] as = getBytes();
        byte[] bs = ((BloomSet) b).getBytes();
        return Arrays.equals(as, bs);
    }

    /**
     * Applies a bitwise 'and', modifying this bloom sets. 'and'ing these actually
     * creates a new physical representation that is equivalent to anding the two
     * sets.
     */
    public void and(BloomSet b) {
        bloom.and(b.bloom);
    }

    /**
     * Rhetorically asks, does the current set contain the specified subset with
     * high probability?
     * 
     * This essentially does a 'and' and then verifies if the resulting set is
     * equal to the original.
     */
    public boolean contains(BloomSet subset) {
        // examples:
        // super sub => (sub & super) ^ sub == 0
        // 1111 0011 => true
        // 1100 1100 => true
        // 1100 1000 => true
        // 1100 0000 => true
        // 1100 0010 => false
        // serialize the bloom filter, and send it on close.
        BloomSet subClone = new BloomSet(subset);

        // if subset
        subClone.and(this);
        return subClone.equals(subset);
    }
}