Java tutorial
package brickhouse.udf.bloom; /** * Copyright 2012 Klout, Inc * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * **/ import java.io.ByteArrayInputStream; import java.io.ByteArrayOutputStream; import java.io.DataInputStream; import java.io.DataOutputStream; import java.io.IOException; import java.io.InputStream; import java.io.OutputStream; import java.util.HashMap; import java.util.Map; import org.apache.commons.codec.binary.Base64; import org.apache.hadoop.util.bloom.BloomFilter; import org.apache.hadoop.util.bloom.Filter; import org.apache.hadoop.util.hash.Hash; import org.apache.log4j.Logger; /** * Utility class for construction and serialization of BloomFilters ... * * */ public class BloomFactory { private static final Logger LOG = Logger.getLogger(BloomFactory.class); private static Map<String, Filter> localBloomMap = new HashMap<String, Filter>(); public static final int DEFAULT_NUM_ELEMENTS = 10000000; public static final double DEFAULT_FALSE_POS_PROB = 0.005; public static final int DEFAULT_HASH_TYPE = Hash.JENKINS_HASH; public static final int NUMBER_OF_BLOOMS = 5; public static Filter NewBloomInstance() { return NewBloomInstance(DEFAULT_NUM_ELEMENTS, DEFAULT_FALSE_POS_PROB); } static Filter NewVesselBloom() { return new BloomFilter(); } public static Filter NewBloomInstance(int expectedNumberOfElements, double falsePositiveProbability) { return NewBloomInstance(Math.ceil(-(Math.log(falsePositiveProbability) / Math.log(2))) / Math.log(2), // c = k / ln(2) expectedNumberOfElements, (int) Math.ceil(-(Math.log(falsePositiveProbability) / Math.log(2)))); // k = ceil(-log_2(false prob.)) } public static Filter NewBloomInstance(double c, int n, int k) { LOG.info("Creating new Bloom filter C = " + c + " N = " + n + " K = " + k); BloomFilter dbf = new BloomFilter((int) Math.ceil(c * n), k, DEFAULT_HASH_TYPE); return dbf; } /** * Generic method for getting BloomFilter from a string. * First, the local map is checked for a bloom loaded from * the distributed cache. Next the bloom is attempted to be * parsed from UUencoded format. * @param name * @return */ public static Filter GetBloomFilter(String str) { Filter bloom = GetNamedBloomFilter(str); if (bloom == null) { try { bloom = ReadBloomFromString(str); return bloom; } catch (IOException e) { LOG.error(" Unable to get bloom for string " + str); return null; } } else { return bloom; } } public static Filter GetNamedBloomFilter(String name) { return localBloomMap.get(name); } public static void PutNamedBloomFilter(String name, Filter bloom) { localBloomMap.put(name, bloom); } public static Filter ReadBloomFromStream(InputStream stream) throws IOException { /// Need to UUDecode first, /// TODO - read bytes directly when hive handles byte arrays better ByteArrayOutputStream buffer = new ByteArrayOutputStream(); byte[] bufferArr = new byte[4096]; int len = 0; while ((len = stream.read(bufferArr, 0, 4096)) > 0) { buffer.write(bufferArr, 0, len); } if (buffer.size() == 0) { return BloomFactory.NewBloomInstance(); } return ReadBloomFromString(new String(buffer.toByteArray())); } public static void WriteBloomToStream(OutputStream stream, Filter bloom) throws IOException { stream.write(WriteBloomToString(bloom).getBytes()); stream.flush(); } public static Filter ReadBloomFromString(String str) throws IOException { if (str != null) { Filter filter = NewVesselBloom(); byte[] decoded = Base64.decodeBase64(str.getBytes()); DataInputStream dataInput = new DataInputStream(new ByteArrayInputStream(decoded)); filter.readFields(dataInput); return filter; } else { return NewBloomInstance(); } } public static String WriteBloomToString(Filter bloom) throws IOException { if (bloom != null) { ByteArrayOutputStream buffer = new ByteArrayOutputStream(); bloom.write(new DataOutputStream(buffer)); byte[] encodedBloom = Base64.encodeBase64(buffer.toByteArray()); return new String(encodedBloom); } else { return null; } } }