com.addthis.hydra.data.MakeBloom.java Source code

Java tutorial

Introduction

Here is the source code for com.addthis.hydra.data.MakeBloom.java

Source

/*
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package com.addthis.hydra.data;

import java.io.File;

import com.addthis.basis.util.LessFiles;

import com.addthis.maljson.JSONArray;

import com.clearspring.analytics.stream.membership.BloomFilter;

import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/**
 * <h1>MakeBloom</h1>
 * <p/>
 * <p>Create a Bloom Filter out of a string list and export it.</p>
 *
 */
public class MakeBloom {

    /**  */
    private static final Logger log = LoggerFactory.getLogger(MakeBloom.class);
    private static final String fpRate = System.getProperty("fpRate", "0.001");
    private static final double fp_rate = Double.parseDouble(fpRate);

    // get all the quoted words from a "frag" file
    public static String[] getWords(File in) throws java.io.IOException, com.addthis.maljson.JSONException {
        log.debug("Reading " + in.length() + " bytes from [" + in + "]");
        JSONArray words = new JSONArray("[" + new String(LessFiles.read(in), "utf8") + "]");
        log.debug("Read " + words.length() + " words from [" + in + "]");
        String[] ret = new String[words.length()];
        for (int i = 0; i < words.length(); i++) {
            ret[i] = words.getString(i);
        }
        return ret;
    }

    public static void main(String[] args) throws java.io.IOException, com.addthis.maljson.JSONException {
        if (args.length != 1 && args.length != 2) {
            throw new IllegalArgumentException("usage: MakeBloom word-list-file [bloom-file]");
        }

        File in = new File(args[0]);
        String[] words = getWords(in);

        BloomFilter bf = new BloomFilter(words.length, fp_rate);
        log.debug("Created: BloomFilter(" + bf.buckets() + " buckets, " + bf.getHashCount() + " hashes); FP rate = "
                + fp_rate);
        for (int i = 0; i < words.length; i++) {
            bf.add(words[i]);
        }
        log.debug("Added words");

        File out = args.length == 2 ? new File(args[1])
                : LessFiles.replaceSuffix(in, "-" + words.length + "-" + fpRate + ".bloom");
        log.debug("Writing [" + out + "]");
        LessFiles.write(out, org.apache.commons.codec.binary.Base64.encodeBase64(BloomFilter.serialize(bf)), false);
        log.debug("Wrote " + out.length() + " bytes to [" + out + "]");
    }
}