it.unimi.dsi.sux4j.mph.VLLcpMonotoneMinimalPerfectHashFunction.java Source code

Introduction

Here is the source code for it.unimi.dsi.sux4j.mph.VLLcpMonotoneMinimalPerfectHashFunction.java
Source

package it.unimi.dsi.sux4j.mph;

/*       
 * Sux4J: Succinct data structures for Java
 *
 * Copyright (C) 2008-2016 Sebastiano Vigna 
 *
 *  This library is free software; you can redistribute it and/or modify it
 *  under the terms of the GNU Lesser General Public License as published by the Free
 *  Software Foundation; either version 3 of the License, or (at your option)
 *  any later version.
 *
 *  This library is distributed in the hope that it will be useful, but
 *  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
 *  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public License
 *  for more details.
 *
 *  You should have received a copy of the GNU Lesser General Public License
 *  along with this program; if not, see <http://www.gnu.org/licenses/>.
 *
 */

import it.unimi.dsi.bits.BitVector;
import it.unimi.dsi.bits.BitVectors;
import it.unimi.dsi.bits.Fast;
import it.unimi.dsi.bits.HuTuckerTransformationStrategy;
import it.unimi.dsi.bits.LongArrayBitVector;
import it.unimi.dsi.bits.TransformationStrategies;
import it.unimi.dsi.bits.TransformationStrategy;
import it.unimi.dsi.fastutil.Size64;
import it.unimi.dsi.fastutil.ints.IntBigArrays;
import it.unimi.dsi.fastutil.io.BinIO;
import it.unimi.dsi.fastutil.longs.LongBigList;
import it.unimi.dsi.io.FastBufferedReader;
import it.unimi.dsi.io.FileLinesCollection;
import it.unimi.dsi.io.LineIterator;
import it.unimi.dsi.io.OfflineIterable;
import it.unimi.dsi.lang.MutableString;
import it.unimi.dsi.logging.ProgressLogger;
import it.unimi.dsi.sux4j.io.ChunkedHashStore;
import it.unimi.dsi.sux4j.util.EliasFanoLongBigList;
import it.unimi.dsi.util.XorShift1024StarRandomGenerator;

import java.io.IOException;
import java.io.InputStreamReader;
import java.io.Serializable;
import java.nio.charset.Charset;
import java.util.Collection;
import java.util.Iterator;
import java.util.zip.GZIPInputStream;

import org.apache.commons.math3.random.RandomGenerator;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import com.martiansoftware.jsap.FlaggedOption;
import com.martiansoftware.jsap.JSAP;
import com.martiansoftware.jsap.JSAPException;
import com.martiansoftware.jsap.JSAPResult;
import com.martiansoftware.jsap.Parameter;
import com.martiansoftware.jsap.SimpleJSAP;
import com.martiansoftware.jsap.Switch;
import com.martiansoftware.jsap.UnflaggedOption;
import com.martiansoftware.jsap.stringparsers.ForNameStringParser;

/** A monotone minimal perfect hash implementation based on fixed-size bucketing that uses 
 * longest common prefixes as distributors, and store their lengths using a {@link GOVMinimalPerfectHashFunction}
 * indexing an {@link EliasFanoLongBigList}. In theory, this function should use less memory
 * than an {@link LcpMonotoneMinimalPerfectHashFunction} when the lengths of common prefixes vary
 * wildly, but in practice a {@link TwoStepsLcpMonotoneMinimalPerfectHashFunction} is often a better choice.
 */

public class VLLcpMonotoneMinimalPerfectHashFunction<T> extends AbstractHashFunction<T>
        implements Serializable, Size64 {
    public static final long serialVersionUID = 3L;
    private static final Logger LOGGER = LoggerFactory.getLogger(VLLcpMonotoneMinimalPerfectHashFunction.class);
    private static final boolean DEBUG = false;

    /** The number of elements. */
    protected final long n;
    /** The size of a bucket. */
    protected final int bucketSize;
    /** {@link Fast#ceilLog2(int)} of {@link #bucketSize}. */
    protected final int log2BucketSize;
    /** The mask for {@link #log2BucketSize} bits. */
    protected final int bucketSizeMask;
    /** A function mapping each element to a distinct index. */
    protected final GOVMinimalPerfectHashFunction<BitVector> mph;
    /** A list, indexed by {@link #mph}, containing the offset of each element inside its bucket. */
    protected final LongBigList offsets;
    /** A list, indexed by {@link #mph}, containing for each element the length of the longest common prefix of its bucket. */
    protected final EliasFanoLongBigList lcpLengths;
    /** A function mapping each longest common prefix to its bucket. */
    protected final GOV3Function<BitVector> lcp2Bucket;
    /** The transformation strategy. */
    protected final TransformationStrategy<? super T> transform;
    /** The seed to be used when converting keys to triples. */
    private long seed;

    @SuppressWarnings("unchecked")
    public long getLong(final Object o) {
        if (n == 0)
            return defRetValue;
        final BitVector bitVector = transform.toBitVector((T) o).fast();
        final long[] triple = new long[3];
        Hashes.spooky4(transform.toBitVector((T) o), seed, triple);
        final long index = mph.getLongByTriple(triple);
        if (index == -1)
            return defRetValue;
        final long prefix = lcpLengths.getLong(index);
        if (prefix == -1 || prefix > bitVector.length())
            return defRetValue;
        return (lcp2Bucket.getLong(bitVector.subVector(0, prefix)) << log2BucketSize) + offsets.getLong(index);
    }

    public VLLcpMonotoneMinimalPerfectHashFunction(final Iterable<? extends T> iterable,
            final TransformationStrategy<? super T> transform) throws IOException {
        this(iterable, -1, transform);
    }

    @SuppressWarnings("unused")
    public VLLcpMonotoneMinimalPerfectHashFunction(final Iterable<? extends T> iterable, final int numElements,
            final TransformationStrategy<? super T> transform) throws IOException {

        final ProgressLogger pl = new ProgressLogger(LOGGER);
        pl.displayLocalSpeed = true;
        pl.displayFreeMemory = true;
        this.transform = transform;
        final RandomGenerator r = new XorShift1024StarRandomGenerator();

        if (numElements == -1) {
            if (iterable instanceof Size64)
                n = ((Size64) iterable).size64();
            else if (iterable instanceof Collection)
                n = ((Collection<?>) iterable).size();
            else {
                long c = 0;
                for (T dummy : iterable)
                    c++;
                n = c;
            }
        } else
            n = numElements;

        if (n == 0) {
            bucketSize = bucketSizeMask = log2BucketSize = 0;
            lcp2Bucket = null;
            offsets = null;
            lcpLengths = null;
            mph = null;
            return;
        }

        defRetValue = -1; // For the very few cases in which we can decide

        int theoreticalBucketSize = (int) Math
                .ceil(1 + GOV3Function.C * Math.log(2) + Math.log(n) - Math.log(1 + Math.log(n)));
        log2BucketSize = Fast.ceilLog2(theoreticalBucketSize);
        bucketSize = 1 << log2BucketSize;
        bucketSizeMask = bucketSize - 1;

        final long numBuckets = (n + bucketSize - 1) / bucketSize;

        LongArrayBitVector prev = LongArrayBitVector.getInstance();
        LongArrayBitVector curr = LongArrayBitVector.getInstance();
        int currLcp = 0;
        int maxLcp = 0, minLcp = Integer.MAX_VALUE;
        long maxLength = 0, totalLength = 0;

        @SuppressWarnings("resource")
        final ChunkedHashStore<BitVector> chunkedHashStore = new ChunkedHashStore<BitVector>(
                TransformationStrategies.identity(), pl);
        chunkedHashStore.reset(r.nextLong());
        @SuppressWarnings("resource")
        OfflineIterable<BitVector, LongArrayBitVector> lcps = new OfflineIterable<BitVector, LongArrayBitVector>(
                BitVectors.OFFLINE_SERIALIZER, LongArrayBitVector.getInstance());
        pl.expectedUpdates = n;
        pl.start("Scanning collection...");

        Iterator<? extends T> iterator = iterable.iterator();
        for (long b = 0; b < numBuckets; b++) {
            prev.replace(transform.toBitVector(iterator.next()));
            chunkedHashStore.add(prev);
            pl.lightUpdate();
            maxLength = Math.max(maxLength, prev.length());
            totalLength += Fast.length(1 + prev.length());
            currLcp = (int) prev.length();
            final int currBucketSize = (int) Math.min(bucketSize, n - b * bucketSize);

            for (int i = 0; i < currBucketSize - 1; i++) {
                curr.replace(transform.toBitVector(iterator.next()));
                chunkedHashStore.add(curr);
                pl.lightUpdate();
                final int prefix = (int) curr.longestCommonPrefixLength(prev);
                if (prefix == prev.length() && prefix == curr.length())
                    throw new IllegalArgumentException("The input bit vectors are not distinct");
                if (prefix == prev.length() || prefix == curr.length())
                    throw new IllegalArgumentException("The input bit vectors are not prefix-free");
                if (prev.getBoolean(prefix))
                    throw new IllegalArgumentException("The input bit vectors are not lexicographically sorted");

                currLcp = Math.min(prefix, currLcp);
                prev.replace(curr);

                maxLength = Math.max(maxLength, prev.length());
                totalLength += Fast.length(1 + prev.length());
            }

            lcps.add(prev.subVector(0, currLcp));
            maxLcp = Math.max(maxLcp, currLcp);
            minLcp = Math.min(minLcp, currLcp);
        }

        pl.done();

        // Build function assigning each lcp to its bucket.
        lcp2Bucket = new GOV3Function.Builder<BitVector>().keys(lcps).transform(TransformationStrategies.identity())
                .build();
        final int[][] lcpLength = IntBigArrays.newBigArray(lcps.size64());
        long p = 0;
        for (LongArrayBitVector bv : lcps)
            IntBigArrays.set(lcpLength, p++, (int) bv.length());

        if (DEBUG) {
            for (BitVector v : lcps)
                System.err.println(v + " " + v.length());
            for (BitVector v : lcps) {
                final long value = lcp2Bucket.getLong(v);
                if (p++ != value) {
                    System.err.println("p: " + (p - 1) + "  value: " + value + " key:" + v);
                    throw new AssertionError();
                }
            }
        }

        lcps.close();

        final Iterable<BitVector> bitVectors = TransformationStrategies.wrap(iterable, transform);
        // Build mph on elements.
        mph = new GOVMinimalPerfectHashFunction.Builder<BitVector>().keys(bitVectors)
                .transform(TransformationStrategies.identity()).store(chunkedHashStore).build();
        this.seed = chunkedHashStore.seed();

        // Build function assigning the lcp length and the bucketing data to each element.
        (offsets = LongArrayBitVector.getInstance().asLongBigList(log2BucketSize)).size(n);
        LongBigList lcpLengthsTemp = LongArrayBitVector.getInstance().asLongBigList(Fast.length(maxLcp));
        lcpLengthsTemp.size(n);

        LOGGER.info("Generating data tables...");

        for (ChunkedHashStore.Chunk chunk : chunkedHashStore) {
            for (long[] quadruple : chunk) {
                final long index = mph.getLongByTriple(quadruple);
                offsets.set(index, quadruple[3] & bucketSizeMask);
                lcpLengthsTemp.set(index, IntBigArrays.get(lcpLength, (int) (quadruple[3] >> log2BucketSize)));
            }
        }

        chunkedHashStore.close();

        lcpLengths = new EliasFanoLongBigList(lcpLengthsTemp.iterator(), minLcp, true);

        if (DEBUG) {
            p = 0;
            for (T key : iterable) {
                BitVector bv = transform.toBitVector(key);
                long index = mph.getLong(bv);
                if (p++ != lcp2Bucket.getLong(bv.subVector(0, lcpLengths.getLong(index))) * bucketSize
                        + offsets.getLong(index)) {
                    System.err.println("p: " + (p - 1) + "  Key: " + key + " bucket size: " + bucketSize + " lcp "
                            + transform.toBitVector(key).subVector(0, lcpLengths.getLong(index)) + " lcp length: "
                            + lcpLengths.getLong(index) + " bucket "
                            + lcp2Bucket.getLong(transform.toBitVector(key).subVector(0, lcpLengths.getLong(index)))
                            + " offset: " + offsets.getLong(index));
                    throw new AssertionError();
                }
            }
        }

        LOGGER.debug("Bucket size: " + bucketSize);
        final double avgLength = (double) totalLength / n;
        LOGGER.debug("Forecast bit cost per element: " + (2 * GOV3Function.C + 2 + avgLength + Fast.log2(avgLength)
                + Fast.log2(Math.E) - Fast.log2(Fast.log2(Math.E)) + Fast.log2(1 + Fast.log2(n))));
        LOGGER.info("Actual bit cost per element: " + (double) numBits() / n);
    }

    public long size64() {
        return n;
    }

    /** Returns the number of bits used by this structure.
     * 
     * @return the number of bits used by this structure.
     */
    public long numBits() {
        if (n == 0)
            return 0;
        return offsets.size64() * log2BucketSize + lcpLengths.numBits() + lcp2Bucket.numBits() + mph.numBits()
                + transform.numBits();
    }

    public static void main(final String[] arg) throws NoSuchMethodException, IOException, JSAPException {

        final SimpleJSAP jsap = new SimpleJSAP(VLLcpMonotoneMinimalPerfectHashFunction.class.getName(),
                "Builds a variable-length LCP-based monotone minimal perfect hash function reading a newline-separated list of strings.",
                new Parameter[] {
                        new FlaggedOption("encoding", ForNameStringParser.getParser(Charset.class), "UTF-8",
                                JSAP.NOT_REQUIRED, 'e', "encoding", "The string file encoding."),
                        new Switch("huTucker", 'h', "hu-tucker", "Use Hu-Tucker coding to reduce string length."),
                        new Switch("iso", 'i', "iso",
                                "Use ISO-8859-1 coding internally (i.e., just use the lower eight bits of each character)."),
                        new Switch("utf32", JSAP.NO_SHORTFLAG, "utf-32",
                                "Use UTF-32 internally (handles surrogate pairs)."),
                        new Switch("zipped", 'z', "zipped", "The string list is compressed in gzip format."),
                        new UnflaggedOption("function", JSAP.STRING_PARSER, JSAP.NO_DEFAULT, JSAP.REQUIRED,
                                JSAP.NOT_GREEDY,
                                "The filename for the serialised monotone minimal perfect hash function."),
                        new UnflaggedOption("stringFile", JSAP.STRING_PARSER, "-", JSAP.NOT_REQUIRED,
                                JSAP.NOT_GREEDY,
                                "The name of a file containing a newline-separated list of strings, or - for standard input; in the first case, strings will not be loaded into core memory."), });

        JSAPResult jsapResult = jsap.parse(arg);
        if (jsap.messagePrinted())
            return;

        final String functionName = jsapResult.getString("function");
        final String stringFile = jsapResult.getString("stringFile");
        final Charset encoding = (Charset) jsapResult.getObject("encoding");
        final boolean zipped = jsapResult.getBoolean("zipped");
        final boolean iso = jsapResult.getBoolean("iso");
        final boolean utf32 = jsapResult.getBoolean("utf32");
        final boolean huTucker = jsapResult.getBoolean("huTucker");

        final Collection<MutableString> collection;
        if ("-".equals(stringFile)) {
            final ProgressLogger pl = new ProgressLogger(LOGGER);
            pl.displayLocalSpeed = true;
            pl.displayFreeMemory = true;
            pl.start("Loading strings...");
            collection = new LineIterator(
                    new FastBufferedReader(
                            new InputStreamReader(zipped ? new GZIPInputStream(System.in) : System.in, encoding)),
                    pl).allLines();
            pl.done();
        } else
            collection = new FileLinesCollection(stringFile, encoding.toString(), zipped);
        final TransformationStrategy<CharSequence> transformationStrategy = huTucker
                ? new HuTuckerTransformationStrategy(collection, true)
                : iso ? TransformationStrategies.prefixFreeIso()
                        : utf32 ? TransformationStrategies.prefixFreeUtf32()
                                : TransformationStrategies.prefixFreeUtf16();

        BinIO.storeObject(
                new VLLcpMonotoneMinimalPerfectHashFunction<CharSequence>(collection, transformationStrategy),
                functionName);
        LOGGER.info("Completed.");
    }
}