it.unimi.dsi.sux4j.mph.TwoStepsLcpMonotoneMinimalPerfectHashFunction.java Source code

Java tutorial

Introduction

Here is the source code for it.unimi.dsi.sux4j.mph.TwoStepsLcpMonotoneMinimalPerfectHashFunction.java

Source

package it.unimi.dsi.sux4j.mph;

/*       
 * Sux4J: Succinct data structures for Java
 *
 * Copyright (C) 2008-2016 Sebastiano Vigna 
 *
 *  This library is free software; you can redistribute it and/or modify it
 *  under the terms of the GNU Lesser General Public License as published by the Free
 *  Software Foundation; either version 3 of the License, or (at your option)
 *  any later version.
 *
 *  This library is distributed in the hope that it will be useful, but
 *  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
 *  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public License
 *  for more details.
 *
 *  You should have received a copy of the GNU Lesser General Public License
 *  along with this program; if not, see <http://www.gnu.org/licenses/>.
 *
 */

import it.unimi.dsi.bits.BitVector;
import it.unimi.dsi.bits.BitVectors;
import it.unimi.dsi.bits.Fast;
import it.unimi.dsi.bits.LongArrayBitVector;
import it.unimi.dsi.bits.TransformationStrategies;
import it.unimi.dsi.bits.TransformationStrategy;
import it.unimi.dsi.fastutil.Size64;
import it.unimi.dsi.fastutil.ints.IntBigArrays;
import it.unimi.dsi.fastutil.io.BinIO;
import it.unimi.dsi.fastutil.longs.AbstractLongBigList;
import it.unimi.dsi.fastutil.longs.LongBigList;
import it.unimi.dsi.fastutil.objects.ObjectOpenHashSet;
import it.unimi.dsi.io.FastBufferedReader;
import it.unimi.dsi.io.FileLinesCollection;
import it.unimi.dsi.io.LineIterator;
import it.unimi.dsi.io.OfflineIterable;
import it.unimi.dsi.lang.MutableString;
import it.unimi.dsi.logging.ProgressLogger;
import it.unimi.dsi.sux4j.io.ChunkedHashStore;
import it.unimi.dsi.util.XorShift1024StarRandomGenerator;

import java.io.File;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.Serializable;
import java.nio.charset.Charset;
import java.util.Collection;
import java.util.Iterator;
import java.util.zip.GZIPInputStream;

import org.apache.commons.math3.random.RandomGenerator;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import com.martiansoftware.jsap.FlaggedOption;
import com.martiansoftware.jsap.JSAP;
import com.martiansoftware.jsap.JSAPException;
import com.martiansoftware.jsap.JSAPResult;
import com.martiansoftware.jsap.Parameter;
import com.martiansoftware.jsap.SimpleJSAP;
import com.martiansoftware.jsap.Switch;
import com.martiansoftware.jsap.UnflaggedOption;
import com.martiansoftware.jsap.stringparsers.FileStringParser;
import com.martiansoftware.jsap.stringparsers.ForNameStringParser;

/** A monotone minimal perfect hash implementation based on fixed-size bucketing that uses 
 * longest common prefixes as distributors, and store their lengths using a {@link TwoStepsGOV3Function}.
 * 
 * <p>This implementation should use a few less bits per elements than {@link LcpMonotoneMinimalPerfectHashFunction},
 * but it is a bit slower as one or two additional functions must be queried.
 * 
 * <p>See the {@linkplain it.unimi.dsi.sux4j.mph package overview} for a comparison with other implementations.
 * Similarly to a {@link GOV3Function}, an instance of this class may be <em>{@linkplain Builder#signed(int) signed}</em>.
 */

public class TwoStepsLcpMonotoneMinimalPerfectHashFunction<T> extends AbstractHashFunction<T>
        implements Size64, Serializable {
    public static final long serialVersionUID = 4L;
    private static final Logger LOGGER = LoggerFactory
            .getLogger(TwoStepsLcpMonotoneMinimalPerfectHashFunction.class);
    private static final boolean DEBUG = false;
    private static final boolean ASSERTS = false;

    /** A builder class for {@link TwoStepsLcpMonotoneMinimalPerfectHashFunction}. */
    public static class Builder<T> {
        protected Iterable<? extends T> keys;
        protected TransformationStrategy<? super T> transform;
        protected long numKeys = -1;
        protected int signatureWidth;
        protected File tempDir;
        /** Whether {@link #build()} has already been called. */
        protected boolean built;

        /** Specifies the keys to hash.
         * 
         * @param keys the keys to hash.
         * @return this builder.
         */
        public Builder<T> keys(final Iterable<? extends T> keys) {
            this.keys = keys;
            return this;
        }

        /** Specifies the number of keys.
         * 
         * <p>The argument must be equal to the number of keys returned by an iterator
         * generated by {@link #keys(Iterable) the set of keys}. Without this information,
         * a first scan of the key set will be necessary to compute its cardinality,
         * unless the set of keys implements {@link Size64} or {@link Collection}.
         * 
         * @param numKeys the keys to hash.
         * @return this builder.
         */
        public Builder<T> numKeys(final long numKeys) {
            this.numKeys = numKeys;
            return this;
        }

        /** Specifies the transformation strategy for the {@linkplain #keys(Iterable) keys to hash}.
         * 
         * @param transform a transformation strategy for the {@linkplain #keys(Iterable) keys to hash}.
         * @return this builder.
         */
        public Builder<T> transform(final TransformationStrategy<? super T> transform) {
            this.transform = transform;
            return this;
        }

        /** Specifies that the resulting {@link LcpMonotoneMinimalPerfectHashFunction} should be signed using a given number of bits per key.
         * 
         * @param signatureWidth a signature width, or 0 for no signature.
         * @return this builder.
         */
        public Builder<T> signed(final int signatureWidth) {
            this.signatureWidth = signatureWidth;
            return this;
        }

        /** Specifies a temporary directory for the {@link ChunkedHashStore}.
         * 
         * @param tempDir a temporary directory for the {@link ChunkedHashStore}. files, or {@code null} for the standard temporary directory.
         * @return this builder.
         */
        public Builder<T> tempDir(final File tempDir) {
            this.tempDir = tempDir;
            return this;
        }

        /** Builds a two-steps LCP monotone minimal perfect hash function.
         * 
         * @return a {@link TwoStepsLcpMonotoneMinimalPerfectHashFunction} instance with the specified parameters.
         * @throws IllegalStateException if called more than once.
         */
        public TwoStepsLcpMonotoneMinimalPerfectHashFunction<T> build() throws IOException {
            if (built)
                throw new IllegalStateException("This builder has been already used");
            built = true;
            return new TwoStepsLcpMonotoneMinimalPerfectHashFunction<T>(keys, numKeys, transform, signatureWidth,
                    tempDir);
        }
    }

    /** The number of elements. */
    protected final long n;
    /** The size of a bucket. */
    protected final int bucketSize;
    /** {@link Fast#ceilLog2(int)} of {@link #bucketSize}. */
    protected final int log2BucketSize;
    /** The mask for {@link #log2BucketSize} bits. */
    protected final int bucketSizeMask;
    /** A function mapping each element to the offset inside its bucket. */
    protected final GOV3Function<BitVector> offsets;
    /** A function mapping each element to the length of the longest common prefix of its bucket. */
    protected final TwoStepsGOV3Function<BitVector> lcpLengths;
    /** A function mapping each longest common prefix to its bucket. */
    protected final GOV3Function<BitVector> lcp2Bucket;
    /** The transformation strategy. */
    protected final TransformationStrategy<? super T> transform;
    /** The seed returned by the {@link ChunkedHashStore}. */
    protected final long seed;
    /** The mask to compare signatures, or zero for no signatures. */
    protected final long signatureMask;
    /** The signatures. */
    protected final LongBigList signatures;

    /**
     * Creates a new two-steps LCP monotone minimal perfect hash function for the given keys.
     * 
     * @param keys the keys to hash.
     * @param numKeys the number of keys, or -1 if the number of keys is not known (will be computed).
     * @param transform a transformation strategy for the keys.
     * @param signatureWidth a signature width, or 0 for no signature.
     * @param tempDir a temporary directory for the store files, or {@code null} for the standard temporary directory.
     */
    @SuppressWarnings("unused")
    protected TwoStepsLcpMonotoneMinimalPerfectHashFunction(final Iterable<? extends T> keys, final long numKeys,
            final TransformationStrategy<? super T> transform, final int signatureWidth, final File tempDir)
            throws IOException {
        final ProgressLogger pl = new ProgressLogger(LOGGER);
        pl.displayLocalSpeed = true;
        pl.displayFreeMemory = true;
        this.transform = transform;
        final RandomGenerator r = new XorShift1024StarRandomGenerator();

        if (numKeys == -1) {
            if (keys instanceof Size64)
                n = ((Size64) keys).size64();
            else if (keys instanceof Collection)
                n = ((Collection<?>) keys).size();
            else {
                long c = 0;
                for (T dummy : keys)
                    c++;
                n = c;
            }
        } else
            n = numKeys;

        defRetValue = -1; // For the very few cases in which we can decide

        if (n == 0) {
            seed = bucketSize = bucketSizeMask = log2BucketSize = 0;
            lcp2Bucket = null;
            offsets = null;
            lcpLengths = null;
            signatureMask = 0;
            signatures = null;
            return;
        }

        int t = (int) Math.ceil(1 + GOV3Function.C * Math.log(2) + Math.log(n) - Math.log(1 + Math.log(n)));
        log2BucketSize = Fast.ceilLog2(t);
        bucketSize = 1 << log2BucketSize;
        bucketSizeMask = bucketSize - 1;
        LOGGER.debug("Bucket size: " + bucketSize);

        final long numBuckets = (n + bucketSize - 1) / bucketSize;

        LongArrayBitVector prev = LongArrayBitVector.getInstance();
        LongArrayBitVector curr = LongArrayBitVector.getInstance();
        int currLcp = 0;
        @SuppressWarnings("resource")
        final OfflineIterable<BitVector, LongArrayBitVector> lcps = new OfflineIterable<BitVector, LongArrayBitVector>(
                BitVectors.OFFLINE_SERIALIZER, LongArrayBitVector.getInstance());
        final int[][] lcpLengths = IntBigArrays.newBigArray((n + bucketSize - 1) / bucketSize);
        int maxLcp = 0;
        long maxLength = 0;

        @SuppressWarnings("resource")
        final ChunkedHashStore<BitVector> chunkedHashStore = new ChunkedHashStore<BitVector>(
                TransformationStrategies.identity(), pl);
        chunkedHashStore.reset(r.nextLong());
        pl.expectedUpdates = n;
        pl.start("Scanning collection...");

        Iterator<? extends T> iterator = keys.iterator();
        for (long b = 0; b < numBuckets; b++) {
            prev.replace(transform.toBitVector(iterator.next()));
            chunkedHashStore.add(prev);
            pl.lightUpdate();
            maxLength = Math.max(maxLength, prev.length());
            currLcp = (int) prev.length();
            final int currBucketSize = (int) Math.min(bucketSize, n - b * bucketSize);

            for (int i = 0; i < currBucketSize - 1; i++) {
                curr.replace(transform.toBitVector(iterator.next()));
                chunkedHashStore.add(curr);
                pl.lightUpdate();
                final int prefix = (int) curr.longestCommonPrefixLength(prev);
                if (prefix == prev.length() && prefix == curr.length())
                    throw new IllegalArgumentException("The input bit vectors are not distinct");
                if (prefix == prev.length() || prefix == curr.length())
                    throw new IllegalArgumentException("The input bit vectors are not prefix-free");
                if (prev.getBoolean(prefix))
                    throw new IllegalArgumentException("The input bit vectors are not lexicographically sorted");

                currLcp = Math.min(prefix, currLcp);
                prev.replace(curr);

                maxLength = Math.max(maxLength, prev.length());
            }

            lcps.add(prev.subVector(0, currLcp));
            IntBigArrays.set(lcpLengths, b, currLcp);
            maxLcp = Math.max(maxLcp, currLcp);
        }

        pl.done();

        // We must be sure that both functions are built on the same store.
        chunkedHashStore.checkAndRetry(TransformationStrategies.wrap(keys, transform));
        this.seed = chunkedHashStore.seed();

        if (ASSERTS) {
            ObjectOpenHashSet<BitVector> s = new ObjectOpenHashSet<BitVector>();
            for (LongArrayBitVector bv : lcps)
                s.add(bv.copy());
            assert s.size() == lcps.size() : s.size() + " != " + lcps.size(); // No duplicates.
        }

        // Build function assigning each lcp to its bucket.
        lcp2Bucket = new GOV3Function.Builder<BitVector>().keys(lcps).transform(TransformationStrategies.identity())
                .build();

        if (DEBUG) {
            int p = 0;
            for (BitVector v : lcps)
                System.err.println(v + " " + v.length());
            for (BitVector v : lcps) {
                final long value = lcp2Bucket.getLong(v);
                if (p++ != value) {
                    System.err.println("p: " + (p - 1) + "  value: " + value + " key:" + v);
                    throw new AssertionError();
                }
            }
        }

        lcps.close();

        // Build function assigning the bucket offset to each element.
        offsets = new GOV3Function.Builder<BitVector>().store(chunkedHashStore).values(new AbstractLongBigList() {
            public long getLong(long index) {
                return index & bucketSizeMask;
            }

            public long size64() {
                return n;
            }
        }, log2BucketSize).indirect().build();

        // Build function assigning the lcp length to each element.
        this.lcpLengths = new TwoStepsGOV3Function.Builder<BitVector>().store(chunkedHashStore)
                .values(new AbstractLongBigList() {
                    public long getLong(long index) {
                        return IntBigArrays.get(lcpLengths, index >>> log2BucketSize);
                    }

                    public long size64() {
                        return n;
                    }
                }).build();

        // Build function assigning the lcp length and the bucketing data to each element.
        final double p = 1.0 / (this.lcpLengths.rankMean + 1);
        final double s = s(p, this.lcpLengths.width);

        LOGGER.debug("Forecast best threshold: " + s);

        if (DEBUG) {
            int j = 0;
            for (T key : keys) {
                BitVector bv = transform.toBitVector(key);
                if (j++ != lcp2Bucket.getLong(bv.subVector(0, this.lcpLengths.getLong(bv))) * bucketSize
                        + offsets.getLong(bv)) {
                    System.err.println("p: " + (j - 1) + "  Key: " + key + " bucket size: " + bucketSize + " lcp "
                            + transform.toBitVector(key).subVector(0, this.lcpLengths.getLong(bv)) + " lcp length: "
                            + this.lcpLengths.getLong(bv) + " bucket "
                            + lcp2Bucket
                                    .getLong(transform.toBitVector(key).subVector(0, this.lcpLengths.getLong(bv)))
                            + " offset: " + offsets.getLong(bv));
                    throw new AssertionError();
                }
            }
        }

        double secondFunctionForecastBitsPerElement = (s + GOV3Function.C
                + (Math.pow(2, s) - 1) * this.lcpLengths.width / n
                + (this.lcpLengths.width + GOV3Function.C) * (Math.pow(1 - p, Math.pow(2, s) + 1)));

        LOGGER.debug("Forecast bit cost per element: "
                + (log2BucketSize + GOV3Function.C + secondFunctionForecastBitsPerElement + (Fast.log2(Math.E))));
        LOGGER.info("Actual bit cost per element: " + (double) numBits() / n);

        if (signatureWidth != 0) {
            signatureMask = -1L >>> Long.SIZE - signatureWidth;
            chunkedHashStore.filter(null); // two-steps functions use filtering.
            signatures = chunkedHashStore.signatures(signatureWidth, pl);
        } else {
            signatureMask = 0;
            signatures = null;
        }

        chunkedHashStore.close();
    }

    private static double W(double x) {
        return -Math.log(-1 / x) - Math.log(Math.log(-1 / x));
    }

    private static double s(double p, int r) {
        return Fast.log2(W(1 / (Math.log(2) * (r + GOV3Function.C) * (p - 1))) / Math.log(1 - p));
    }

    public long size64() {
        return n;
    }

    /** Returns the number of bits used by this structure.
     * 
     * @return the number of bits used by this structure.
     */
    public long numBits() {
        if (n == 0)
            return 0;
        return offsets.numBits() + lcpLengths.numBits() + lcp2Bucket.numBits() + transform.numBits();
    }

    @SuppressWarnings("unchecked")
    public long getLong(final Object o) {
        if (n == 0)
            return defRetValue;
        final BitVector bitVector = transform.toBitVector((T) o).fast();
        final long[] triple = new long[3];
        Hashes.spooky4(bitVector, seed, triple);
        final long prefix = lcpLengths.getLongByTriple(triple);
        if (prefix == -1 || prefix > bitVector.length())
            return defRetValue;
        final long result = (lcp2Bucket.getLong(bitVector.subVector(0, prefix)) << log2BucketSize)
                + offsets.getLongByTriple(triple);
        if (signatureMask != 0)
            return result < 0 || result >= n || signatures.getLong(result) != (triple[0] & signatureMask)
                    ? defRetValue
                    : result;
        // Out-of-set strings can generate bizarre 3-hyperedges.
        return result < 0 || result >= n ? defRetValue : result;
    }

    public long getLongByBitVectorAndTriple(final BitVector bitVector, final long[] triple) {
        if (n == 0)
            return defRetValue;
        final long prefix = lcpLengths.getLongByTriple(triple);
        if (prefix == -1 || prefix > bitVector.length())
            return defRetValue;
        final long result = (lcp2Bucket.getLong(bitVector.subVector(0, prefix)) << log2BucketSize)
                + offsets.getLongByTriple(triple);
        if (signatureMask != 0)
            return result < 0 || result >= n || signatures.getLong(result) != (triple[0] & signatureMask)
                    ? defRetValue
                    : result;
        // Out-of-set strings can generate bizarre 3-hyperedges.
        return result < 0 || result >= n ? defRetValue : result;
    }

    public static void main(final String[] arg) throws NoSuchMethodException, IOException, JSAPException {

        final SimpleJSAP jsap = new SimpleJSAP(TwoStepsLcpMonotoneMinimalPerfectHashFunction.class.getName(),
                "Builds a two-steps LCP-based monotone minimal perfect hash function reading a newline-separated list of strings.",
                new Parameter[] {
                        new FlaggedOption("encoding", ForNameStringParser.getParser(Charset.class), "UTF-8",
                                JSAP.NOT_REQUIRED, 'e', "encoding", "The string file encoding."),
                        new FlaggedOption("tempDir", FileStringParser.getParser(), JSAP.NO_DEFAULT,
                                JSAP.NOT_REQUIRED, 'T', "temp-dir", "A directory for temporary files."),
                        new Switch("huTucker", 'h', "hu-tucker", "Use Hu-Tucker coding to reduce string length."),
                        new Switch("iso", 'i', "iso",
                                "Use ISO-8859-1 coding internally (i.e., just use the lower eight bits of each character)."),
                        new Switch("utf32", JSAP.NO_SHORTFLAG, "utf-32",
                                "Use UTF-32 internally (handles surrogate pairs)."),
                        new FlaggedOption("signatureWidth", JSAP.INTEGER_PARSER, JSAP.NO_DEFAULT, JSAP.NOT_REQUIRED,
                                's', "signature-width",
                                "If specified, the signature width in bits; if negative, the generated function will be a dictionary."),
                        new Switch("zipped", 'z', "zipped", "The string list is compressed in gzip format."),
                        new UnflaggedOption("function", JSAP.STRING_PARSER, JSAP.NO_DEFAULT, JSAP.REQUIRED,
                                JSAP.NOT_GREEDY,
                                "The filename for the serialised monotone minimal perfect hash function."),
                        new UnflaggedOption("stringFile", JSAP.STRING_PARSER, "-", JSAP.NOT_REQUIRED,
                                JSAP.NOT_GREEDY,
                                "The name of a file containing a newline-separated list of strings, or - for standard input; in the first case, strings will not be loaded into core memory."), });

        JSAPResult jsapResult = jsap.parse(arg);
        if (jsap.messagePrinted())
            return;

        final String functionName = jsapResult.getString("function");
        final String stringFile = jsapResult.getString("stringFile");
        final Charset encoding = (Charset) jsapResult.getObject("encoding");
        final File tempDir = jsapResult.getFile("tempDir");
        final boolean zipped = jsapResult.getBoolean("zipped");
        final boolean iso = jsapResult.getBoolean("iso");
        final boolean utf32 = jsapResult.getBoolean("utf32");
        final int signatureWidth = jsapResult.getInt("signatureWidth", 0);

        final Collection<MutableString> collection;
        if ("-".equals(stringFile)) {
            final ProgressLogger pl = new ProgressLogger(LOGGER);
            pl.displayLocalSpeed = true;
            pl.displayFreeMemory = true;
            pl.start("Loading strings...");
            collection = new LineIterator(
                    new FastBufferedReader(
                            new InputStreamReader(zipped ? new GZIPInputStream(System.in) : System.in, encoding)),
                    pl).allLines();
            pl.done();
        } else
            collection = new FileLinesCollection(stringFile, encoding.toString(), zipped);
        final TransformationStrategy<CharSequence> transformationStrategy = iso
                ? TransformationStrategies.prefixFreeIso()
                : utf32 ? TransformationStrategies.prefixFreeUtf32() : TransformationStrategies.prefixFreeUtf16();

        BinIO.storeObject(new TwoStepsLcpMonotoneMinimalPerfectHashFunction<CharSequence>(collection, -1,
                transformationStrategy, signatureWidth, tempDir), functionName);
        LOGGER.info("Completed.");
    }
}