Java tutorial
package it.unimi.dsi.sux4j.mph; /* * Sux4J: Succinct data structures for Java * * Copyright (C) 2008-2016 Sebastiano Vigna * * This library is free software; you can redistribute it and/or modify it * under the terms of the GNU Lesser General Public License as published by the Free * Software Foundation; either version 3 of the License, or (at your option) * any later version. * * This library is distributed in the hope that it will be useful, but * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY * or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License * for more details. * * You should have received a copy of the GNU Lesser General Public License * along with this program; if not, see <http://www.gnu.org/licenses/>. * */ import static it.unimi.dsi.bits.Fast.log2; import static java.lang.Math.E; import static java.lang.Math.log; import it.unimi.dsi.bits.BitVector; import it.unimi.dsi.bits.Fast; import it.unimi.dsi.bits.HuTuckerTransformationStrategy; import it.unimi.dsi.bits.TransformationStrategies; import it.unimi.dsi.bits.TransformationStrategy; import it.unimi.dsi.fastutil.io.BinIO; import it.unimi.dsi.fastutil.longs.AbstractLongBigList; import it.unimi.dsi.fastutil.longs.LongBigList; import it.unimi.dsi.io.FastBufferedReader; import it.unimi.dsi.io.FileLinesCollection; import it.unimi.dsi.io.LineIterator; import it.unimi.dsi.lang.MutableString; import it.unimi.dsi.logging.ProgressLogger; import it.unimi.dsi.sux4j.io.ChunkedHashStore; import it.unimi.dsi.util.XorShift1024StarRandomGenerator; import java.io.File; import java.io.IOException; import java.io.InputStreamReader; import java.io.Serializable; import java.nio.charset.Charset; import java.util.Collection; import java.util.zip.GZIPInputStream; import org.apache.commons.math3.random.RandomGenerator; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import com.martiansoftware.jsap.FlaggedOption; import com.martiansoftware.jsap.JSAP; import com.martiansoftware.jsap.JSAPException; import com.martiansoftware.jsap.JSAPResult; import com.martiansoftware.jsap.Parameter; import com.martiansoftware.jsap.SimpleJSAP; import com.martiansoftware.jsap.Switch; import com.martiansoftware.jsap.UnflaggedOption; import com.martiansoftware.jsap.stringparsers.FileStringParser; import com.martiansoftware.jsap.stringparsers.ForNameStringParser; /** A monotone minimal perfect hash implementation based on fixed-size bucketing that uses * a {@linkplain ZFastTrieDistributor z-fast trie} as a distributor. * * <p>See the {@linkplain it.unimi.dsi.sux4j.mph package overview} for a comparison with other implementations. * Similarly to a {@link GOV3Function}, an instance of this class may be <em>{@linkplain Builder#signed(int) signed}</em>. */ public class ZFastTrieDistributorMonotoneMinimalPerfectHashFunction<T> extends AbstractHashFunction<T> implements Serializable { public static final long serialVersionUID = 4L; private static final Logger LOGGER = LoggerFactory .getLogger(ZFastTrieDistributorMonotoneMinimalPerfectHashFunction.class); /** The number of elements. */ private final long size; /** The logarithm of the bucket size. */ private final int log2BucketSize; /** The transformation strategy. */ private final TransformationStrategy<? super T> transform; /** A hollow trie distributor assigning keys to buckets. */ private final ZFastTrieDistributor<BitVector> distributor; /** The offset of each element into his bucket. */ private final GOV3Function<BitVector> offset; /** The seed returned by the {@link ChunkedHashStore}. */ private long seed; /** The mask to compare signatures, or zero for no signatures. */ protected final long signatureMask; /** The signatures. */ protected final LongBigList signatures; /** A builder class for {@link ZFastTrieDistributorMonotoneMinimalPerfectHashFunction}. */ public static class Builder<T> { protected Iterable<? extends T> keys; protected TransformationStrategy<? super T> transform; protected long numKeys = -1; protected int signatureWidth; protected File tempDir; /** Whether {@link #build()} has already been called. */ protected boolean built; /** Specifies the keys to hash. * * @param keys the keys to hash. * @return this builder. */ public Builder<T> keys(final Iterable<? extends T> keys) { this.keys = keys; return this; } /** Specifies the transformation strategy for the {@linkplain #keys(Iterable) keys to hash}. * * @param transform a transformation strategy for the {@linkplain #keys(Iterable) keys to hash}. * @return this builder. */ public Builder<T> transform(final TransformationStrategy<? super T> transform) { this.transform = transform; return this; } /** Specifies that the resulting {@link LcpMonotoneMinimalPerfectHashFunction} should be signed using a given number of bits per key. * * @param signatureWidth a signature width, or 0 for no signature. * @return this builder. */ public Builder<T> signed(final int signatureWidth) { this.signatureWidth = signatureWidth; return this; } /** Specifies a temporary directory for the {@link ChunkedHashStore}. * * @param tempDir a temporary directory for the {@link ChunkedHashStore}. files, or {@code null} for the standard temporary directory. * @return this builder. */ public Builder<T> tempDir(final File tempDir) { this.tempDir = tempDir; return this; } /** Builds a monotone minimal perfect hash function based on a z-fast trie distributor. * * @return a {@link ZFastTrieDistributorMonotoneMinimalPerfectHashFunction} instance with the specified parameters. * @throws IllegalStateException if called more than once. */ public ZFastTrieDistributorMonotoneMinimalPerfectHashFunction<T> build() throws IOException { if (built) throw new IllegalStateException("This builder has been already used"); built = true; return new ZFastTrieDistributorMonotoneMinimalPerfectHashFunction<T>(keys, transform, -1, signatureWidth, tempDir); } } /** Creates a new monotone minimal perfect hash function based on a z-fast trie distributor using the given * keys, transformation strategy and bucket size. * * @param keys the keys among which the trie must be able to rank. * @param transform a transformation strategy that must turn the keys into a list of * distinct, prefix-free, lexicographically increasing (in iteration order) bit vectors. * @param log2BucketSize the logarithm of the bucket size, or -1 for the default value. * @param signatureWidth a signature width, or 0 for no signature. * @param tempDir a temporary directory for the store files, or {@code null} for the standard temporary directory. */ protected ZFastTrieDistributorMonotoneMinimalPerfectHashFunction(final Iterable<? extends T> keys, final TransformationStrategy<? super T> transform, final int log2BucketSize, final int signatureWidth, final File tempDir) throws IOException { this.transform = transform; defRetValue = -1; // For the very few cases in which we can decide long maxLength = 0; long totalLength = 0; RandomGenerator r = new XorShift1024StarRandomGenerator(); final ChunkedHashStore<BitVector> chunkedHashStore = new ChunkedHashStore<BitVector>( TransformationStrategies.identity(), tempDir); chunkedHashStore.reset(r.nextLong()); final Iterable<BitVector> bitVectors = TransformationStrategies.wrap(keys, transform); final ProgressLogger pl = new ProgressLogger(LOGGER); pl.displayLocalSpeed = true; pl.displayFreeMemory = true; pl.itemsName = "keys"; pl.start("Scanning collection..."); for (BitVector bv : bitVectors) { maxLength = Math.max(maxLength, bv.length()); totalLength += bv.length(); chunkedHashStore.add(bv); pl.lightUpdate(); } pl.done(); chunkedHashStore.checkAndRetry(bitVectors); size = chunkedHashStore.size(); if (size == 0) { this.log2BucketSize = -1; distributor = null; offset = null; signatureMask = 0; signatures = null; chunkedHashStore.close(); return; } final long averageLength = (totalLength + size - 1) / size; final long forecastBucketSize = (long) Math.ceil( 10.5 + 4.05 * log(averageLength) + 2.43 * log(log(size) + 1) + 2.43 * log(log(averageLength) + 1)); this.log2BucketSize = log2BucketSize == -1 ? Fast.mostSignificantBit(forecastBucketSize) : log2BucketSize; LOGGER.debug("Average length: " + averageLength); LOGGER.debug("Max length: " + maxLength); LOGGER.debug("Bucket size: " + (1L << this.log2BucketSize)); LOGGER.info("Computing z-fast trie distributor..."); distributor = new ZFastTrieDistributor<BitVector>(bitVectors, this.log2BucketSize, TransformationStrategies.identity(), chunkedHashStore); LOGGER.info("Computing offsets..."); offset = new GOV3Function.Builder<BitVector>().store(chunkedHashStore).values(new AbstractLongBigList() { final long bucketSizeMask = (1L << ZFastTrieDistributorMonotoneMinimalPerfectHashFunction.this.log2BucketSize) - 1; public long getLong(long index) { return index & bucketSizeMask; } public long size64() { return size; } }, this.log2BucketSize).indirect().build(); seed = chunkedHashStore.seed(); double logU = averageLength * log(2); LOGGER.info("Forecast bit cost per element: " + 1.0 / forecastBucketSize * (-6 * log2(log(2)) + 5 * log2(logU) + 2 * log2(forecastBucketSize) + log2(log(logU) - log(log(2))) + 6 * GOV3Function.C + 3 * log2(E) + 3 * log2(log(3.0 * size)) + 3 + GOV3Function.C * forecastBucketSize + GOV3Function.C * forecastBucketSize * log2(forecastBucketSize))); LOGGER.info("Actual bit cost per element: " + (double) numBits() / size); if (signatureWidth != 0) { signatureMask = -1L >>> Long.SIZE - signatureWidth; signatures = chunkedHashStore.signatures(signatureWidth, pl); } else { signatureMask = 0; signatures = null; } chunkedHashStore.close(); } @SuppressWarnings("unchecked") public long getLong(final Object o) { if (size == 0) return defRetValue; final BitVector bv = transform.toBitVector((T) o).fast(); final long state[] = Hashes.preprocessSpooky4(bv, seed); final long[] triple = new long[3]; Hashes.spooky4(bv, bv.length(), seed, state, triple); final long bucket = distributor.getLongByBitVectorTripleAndState(bv, triple, state); final long result = (bucket << log2BucketSize) + offset.getLongByTriple(triple); if (signatureMask != 0) return result < 0 || result >= size || signatures.getLong(result) != (triple[0] & signatureMask) ? defRetValue : result; // Out-of-set strings can generate bizarre 3-hyperedges. return result < 0 || result >= size ? defRetValue : result; } public long size64() { return size; } public long numBits() { if (size == 0) return 0; return distributor.numBits() + offset.numBits() + transform.numBits(); } public static void main(final String[] arg) throws NoSuchMethodException, IOException, JSAPException { final SimpleJSAP jsap = new SimpleJSAP( ZFastTrieDistributorMonotoneMinimalPerfectHashFunction.class.getName(), "Builds a monotone minimal perfect hash using a probabilistic z-fast trie as a distributor reading a newline-separated list of strings.", new Parameter[] { new FlaggedOption("encoding", ForNameStringParser.getParser(Charset.class), "UTF-8", JSAP.NOT_REQUIRED, 'e', "encoding", "The string file encoding."), new FlaggedOption("tempDir", FileStringParser.getParser(), JSAP.NO_DEFAULT, JSAP.NOT_REQUIRED, 'T', "temp-dir", "A directory for temporary files."), new Switch("huTucker", 'h', "hu-tucker", "Use Hu-Tucker coding to reduce string length."), new Switch("iso", 'i', "iso", "Use ISO-8859-1 coding internally (i.e., just use the lower eight bits of each character)."), new Switch("utf32", JSAP.NO_SHORTFLAG, "utf-32", "Use UTF-32 internally (handles surrogate pairs)."), new FlaggedOption("signatureWidth", JSAP.INTEGER_PARSER, JSAP.NO_DEFAULT, JSAP.NOT_REQUIRED, 's', "signature-width", "If specified, the signature width in bits; if negative, the generated function will be a dictionary."), new Switch("zipped", 'z', "zipped", "The string list is compressed in gzip format."), new FlaggedOption("log2bucket", JSAP.INTEGER_PARSER, "-1", JSAP.NOT_REQUIRED, 'b', "log2bucket", "The base 2 logarithm of the bucket size (mainly for testing)."), new UnflaggedOption("function", JSAP.STRING_PARSER, JSAP.NO_DEFAULT, JSAP.REQUIRED, JSAP.NOT_GREEDY, "The filename for the serialised monotone minimal perfect hash function."), new UnflaggedOption("stringFile", JSAP.STRING_PARSER, "-", JSAP.NOT_REQUIRED, JSAP.NOT_GREEDY, "The name of a file containing a newline-separated list of strings, or - for standard input; in the first case, strings will not be loaded into core memory."), }); JSAPResult jsapResult = jsap.parse(arg); if (jsap.messagePrinted()) return; final String functionName = jsapResult.getString("function"); final String stringFile = jsapResult.getString("stringFile"); final int log2BucketSize = jsapResult.getInt("log2bucket"); final Charset encoding = (Charset) jsapResult.getObject("encoding"); final File tempDir = jsapResult.getFile("tempDir"); final boolean zipped = jsapResult.getBoolean("zipped"); final boolean iso = jsapResult.getBoolean("iso"); final boolean utf32 = jsapResult.getBoolean("utf32"); final boolean huTucker = jsapResult.getBoolean("huTucker"); final int signatureWidth = jsapResult.getInt("signatureWidth", 0); final Collection<MutableString> collection; if ("-".equals(stringFile)) { final ProgressLogger pl = new ProgressLogger(LOGGER); pl.displayLocalSpeed = true; pl.displayFreeMemory = true; pl.start("Loading strings..."); collection = new LineIterator( new FastBufferedReader( new InputStreamReader(zipped ? new GZIPInputStream(System.in) : System.in, encoding)), pl).allLines(); pl.done(); } else collection = new FileLinesCollection(stringFile, encoding.toString(), zipped); final TransformationStrategy<CharSequence> transformationStrategy = huTucker ? new HuTuckerTransformationStrategy(collection, true) : iso ? TransformationStrategies.prefixFreeIso() : utf32 ? TransformationStrategies.prefixFreeUtf32() : TransformationStrategies.prefixFreeUtf16(); BinIO.storeObject(new ZFastTrieDistributorMonotoneMinimalPerfectHashFunction<CharSequence>(collection, transformationStrategy, log2BucketSize, signatureWidth, tempDir), functionName); LOGGER.info("Completed."); } }