Java tutorial
package it.unimi.dsi.sux4j.mph; /* * Sux4J: Succinct data structures for Java * * Copyright (C) 2008-2016 Sebastiano Vigna * * This library is free software; you can redistribute it and/or modify it * under the terms of the GNU Lesser General Public License as published by the Free * Software Foundation; either version 3 of the License, or (at your option) * any later version. * * This library is distributed in the hope that it will be useful, but * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY * or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License * for more details. * * You should have received a copy of the GNU Lesser General Public License * along with this program; if not, see <http://www.gnu.org/licenses/>. * */ import it.unimi.dsi.bits.BitVector; import it.unimi.dsi.bits.Fast; import it.unimi.dsi.bits.HuTuckerTransformationStrategy; import it.unimi.dsi.bits.TransformationStrategies; import it.unimi.dsi.bits.TransformationStrategy; import it.unimi.dsi.fastutil.Size64; import it.unimi.dsi.fastutil.io.BinIO; import it.unimi.dsi.fastutil.longs.AbstractLongBigList; import it.unimi.dsi.io.FastBufferedReader; import it.unimi.dsi.io.FileLinesCollection; import it.unimi.dsi.io.LineIterator; import it.unimi.dsi.lang.MutableString; import it.unimi.dsi.logging.ProgressLogger; import it.unimi.dsi.sux4j.io.ChunkedHashStore; import it.unimi.dsi.util.XorShift1024StarRandomGenerator; import java.io.IOException; import java.io.InputStreamReader; import java.io.Serializable; import java.nio.charset.Charset; import java.util.Collection; import java.util.zip.GZIPInputStream; import org.apache.commons.math3.random.RandomGenerator; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import com.martiansoftware.jsap.FlaggedOption; import com.martiansoftware.jsap.JSAP; import com.martiansoftware.jsap.JSAPException; import com.martiansoftware.jsap.JSAPResult; import com.martiansoftware.jsap.Parameter; import com.martiansoftware.jsap.SimpleJSAP; import com.martiansoftware.jsap.Switch; import com.martiansoftware.jsap.UnflaggedOption; import com.martiansoftware.jsap.stringparsers.ForNameStringParser; /** A monotone minimal perfect hash implementation based on fixed-size bucketing that uses * a {@linkplain PaCoTrieDistributor partial compacted binary trie (PaCo trie)} as distributor. */ public class PaCoTrieDistributorMonotoneMinimalPerfectHashFunction<T> extends AbstractHashFunction<T> implements Size64, Serializable { public static final long serialVersionUID = 4L; private static final Logger LOGGER = LoggerFactory .getLogger(PaCoTrieDistributorMonotoneMinimalPerfectHashFunction.class); /** The number of elements. */ private final long size; /** The size of a bucket. */ private final int bucketSize; /** {@link Fast#ceilLog2(int)} of {@link #bucketSize}. */ private final int log2BucketSize; /** The transformation strategy. */ private final TransformationStrategy<? super T> transform; /** A PaCo trie assigning keys to buckets. */ private final PaCoTrieDistributor<BitVector> distributor; /** The offset of each element into his bucket. */ private final GOV3Function<BitVector> offset; @SuppressWarnings("unchecked") public long getLong(final Object o) { if (size == 0) return defRetValue; final BitVector bv = transform.toBitVector((T) o).fast(); final long bucket = distributor.getLong(bv); return (bucket << log2BucketSize) + offset.getLong(bv); } /** Creates a new PaCo-trie-based monotone minimal perfect hash function using the given * elements and transformation strategy. * * @param elements the elements among which the trie must be able to rank. * @param transform a transformation strategy that must turn the elements in <code>elements</code> into a list of * distinct, prefix-free, lexicographically increasing (in iteration order) bit vectors. */ public PaCoTrieDistributorMonotoneMinimalPerfectHashFunction(final Iterable<? extends T> elements, final TransformationStrategy<? super T> transform) throws IOException { this.transform = transform; defRetValue = -1; // For the very few cases in which we can decide long maxLength = 0; long totalLength = 0; BitVector bv; final RandomGenerator random = new XorShift1024StarRandomGenerator(); ProgressLogger pl = new ProgressLogger(LOGGER); pl.displayLocalSpeed = true; pl.displayFreeMemory = true; pl.itemsName = "keys"; pl.start("Creating chunked hash store..."); final ChunkedHashStore<BitVector> chunkedHashStore = new ChunkedHashStore<BitVector>( TransformationStrategies.identity()); chunkedHashStore.reset(random.nextLong()); for (T s : elements) { bv = transform.toBitVector(s); chunkedHashStore.add(bv); maxLength = Math.max(maxLength, bv.length()); totalLength += bv.length(); pl.lightUpdate(); } pl.done(); LOGGER.debug("Maximum length: " + maxLength); LOGGER.debug("Average length: " + totalLength / (double) chunkedHashStore.size()); size = chunkedHashStore.size(); if (size == 0) { bucketSize = log2BucketSize = 0; distributor = null; offset = null; chunkedHashStore.close(); return; } final long averageLength = (totalLength + size - 1) / size; int t = Fast.mostSignificantBit( (int) Math.floor(averageLength - Math.log(size) - Math.log(averageLength - Math.log(size)) - 1)); final int firstbucketSize = 1 << t; LOGGER.debug("First bucket size estimate: " + firstbucketSize); final Iterable<BitVector> bitVectors = TransformationStrategies.wrap(elements, transform); LOGGER.info("Creating distributor..."); PaCoTrieDistributor<BitVector> firstDistributor = new PaCoTrieDistributor<BitVector>(bitVectors, t, TransformationStrategies.identity()); if (firstDistributor.numBits() == 0 || firstbucketSize >= size) log2BucketSize = t; else { // Reassign bucket size based on empirical estimation log2BucketSize = t - Fast.mostSignificantBit((int) Math.ceil(size / (firstDistributor.numBits() * Math.log(2)))); } bucketSize = 1 << log2BucketSize; LOGGER.debug("Second bucket size estimate: " + bucketSize); if (firstbucketSize == bucketSize) distributor = firstDistributor; else { firstDistributor = null; distributor = new PaCoTrieDistributor<BitVector>(bitVectors, log2BucketSize, TransformationStrategies.identity()); } LOGGER.debug("Bucket size: " + bucketSize); final int bucketSizeMask = bucketSize - 1; LOGGER.info("Generating offset function..."); offset = new GOV3Function.Builder<BitVector>().keys(bitVectors) .transform(TransformationStrategies.identity()).store(chunkedHashStore) .values(new AbstractLongBigList() { public long getLong(long index) { return index & bucketSizeMask; } public long size64() { return size; } }, log2BucketSize).indirect().build(); chunkedHashStore.close(); LOGGER.debug("Forecast distributor bit cost: " + (size / bucketSize) * (maxLength + log2BucketSize - Math.log(size))); LOGGER.debug("Actual distributor bit cost: " + distributor.numBits()); LOGGER.debug("Forecast bit cost per element: " + (GOV3Function.C + Fast.log2(Math.E) - Fast.log2(Fast.log2(Math.E)) + Fast.log2(maxLength - Fast.log2(size)))); LOGGER.info("Actual bit cost per element: " + (double) numBits() / size); } public long size64() { return size; } public long numBits() { return distributor.numBits() + offset.numBits() + transform.numBits(); } public static void main(final String[] arg) throws NoSuchMethodException, IOException, JSAPException { final SimpleJSAP jsap = new SimpleJSAP( PaCoTrieDistributorMonotoneMinimalPerfectHashFunction.class.getName(), "Builds an PaCo trie-based monotone minimal perfect hash function reading a newline-separated list of strings.", new Parameter[] { new FlaggedOption("encoding", ForNameStringParser.getParser(Charset.class), "UTF-8", JSAP.NOT_REQUIRED, 'e', "encoding", "The string file encoding."), new Switch("huTucker", 'h', "hu-tucker", "Use Hu-Tucker coding to reduce string length."), new Switch("iso", 'i', "iso", "Use ISO-8859-1 coding internally (i.e., just use the lower eight bits of each character)."), new Switch("utf32", JSAP.NO_SHORTFLAG, "utf-32", "Use UTF-32 internally (handles surrogate pairs)."), new Switch("zipped", 'z', "zipped", "The string list is compressed in gzip format."), new UnflaggedOption("function", JSAP.STRING_PARSER, JSAP.NO_DEFAULT, JSAP.REQUIRED, JSAP.NOT_GREEDY, "The filename for the serialised monotone minimal perfect hash function."), new UnflaggedOption("stringFile", JSAP.STRING_PARSER, "-", JSAP.NOT_REQUIRED, JSAP.NOT_GREEDY, "The name of a file containing a newline-separated list of strings, or - for standard input; in the first case, strings will not be loaded into core memory."), }); JSAPResult jsapResult = jsap.parse(arg); if (jsap.messagePrinted()) return; final String functionName = jsapResult.getString("function"); final String stringFile = jsapResult.getString("stringFile"); final Charset encoding = (Charset) jsapResult.getObject("encoding"); final boolean zipped = jsapResult.getBoolean("zipped"); final boolean iso = jsapResult.getBoolean("iso"); final boolean utf32 = jsapResult.getBoolean("utf32"); final boolean huTucker = jsapResult.getBoolean("huTucker"); final Collection<MutableString> collection; if ("-".equals(stringFile)) { final ProgressLogger pl = new ProgressLogger(LOGGER); pl.displayLocalSpeed = true; pl.displayFreeMemory = true; pl.start("Loading strings..."); collection = new LineIterator( new FastBufferedReader( new InputStreamReader(zipped ? new GZIPInputStream(System.in) : System.in, encoding)), pl).allLines(); pl.done(); } else collection = new FileLinesCollection(stringFile, encoding.toString(), zipped); final TransformationStrategy<CharSequence> transformationStrategy = huTucker ? new HuTuckerTransformationStrategy(collection, true) : iso ? TransformationStrategies.prefixFreeIso() : utf32 ? TransformationStrategies.prefixFreeUtf32() : TransformationStrategies.prefixFreeUtf16(); BinIO.storeObject(new PaCoTrieDistributorMonotoneMinimalPerfectHashFunction<CharSequence>(collection, transformationStrategy), functionName); LOGGER.info("Completed."); } }