Java tutorial
package it.unimi.dsi.sux4j.mph; /* * Sux4J: Succinct data structures for Java * * Copyright (C) 2008-2016 Sebastiano Vigna * * This library is free software; you can redistribute it and/or modify it * under the terms of the GNU Lesser General Public License as published by the Free * Software Foundation; either version 3 of the License, or (at your option) * any later version. * * This library is distributed in the hope that it will be useful, but * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY * or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License * for more details. * * You should have received a copy of the GNU Lesser General Public License * along with this program; if not, see <http://www.gnu.org/licenses/>. * */ import it.unimi.dsi.bits.Fast; import it.unimi.dsi.bits.TransformationStrategies; import it.unimi.dsi.bits.TransformationStrategy; import it.unimi.dsi.fastutil.Size64; import it.unimi.dsi.fastutil.io.BinIO; import it.unimi.dsi.fastutil.longs.AbstractLongBigList; import it.unimi.dsi.fastutil.longs.AbstractLongComparator; import it.unimi.dsi.fastutil.longs.Long2LongOpenHashMap; import it.unimi.dsi.fastutil.longs.LongArrays; import it.unimi.dsi.fastutil.longs.LongBigArrayBigList; import it.unimi.dsi.fastutil.longs.LongBigList; import it.unimi.dsi.fastutil.longs.LongIterator; import it.unimi.dsi.io.FastBufferedReader; import it.unimi.dsi.io.FileLinesCollection; import it.unimi.dsi.io.LineIterator; import it.unimi.dsi.lang.MutableString; import it.unimi.dsi.logging.ProgressLogger; import it.unimi.dsi.sux4j.io.ChunkedHashStore; import it.unimi.dsi.util.XorShift1024StarRandomGenerator; import java.io.File; import java.io.IOException; import java.io.InputStreamReader; import java.io.Serializable; import java.nio.charset.Charset; import java.util.Collection; import java.util.zip.GZIPInputStream; import org.apache.commons.collections.Predicate; import org.apache.commons.math3.random.RandomGenerator; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import com.martiansoftware.jsap.FlaggedOption; import com.martiansoftware.jsap.JSAP; import com.martiansoftware.jsap.JSAPException; import com.martiansoftware.jsap.JSAPResult; import com.martiansoftware.jsap.Parameter; import com.martiansoftware.jsap.SimpleJSAP; import com.martiansoftware.jsap.Switch; import com.martiansoftware.jsap.UnflaggedOption; import com.martiansoftware.jsap.stringparsers.FileStringParser; import com.martiansoftware.jsap.stringparsers.ForNameStringParser; /** A function stored using two {@linkplain GOV3Function}s—one for * frequent values, and one for infrequent values. This naive idea turns out to be very effective in reducing the function * size when the distribution of values is skewed (e.g., as it happens in a {@link TwoStepsLcpMonotoneMinimalPerfectHashFunction}). * * <p>To create an instance, we perform a pre-scan of the values to be assigned. If possible, we finds the best possible * <var>r</var> such that the 2<sup><var>r</var></sup> − 1 most frequent values can be stored in a {@link GOV3Function} * and suitably remapped when read. The function uses 2<sup><var>r</var></sup> − 1 as an escape symbol for all other * values, which are stored in a separate function. * * <p><strong>Warning</strong>: during the construction phase, a {@linkplain ChunkedHashStore#filter(Predicate) filter} * will be set on the {@link ChunkedHashStore} used to store the keys. If you are {@linkplain Builder#store(ChunkedHashStore) passing a store}, * you will have to reset it to its previous state. * * @author Sebastiano Vigna * @since 4.0 */ public class TwoStepsGOV3Function<T> extends AbstractHashFunction<T> implements Serializable, Size64 { public static final long serialVersionUID = 0L; private static final Logger LOGGER = LoggerFactory.getLogger(TwoStepsGOV3Function.class); private final static boolean ASSERTS = false; /** A builder class for {@link TwoStepsGOV3Function}. */ public static class Builder<T> { protected Iterable<? extends T> keys; protected TransformationStrategy<? super T> transform; protected File tempDir; protected ChunkedHashStore<T> chunkedHashStore; protected LongBigList values; /** Whether {@link #build()} has already been called. */ protected boolean built; /** Specifies the keys of the function; if you have specified a {@link #store(ChunkedHashStore) ChunkedHashStore}, it can be {@code null}. * * @param keys the keys of the function. * @return this builder. */ public Builder<T> keys(final Iterable<? extends T> keys) { this.keys = keys; return this; } /** Specifies the transformation strategy for the {@linkplain #keys(Iterable) keys of the function}. * * @param transform a transformation strategy for the {@linkplain #keys(Iterable) keys of the function}. * @return this builder. */ public Builder<T> transform(final TransformationStrategy<? super T> transform) { this.transform = transform; return this; } /** Specifies a temporary directory for the {@link #store(ChunkedHashStore) ChunkedHashStore}. * * @param tempDir a temporary directory for the {@link #store(ChunkedHashStore) ChunkedHashStore} files, or {@code null} for the standard temporary directory. * @return this builder. */ public Builder<T> tempDir(final File tempDir) { this.tempDir = tempDir; return this; } /** Specifies a chunked hash store containing the keys associated with their rank. * * <p><strong>Warning</strong>: during the construction phase, a {@linkplain ChunkedHashStore#filter(Predicate) filter} * will be set on the specified {@link ChunkedHashStore}. You will have to reset it to its previous state. * * @param chunkedHashStore a chunked hash store containing the keys associated with their rank, or {@code null}; the store * can be unchecked, but in this case you must specify {@linkplain #keys(Iterable) keys} and a {@linkplain #transform(TransformationStrategy) transform} * (otherwise, in case of a hash collision in the store an {@link IllegalStateException} will be thrown). * @return this builder. */ public Builder<T> store(final ChunkedHashStore<T> chunkedHashStore) { this.chunkedHashStore = chunkedHashStore; return this; } /** Specifies the values assigned to the {@linkplain #keys(Iterable) keys}; the output width of the function will * be the minimum width needed to represent all values. * * @param values values to be assigned to each element, in the same order of the {@linkplain #keys(Iterable) keys}. * @return this builder. */ public Builder<T> values(final LongBigList values) { this.values = values; return this; } /** Builds a new function. * * @return a {@link GOV3Function} instance with the specified parameters. * @throws IllegalStateException if called more than once. */ public TwoStepsGOV3Function<T> build() throws IOException { if (built) throw new IllegalStateException("This builder has been already used"); built = true; if (transform == null) { if (chunkedHashStore != null) transform = chunkedHashStore.transform(); else throw new IllegalArgumentException( "You must specify a TransformationStrategy, either explicitly or via a given ChunkedHashStore"); } return new TwoStepsGOV3Function<T>(keys, transform, values, tempDir, chunkedHashStore); } } /** The number of keys. */ protected final long n; /** The transformation strategy to turn objects of type <code>T</code> into bit vectors. */ protected final TransformationStrategy<? super T> transform; /** The first function, or {@code null}. The special output value {@link #escape} denotes that {@link #secondFunction} * should be queried instead. */ protected final GOV3Function<T> firstFunction; /** The second function. All queries for which {@link #firstFunction} returns * {@link #escape} (or simply all queries, if {@link #firstFunction} is {@code null}) will be rerouted here. */ protected final GOV3Function<T> secondFunction; /** A mapping from values of the first function to actual values, provided that there is a {@linkplain #firstFunction first function}. */ protected final long[] remap; /** The escape value returned by {@link #firstFunction} to suggest that {@link #secondFunction} should be queried instead, provided that there is a {@linkplain #firstFunction first function}. */ protected final int escape; /** The seed to be used when converting keys to triples. */ protected long seed; /** The width of the output of this function, in bits. */ protected final int width; /** The mean of the rank distribution. */ protected final double rankMean; /** Creates a new two-step function for the given keys and values. * * @param keys the keys in the domain of the function. * @param transform a transformation strategy for the keys. * @param values values to be assigned to each key, in the same order of the iterator returned by <code>keys</code>; if {@code null}, the * assigned value will the the ordinal number of each key. * @param tempDir a temporary directory for the store files, or {@code null} for the standard temporary directory. * @param chunkedHashStore a chunked hash store containing the keys associated with their rank, or {@code null}; the store * can be unchecked, but in this case <code>keys</code> and <code>transform</code> must be non-{@code null}. */ protected TwoStepsGOV3Function(final Iterable<? extends T> keys, final TransformationStrategy<? super T> transform, final LongBigList values, final File tempDir, ChunkedHashStore<T> chunkedHashStore) throws IOException { this.transform = transform; final ProgressLogger pl = new ProgressLogger(LOGGER); pl.displayLocalSpeed = true; pl.displayFreeMemory = true; final RandomGenerator random = new XorShift1024StarRandomGenerator(); pl.itemsName = "keys"; final boolean givenChunkedHashStore = chunkedHashStore != null; if (!givenChunkedHashStore) { if (keys == null) throw new IllegalArgumentException( "If you do not provide a chunked hash store, you must provide the keys"); chunkedHashStore = new ChunkedHashStore<T>(transform, pl); chunkedHashStore.reset(random.nextLong()); chunkedHashStore.addAll(keys.iterator()); } n = chunkedHashStore.size(); defRetValue = -1; // For the very few cases in which we can decide if (n == 0) { rankMean = escape = width = 0; firstFunction = secondFunction = null; remap = null; if (!givenChunkedHashStore) chunkedHashStore.close(); return; } // Compute distribution of values and maximum number of bits. int w = 0, size; long v; final Long2LongOpenHashMap counts = new Long2LongOpenHashMap(); counts.defaultReturnValue(-1); for (LongIterator i = values.iterator(); i.hasNext();) { v = i.nextLong(); counts.put(v, counts.get(v) + 1); size = Fast.length(v); if (size > w) w = size; } this.width = w; final int m = counts.size(); LOGGER.debug("Generating two-steps GOV3 function with " + w + " output bits..."); // Sort keys by reverse frequency final long[] keysArray = counts.keySet().toLongArray(new long[m]); LongArrays.quickSort(keysArray, 0, keysArray.length, new AbstractLongComparator() { private static final long serialVersionUID = 1L; public int compare(final long a, final long b) { return Long.signum(counts.get(b) - counts.get(a)); } }); long mean = 0; for (int i = 0; i < keysArray.length; i++) mean += i * counts.get(keysArray[i]); rankMean = (double) mean / n; // Analyze data and choose a threshold long post = n, bestCost = Long.MAX_VALUE; int pos = 0, best = -1; // Examine every possible choice for r. Note that r = 0 implies one function, so we do not need to test the case r == w. for (int r = 0; r < w && pos < m; r++) { /* This cost function is dependent on the implementation of GOV3Function. * Note that for r = 0 we are actually computing the cost of a single function (the first one). */ final long cost = (long) Math.min(GOV3Function.C * n * 1.126 + n * r, GOV3Function.C * n * r) + (long) Math.min(GOV3Function.C * post * 1.126 + post * w, GOV3Function.C * post * w) + pos * Long.SIZE; if (cost < bestCost) { best = r; bestCost = cost; } /* We add to pre and subtract from post the counts of keys from position (1<<r)-1 to position (1<<r+1)-1. */ for (int j = 0; j < (1 << r) && pos < m; j++) { final long c = counts.get(keysArray[pos++]); post -= c; } } if (ASSERTS) assert pos == m; counts.clear(); counts.trim(); // We must keep the remap array small. if (best >= Integer.SIZE) best = Integer.SIZE - 1; LOGGER.debug("Best threshold: " + best); escape = (1 << best) - 1; System.arraycopy(keysArray, 0, remap = new long[escape], 0, remap.length); final Long2LongOpenHashMap map = new Long2LongOpenHashMap(); map.defaultReturnValue(-1); for (int i = 0; i < escape; i++) map.put(remap[i], i); if (best != 0) { firstFunction = new GOV3Function.Builder<T>().keys(keys).transform(transform).store(chunkedHashStore) .values(new AbstractLongBigList() { public long getLong(long index) { long value = map.get(values.getLong(index)); return value == -1 ? escape : value; } public long size64() { return n; } }, best).indirect().build(); LOGGER.debug("Actual bit cost per key of first function: " + (double) firstFunction.numBits() / n); } else firstFunction = null; chunkedHashStore.filter(new Predicate() { public boolean evaluate(Object triple) { return firstFunction == null || firstFunction.getLongByTriple((long[]) triple) == escape; } }); secondFunction = new GOV3Function.Builder<T>().store(chunkedHashStore).values(values, w).indirect().build(); this.seed = chunkedHashStore.seed(); if (!givenChunkedHashStore) chunkedHashStore.close(); LOGGER.debug("Actual bit cost per key of second function: " + (double) secondFunction.numBits() / n); LOGGER.info("Actual bit cost per key: " + (double) numBits() / n); LOGGER.info("Completed."); } @SuppressWarnings("unchecked") public long getLong(final Object o) { if (n == 0) return defRetValue; final long[] triple = new long[3]; Hashes.spooky4(transform.toBitVector((T) o), seed, triple); if (firstFunction != null) { final int firstValue = (int) firstFunction.getLongByTriple(triple); if (firstValue == -1) return defRetValue; if (firstValue != escape) return remap[firstValue]; } return secondFunction.getLongByTriple(triple); } public long getLongByTriple(final long[] triple) { if (firstFunction != null) { final int firstValue = (int) firstFunction.getLongByTriple(triple); if (firstValue == -1) return defRetValue; if (firstValue != escape) return remap[firstValue]; } return secondFunction.getLongByTriple(triple); } public long size64() { return n; } /** Returns the number of bits used by this structure. * * @return the number of bits used by this structure. */ public long numBits() { return (firstFunction != null ? firstFunction.numBits() : 0) + secondFunction.numBits() + transform.numBits() + remap.length * (long) Long.SIZE; } public static void main(final String[] arg) throws NoSuchMethodException, IOException, JSAPException { final SimpleJSAP jsap = new SimpleJSAP(TwoStepsGOV3Function.class.getName(), "Builds a two-steps GOV3 function mapping a newline-separated list of strings to their ordinal position, or to specific values.", new Parameter[] { new FlaggedOption("encoding", ForNameStringParser.getParser(Charset.class), "UTF-8", JSAP.NOT_REQUIRED, 'e', "encoding", "The string file encoding."), new FlaggedOption("tempDir", FileStringParser.getParser(), JSAP.NO_DEFAULT, JSAP.NOT_REQUIRED, 'T', "temp-dir", "A directory for temporary files."), new Switch("iso", 'i', "iso", "Use ISO-8859-1 coding internally (i.e., just use the lower eight bits of each character)."), new Switch("utf32", JSAP.NO_SHORTFLAG, "utf-32", "Use UTF-32 internally (handles surrogate pairs)."), new Switch("zipped", 'z', "zipped", "The string list is compressed in gzip format."), new FlaggedOption("values", JSAP.STRING_PARSER, JSAP.NO_DEFAULT, JSAP.REQUIRED, 'v', "values", "A binary file in DataInput format containing a long for each string (otherwise, the values will be the ordinal positions of the strings)."), new UnflaggedOption("function", JSAP.STRING_PARSER, JSAP.NO_DEFAULT, JSAP.REQUIRED, JSAP.NOT_GREEDY, "The filename for the serialised two-steps GOV3 function."), new UnflaggedOption("stringFile", JSAP.STRING_PARSER, "-", JSAP.NOT_REQUIRED, JSAP.NOT_GREEDY, "The name of a file containing a newline-separated list of strings, or - for standard input; in the first case, strings will not be loaded into core memory."), }); JSAPResult jsapResult = jsap.parse(arg); if (jsap.messagePrinted()) return; final String functionName = jsapResult.getString("function"); final String stringFile = jsapResult.getString("stringFile"); final Charset encoding = (Charset) jsapResult.getObject("encoding"); final File tempDir = jsapResult.getFile("tempDir"); final boolean zipped = jsapResult.getBoolean("zipped"); final boolean iso = jsapResult.getBoolean("iso"); final boolean utf32 = jsapResult.getBoolean("utf32"); final Collection<MutableString> collection; if ("-".equals(stringFile)) { final ProgressLogger pl = new ProgressLogger(LOGGER); pl.displayLocalSpeed = true; pl.displayFreeMemory = true; pl.start("Loading strings..."); collection = new LineIterator( new FastBufferedReader( new InputStreamReader(zipped ? new GZIPInputStream(System.in) : System.in, encoding)), pl).allLines(); pl.done(); } else collection = new FileLinesCollection(stringFile, encoding.toString(), zipped); final TransformationStrategy<CharSequence> transformationStrategy = iso ? TransformationStrategies.rawIso() : utf32 ? TransformationStrategies.rawUtf32() : TransformationStrategies.rawUtf16(); BinIO.storeObject(new TwoStepsGOV3Function<CharSequence>(collection, transformationStrategy, LongBigArrayBigList.wrap(BinIO.loadLongsBig(jsapResult.getString("values"))), tempDir, null), functionName); LOGGER.info("Completed."); } }