List of usage examples for org.apache.commons.math3.random RandomGenerator nextLong
long nextLong();
long
value from this random number generator's sequence. From source file:it.unimi.dsi.sux4j.mph.GOVMinimalPerfectHashFunction.java
/** * Creates a new minimal perfect hash function for the given keys. * //from ww w .j a v a2 s .c o m * @param keys the keys to hash, or {@code null}. * @param transform a transformation strategy for the keys. * @param signatureWidth a signature width, or 0 for no signature. * @param tempDir a temporary directory for the store files, or {@code null} for the standard temporary directory. * @param chunkedHashStore a chunked hash store containing the keys, or {@code null}; the store * can be unchecked, but in this case <code>keys</code> and <code>transform</code> must be non-{@code null}. */ protected GOVMinimalPerfectHashFunction(final Iterable<? extends T> keys, final TransformationStrategy<? super T> transform, final int signatureWidth, final File tempDir, ChunkedHashStore<T> chunkedHashStore) throws IOException { this.transform = transform; final ProgressLogger pl = new ProgressLogger(LOGGER); pl.displayLocalSpeed = true; pl.displayFreeMemory = true; final RandomGenerator r = new XorShift1024StarRandomGenerator(); pl.itemsName = "keys"; final boolean givenChunkedHashStore = chunkedHashStore != null; if (!givenChunkedHashStore) { chunkedHashStore = new ChunkedHashStore<T>(transform, tempDir, pl); chunkedHashStore.reset(r.nextLong()); chunkedHashStore.addAll(keys.iterator()); } n = chunkedHashStore.size(); defRetValue = -1; // For the very few cases in which we can decide int log2NumChunks = Math.max(0, Fast.mostSignificantBit(n >> LOG2_CHUNK_SIZE)); chunkShift = chunkedHashStore.log2Chunks(log2NumChunks); final int numChunks = 1 << log2NumChunks; LOGGER.debug("Number of chunks: " + numChunks); edgeOffsetAndSeed = new long[numChunks + 1]; bitVector = LongArrayBitVector.getInstance(); (values = bitVector.asLongBigList(2)).size(n * C_TIMES_256 >> 8); array = bitVector.bits(); int duplicates = 0; for (;;) { LOGGER.debug("Generating minimal perfect hash function..."); pl.expectedUpdates = numChunks; pl.itemsName = "chunks"; pl.start("Analysing chunks... "); try { int q = 0; long unorientable = 0, unsolvable = 0; for (ChunkedHashStore.Chunk chunk : chunkedHashStore) { edgeOffsetAndSeed[q + 1] = edgeOffsetAndSeed[q] + chunk.size(); long seed = 0; final long off = vertexOffset(edgeOffsetAndSeed[q]); final Linear3SystemSolver<BitVector> solver = new Linear3SystemSolver<BitVector>( (int) (vertexOffset(edgeOffsetAndSeed[q + 1]) - off), chunk.size()); for (;;) { final boolean solved = solver.generateAndSolve(chunk, seed, null); unorientable += solver.unorientable; unsolvable += solver.unsolvable; if (solved) break; seed += SEED_STEP; if (seed == 0) throw new AssertionError("Exhausted local seeds"); } this.edgeOffsetAndSeed[q] |= seed; final long[] solution = solver.solution; for (int i = 0; i < solution.length; i++) values.set(i + off, solution[i]); q++; pl.update(); if (ASSERTS) { final IntOpenHashSet pos = new IntOpenHashSet(); final int[] e = new int[3]; for (long[] triple : chunk) { Linear3SystemSolver.tripleToEquation(triple, seed, (int) (vertexOffset(edgeOffsetAndSeed[q]) - off), e); assert pos .add(e[(int) (values.getLong(off + e[0]) + values.getLong(off + e[1]) + values.getLong(off + e[2])) % 3]) : "<" + e[0] + "," + e[1] + "," + e[2] + ">: " + e[(int) (values.getLong(off + e[0]) + values.getLong(off + e[1]) + values.getLong(off + e[2])) % 3]; } } } LOGGER.info("Unorientable graphs: " + unorientable + "/" + numChunks + " (" + Util.format(100.0 * unorientable / numChunks) + "%)"); LOGGER.info("Unsolvable systems: " + unsolvable + "/" + numChunks + " (" + Util.format(100.0 * unsolvable / numChunks) + "%)"); pl.done(); break; } catch (ChunkedHashStore.DuplicateException e) { if (keys == null) throw new IllegalStateException( "You provided no keys, but the chunked hash store was not checked"); if (duplicates++ > 3) throw new IllegalArgumentException("The input list contains duplicates"); LOGGER.warn("Found duplicate. Recomputing triples..."); chunkedHashStore.reset(r.nextLong()); chunkedHashStore.addAll(keys.iterator()); } } globalSeed = chunkedHashStore.seed(); LOGGER.info("Completed."); LOGGER.debug("Forecast bit cost per key: " + 2 * C + 64. / (1 << LOG2_CHUNK_SIZE)); LOGGER.info("Actual bit cost per key: " + (double) numBits() / n); if (signatureWidth != 0) { signatureMask = -1L >>> Long.SIZE - signatureWidth; (signatures = LongArrayBitVector.getInstance().asLongBigList(signatureWidth)).size(n); pl.expectedUpdates = n; pl.itemsName = "signatures"; pl.start("Signing..."); for (ChunkedHashStore.Chunk chunk : chunkedHashStore) { Iterator<long[]> iterator = chunk.iterator(); for (int i = chunk.size(); i-- != 0;) { final long[] triple = iterator.next(); final int[] e = new int[3]; signatures.set(getLongByTripleNoCheck(triple, e), signatureMask & triple[0]); pl.lightUpdate(); } } pl.done(); } else { signatureMask = 0; signatures = null; } if (!givenChunkedHashStore) chunkedHashStore.close(); }
From source file:it.unimi.dsi.sux4j.mph.TwoStepsGOV3Function.java
/** Creates a new two-step function for the given keys and values. * /*from www . ja v a 2 s.c o m*/ * @param keys the keys in the domain of the function. * @param transform a transformation strategy for the keys. * @param values values to be assigned to each key, in the same order of the iterator returned by <code>keys</code>; if {@code null}, the * assigned value will the the ordinal number of each key. * @param tempDir a temporary directory for the store files, or {@code null} for the standard temporary directory. * @param chunkedHashStore a chunked hash store containing the keys associated with their rank, or {@code null}; the store * can be unchecked, but in this case <code>keys</code> and <code>transform</code> must be non-{@code null}. */ protected TwoStepsGOV3Function(final Iterable<? extends T> keys, final TransformationStrategy<? super T> transform, final LongBigList values, final File tempDir, ChunkedHashStore<T> chunkedHashStore) throws IOException { this.transform = transform; final ProgressLogger pl = new ProgressLogger(LOGGER); pl.displayLocalSpeed = true; pl.displayFreeMemory = true; final RandomGenerator random = new XorShift1024StarRandomGenerator(); pl.itemsName = "keys"; final boolean givenChunkedHashStore = chunkedHashStore != null; if (!givenChunkedHashStore) { if (keys == null) throw new IllegalArgumentException( "If you do not provide a chunked hash store, you must provide the keys"); chunkedHashStore = new ChunkedHashStore<T>(transform, pl); chunkedHashStore.reset(random.nextLong()); chunkedHashStore.addAll(keys.iterator()); } n = chunkedHashStore.size(); defRetValue = -1; // For the very few cases in which we can decide if (n == 0) { rankMean = escape = width = 0; firstFunction = secondFunction = null; remap = null; if (!givenChunkedHashStore) chunkedHashStore.close(); return; } // Compute distribution of values and maximum number of bits. int w = 0, size; long v; final Long2LongOpenHashMap counts = new Long2LongOpenHashMap(); counts.defaultReturnValue(-1); for (LongIterator i = values.iterator(); i.hasNext();) { v = i.nextLong(); counts.put(v, counts.get(v) + 1); size = Fast.length(v); if (size > w) w = size; } this.width = w; final int m = counts.size(); LOGGER.debug("Generating two-steps GOV3 function with " + w + " output bits..."); // Sort keys by reverse frequency final long[] keysArray = counts.keySet().toLongArray(new long[m]); LongArrays.quickSort(keysArray, 0, keysArray.length, new AbstractLongComparator() { private static final long serialVersionUID = 1L; public int compare(final long a, final long b) { return Long.signum(counts.get(b) - counts.get(a)); } }); long mean = 0; for (int i = 0; i < keysArray.length; i++) mean += i * counts.get(keysArray[i]); rankMean = (double) mean / n; // Analyze data and choose a threshold long post = n, bestCost = Long.MAX_VALUE; int pos = 0, best = -1; // Examine every possible choice for r. Note that r = 0 implies one function, so we do not need to test the case r == w. for (int r = 0; r < w && pos < m; r++) { /* This cost function is dependent on the implementation of GOV3Function. * Note that for r = 0 we are actually computing the cost of a single function (the first one). */ final long cost = (long) Math.min(GOV3Function.C * n * 1.126 + n * r, GOV3Function.C * n * r) + (long) Math.min(GOV3Function.C * post * 1.126 + post * w, GOV3Function.C * post * w) + pos * Long.SIZE; if (cost < bestCost) { best = r; bestCost = cost; } /* We add to pre and subtract from post the counts of keys from position (1<<r)-1 to position (1<<r+1)-1. */ for (int j = 0; j < (1 << r) && pos < m; j++) { final long c = counts.get(keysArray[pos++]); post -= c; } } if (ASSERTS) assert pos == m; counts.clear(); counts.trim(); // We must keep the remap array small. if (best >= Integer.SIZE) best = Integer.SIZE - 1; LOGGER.debug("Best threshold: " + best); escape = (1 << best) - 1; System.arraycopy(keysArray, 0, remap = new long[escape], 0, remap.length); final Long2LongOpenHashMap map = new Long2LongOpenHashMap(); map.defaultReturnValue(-1); for (int i = 0; i < escape; i++) map.put(remap[i], i); if (best != 0) { firstFunction = new GOV3Function.Builder<T>().keys(keys).transform(transform).store(chunkedHashStore) .values(new AbstractLongBigList() { public long getLong(long index) { long value = map.get(values.getLong(index)); return value == -1 ? escape : value; } public long size64() { return n; } }, best).indirect().build(); LOGGER.debug("Actual bit cost per key of first function: " + (double) firstFunction.numBits() / n); } else firstFunction = null; chunkedHashStore.filter(new Predicate() { public boolean evaluate(Object triple) { return firstFunction == null || firstFunction.getLongByTriple((long[]) triple) == escape; } }); secondFunction = new GOV3Function.Builder<T>().store(chunkedHashStore).values(values, w).indirect().build(); this.seed = chunkedHashStore.seed(); if (!givenChunkedHashStore) chunkedHashStore.close(); LOGGER.debug("Actual bit cost per key of second function: " + (double) secondFunction.numBits() / n); LOGGER.info("Actual bit cost per key: " + (double) numBits() / n); LOGGER.info("Completed."); }
From source file:it.unimi.dsi.sux4j.mph.TwoStepsMWHCFunction.java
/** Creates a new two-step function for the given keys and values. * //from w w w .j a v a 2s .c om * @param keys the keys in the domain of the function. * @param transform a transformation strategy for the keys. * @param values values to be assigned to each key, in the same order of the iterator returned by <code>keys</code>; if {@code null}, the * assigned value will the the ordinal number of each key. * @param tempDir a temporary directory for the store files, or {@code null} for the standard temporary directory. * @param chunkedHashStore a chunked hash store containing the keys associated with their rank, or {@code null}; the store * can be unchecked, but in this case <code>keys</code> and <code>transform</code> must be non-{@code null}. */ protected TwoStepsMWHCFunction(final Iterable<? extends T> keys, final TransformationStrategy<? super T> transform, final LongBigList values, final File tempDir, ChunkedHashStore<T> chunkedHashStore) throws IOException { this.transform = transform; final ProgressLogger pl = new ProgressLogger(LOGGER); pl.displayLocalSpeed = true; pl.displayFreeMemory = true; final RandomGenerator random = new XorShift1024StarRandomGenerator(); pl.itemsName = "keys"; final boolean givenChunkedHashStore = chunkedHashStore != null; if (!givenChunkedHashStore) { if (keys == null) throw new IllegalArgumentException( "If you do not provide a chunked hash store, you must provide the keys"); chunkedHashStore = new ChunkedHashStore<T>(transform, pl); chunkedHashStore.reset(random.nextLong()); chunkedHashStore.addAll(keys.iterator()); } n = chunkedHashStore.size(); defRetValue = -1; // For the very few cases in which we can decide if (n == 0) { rankMean = escape = width = 0; firstFunction = secondFunction = null; remap = null; if (!givenChunkedHashStore) chunkedHashStore.close(); return; } // Compute distribution of values and maximum number of bits. int w = 0, size; long v; final Long2LongOpenHashMap counts = new Long2LongOpenHashMap(); counts.defaultReturnValue(-1); for (LongIterator i = values.iterator(); i.hasNext();) { v = i.nextLong(); counts.put(v, counts.get(v) + 1); size = Fast.length(v); if (size > w) w = size; } this.width = w; final int m = counts.size(); LOGGER.debug("Generating two-steps MWHC function with " + w + " output bits..."); // Sort keys by reverse frequency final long[] keysArray = counts.keySet().toLongArray(new long[m]); LongArrays.quickSort(keysArray, 0, keysArray.length, new AbstractLongComparator() { private static final long serialVersionUID = 1L; public int compare(final long a, final long b) { return Long.signum(counts.get(b) - counts.get(a)); } }); long mean = 0; for (int i = 0; i < keysArray.length; i++) mean += i * counts.get(keysArray[i]); rankMean = (double) mean / n; // Analyze data and choose a threshold long post = n, bestCost = Long.MAX_VALUE; int pos = 0, best = -1; // Examine every possible choice for r. Note that r = 0 implies one function, so we do not need to test the case r == w. for (int r = 0; r < w && pos < m; r++) { /* This cost function is dependent on the implementation of MWHCFunction. * Note that for r = 0 we are actually computing the cost of a single function (the first one). */ final long cost = (long) Math.min(HypergraphSorter.GAMMA * n * 1.126 + n * r, HypergraphSorter.GAMMA * n * r) + (long) Math.min(HypergraphSorter.GAMMA * post * 1.126 + post * w, HypergraphSorter.GAMMA * post * w) + pos * Long.SIZE; if (cost < bestCost) { best = r; bestCost = cost; } /* We add to pre and subtract from post the counts of keys from position (1<<r)-1 to position (1<<r+1)-1. */ for (int j = 0; j < (1 << r) && pos < m; j++) { final long c = counts.get(keysArray[pos++]); post -= c; } } if (ASSERTS) assert pos == m; counts.clear(); counts.trim(); // We must keep the remap array small. if (best >= Integer.SIZE) best = Integer.SIZE - 1; LOGGER.debug("Best threshold: " + best); escape = (1 << best) - 1; System.arraycopy(keysArray, 0, remap = new long[escape], 0, remap.length); final Long2LongOpenHashMap map = new Long2LongOpenHashMap(); map.defaultReturnValue(-1); for (int i = 0; i < escape; i++) map.put(remap[i], i); if (best != 0) { firstFunction = new MWHCFunction.Builder<T>().keys(keys).transform(transform).store(chunkedHashStore) .values(new AbstractLongBigList() { public long getLong(long index) { long value = map.get(values.getLong(index)); return value == -1 ? escape : value; } public long size64() { return n; } }, best).indirect().build(); LOGGER.debug("Actual bit cost per key of first function: " + (double) firstFunction.numBits() / n); } else firstFunction = null; chunkedHashStore.filter(new Predicate() { public boolean evaluate(Object triple) { return firstFunction == null || firstFunction.getLongByTriple((long[]) triple) == escape; } }); secondFunction = new MWHCFunction.Builder<T>().store(chunkedHashStore).values(values, w).indirect().build(); this.seed = chunkedHashStore.seed(); if (!givenChunkedHashStore) chunkedHashStore.close(); LOGGER.debug("Actual bit cost per key of second function: " + (double) secondFunction.numBits() / n); LOGGER.info("Actual bit cost per key: " + (double) numBits() / n); LOGGER.info("Completed."); }
From source file:it.unimi.dsi.sux4j.mph.MinimalPerfectHashFunction.java
/** * Creates a new minimal perfect hash function for the given keys. * //from w w w.ja v a 2 s . c o m * @param keys the keys to hash, or {@code null}. * @param transform a transformation strategy for the keys. * @param signatureWidth a signature width, or 0 for no signature. * @param tempDir a temporary directory for the store files, or {@code null} for the standard temporary directory. * @param chunkedHashStore a chunked hash store containing the keys, or {@code null}; the store * can be unchecked, but in this case <code>keys</code> and <code>transform</code> must be non-{@code null}. */ protected MinimalPerfectHashFunction(final Iterable<? extends T> keys, final TransformationStrategy<? super T> transform, final int signatureWidth, final File tempDir, ChunkedHashStore<T> chunkedHashStore) throws IOException { this.transform = transform; final ProgressLogger pl = new ProgressLogger(LOGGER); pl.displayLocalSpeed = true; pl.displayFreeMemory = true; final RandomGenerator r = new XorShift1024StarRandomGenerator(); pl.itemsName = "keys"; final boolean givenChunkedHashStore = chunkedHashStore != null; if (!givenChunkedHashStore) { chunkedHashStore = new ChunkedHashStore<T>(transform, tempDir, pl); chunkedHashStore.reset(r.nextLong()); chunkedHashStore.addAll(keys.iterator()); } n = chunkedHashStore.size(); defRetValue = -1; // For the very few cases in which we can decide int log2NumChunks = Math.max(0, Fast.mostSignificantBit(n >> LOG2_CHUNK_SIZE)); chunkShift = chunkedHashStore.log2Chunks(log2NumChunks); final int numChunks = 1 << log2NumChunks; LOGGER.debug("Number of chunks: " + numChunks); seed = new long[numChunks]; offset = new long[numChunks + 1]; bitVector = LongArrayBitVector.getInstance(); (values = bitVector.asLongBigList(2)).size(((long) Math.ceil(n * HypergraphSorter.GAMMA) + 4 * numChunks)); array = bitVector.bits(); int duplicates = 0; for (;;) { LOGGER.debug("Generating minimal perfect hash function..."); long seed = 0; pl.expectedUpdates = numChunks; pl.itemsName = "chunks"; pl.start("Analysing chunks... "); try { int q = 0; for (ChunkedHashStore.Chunk chunk : chunkedHashStore) { final HypergraphSorter<BitVector> sorter = new HypergraphSorter<BitVector>(chunk.size(), false); do { seed = r.nextLong(); } while (!sorter.generateAndSort(chunk.iterator(), seed)); this.seed[q] = seed; offset[q + 1] = offset[q] + sorter.numVertices; /* We assign values. */ int top = chunk.size(), k, v = 0; final int[] stack = sorter.stack; final int[] vertex1 = sorter.vertex1; final int[] vertex2 = sorter.vertex2; final long off = offset[q]; while (top > 0) { v = stack[--top]; k = (v > vertex1[v] ? 1 : 0) + (v > vertex2[v] ? 1 : 0); assert k >= 0 && k < 3 : Integer.toString(k); //System.err.println( "<" + v + ", " + vertex1[v] + ", " + vertex2[ v ]+ "> (" + k + ")" ); final long s = values.getLong(off + vertex1[v]) + values.getLong(off + vertex2[v]); final long value = (k - s + 9) % 3; assert values.getLong(off + v) == 0; values.set(off + v, value == 0 ? 3 : value); } q++; pl.update(); if (ASSERTS) { final IntOpenHashSet pos = new IntOpenHashSet(); final int[] e = new int[3]; for (long[] triple : chunk) { HypergraphSorter.tripleToEdge(triple, seed, sorter.numVertices, sorter.partSize, e); assert pos.add(e[(int) (values.getLong(off + e[0]) + values.getLong(off + e[1]) + values.getLong(off + e[2])) % 3]); } } } pl.done(); break; } catch (ChunkedHashStore.DuplicateException e) { if (keys == null) throw new IllegalStateException( "You provided no keys, but the chunked hash store was not checked"); if (duplicates++ > 3) throw new IllegalArgumentException("The input list contains duplicates"); LOGGER.warn("Found duplicate. Recomputing triples..."); chunkedHashStore.reset(r.nextLong()); chunkedHashStore.addAll(keys.iterator()); } } globalSeed = chunkedHashStore.seed(); if (n > 0) { long m = values.size64(); final long length = bitVector.length(); final int numWords = (int) ((length + Long.SIZE - 1) / Long.SIZE); final int numCounts = (int) ((length + 32 * Long.SIZE - 1) / (32 * Long.SIZE)) * 2; // Init rank/select structure count = new long[numCounts + 1]; long c = 0; int pos = 0; for (int i = 0; i < numWords; i += WORDS_PER_SUPERBLOCK, pos += 2) { count[pos] = c; for (int j = 0; j < WORDS_PER_SUPERBLOCK; j++) { if (j != 0 && j % 6 == 0) count[pos + 1] |= (i + j <= numWords ? c - count[pos] : 0x7FFL) << 12 * (j / 6 - 1); if (i + j < numWords) c += countNonzeroPairs(array[i + j]); } } count[numCounts] = c; if (ASSERTS) { int k = 0; for (long i = 0; i < m; i++) { assert rank(i) == k : "(" + i + ") " + k + " != " + rank(i); if (values.getLong(i) != 0) k++; assert k <= n; } if (keys != null) { final Iterator<? extends T> iterator = keys.iterator(); for (long i = 0; i < n; i++) assert getLong(iterator.next()) < n; } } } else count = LongArrays.EMPTY_ARRAY; LOGGER.info("Completed."); LOGGER.debug( "Forecast bit cost per key: " + (2 * HypergraphSorter.GAMMA + 2. * Long.SIZE / BITS_PER_BLOCK)); LOGGER.info("Actual bit cost per key: " + (double) numBits() / n); if (signatureWidth != 0) { signatureMask = -1L >>> Long.SIZE - signatureWidth; (signatures = LongArrayBitVector.getInstance().asLongBigList(signatureWidth)).size(n); pl.expectedUpdates = n; pl.itemsName = "signatures"; pl.start("Signing..."); for (ChunkedHashStore.Chunk chunk : chunkedHashStore) { Iterator<long[]> iterator = chunk.iterator(); for (int i = chunk.size(); i-- != 0;) { final long[] triple = iterator.next(); final int[] e = new int[3]; signatures.set(getLongByTripleNoCheck(triple, e), signatureMask & triple[0]); pl.lightUpdate(); } } pl.done(); } else { signatureMask = 0; signatures = null; } if (!givenChunkedHashStore) chunkedHashStore.close(); }
From source file:it.unimi.dsi.sux4j.mph.CHDMinimalPerfectHashFunction.java
/** * Creates a new CHD minimal perfect hash function for the given keys. * //from w w w . jav a 2 s . c o m * @param keys the keys to hash, or {@code null}. * @param transform a transformation strategy for the keys. * @param lambda the average bucket size. * @param loadFactor the load factor. * @param signatureWidth a signature width, or 0 for no signature. * @param tempDir a temporary directory for the store files, or {@code null} for the standard temporary directory. * @param chunkedHashStore a chunked hash store containing the keys, or {@code null}; the store * can be unchecked, but in this case <code>keys</code> and <code>transform</code> must be non-{@code null}. */ protected CHDMinimalPerfectHashFunction(final Iterable<? extends T> keys, final TransformationStrategy<? super T> transform, final int lambda, double loadFactor, final int signatureWidth, final File tempDir, ChunkedHashStore<T> chunkedHashStore) throws IOException { this.transform = transform; final ProgressLogger pl = new ProgressLogger(LOGGER); pl.displayLocalSpeed = true; pl.displayFreeMemory = true; final RandomGenerator r = new XorShift1024StarRandomGenerator(); pl.itemsName = "keys"; final boolean givenChunkedHashStore = chunkedHashStore != null; if (!givenChunkedHashStore) { chunkedHashStore = new ChunkedHashStore<T>(transform, tempDir, pl); chunkedHashStore.reset(r.nextLong()); chunkedHashStore.addAll(keys.iterator()); } n = chunkedHashStore.size(); defRetValue = -1; // For the very few cases in which we can decide int log2NumChunks = Math.max(0, Fast.mostSignificantBit(n >> LOG2_CHUNK_SIZE)); chunkShift = chunkedHashStore.log2Chunks(log2NumChunks); final int numChunks = 1 << log2NumChunks; LOGGER.debug("Number of chunks: " + numChunks); LOGGER.debug("Average chunk size: " + (double) n / numChunks); offsetNumBucketsSeed = new long[(numChunks + 1) * 3 + 2]; int duplicates = 0; final LongArrayList holes = new LongArrayList(); @SuppressWarnings("resource") final OfflineIterable<MutableLong, MutableLong> coefficients = new OfflineIterable<MutableLong, MutableLong>( new Serializer<MutableLong, MutableLong>() { @Override public void write(final MutableLong a, final DataOutput dos) throws IOException { long x = a.longValue(); while ((x & ~0x7FL) != 0) { dos.writeByte((int) (x | 0x80)); x >>>= 7; } dos.writeByte((int) x); } @Override public void read(final DataInput dis, final MutableLong x) throws IOException { byte b = dis.readByte(); long t = b & 0x7F; for (int shift = 7; (b & 0x80) != 0; shift += 7) { b = dis.readByte(); t |= (b & 0x7FL) << shift; } x.setValue(t); } }, new MutableLong()); for (;;) { LOGGER.debug("Generating minimal perfect hash function..."); holes.clear(); coefficients.clear(); pl.expectedUpdates = numChunks; pl.itemsName = "chunks"; pl.start("Analysing chunks... "); try { int chunkNumber = 0; for (ChunkedHashStore.Chunk chunk : chunkedHashStore) { /* We treat a chunk as a single hash function. The number of bins is thus * the first prime larger than the chunk size divided by the load factor. */ final int p = Primes.nextPrime((int) Math.ceil(chunk.size() / loadFactor) + 1); final boolean used[] = new boolean[p]; final int numBuckets = (chunk.size() + lambda - 1) / lambda; numBuckets(chunkNumber + 1, numBuckets(chunkNumber) + numBuckets); final int[] cc0 = new int[numBuckets]; final int[] cc1 = new int[numBuckets]; @SuppressWarnings("unchecked") final ArrayList<long[]>[] bucket = new ArrayList[numBuckets]; for (int i = bucket.length; i-- != 0;) bucket[i] = new ArrayList<long[]>(); tryChunk: for (;;) { for (ArrayList<long[]> b : bucket) b.clear(); Arrays.fill(used, false); /* At each try, the allocation to keys to bucket is randomized differently. */ final long seed = r.nextLong(); // System.err.println( "Number of keys: " + chunk.size() + " Number of bins: " + p + " seed: " + seed ); /* We distribute the keys in this chunks in the buckets. */ for (Iterator<long[]> iterator = chunk.iterator(); iterator.hasNext();) { final long[] triple = iterator.next(); final long[] h = new long[3]; Hashes.spooky4(triple, seed, h); final ArrayList<long[]> b = bucket[(int) ((h[0] >>> 1) % numBuckets)]; h[1] = (int) ((h[1] >>> 1) % p); h[2] = (int) ((h[2] >>> 1) % (p - 1)) + 1; // All elements in a bucket must have either different h[ 1 ] or different h[ 2 ] for (long[] t : b) if (t[1] == h[1] && t[2] == h[2]) { LOGGER.info("Duplicate index" + Arrays.toString(t)); continue tryChunk; } b.add(h); } final int[] perm = Util.identity(bucket.length); IntArrays.quickSort(perm, new AbstractIntComparator() { private static final long serialVersionUID = 1L; @Override public int compare(int a0, int a1) { return Integer.compare(bucket[a1].size(), bucket[a0].size()); } }); for (int i = 0; i < perm.length;) { final LinkedList<Integer> bucketsToDo = new LinkedList<Integer>(); final int size = bucket[perm[i]].size(); //System.err.println( "Bucket size: " + size ); int j; // Gather indices of all buckets with the same size for (j = i; j < perm.length && bucket[perm[j]].size() == size; j++) bucketsToDo.add(Integer.valueOf(perm[j])); // Examine for each pair (c0,c1) the buckets still to do ext: for (int c1 = 0; c1 < p; c1++) for (int c0 = 0; c0 < p; c0++) { //System.err.println( "Testing " + c0 + ", " + c1 + " (to do: " + bucketsToDo.size() + ")" ); for (Iterator<Integer> iterator = bucketsToDo.iterator(); iterator.hasNext();) { final int k = iterator.next().intValue(); final ArrayList<long[]> b = bucket[k]; boolean completed = true; final IntArrayList done = new IntArrayList(); // Try to see whether the necessary entries are not used for (long[] h : b) { //assert k == h[ 0 ]; int pos = (int) ((h[1] + c0 * h[2] + c1) % p); //System.err.println( "Testing pos " + pos + " for " + Arrays.toString( e )); if (used[pos]) { completed = false; break; } else { used[pos] = true; done.add(pos); } } if (completed) { // All positions were free cc0[k] = c0; cc1[k] = c1; iterator.remove(); } else for (int d : done) used[d] = false; } if (bucketsToDo.isEmpty()) break ext; } if (!bucketsToDo.isEmpty()) continue tryChunk; seed(chunkNumber, seed); i = j; } break; } // System.err.println("DONE!"); if (ASSERTS) { final IntOpenHashSet pos = new IntOpenHashSet(); final long h[] = new long[3]; for (Iterator<long[]> iterator = chunk.iterator(); iterator.hasNext();) { final long[] triple = iterator.next(); Hashes.spooky4(triple, seed(chunkNumber), h); h[0] = (h[0] >>> 1) % numBuckets; h[1] = (int) ((h[1] >>> 1) % p); h[2] = (int) ((h[2] >>> 1) % (p - 1)) + 1; //System.err.println( Arrays.toString( e ) ); assert pos.add((int) ((h[1] + cc0[(int) (h[0])] * h[2] + cc1[(int) (h[0])]) % p)); } } final MutableLong l = new MutableLong(); for (int i = 0; i < numBuckets; i++) { l.setValue(cc0[i] + cc1[i] * p); coefficients.add(l); } for (int i = 0; i < p; i++) if (!used[i]) holes.add(offset(chunkNumber) + i); offset(chunkNumber + 1, offset(chunkNumber) + p); chunkNumber++; pl.update(); } pl.done(); break; } catch (ChunkedHashStore.DuplicateException e) { if (keys == null) throw new IllegalStateException( "You provided no keys, but the chunked hash store was not checked"); if (duplicates++ > 3) throw new IllegalArgumentException("The input list contains duplicates"); LOGGER.warn("Found duplicate. Recomputing triples..."); chunkedHashStore.reset(r.nextLong()); chunkedHashStore.addAll(keys.iterator()); } } rank = new SparseRank(offset(offsetNumBucketsSeed.length / 3 - 1), holes.size(), holes.iterator()); globalSeed = chunkedHashStore.seed(); this.coefficients = new EliasFanoLongBigList(new AbstractLongIterator() { final OfflineIterator<MutableLong, MutableLong> iterator = coefficients.iterator(); @Override public boolean hasNext() { return iterator.hasNext(); } public long nextLong() { return iterator.next().longValue(); } }, 0, true); coefficients.close(); LOGGER.info("Completed."); LOGGER.info("Actual bit cost per key: " + (double) numBits() / n); if (signatureWidth != 0) { signatureMask = -1L >>> Long.SIZE - signatureWidth; (signatures = LongArrayBitVector.getInstance().asLongBigList(signatureWidth)).size(n); pl.expectedUpdates = n; pl.itemsName = "signatures"; pl.start("Signing..."); for (ChunkedHashStore.Chunk chunk : chunkedHashStore) { Iterator<long[]> iterator = chunk.iterator(); for (int i = chunk.size(); i-- != 0;) { final long[] triple = iterator.next(); long t = getLongByTripleNoCheck(triple); signatures.set(t, signatureMask & triple[0]); pl.lightUpdate(); } } pl.done(); } else { signatureMask = 0; signatures = null; } if (!givenChunkedHashStore) chunkedHashStore.close(); }
From source file:it.unimi.dsi.sux4j.io.ChunkedHashStore.java
/** Checks that this store has no duplicate triples, and try to rebuild if this fails to happen. * /*from ww w. j a v a 2 s . c o m*/ * @param iterable the elements with which the store will be refilled if there are duplicate triples. * @param values the values that will be associated with the elements returned by <code>iterable</code>. * @throws IllegalArgumentException if after a few trials the store still contains duplicate triples. */ public void checkAndRetry(final Iterable<? extends T> iterable, final LongIterable values) throws IOException { final RandomGenerator random = new XorShift1024StarRandomGenerator(); int duplicates = 0; for (;;) try { check(); break; } catch (DuplicateException e) { if (duplicates++ > 3) throw new IllegalArgumentException("The input list contains duplicates"); LOGGER.warn("Found duplicate. Recomputing triples..."); reset(random.nextLong()); addAll(iterable.iterator(), values.iterator()); } checkedForDuplicates = true; }
From source file:it.unimi.dsi.sux4j.mph.GOV4Function.java
/** Creates a new function for the given keys and values. * //from w ww . j ava2s . co m * @param keys the keys in the domain of the function, or {@code null}. * @param transform a transformation strategy for the keys. * @param signatureWidth a positive number for a signature width, 0 for no signature, a negative value for a self-signed function; if nonzero, {@code values} must be {@code null} and {@code width} must be -1. * @param values values to be assigned to each element, in the same order of the iterator returned by <code>keys</code>; if {@code null}, the * assigned value will the the ordinal number of each element. * @param dataWidth the bit width of the <code>values</code>, or -1 if <code>values</code> is {@code null}. * @param tempDir a temporary directory for the store files, or {@code null} for the standard temporary directory. * @param chunkedHashStore a chunked hash store containing the keys associated with their ranks (if there are no values, or {@code indirect} is true) * or values, or {@code null}; the store * can be unchecked, but in this case <code>keys</code> and <code>transform</code> must be non-{@code null}. * @param indirect if true, <code>chunkedHashStore</code> contains ordinal positions, and <code>values</code> is a {@link LongIterable} that * must be accessed to retrieve the actual values. */ protected GOV4Function(final Iterable<? extends T> keys, final TransformationStrategy<? super T> transform, int signatureWidth, final LongIterable values, final int dataWidth, final File tempDir, ChunkedHashStore<T> chunkedHashStore, final boolean indirect) throws IOException { this.transform = transform; if (signatureWidth != 0 && values != null) throw new IllegalArgumentException("You cannot sign a function if you specify its values"); if (signatureWidth != 0 && dataWidth != -1) throw new IllegalArgumentException("You cannot specify a signature width and a data width"); final ProgressLogger pl = new ProgressLogger(LOGGER); pl.displayLocalSpeed = true; pl.displayFreeMemory = true; final RandomGenerator r = new XorShift1024StarRandomGenerator(); pl.itemsName = "keys"; final boolean givenChunkedHashStore = chunkedHashStore != null; if (!givenChunkedHashStore) { if (keys == null) throw new IllegalArgumentException( "If you do not provide a chunked hash store, you must provide the keys"); chunkedHashStore = new ChunkedHashStore<T>(transform, tempDir, -Math.min(signatureWidth, 0), pl); chunkedHashStore.reset(r.nextLong()); if (values == null || indirect) chunkedHashStore.addAll(keys.iterator()); else chunkedHashStore.addAll(keys.iterator(), values != null ? values.iterator() : null); } n = chunkedHashStore.size(); defRetValue = signatureWidth < 0 ? 0 : -1; // Self-signed maps get zero as default resturn value. if (n == 0) { m = this.globalSeed = chunkShift = this.width = 0; data = null; offsetAndSeed = null; signatureMask = 0; signatures = null; return; } int log2NumChunks = Math.max(0, Fast.mostSignificantBit(n >> LOG2_CHUNK_SIZE)); chunkShift = chunkedHashStore.log2Chunks(log2NumChunks); final int numChunks = 1 << log2NumChunks; LOGGER.debug("Number of chunks: " + numChunks); offsetAndSeed = new long[numChunks + 1]; this.width = signatureWidth < 0 ? -signatureWidth : dataWidth == -1 ? Fast.ceilLog2(n) : dataWidth; // Candidate data; might be discarded for compaction. @SuppressWarnings("resource") final OfflineIterable<BitVector, LongArrayBitVector> offlineData = new OfflineIterable<BitVector, LongArrayBitVector>( BitVectors.OFFLINE_SERIALIZER, LongArrayBitVector.getInstance()); int duplicates = 0; for (;;) { LOGGER.debug("Generating GOV function with " + this.width + " output bits..."); pl.expectedUpdates = numChunks; pl.itemsName = "chunks"; pl.start("Analysing chunks... "); try { int q = 0; final LongArrayBitVector dataBitVector = LongArrayBitVector.getInstance(); final LongBigList data = dataBitVector.asLongBigList(this.width); long unsolvable = 0; for (final ChunkedHashStore.Chunk chunk : chunkedHashStore) { offsetAndSeed[q + 1] = offsetAndSeed[q] + Math.max((C_TIMES_256 * chunk.size() >>> 8), chunk.size() + 1); long seed = 0; final int v = (int) (offsetAndSeed[q + 1] - offsetAndSeed[q]); final Linear4SystemSolver<BitVector> solver = new Linear4SystemSolver<BitVector>(v, chunk.size()); for (;;) { final boolean solved = solver.generateAndSolve(chunk, seed, new AbstractLongBigList() { private final LongBigList valueList = indirect ? (values instanceof LongList ? LongBigLists.asBigList((LongList) values) : (LongBigList) values) : null; @Override public long size64() { return chunk.size(); } @Override public long getLong(final long index) { return indirect ? valueList.getLong(chunk.data(index)) : chunk.data(index); } }); unsolvable += solver.unsolvable; if (solved) break; seed += SEED_STEP; if (seed == 0) throw new AssertionError("Exhausted local seeds"); } this.offsetAndSeed[q] |= seed; dataBitVector.fill(false); data.size(v); q++; /* We assign values. */ final long[] solution = solver.solution; for (int i = 0; i < solution.length; i++) data.set(i, solution[i]); offlineData.add(dataBitVector); pl.update(); } LOGGER.info("Unsolvable systems: " + unsolvable + "/" + numChunks + " (" + Util.format(100.0 * unsolvable / numChunks) + "%)"); pl.done(); break; } catch (ChunkedHashStore.DuplicateException e) { if (keys == null) throw new IllegalStateException( "You provided no keys, but the chunked hash store was not checked"); if (duplicates++ > 3) throw new IllegalArgumentException("The input list contains duplicates"); LOGGER.warn("Found duplicate. Recomputing triples..."); chunkedHashStore.reset(r.nextLong()); pl.itemsName = "keys"; if (values == null || indirect) chunkedHashStore.addAll(keys.iterator()); else chunkedHashStore.addAll(keys.iterator(), values != null ? values.iterator() : null); } } if (DEBUG) System.out.println("Offsets: " + Arrays.toString(offsetAndSeed)); globalSeed = chunkedHashStore.seed(); m = offsetAndSeed[offsetAndSeed.length - 1]; final LongArrayBitVector dataBitVector = LongArrayBitVector.getInstance(m * this.width); this.data = dataBitVector.asLongBigList(this.width); OfflineIterator<BitVector, LongArrayBitVector> iterator = offlineData.iterator(); while (iterator.hasNext()) dataBitVector.append(iterator.next()); iterator.close(); offlineData.close(); LOGGER.info("Completed."); LOGGER.info("Forecast bit cost per element: " + C * this.width); LOGGER.info("Actual bit cost per element: " + (double) numBits() / n); if (signatureWidth > 0) { signatureMask = -1L >>> Long.SIZE - signatureWidth; signatures = chunkedHashStore.signatures(signatureWidth, pl); } else if (signatureWidth < 0) { signatureMask = -1L >>> Long.SIZE + signatureWidth; signatures = null; } else { signatureMask = 0; signatures = null; } if (!givenChunkedHashStore) chunkedHashStore.close(); }
From source file:it.unimi.dsi.sux4j.mph.MWHCFunction.java
/** Creates a new function for the given keys and values. * // ww w . j av a 2 s .co m * @param keys the keys in the domain of the function, or {@code null}. * @param transform a transformation strategy for the keys. * @param signatureWidth a positive number for a signature width, 0 for no signature, a negative value for a self-signed function; if nonzero, {@code values} must be {@code null} and {@code width} must be -1. * @param values values to be assigned to each element, in the same order of the iterator returned by <code>keys</code>; if {@code null}, the * assigned value will the the ordinal number of each element. * @param dataWidth the bit width of the <code>values</code>, or -1 if <code>values</code> is {@code null}. * @param tempDir a temporary directory for the store files, or {@code null} for the standard temporary directory. * @param chunkedHashStore a chunked hash store containing the keys associated with their ranks (if there are no values, or {@code indirect} is true) * or values, or {@code null}; the store * can be unchecked, but in this case <code>keys</code> and <code>transform</code> must be non-{@code null}. * @param indirect if true, <code>chunkedHashStore</code> contains ordinal positions, and <code>values</code> is a {@link LongIterable} that * must be accessed to retrieve the actual values. */ protected MWHCFunction(final Iterable<? extends T> keys, final TransformationStrategy<? super T> transform, int signatureWidth, final LongIterable values, final int dataWidth, final File tempDir, ChunkedHashStore<T> chunkedHashStore, boolean indirect) throws IOException { this.transform = transform; if (signatureWidth != 0 && values != null) throw new IllegalArgumentException("You cannot sign a function if you specify its values"); if (signatureWidth != 0 && dataWidth != -1) throw new IllegalArgumentException("You cannot specify a signature width and a data width"); // If we have no keys, values must be a random-access list of longs. final LongBigList valueList = indirect ? (values instanceof LongList ? LongBigLists.asBigList((LongList) values) : (LongBigList) values) : null; final ProgressLogger pl = new ProgressLogger(LOGGER); pl.displayLocalSpeed = true; pl.displayFreeMemory = true; final RandomGenerator r = new XorShift1024StarRandomGenerator(); pl.itemsName = "keys"; final boolean givenChunkedHashStore = chunkedHashStore != null; if (!givenChunkedHashStore) { if (keys == null) throw new IllegalArgumentException( "If you do not provide a chunked hash store, you must provide the keys"); chunkedHashStore = new ChunkedHashStore<T>(transform, tempDir, -Math.min(signatureWidth, 0), pl); chunkedHashStore.reset(r.nextLong()); if (values == null || indirect) chunkedHashStore.addAll(keys.iterator()); else chunkedHashStore.addAll(keys.iterator(), values != null ? values.iterator() : null); } n = chunkedHashStore.size(); defRetValue = signatureWidth < 0 ? 0 : -1; // Self-signed maps get zero as default resturn value. if (n == 0) { m = this.globalSeed = chunkShift = this.width = 0; data = null; marker = null; rank = null; seed = null; offset = null; signatureMask = 0; signatures = null; return; } int log2NumChunks = Math.max(0, Fast.mostSignificantBit(n >> LOG2_CHUNK_SIZE)); chunkShift = chunkedHashStore.log2Chunks(log2NumChunks); final int numChunks = 1 << log2NumChunks; LOGGER.debug("Number of chunks: " + numChunks); seed = new long[numChunks]; offset = new long[numChunks + 1]; this.width = signatureWidth < 0 ? -signatureWidth : dataWidth == -1 ? Fast.ceilLog2(n) : dataWidth; // Candidate data; might be discarded for compaction. @SuppressWarnings("resource") final OfflineIterable<BitVector, LongArrayBitVector> offlineData = new OfflineIterable<BitVector, LongArrayBitVector>( BitVectors.OFFLINE_SERIALIZER, LongArrayBitVector.getInstance()); int duplicates = 0; for (;;) { LOGGER.debug("Generating MWHC function with " + this.width + " output bits..."); long seed = 0; pl.expectedUpdates = numChunks; pl.itemsName = "chunks"; pl.start("Analysing chunks... "); try { int q = 0; final LongArrayBitVector dataBitVector = LongArrayBitVector.getInstance(); final LongBigList data = dataBitVector.asLongBigList(this.width); for (ChunkedHashStore.Chunk chunk : chunkedHashStore) { HypergraphSorter<BitVector> sorter = new HypergraphSorter<BitVector>(chunk.size()); do { seed = r.nextLong(); } while (!sorter.generateAndSort(chunk.iterator(), seed)); this.seed[q] = seed; dataBitVector.fill(false); data.size(sorter.numVertices); offset[q + 1] = offset[q] + sorter.numVertices; /* We assign values. */ int top = chunk.size(), x, k; final int[] stack = sorter.stack; final int[] vertex1 = sorter.vertex1; final int[] vertex2 = sorter.vertex2; final int[] edge = sorter.edge; while (top > 0) { x = stack[--top]; k = edge[x]; final long s = data.getLong(vertex1[x]) ^ data.getLong(vertex2[x]); final long value = indirect ? valueList.getLong(chunk.data(k)) : chunk.data(k); data.set(x, value ^ s); if (ASSERTS) assert (value == (data.getLong(x) ^ data.getLong(vertex1[x]) ^ data.getLong(vertex2[x]))) : "<" + x + "," + vertex1[x] + "," + vertex2[x] + ">: " + value + " != " + (data.getLong(x) ^ data.getLong(vertex1[x]) ^ data.getLong(vertex2[x])); } q++; offlineData.add(dataBitVector); pl.update(); } pl.done(); break; } catch (ChunkedHashStore.DuplicateException e) { if (keys == null) throw new IllegalStateException( "You provided no keys, but the chunked hash store was not checked"); if (duplicates++ > 3) throw new IllegalArgumentException("The input list contains duplicates"); LOGGER.warn("Found duplicate. Recomputing triples..."); chunkedHashStore.reset(r.nextLong()); pl.itemsName = "keys"; if (values == null || indirect) chunkedHashStore.addAll(keys.iterator()); else chunkedHashStore.addAll(keys.iterator(), values != null ? values.iterator() : null); } } if (DEBUG) System.out.println("Offsets: " + Arrays.toString(offset)); globalSeed = chunkedHashStore.seed(); // Check for compaction long nonZero = 0; m = offset[offset.length - 1]; { final OfflineIterator<BitVector, LongArrayBitVector> iterator = offlineData.iterator(); while (iterator.hasNext()) { final LongBigList data = iterator.next().asLongBigList(this.width); for (long i = 0; i < data.size64(); i++) if (data.getLong(i) != 0) nonZero++; } iterator.close(); } // We estimate size using Rank16 if (nonZero * this.width + m * 1.126 < m * this.width) { LOGGER.info("Compacting..."); marker = LongArrayBitVector.ofLength(m); final LongBigList newData = LongArrayBitVector.getInstance().asLongBigList(this.width); newData.size(nonZero); nonZero = 0; final OfflineIterator<BitVector, LongArrayBitVector> iterator = offlineData.iterator(); long j = 0; while (iterator.hasNext()) { final LongBigList data = iterator.next().asLongBigList(this.width); for (long i = 0; i < data.size64(); i++, j++) { final long value = data.getLong(i); if (value != 0) { marker.set(j); newData.set(nonZero++, value); } } } iterator.close(); rank = new Rank16(marker); if (ASSERTS) { final OfflineIterator<BitVector, LongArrayBitVector> iterator2 = offlineData.iterator(); long k = 0; while (iterator2.hasNext()) { final LongBigList data = iterator2.next().asLongBigList(this.width); for (long i = 0; i < data.size64(); i++, k++) { final long value = data.getLong(i); assert (value != 0) == marker.getBoolean(k); if (value != 0) assert value == newData.getLong(rank.rank(k)) : value + " != " + newData.getLong(rank.rank(k)); } } iterator2.close(); } this.data = newData; } else { final LongArrayBitVector dataBitVector = LongArrayBitVector.getInstance(m * this.width); this.data = dataBitVector.asLongBigList(this.width); OfflineIterator<BitVector, LongArrayBitVector> iterator = offlineData.iterator(); while (iterator.hasNext()) dataBitVector.append(iterator.next()); iterator.close(); marker = null; rank = null; } offlineData.close(); LOGGER.info("Completed."); LOGGER.debug("Forecast bit cost per element: " + (marker == null ? HypergraphSorter.GAMMA * this.width : HypergraphSorter.GAMMA + this.width + 0.126)); LOGGER.info("Actual bit cost per element: " + (double) numBits() / n); if (signatureWidth > 0) { signatureMask = -1L >>> Long.SIZE - signatureWidth; signatures = chunkedHashStore.signatures(signatureWidth, pl); } else if (signatureWidth < 0) { signatureMask = -1L >>> Long.SIZE + signatureWidth; signatures = null; } else { signatureMask = 0; signatures = null; } if (!givenChunkedHashStore) chunkedHashStore.close(); }
From source file:it.unimi.dsi.sux4j.mph.GOV3Function.java
/** Creates a new function for the given keys and values. * /* ww w . jav a2s.c om*/ * @param keys the keys in the domain of the function, or {@code null}. * @param transform a transformation strategy for the keys. * @param signatureWidth a positive number for a signature width, 0 for no signature, a negative value for a self-signed function; if nonzero, {@code values} must be {@code null} and {@code width} must be -1. * @param values values to be assigned to each element, in the same order of the iterator returned by <code>keys</code>; if {@code null}, the * assigned value will the the ordinal number of each element. * @param dataWidth the bit width of the <code>values</code>, or -1 if <code>values</code> is {@code null}. * @param indirect if true, <code>chunkedHashStore</code> contains ordinal positions, and <code>values</code> is a {@link LongIterable} that * must be accessed to retrieve the actual values. * @param compacted if true, the coefficients will be compacted. * @param tempDir a temporary directory for the store files, or {@code null} for the standard temporary directory. * @param chunkedHashStore a chunked hash store containing the keys associated with their ranks (if there are no values, or {@code indirect} is true) * or values, or {@code null}; the store * can be unchecked, but in this case <code>keys</code> and <code>transform</code> must be non-{@code null}. */ protected GOV3Function(final Iterable<? extends T> keys, final TransformationStrategy<? super T> transform, int signatureWidth, final LongIterable values, final int dataWidth, final boolean indirect, final boolean compacted, final File tempDir, ChunkedHashStore<T> chunkedHashStore) throws IOException { this.transform = transform; if (signatureWidth != 0 && values != null) throw new IllegalArgumentException("You cannot sign a function if you specify its values"); if (signatureWidth != 0 && dataWidth != -1) throw new IllegalArgumentException("You cannot specify a signature width and a data width"); final ProgressLogger pl = new ProgressLogger(LOGGER); pl.displayLocalSpeed = true; pl.displayFreeMemory = true; final RandomGenerator r = new XorShift1024StarRandomGenerator(); pl.itemsName = "keys"; final boolean givenChunkedHashStore = chunkedHashStore != null; if (!givenChunkedHashStore) { if (keys == null) throw new IllegalArgumentException( "If you do not provide a chunked hash store, you must provide the keys"); chunkedHashStore = new ChunkedHashStore<T>(transform, tempDir, -Math.min(signatureWidth, 0), pl); chunkedHashStore.reset(r.nextLong()); if (values == null || indirect) chunkedHashStore.addAll(keys.iterator()); else chunkedHashStore.addAll(keys.iterator(), values != null ? values.iterator() : null); } n = chunkedHashStore.size(); defRetValue = signatureWidth < 0 ? 0 : -1; // Self-signed maps get zero as default resturn value. if (n == 0) { m = this.globalSeed = chunkShift = this.width = 0; data = null; marker = null; rank = null; offsetAndSeed = null; signatureMask = 0; signatures = null; if (!givenChunkedHashStore) chunkedHashStore.close(); return; } int log2NumChunks = Math.max(0, Fast.mostSignificantBit(n >> LOG2_CHUNK_SIZE)); chunkShift = chunkedHashStore.log2Chunks(log2NumChunks); final int numChunks = 1 << log2NumChunks; LOGGER.debug("Number of chunks: " + numChunks); offsetAndSeed = new long[numChunks + 1]; this.width = signatureWidth < 0 ? -signatureWidth : dataWidth == -1 ? Fast.ceilLog2(n) : dataWidth; // Candidate data; might be discarded for compaction. @SuppressWarnings("resource") final OfflineIterable<BitVector, LongArrayBitVector> offlineData = new OfflineIterable<BitVector, LongArrayBitVector>( BitVectors.OFFLINE_SERIALIZER, LongArrayBitVector.getInstance()); int duplicates = 0; for (;;) { LOGGER.debug("Generating GOV function with " + this.width + " output bits..."); pl.expectedUpdates = numChunks; pl.itemsName = "chunks"; pl.start("Analysing chunks... "); try { int q = 0; final LongArrayBitVector dataBitVector = LongArrayBitVector.getInstance(); final LongBigList data = dataBitVector.asLongBigList(this.width); long unsolvable = 0; for (final ChunkedHashStore.Chunk chunk : chunkedHashStore) { offsetAndSeed[q + 1] = offsetAndSeed[q] + (C_TIMES_256 * chunk.size() >>> 8); long seed = 0; final int v = (int) (offsetAndSeed[q + 1] - offsetAndSeed[q]); final Linear3SystemSolver<BitVector> solver = new Linear3SystemSolver<BitVector>(v, chunk.size()); for (;;) { final boolean solved = solver.generateAndSolve(chunk, seed, new AbstractLongBigList() { private final LongBigList valueList = indirect ? (values instanceof LongList ? LongBigLists.asBigList((LongList) values) : (LongBigList) values) : null; @Override public long size64() { return chunk.size(); } @Override public long getLong(final long index) { return indirect ? valueList.getLong(chunk.data(index)) : chunk.data(index); } }); unsolvable += solver.unsolvable; if (solved) break; seed += SEED_STEP; if (seed == 0) throw new AssertionError("Exhausted local seeds"); } this.offsetAndSeed[q] |= seed; dataBitVector.fill(false); data.size(v); q++; /* We assign values. */ final long[] solution = solver.solution; for (int i = 0; i < solution.length; i++) data.set(i, solution[i]); offlineData.add(dataBitVector); pl.update(); } LOGGER.info("Unsolvable systems: " + unsolvable + "/" + numChunks + " (" + Util.format(100.0 * unsolvable / numChunks) + "%)"); pl.done(); break; } catch (ChunkedHashStore.DuplicateException e) { if (keys == null) throw new IllegalStateException( "You provided no keys, but the chunked hash store was not checked"); if (duplicates++ > 3) throw new IllegalArgumentException("The input list contains duplicates"); LOGGER.warn("Found duplicate. Recomputing triples..."); chunkedHashStore.reset(r.nextLong()); pl.itemsName = "keys"; if (values == null || indirect) chunkedHashStore.addAll(keys.iterator()); else chunkedHashStore.addAll(keys.iterator(), values != null ? values.iterator() : null); } } if (DEBUG) System.out.println("Offsets: " + Arrays.toString(offsetAndSeed)); globalSeed = chunkedHashStore.seed(); // Check for compaction long nonZero = 0; m = offsetAndSeed[offsetAndSeed.length - 1]; { final OfflineIterator<BitVector, LongArrayBitVector> iterator = offlineData.iterator(); while (iterator.hasNext()) { final LongBigList data = iterator.next().asLongBigList(this.width); for (long i = 0; i < data.size64(); i++) if (data.getLong(i) != 0) nonZero++; } iterator.close(); } if (compacted) { LOGGER.info("Compacting..."); marker = LongArrayBitVector.ofLength(m); final LongBigList newData = LongArrayBitVector.getInstance().asLongBigList(this.width); newData.size(nonZero); nonZero = 0; final OfflineIterator<BitVector, LongArrayBitVector> iterator = offlineData.iterator(); long j = 0; while (iterator.hasNext()) { final LongBigList data = iterator.next().asLongBigList(this.width); for (long i = 0; i < data.size64(); i++, j++) { final long value = data.getLong(i); if (value != 0) { marker.set(j); newData.set(nonZero++, value); } } } iterator.close(); rank = new Rank16(marker); if (ASSERTS) { final OfflineIterator<BitVector, LongArrayBitVector> iterator2 = offlineData.iterator(); long k = 0; while (iterator2.hasNext()) { final LongBigList data = iterator2.next().asLongBigList(this.width); for (long i = 0; i < data.size64(); i++, k++) { final long value = data.getLong(i); assert (value != 0) == marker.getBoolean(k); if (value != 0) assert value == newData.getLong(rank.rank(k)) : value + " != " + newData.getLong(rank.rank(k)); } } iterator2.close(); } this.data = newData; } else { final LongArrayBitVector dataBitVector = LongArrayBitVector.getInstance(m * this.width); this.data = dataBitVector.asLongBigList(this.width); OfflineIterator<BitVector, LongArrayBitVector> iterator = offlineData.iterator(); while (iterator.hasNext()) dataBitVector.append(iterator.next()); iterator.close(); marker = null; rank = null; } offlineData.close(); LOGGER.info("Completed."); LOGGER.debug( "Forecast bit cost per element: " + (marker == null ? C * this.width : C + this.width + 0.126)); LOGGER.info("Actual bit cost per element: " + (double) numBits() / n); if (signatureWidth > 0) { signatureMask = -1L >>> Long.SIZE - signatureWidth; signatures = chunkedHashStore.signatures(signatureWidth, pl); } else if (signatureWidth < 0) { signatureMask = -1L >>> Long.SIZE + signatureWidth; signatures = null; } else { signatureMask = 0; signatures = null; } if (!givenChunkedHashStore) chunkedHashStore.close(); }
From source file:it.unimi.dsi.sux4j.mph.VLLcpMonotoneMinimalPerfectHashFunction.java
@SuppressWarnings("unused") public VLLcpMonotoneMinimalPerfectHashFunction(final Iterable<? extends T> iterable, final int numElements, final TransformationStrategy<? super T> transform) throws IOException { final ProgressLogger pl = new ProgressLogger(LOGGER); pl.displayLocalSpeed = true;//from www .java 2s . c o m pl.displayFreeMemory = true; this.transform = transform; final RandomGenerator r = new XorShift1024StarRandomGenerator(); if (numElements == -1) { if (iterable instanceof Size64) n = ((Size64) iterable).size64(); else if (iterable instanceof Collection) n = ((Collection<?>) iterable).size(); else { long c = 0; for (T dummy : iterable) c++; n = c; } } else n = numElements; if (n == 0) { bucketSize = bucketSizeMask = log2BucketSize = 0; lcp2Bucket = null; offsets = null; lcpLengths = null; mph = null; return; } defRetValue = -1; // For the very few cases in which we can decide int theoreticalBucketSize = (int) Math .ceil(1 + GOV3Function.C * Math.log(2) + Math.log(n) - Math.log(1 + Math.log(n))); log2BucketSize = Fast.ceilLog2(theoreticalBucketSize); bucketSize = 1 << log2BucketSize; bucketSizeMask = bucketSize - 1; final long numBuckets = (n + bucketSize - 1) / bucketSize; LongArrayBitVector prev = LongArrayBitVector.getInstance(); LongArrayBitVector curr = LongArrayBitVector.getInstance(); int currLcp = 0; int maxLcp = 0, minLcp = Integer.MAX_VALUE; long maxLength = 0, totalLength = 0; @SuppressWarnings("resource") final ChunkedHashStore<BitVector> chunkedHashStore = new ChunkedHashStore<BitVector>( TransformationStrategies.identity(), pl); chunkedHashStore.reset(r.nextLong()); @SuppressWarnings("resource") OfflineIterable<BitVector, LongArrayBitVector> lcps = new OfflineIterable<BitVector, LongArrayBitVector>( BitVectors.OFFLINE_SERIALIZER, LongArrayBitVector.getInstance()); pl.expectedUpdates = n; pl.start("Scanning collection..."); Iterator<? extends T> iterator = iterable.iterator(); for (long b = 0; b < numBuckets; b++) { prev.replace(transform.toBitVector(iterator.next())); chunkedHashStore.add(prev); pl.lightUpdate(); maxLength = Math.max(maxLength, prev.length()); totalLength += Fast.length(1 + prev.length()); currLcp = (int) prev.length(); final int currBucketSize = (int) Math.min(bucketSize, n - b * bucketSize); for (int i = 0; i < currBucketSize - 1; i++) { curr.replace(transform.toBitVector(iterator.next())); chunkedHashStore.add(curr); pl.lightUpdate(); final int prefix = (int) curr.longestCommonPrefixLength(prev); if (prefix == prev.length() && prefix == curr.length()) throw new IllegalArgumentException("The input bit vectors are not distinct"); if (prefix == prev.length() || prefix == curr.length()) throw new IllegalArgumentException("The input bit vectors are not prefix-free"); if (prev.getBoolean(prefix)) throw new IllegalArgumentException("The input bit vectors are not lexicographically sorted"); currLcp = Math.min(prefix, currLcp); prev.replace(curr); maxLength = Math.max(maxLength, prev.length()); totalLength += Fast.length(1 + prev.length()); } lcps.add(prev.subVector(0, currLcp)); maxLcp = Math.max(maxLcp, currLcp); minLcp = Math.min(minLcp, currLcp); } pl.done(); // Build function assigning each lcp to its bucket. lcp2Bucket = new GOV3Function.Builder<BitVector>().keys(lcps).transform(TransformationStrategies.identity()) .build(); final int[][] lcpLength = IntBigArrays.newBigArray(lcps.size64()); long p = 0; for (LongArrayBitVector bv : lcps) IntBigArrays.set(lcpLength, p++, (int) bv.length()); if (DEBUG) { for (BitVector v : lcps) System.err.println(v + " " + v.length()); for (BitVector v : lcps) { final long value = lcp2Bucket.getLong(v); if (p++ != value) { System.err.println("p: " + (p - 1) + " value: " + value + " key:" + v); throw new AssertionError(); } } } lcps.close(); final Iterable<BitVector> bitVectors = TransformationStrategies.wrap(iterable, transform); // Build mph on elements. mph = new GOVMinimalPerfectHashFunction.Builder<BitVector>().keys(bitVectors) .transform(TransformationStrategies.identity()).store(chunkedHashStore).build(); this.seed = chunkedHashStore.seed(); // Build function assigning the lcp length and the bucketing data to each element. (offsets = LongArrayBitVector.getInstance().asLongBigList(log2BucketSize)).size(n); LongBigList lcpLengthsTemp = LongArrayBitVector.getInstance().asLongBigList(Fast.length(maxLcp)); lcpLengthsTemp.size(n); LOGGER.info("Generating data tables..."); for (ChunkedHashStore.Chunk chunk : chunkedHashStore) { for (long[] quadruple : chunk) { final long index = mph.getLongByTriple(quadruple); offsets.set(index, quadruple[3] & bucketSizeMask); lcpLengthsTemp.set(index, IntBigArrays.get(lcpLength, (int) (quadruple[3] >> log2BucketSize))); } } chunkedHashStore.close(); lcpLengths = new EliasFanoLongBigList(lcpLengthsTemp.iterator(), minLcp, true); if (DEBUG) { p = 0; for (T key : iterable) { BitVector bv = transform.toBitVector(key); long index = mph.getLong(bv); if (p++ != lcp2Bucket.getLong(bv.subVector(0, lcpLengths.getLong(index))) * bucketSize + offsets.getLong(index)) { System.err.println("p: " + (p - 1) + " Key: " + key + " bucket size: " + bucketSize + " lcp " + transform.toBitVector(key).subVector(0, lcpLengths.getLong(index)) + " lcp length: " + lcpLengths.getLong(index) + " bucket " + lcp2Bucket.getLong(transform.toBitVector(key).subVector(0, lcpLengths.getLong(index))) + " offset: " + offsets.getLong(index)); throw new AssertionError(); } } } LOGGER.debug("Bucket size: " + bucketSize); final double avgLength = (double) totalLength / n; LOGGER.debug("Forecast bit cost per element: " + (2 * GOV3Function.C + 2 + avgLength + Fast.log2(avgLength) + Fast.log2(Math.E) - Fast.log2(Fast.log2(Math.E)) + Fast.log2(1 + Fast.log2(n)))); LOGGER.info("Actual bit cost per element: " + (double) numBits() / n); }