Example usage for org.apache.commons.math3.random RandomGenerator nextLong

List of usage examples for org.apache.commons.math3.random RandomGenerator nextLong


In this page you can find the example usage for org.apache.commons.math3.random RandomGenerator nextLong.


long nextLong();

Source Link


Returns the next pseudorandom, uniformly distributed long value from this random number generator's sequence.


From source file:it.unimi.dsi.sux4j.mph.GOVMinimalPerfectHashFunction.java

 * Creates a new minimal perfect hash function for the given keys.
 * //from   ww  w .j  a v a2 s  .c o m
 * @param keys the keys to hash, or {@code null}.
 * @param transform a transformation strategy for the keys.
 * @param signatureWidth a signature width, or 0 for no signature.
 * @param tempDir a temporary directory for the store files, or {@code null} for the standard temporary directory.
 * @param chunkedHashStore a chunked hash store containing the keys, or {@code null}; the store
 * can be unchecked, but in this case <code>keys</code> and <code>transform</code> must be non-{@code null}. 
protected GOVMinimalPerfectHashFunction(final Iterable<? extends T> keys,
        final TransformationStrategy<? super T> transform, final int signatureWidth, final File tempDir,
        ChunkedHashStore<T> chunkedHashStore) throws IOException {
    this.transform = transform;

    final ProgressLogger pl = new ProgressLogger(LOGGER);
    pl.displayLocalSpeed = true;
    pl.displayFreeMemory = true;
    final RandomGenerator r = new XorShift1024StarRandomGenerator();
    pl.itemsName = "keys";

    final boolean givenChunkedHashStore = chunkedHashStore != null;
    if (!givenChunkedHashStore) {
        chunkedHashStore = new ChunkedHashStore<T>(transform, tempDir, pl);
    n = chunkedHashStore.size();

    defRetValue = -1; // For the very few cases in which we can decide

    int log2NumChunks = Math.max(0, Fast.mostSignificantBit(n >> LOG2_CHUNK_SIZE));
    chunkShift = chunkedHashStore.log2Chunks(log2NumChunks);
    final int numChunks = 1 << log2NumChunks;

    LOGGER.debug("Number of chunks: " + numChunks);

    edgeOffsetAndSeed = new long[numChunks + 1];

    bitVector = LongArrayBitVector.getInstance();
    (values = bitVector.asLongBigList(2)).size(n * C_TIMES_256 >> 8);
    array = bitVector.bits();

    int duplicates = 0;

    for (;;) {
        LOGGER.debug("Generating minimal perfect hash function...");

        pl.expectedUpdates = numChunks;
        pl.itemsName = "chunks";
        pl.start("Analysing chunks... ");

        try {
            int q = 0;
            long unorientable = 0, unsolvable = 0;
            for (ChunkedHashStore.Chunk chunk : chunkedHashStore) {

                edgeOffsetAndSeed[q + 1] = edgeOffsetAndSeed[q] + chunk.size();

                long seed = 0;
                final long off = vertexOffset(edgeOffsetAndSeed[q]);
                final Linear3SystemSolver<BitVector> solver = new Linear3SystemSolver<BitVector>(
                        (int) (vertexOffset(edgeOffsetAndSeed[q + 1]) - off), chunk.size());

                for (;;) {
                    final boolean solved = solver.generateAndSolve(chunk, seed, null);
                    unorientable += solver.unorientable;
                    unsolvable += solver.unsolvable;
                    if (solved)
                    seed += SEED_STEP;
                    if (seed == 0)
                        throw new AssertionError("Exhausted local seeds");

                this.edgeOffsetAndSeed[q] |= seed;
                final long[] solution = solver.solution;
                for (int i = 0; i < solution.length; i++)
                    values.set(i + off, solution[i]);


                if (ASSERTS) {
                    final IntOpenHashSet pos = new IntOpenHashSet();
                    final int[] e = new int[3];
                    for (long[] triple : chunk) {
                        Linear3SystemSolver.tripleToEquation(triple, seed,
                                (int) (vertexOffset(edgeOffsetAndSeed[q]) - off), e);

                        assert pos
                                .add(e[(int) (values.getLong(off + e[0]) + values.getLong(off + e[1])
                                        + values.getLong(off + e[2])) % 3]) : "<" + e[0] + "," + e[1] + ","
                                                + e[2] + ">: "
                                                + e[(int) (values.getLong(off + e[0])
                                                        + values.getLong(off + e[1])
                                                        + values.getLong(off + e[2])) % 3];

            LOGGER.info("Unorientable graphs: " + unorientable + "/" + numChunks + " ("
                    + Util.format(100.0 * unorientable / numChunks) + "%)");
            LOGGER.info("Unsolvable systems: " + unsolvable + "/" + numChunks + " ("
                    + Util.format(100.0 * unsolvable / numChunks) + "%)");

        } catch (ChunkedHashStore.DuplicateException e) {
            if (keys == null)
                throw new IllegalStateException(
                        "You provided no keys, but the chunked hash store was not checked");
            if (duplicates++ > 3)
                throw new IllegalArgumentException("The input list contains duplicates");
            LOGGER.warn("Found duplicate. Recomputing triples...");

    globalSeed = chunkedHashStore.seed();

    LOGGER.debug("Forecast bit cost per key: " + 2 * C + 64. / (1 << LOG2_CHUNK_SIZE));
    LOGGER.info("Actual bit cost per key: " + (double) numBits() / n);

    if (signatureWidth != 0) {
        signatureMask = -1L >>> Long.SIZE - signatureWidth;
        (signatures = LongArrayBitVector.getInstance().asLongBigList(signatureWidth)).size(n);
        pl.expectedUpdates = n;
        pl.itemsName = "signatures";
        for (ChunkedHashStore.Chunk chunk : chunkedHashStore) {
            Iterator<long[]> iterator = chunk.iterator();
            for (int i = chunk.size(); i-- != 0;) {
                final long[] triple = iterator.next();
                final int[] e = new int[3];
                signatures.set(getLongByTripleNoCheck(triple, e), signatureMask & triple[0]);
    } else {
        signatureMask = 0;
        signatures = null;

    if (!givenChunkedHashStore)

From source file:it.unimi.dsi.sux4j.mph.TwoStepsGOV3Function.java

/** Creates a new two-step function for the given keys and values.
 * /*from   www . ja  v a  2  s.c o m*/
 * @param keys the keys in the domain of the function.
 * @param transform a transformation strategy for the keys.
 * @param values values to be assigned to each key, in the same order of the iterator returned by <code>keys</code>; if {@code null}, the
 * assigned value will the the ordinal number of each key.
 * @param tempDir a temporary directory for the store files, or {@code null} for the standard temporary directory.
 * @param chunkedHashStore a chunked hash store containing the keys associated with their rank, or {@code null}; the store
 * can be unchecked, but in this case <code>keys</code> and <code>transform</code> must be non-{@code null}. 
protected TwoStepsGOV3Function(final Iterable<? extends T> keys,
        final TransformationStrategy<? super T> transform, final LongBigList values, final File tempDir,
        ChunkedHashStore<T> chunkedHashStore) throws IOException {
    this.transform = transform;
    final ProgressLogger pl = new ProgressLogger(LOGGER);
    pl.displayLocalSpeed = true;
    pl.displayFreeMemory = true;
    final RandomGenerator random = new XorShift1024StarRandomGenerator();
    pl.itemsName = "keys";

    final boolean givenChunkedHashStore = chunkedHashStore != null;
    if (!givenChunkedHashStore) {
        if (keys == null)
            throw new IllegalArgumentException(
                    "If you do not provide a chunked hash store, you must provide the keys");
        chunkedHashStore = new ChunkedHashStore<T>(transform, pl);
    n = chunkedHashStore.size();
    defRetValue = -1; // For the very few cases in which we can decide

    if (n == 0) {
        rankMean = escape = width = 0;
        firstFunction = secondFunction = null;
        remap = null;
        if (!givenChunkedHashStore)

    // Compute distribution of values and maximum number of bits.
    int w = 0, size;
    long v;
    final Long2LongOpenHashMap counts = new Long2LongOpenHashMap();
    for (LongIterator i = values.iterator(); i.hasNext();) {
        v = i.nextLong();
        counts.put(v, counts.get(v) + 1);
        size = Fast.length(v);
        if (size > w)
            w = size;

    this.width = w;
    final int m = counts.size();

    LOGGER.debug("Generating two-steps GOV3 function with " + w + " output bits...");

    // Sort keys by reverse frequency
    final long[] keysArray = counts.keySet().toLongArray(new long[m]);
    LongArrays.quickSort(keysArray, 0, keysArray.length, new AbstractLongComparator() {
        private static final long serialVersionUID = 1L;

        public int compare(final long a, final long b) {
            return Long.signum(counts.get(b) - counts.get(a));

    long mean = 0;
    for (int i = 0; i < keysArray.length; i++)
        mean += i * counts.get(keysArray[i]);
    rankMean = (double) mean / n;

    // Analyze data and choose a threshold
    long post = n, bestCost = Long.MAX_VALUE;
    int pos = 0, best = -1;

    // Examine every possible choice for r. Note that r = 0 implies one function, so we do not need to test the case r == w.
    for (int r = 0; r < w && pos < m; r++) {

        /* This cost function is dependent on the implementation of GOV3Function. 
         * Note that for r = 0 we are actually computing the cost of a single function (the first one). */
        final long cost = (long) Math.min(GOV3Function.C * n * 1.126 + n * r, GOV3Function.C * n * r)
                + (long) Math.min(GOV3Function.C * post * 1.126 + post * w, GOV3Function.C * post * w)
                + pos * Long.SIZE;

        if (cost < bestCost) {
            best = r;
            bestCost = cost;

        /* We add to pre and subtract from post the counts of keys from position (1<<r)-1 to position (1<<r+1)-1. */
        for (int j = 0; j < (1 << r) && pos < m; j++) {
            final long c = counts.get(keysArray[pos++]);
            post -= c;

    if (ASSERTS)
        assert pos == m;


    // We must keep the remap array small.
    if (best >= Integer.SIZE)
        best = Integer.SIZE - 1;

    LOGGER.debug("Best threshold: " + best);
    escape = (1 << best) - 1;
    System.arraycopy(keysArray, 0, remap = new long[escape], 0, remap.length);
    final Long2LongOpenHashMap map = new Long2LongOpenHashMap();
    for (int i = 0; i < escape; i++)
        map.put(remap[i], i);

    if (best != 0) {
        firstFunction = new GOV3Function.Builder<T>().keys(keys).transform(transform).store(chunkedHashStore)
                .values(new AbstractLongBigList() {
                    public long getLong(long index) {
                        long value = map.get(values.getLong(index));
                        return value == -1 ? escape : value;

                    public long size64() {
                        return n;
                }, best).indirect().build();

        LOGGER.debug("Actual bit cost per key of first function: " + (double) firstFunction.numBits() / n);
    } else
        firstFunction = null;

    chunkedHashStore.filter(new Predicate() {
        public boolean evaluate(Object triple) {
            return firstFunction == null || firstFunction.getLongByTriple((long[]) triple) == escape;

    secondFunction = new GOV3Function.Builder<T>().store(chunkedHashStore).values(values, w).indirect().build();

    this.seed = chunkedHashStore.seed();
    if (!givenChunkedHashStore)

    LOGGER.debug("Actual bit cost per key of second function: " + (double) secondFunction.numBits() / n);

    LOGGER.info("Actual bit cost per key: " + (double) numBits() / n);


From source file:it.unimi.dsi.sux4j.mph.TwoStepsMWHCFunction.java

/** Creates a new two-step function for the given keys and values.
 * //from w  w  w .j  a  v a  2s  .c  om
 * @param keys the keys in the domain of the function.
 * @param transform a transformation strategy for the keys.
 * @param values values to be assigned to each key, in the same order of the iterator returned by <code>keys</code>; if {@code null}, the
 * assigned value will the the ordinal number of each key.
 * @param tempDir a temporary directory for the store files, or {@code null} for the standard temporary directory.
 * @param chunkedHashStore a chunked hash store containing the keys associated with their rank, or {@code null}; the store
 * can be unchecked, but in this case <code>keys</code> and <code>transform</code> must be non-{@code null}. 
protected TwoStepsMWHCFunction(final Iterable<? extends T> keys,
        final TransformationStrategy<? super T> transform, final LongBigList values, final File tempDir,
        ChunkedHashStore<T> chunkedHashStore) throws IOException {
    this.transform = transform;
    final ProgressLogger pl = new ProgressLogger(LOGGER);
    pl.displayLocalSpeed = true;
    pl.displayFreeMemory = true;
    final RandomGenerator random = new XorShift1024StarRandomGenerator();
    pl.itemsName = "keys";

    final boolean givenChunkedHashStore = chunkedHashStore != null;
    if (!givenChunkedHashStore) {
        if (keys == null)
            throw new IllegalArgumentException(
                    "If you do not provide a chunked hash store, you must provide the keys");
        chunkedHashStore = new ChunkedHashStore<T>(transform, pl);
    n = chunkedHashStore.size();
    defRetValue = -1; // For the very few cases in which we can decide

    if (n == 0) {
        rankMean = escape = width = 0;
        firstFunction = secondFunction = null;
        remap = null;
        if (!givenChunkedHashStore)

    // Compute distribution of values and maximum number of bits.
    int w = 0, size;
    long v;
    final Long2LongOpenHashMap counts = new Long2LongOpenHashMap();
    for (LongIterator i = values.iterator(); i.hasNext();) {
        v = i.nextLong();
        counts.put(v, counts.get(v) + 1);
        size = Fast.length(v);
        if (size > w)
            w = size;

    this.width = w;
    final int m = counts.size();

    LOGGER.debug("Generating two-steps MWHC function with " + w + " output bits...");

    // Sort keys by reverse frequency
    final long[] keysArray = counts.keySet().toLongArray(new long[m]);
    LongArrays.quickSort(keysArray, 0, keysArray.length, new AbstractLongComparator() {
        private static final long serialVersionUID = 1L;

        public int compare(final long a, final long b) {
            return Long.signum(counts.get(b) - counts.get(a));

    long mean = 0;
    for (int i = 0; i < keysArray.length; i++)
        mean += i * counts.get(keysArray[i]);
    rankMean = (double) mean / n;

    // Analyze data and choose a threshold
    long post = n, bestCost = Long.MAX_VALUE;
    int pos = 0, best = -1;

    // Examine every possible choice for r. Note that r = 0 implies one function, so we do not need to test the case r == w.
    for (int r = 0; r < w && pos < m; r++) {

        /* This cost function is dependent on the implementation of MWHCFunction. 
         * Note that for r = 0 we are actually computing the cost of a single function (the first one). */
        final long cost = (long) Math.min(HypergraphSorter.GAMMA * n * 1.126 + n * r,
                HypergraphSorter.GAMMA * n * r)
                + (long) Math.min(HypergraphSorter.GAMMA * post * 1.126 + post * w,
                        HypergraphSorter.GAMMA * post * w)
                + pos * Long.SIZE;

        if (cost < bestCost) {
            best = r;
            bestCost = cost;

        /* We add to pre and subtract from post the counts of keys from position (1<<r)-1 to position (1<<r+1)-1. */
        for (int j = 0; j < (1 << r) && pos < m; j++) {
            final long c = counts.get(keysArray[pos++]);
            post -= c;

    if (ASSERTS)
        assert pos == m;


    // We must keep the remap array small.
    if (best >= Integer.SIZE)
        best = Integer.SIZE - 1;

    LOGGER.debug("Best threshold: " + best);
    escape = (1 << best) - 1;
    System.arraycopy(keysArray, 0, remap = new long[escape], 0, remap.length);
    final Long2LongOpenHashMap map = new Long2LongOpenHashMap();
    for (int i = 0; i < escape; i++)
        map.put(remap[i], i);

    if (best != 0) {
        firstFunction = new MWHCFunction.Builder<T>().keys(keys).transform(transform).store(chunkedHashStore)
                .values(new AbstractLongBigList() {
                    public long getLong(long index) {
                        long value = map.get(values.getLong(index));
                        return value == -1 ? escape : value;

                    public long size64() {
                        return n;
                }, best).indirect().build();

        LOGGER.debug("Actual bit cost per key of first function: " + (double) firstFunction.numBits() / n);
    } else
        firstFunction = null;

    chunkedHashStore.filter(new Predicate() {
        public boolean evaluate(Object triple) {
            return firstFunction == null || firstFunction.getLongByTriple((long[]) triple) == escape;

    secondFunction = new MWHCFunction.Builder<T>().store(chunkedHashStore).values(values, w).indirect().build();

    this.seed = chunkedHashStore.seed();
    if (!givenChunkedHashStore)

    LOGGER.debug("Actual bit cost per key of second function: " + (double) secondFunction.numBits() / n);

    LOGGER.info("Actual bit cost per key: " + (double) numBits() / n);


From source file:it.unimi.dsi.sux4j.mph.MinimalPerfectHashFunction.java

 * Creates a new minimal perfect hash function for the given keys.
 * //from   w  w  w.ja v a 2 s  .  c  o  m
 * @param keys the keys to hash, or {@code null}.
 * @param transform a transformation strategy for the keys.
 * @param signatureWidth a signature width, or 0 for no signature.
 * @param tempDir a temporary directory for the store files, or {@code null} for the standard temporary directory.
 * @param chunkedHashStore a chunked hash store containing the keys, or {@code null}; the store
 * can be unchecked, but in this case <code>keys</code> and <code>transform</code> must be non-{@code null}. 
protected MinimalPerfectHashFunction(final Iterable<? extends T> keys,
        final TransformationStrategy<? super T> transform, final int signatureWidth, final File tempDir,
        ChunkedHashStore<T> chunkedHashStore) throws IOException {
    this.transform = transform;

    final ProgressLogger pl = new ProgressLogger(LOGGER);
    pl.displayLocalSpeed = true;
    pl.displayFreeMemory = true;
    final RandomGenerator r = new XorShift1024StarRandomGenerator();
    pl.itemsName = "keys";

    final boolean givenChunkedHashStore = chunkedHashStore != null;
    if (!givenChunkedHashStore) {
        chunkedHashStore = new ChunkedHashStore<T>(transform, tempDir, pl);
    n = chunkedHashStore.size();

    defRetValue = -1; // For the very few cases in which we can decide

    int log2NumChunks = Math.max(0, Fast.mostSignificantBit(n >> LOG2_CHUNK_SIZE));
    chunkShift = chunkedHashStore.log2Chunks(log2NumChunks);
    final int numChunks = 1 << log2NumChunks;

    LOGGER.debug("Number of chunks: " + numChunks);

    seed = new long[numChunks];
    offset = new long[numChunks + 1];

    bitVector = LongArrayBitVector.getInstance();
    (values = bitVector.asLongBigList(2)).size(((long) Math.ceil(n * HypergraphSorter.GAMMA) + 4 * numChunks));
    array = bitVector.bits();

    int duplicates = 0;

    for (;;) {
        LOGGER.debug("Generating minimal perfect hash function...");

        long seed = 0;
        pl.expectedUpdates = numChunks;
        pl.itemsName = "chunks";
        pl.start("Analysing chunks... ");

        try {
            int q = 0;
            for (ChunkedHashStore.Chunk chunk : chunkedHashStore) {
                final HypergraphSorter<BitVector> sorter = new HypergraphSorter<BitVector>(chunk.size(), false);
                do {
                    seed = r.nextLong();
                } while (!sorter.generateAndSort(chunk.iterator(), seed));

                this.seed[q] = seed;
                offset[q + 1] = offset[q] + sorter.numVertices;

                /* We assign values. */
                int top = chunk.size(), k, v = 0;
                final int[] stack = sorter.stack;
                final int[] vertex1 = sorter.vertex1;
                final int[] vertex2 = sorter.vertex2;
                final long off = offset[q];

                while (top > 0) {
                    v = stack[--top];
                    k = (v > vertex1[v] ? 1 : 0) + (v > vertex2[v] ? 1 : 0);
                    assert k >= 0 && k < 3 : Integer.toString(k);
                    //System.err.println( "<" + v + ", " + vertex1[v] + ", " + vertex2[ v ]+ "> (" + k + ")" );
                    final long s = values.getLong(off + vertex1[v]) + values.getLong(off + vertex2[v]);
                    final long value = (k - s + 9) % 3;
                    assert values.getLong(off + v) == 0;
                    values.set(off + v, value == 0 ? 3 : value);


                if (ASSERTS) {
                    final IntOpenHashSet pos = new IntOpenHashSet();
                    final int[] e = new int[3];
                    for (long[] triple : chunk) {
                        HypergraphSorter.tripleToEdge(triple, seed, sorter.numVertices, sorter.partSize, e);
                        assert pos.add(e[(int) (values.getLong(off + e[0]) + values.getLong(off + e[1])
                                + values.getLong(off + e[2])) % 3]);

        } catch (ChunkedHashStore.DuplicateException e) {
            if (keys == null)
                throw new IllegalStateException(
                        "You provided no keys, but the chunked hash store was not checked");
            if (duplicates++ > 3)
                throw new IllegalArgumentException("The input list contains duplicates");
            LOGGER.warn("Found duplicate. Recomputing triples...");

    globalSeed = chunkedHashStore.seed();

    if (n > 0) {
        long m = values.size64();

        final long length = bitVector.length();

        final int numWords = (int) ((length + Long.SIZE - 1) / Long.SIZE);

        final int numCounts = (int) ((length + 32 * Long.SIZE - 1) / (32 * Long.SIZE)) * 2;
        // Init rank/select structure
        count = new long[numCounts + 1];

        long c = 0;
        int pos = 0;
        for (int i = 0; i < numWords; i += WORDS_PER_SUPERBLOCK, pos += 2) {
            count[pos] = c;

            for (int j = 0; j < WORDS_PER_SUPERBLOCK; j++) {
                if (j != 0 && j % 6 == 0)
                    count[pos + 1] |= (i + j <= numWords ? c - count[pos] : 0x7FFL) << 12 * (j / 6 - 1);
                if (i + j < numWords)
                    c += countNonzeroPairs(array[i + j]);

        count[numCounts] = c;

        if (ASSERTS) {
            int k = 0;
            for (long i = 0; i < m; i++) {
                assert rank(i) == k : "(" + i + ") " + k + " != " + rank(i);
                if (values.getLong(i) != 0)
                assert k <= n;

            if (keys != null) {
                final Iterator<? extends T> iterator = keys.iterator();
                for (long i = 0; i < n; i++)
                    assert getLong(iterator.next()) < n;
    } else
        count = LongArrays.EMPTY_ARRAY;

            "Forecast bit cost per key: " + (2 * HypergraphSorter.GAMMA + 2. * Long.SIZE / BITS_PER_BLOCK));
    LOGGER.info("Actual bit cost per key: " + (double) numBits() / n);

    if (signatureWidth != 0) {
        signatureMask = -1L >>> Long.SIZE - signatureWidth;
        (signatures = LongArrayBitVector.getInstance().asLongBigList(signatureWidth)).size(n);
        pl.expectedUpdates = n;
        pl.itemsName = "signatures";
        for (ChunkedHashStore.Chunk chunk : chunkedHashStore) {
            Iterator<long[]> iterator = chunk.iterator();
            for (int i = chunk.size(); i-- != 0;) {
                final long[] triple = iterator.next();
                final int[] e = new int[3];
                signatures.set(getLongByTripleNoCheck(triple, e), signatureMask & triple[0]);
    } else {
        signatureMask = 0;
        signatures = null;

    if (!givenChunkedHashStore)

From source file:it.unimi.dsi.sux4j.mph.CHDMinimalPerfectHashFunction.java

 * Creates a new CHD minimal perfect hash function for the given keys.
 * //from   w w w .  jav a  2 s  .  c o m
 * @param keys the keys to hash, or {@code null}.
 * @param transform a transformation strategy for the keys.
 * @param lambda the average bucket size.
 * @param loadFactor the load factor.
 * @param signatureWidth a signature width, or 0 for no signature.
 * @param tempDir a temporary directory for the store files, or {@code null} for the standard temporary directory.
 * @param chunkedHashStore a chunked hash store containing the keys, or {@code null}; the store
 * can be unchecked, but in this case <code>keys</code> and <code>transform</code> must be non-{@code null}. 
protected CHDMinimalPerfectHashFunction(final Iterable<? extends T> keys,
        final TransformationStrategy<? super T> transform, final int lambda, double loadFactor,
        final int signatureWidth, final File tempDir, ChunkedHashStore<T> chunkedHashStore) throws IOException {
    this.transform = transform;

    final ProgressLogger pl = new ProgressLogger(LOGGER);
    pl.displayLocalSpeed = true;
    pl.displayFreeMemory = true;
    final RandomGenerator r = new XorShift1024StarRandomGenerator();
    pl.itemsName = "keys";

    final boolean givenChunkedHashStore = chunkedHashStore != null;
    if (!givenChunkedHashStore) {
        chunkedHashStore = new ChunkedHashStore<T>(transform, tempDir, pl);
    n = chunkedHashStore.size();

    defRetValue = -1; // For the very few cases in which we can decide

    int log2NumChunks = Math.max(0, Fast.mostSignificantBit(n >> LOG2_CHUNK_SIZE));
    chunkShift = chunkedHashStore.log2Chunks(log2NumChunks);
    final int numChunks = 1 << log2NumChunks;

    LOGGER.debug("Number of chunks: " + numChunks);
    LOGGER.debug("Average chunk size: " + (double) n / numChunks);

    offsetNumBucketsSeed = new long[(numChunks + 1) * 3 + 2];

    int duplicates = 0;
    final LongArrayList holes = new LongArrayList();

    final OfflineIterable<MutableLong, MutableLong> coefficients = new OfflineIterable<MutableLong, MutableLong>(
            new Serializer<MutableLong, MutableLong>() {

                public void write(final MutableLong a, final DataOutput dos) throws IOException {
                    long x = a.longValue();
                    while ((x & ~0x7FL) != 0) {
                        dos.writeByte((int) (x | 0x80));
                        x >>>= 7;
                    dos.writeByte((int) x);

                public void read(final DataInput dis, final MutableLong x) throws IOException {
                    byte b = dis.readByte();
                    long t = b & 0x7F;
                    for (int shift = 7; (b & 0x80) != 0; shift += 7) {
                        b = dis.readByte();
                        t |= (b & 0x7FL) << shift;
            }, new MutableLong());

    for (;;) {
        LOGGER.debug("Generating minimal perfect hash function...");

        pl.expectedUpdates = numChunks;
        pl.itemsName = "chunks";
        pl.start("Analysing chunks... ");

        try {
            int chunkNumber = 0;

            for (ChunkedHashStore.Chunk chunk : chunkedHashStore) {
                /* We treat a chunk as a single hash function. The number of bins is thus
                 * the first prime larger than the chunk size divided by the load factor. */
                final int p = Primes.nextPrime((int) Math.ceil(chunk.size() / loadFactor) + 1);
                final boolean used[] = new boolean[p];

                final int numBuckets = (chunk.size() + lambda - 1) / lambda;
                numBuckets(chunkNumber + 1, numBuckets(chunkNumber) + numBuckets);
                final int[] cc0 = new int[numBuckets];
                final int[] cc1 = new int[numBuckets];
                final ArrayList<long[]>[] bucket = new ArrayList[numBuckets];
                for (int i = bucket.length; i-- != 0;)
                    bucket[i] = new ArrayList<long[]>();

                tryChunk: for (;;) {
                    for (ArrayList<long[]> b : bucket)
                    Arrays.fill(used, false);

                    /* At each try, the allocation to keys to bucket is randomized differently. */
                    final long seed = r.nextLong();
                    // System.err.println( "Number of keys: " + chunk.size()  + " Number of bins: " + p + " seed: " + seed );
                    /* We distribute the keys in this chunks in the buckets. */
                    for (Iterator<long[]> iterator = chunk.iterator(); iterator.hasNext();) {
                        final long[] triple = iterator.next();
                        final long[] h = new long[3];
                        Hashes.spooky4(triple, seed, h);
                        final ArrayList<long[]> b = bucket[(int) ((h[0] >>> 1) % numBuckets)];
                        h[1] = (int) ((h[1] >>> 1) % p);
                        h[2] = (int) ((h[2] >>> 1) % (p - 1)) + 1;

                        // All elements in a bucket must have either different h[ 1 ] or different h[ 2 ]
                        for (long[] t : b)
                            if (t[1] == h[1] && t[2] == h[2]) {
                                LOGGER.info("Duplicate index" + Arrays.toString(t));
                                continue tryChunk;

                    final int[] perm = Util.identity(bucket.length);
                    IntArrays.quickSort(perm, new AbstractIntComparator() {
                        private static final long serialVersionUID = 1L;

                        public int compare(int a0, int a1) {
                            return Integer.compare(bucket[a1].size(), bucket[a0].size());

                    for (int i = 0; i < perm.length;) {
                        final LinkedList<Integer> bucketsToDo = new LinkedList<Integer>();
                        final int size = bucket[perm[i]].size();
                        //System.err.println( "Bucket size: " + size );
                        int j;
                        // Gather indices of all buckets with the same size
                        for (j = i; j < perm.length && bucket[perm[j]].size() == size; j++)

                        // Examine for each pair (c0,c1) the buckets still to do
                        ext: for (int c1 = 0; c1 < p; c1++)
                            for (int c0 = 0; c0 < p; c0++) {
                                //System.err.println( "Testing " + c0 + ", " + c1 + " (to do: " + bucketsToDo.size() + ")" );
                                for (Iterator<Integer> iterator = bucketsToDo.iterator(); iterator.hasNext();) {
                                    final int k = iterator.next().intValue();
                                    final ArrayList<long[]> b = bucket[k];
                                    boolean completed = true;
                                    final IntArrayList done = new IntArrayList();
                                    // Try to see whether the necessary entries are not used
                                    for (long[] h : b) {
                                        //assert k == h[ 0 ];

                                        int pos = (int) ((h[1] + c0 * h[2] + c1) % p);
                                        //System.err.println( "Testing pos " + pos + " for " + Arrays.toString( e  ));
                                        if (used[pos]) {
                                            completed = false;
                                        } else {
                                            used[pos] = true;

                                    if (completed) {
                                        // All positions were free
                                        cc0[k] = c0;
                                        cc1[k] = c1;
                                    } else
                                        for (int d : done)
                                            used[d] = false;
                                if (bucketsToDo.isEmpty())
                                    break ext;
                        if (!bucketsToDo.isEmpty())
                            continue tryChunk;

                        seed(chunkNumber, seed);
                        i = j;

                // System.err.println("DONE!");

                if (ASSERTS) {
                    final IntOpenHashSet pos = new IntOpenHashSet();
                    final long h[] = new long[3];
                    for (Iterator<long[]> iterator = chunk.iterator(); iterator.hasNext();) {
                        final long[] triple = iterator.next();
                        Hashes.spooky4(triple, seed(chunkNumber), h);
                        h[0] = (h[0] >>> 1) % numBuckets;
                        h[1] = (int) ((h[1] >>> 1) % p);
                        h[2] = (int) ((h[2] >>> 1) % (p - 1)) + 1;
                        //System.err.println( Arrays.toString(  e  ) );
                        assert pos.add((int) ((h[1] + cc0[(int) (h[0])] * h[2] + cc1[(int) (h[0])]) % p));

                final MutableLong l = new MutableLong();
                for (int i = 0; i < numBuckets; i++) {
                    l.setValue(cc0[i] + cc1[i] * p);

                for (int i = 0; i < p; i++)
                    if (!used[i])
                        holes.add(offset(chunkNumber) + i);

                offset(chunkNumber + 1, offset(chunkNumber) + p);

        } catch (ChunkedHashStore.DuplicateException e) {
            if (keys == null)
                throw new IllegalStateException(
                        "You provided no keys, but the chunked hash store was not checked");
            if (duplicates++ > 3)
                throw new IllegalArgumentException("The input list contains duplicates");
            LOGGER.warn("Found duplicate. Recomputing triples...");

    rank = new SparseRank(offset(offsetNumBucketsSeed.length / 3 - 1), holes.size(), holes.iterator());

    globalSeed = chunkedHashStore.seed();

    this.coefficients = new EliasFanoLongBigList(new AbstractLongIterator() {
        final OfflineIterator<MutableLong, MutableLong> iterator = coefficients.iterator();

        public boolean hasNext() {
            return iterator.hasNext();

        public long nextLong() {
            return iterator.next().longValue();
    }, 0, true);


    LOGGER.info("Actual bit cost per key: " + (double) numBits() / n);

    if (signatureWidth != 0) {
        signatureMask = -1L >>> Long.SIZE - signatureWidth;
        (signatures = LongArrayBitVector.getInstance().asLongBigList(signatureWidth)).size(n);
        pl.expectedUpdates = n;
        pl.itemsName = "signatures";
        for (ChunkedHashStore.Chunk chunk : chunkedHashStore) {
            Iterator<long[]> iterator = chunk.iterator();
            for (int i = chunk.size(); i-- != 0;) {
                final long[] triple = iterator.next();
                long t = getLongByTripleNoCheck(triple);
                signatures.set(t, signatureMask & triple[0]);
    } else {
        signatureMask = 0;
        signatures = null;

    if (!givenChunkedHashStore)

From source file:it.unimi.dsi.sux4j.io.ChunkedHashStore.java

/** Checks that this store has no duplicate triples, and try to rebuild if this fails to happen.
 * /*from  ww w. j a  v  a  2 s .  c  o  m*/
 * @param iterable the elements with which the store will be refilled if there are duplicate triples.
 * @param values the values that will be associated with the elements returned by <code>iterable</code>. 
 * @throws IllegalArgumentException if after a few trials the store still contains duplicate triples.
public void checkAndRetry(final Iterable<? extends T> iterable, final LongIterable values) throws IOException {
    final RandomGenerator random = new XorShift1024StarRandomGenerator();
    int duplicates = 0;

    for (;;)
        try {
        } catch (DuplicateException e) {
            if (duplicates++ > 3)
                throw new IllegalArgumentException("The input list contains duplicates");
            LOGGER.warn("Found duplicate. Recomputing triples...");
            addAll(iterable.iterator(), values.iterator());

    checkedForDuplicates = true;

From source file:it.unimi.dsi.sux4j.mph.GOV4Function.java

/** Creates a new function for the given keys and values.
 * //from   w ww . j  ava2s .  co m
 * @param keys the keys in the domain of the function, or {@code null}.
 * @param transform a transformation strategy for the keys.
 * @param signatureWidth a positive number for a signature width, 0 for no signature, a negative value for a self-signed function; if nonzero, {@code values} must be {@code null} and {@code width} must be -1.
 * @param values values to be assigned to each element, in the same order of the iterator returned by <code>keys</code>; if {@code null}, the
 * assigned value will the the ordinal number of each element.
 * @param dataWidth the bit width of the <code>values</code>, or -1 if <code>values</code> is {@code null}.
 * @param tempDir a temporary directory for the store files, or {@code null} for the standard temporary directory.
 * @param chunkedHashStore a chunked hash store containing the keys associated with their ranks (if there are no values, or {@code indirect} is true)
 * or values, or {@code null}; the store
 * can be unchecked, but in this case <code>keys</code> and <code>transform</code> must be non-{@code null}. 
 * @param indirect if true, <code>chunkedHashStore</code> contains ordinal positions, and <code>values</code> is a {@link LongIterable} that
 * must be accessed to retrieve the actual values. 
protected GOV4Function(final Iterable<? extends T> keys, final TransformationStrategy<? super T> transform,
        int signatureWidth, final LongIterable values, final int dataWidth, final File tempDir,
        ChunkedHashStore<T> chunkedHashStore, final boolean indirect) throws IOException {
    this.transform = transform;

    if (signatureWidth != 0 && values != null)
        throw new IllegalArgumentException("You cannot sign a function if you specify its values");
    if (signatureWidth != 0 && dataWidth != -1)
        throw new IllegalArgumentException("You cannot specify a signature width and a data width");

    final ProgressLogger pl = new ProgressLogger(LOGGER);
    pl.displayLocalSpeed = true;
    pl.displayFreeMemory = true;
    final RandomGenerator r = new XorShift1024StarRandomGenerator();
    pl.itemsName = "keys";

    final boolean givenChunkedHashStore = chunkedHashStore != null;
    if (!givenChunkedHashStore) {
        if (keys == null)
            throw new IllegalArgumentException(
                    "If you do not provide a chunked hash store, you must provide the keys");
        chunkedHashStore = new ChunkedHashStore<T>(transform, tempDir, -Math.min(signatureWidth, 0), pl);
        if (values == null || indirect)
            chunkedHashStore.addAll(keys.iterator(), values != null ? values.iterator() : null);
    n = chunkedHashStore.size();
    defRetValue = signatureWidth < 0 ? 0 : -1; // Self-signed maps get zero as default resturn value.

    if (n == 0) {
        m = this.globalSeed = chunkShift = this.width = 0;
        data = null;
        offsetAndSeed = null;
        signatureMask = 0;
        signatures = null;

    int log2NumChunks = Math.max(0, Fast.mostSignificantBit(n >> LOG2_CHUNK_SIZE));
    chunkShift = chunkedHashStore.log2Chunks(log2NumChunks);
    final int numChunks = 1 << log2NumChunks;

    LOGGER.debug("Number of chunks: " + numChunks);

    offsetAndSeed = new long[numChunks + 1];

    this.width = signatureWidth < 0 ? -signatureWidth : dataWidth == -1 ? Fast.ceilLog2(n) : dataWidth;

    // Candidate data; might be discarded for compaction.
    final OfflineIterable<BitVector, LongArrayBitVector> offlineData = new OfflineIterable<BitVector, LongArrayBitVector>(
            BitVectors.OFFLINE_SERIALIZER, LongArrayBitVector.getInstance());

    int duplicates = 0;

    for (;;) {
        LOGGER.debug("Generating GOV function with " + this.width + " output bits...");

        pl.expectedUpdates = numChunks;
        pl.itemsName = "chunks";
        pl.start("Analysing chunks... ");

        try {
            int q = 0;
            final LongArrayBitVector dataBitVector = LongArrayBitVector.getInstance();
            final LongBigList data = dataBitVector.asLongBigList(this.width);
            long unsolvable = 0;
            for (final ChunkedHashStore.Chunk chunk : chunkedHashStore) {

                offsetAndSeed[q + 1] = offsetAndSeed[q]
                        + Math.max((C_TIMES_256 * chunk.size() >>> 8), chunk.size() + 1);

                long seed = 0;
                final int v = (int) (offsetAndSeed[q + 1] - offsetAndSeed[q]);
                final Linear4SystemSolver<BitVector> solver = new Linear4SystemSolver<BitVector>(v,

                for (;;) {
                    final boolean solved = solver.generateAndSolve(chunk, seed, new AbstractLongBigList() {
                        private final LongBigList valueList = indirect
                                ? (values instanceof LongList ? LongBigLists.asBigList((LongList) values)
                                        : (LongBigList) values)
                                : null;

                        public long size64() {
                            return chunk.size();

                        public long getLong(final long index) {
                            return indirect ? valueList.getLong(chunk.data(index)) : chunk.data(index);
                    unsolvable += solver.unsolvable;
                    if (solved)
                    seed += SEED_STEP;
                    if (seed == 0)
                        throw new AssertionError("Exhausted local seeds");

                this.offsetAndSeed[q] |= seed;


                /* We assign values. */
                final long[] solution = solver.solution;
                for (int i = 0; i < solution.length; i++)
                    data.set(i, solution[i]);


            LOGGER.info("Unsolvable systems: " + unsolvable + "/" + numChunks + " ("
                    + Util.format(100.0 * unsolvable / numChunks) + "%)");

        } catch (ChunkedHashStore.DuplicateException e) {
            if (keys == null)
                throw new IllegalStateException(
                        "You provided no keys, but the chunked hash store was not checked");
            if (duplicates++ > 3)
                throw new IllegalArgumentException("The input list contains duplicates");
            LOGGER.warn("Found duplicate. Recomputing triples...");
            pl.itemsName = "keys";
            if (values == null || indirect)
                chunkedHashStore.addAll(keys.iterator(), values != null ? values.iterator() : null);

    if (DEBUG)
        System.out.println("Offsets: " + Arrays.toString(offsetAndSeed));

    globalSeed = chunkedHashStore.seed();
    m = offsetAndSeed[offsetAndSeed.length - 1];
    final LongArrayBitVector dataBitVector = LongArrayBitVector.getInstance(m * this.width);
    this.data = dataBitVector.asLongBigList(this.width);

    OfflineIterator<BitVector, LongArrayBitVector> iterator = offlineData.iterator();
    while (iterator.hasNext())


    LOGGER.info("Forecast bit cost per element: " + C * this.width);
    LOGGER.info("Actual bit cost per element: " + (double) numBits() / n);

    if (signatureWidth > 0) {
        signatureMask = -1L >>> Long.SIZE - signatureWidth;
        signatures = chunkedHashStore.signatures(signatureWidth, pl);
    } else if (signatureWidth < 0) {
        signatureMask = -1L >>> Long.SIZE + signatureWidth;
        signatures = null;
    } else {
        signatureMask = 0;
        signatures = null;

    if (!givenChunkedHashStore)

From source file:it.unimi.dsi.sux4j.mph.MWHCFunction.java

/** Creates a new function for the given keys and values.
 * // ww  w .  j av  a  2  s .co  m
 * @param keys the keys in the domain of the function, or {@code null}.
 * @param transform a transformation strategy for the keys.
 * @param signatureWidth a positive number for a signature width, 0 for no signature, a negative value for a self-signed function; if nonzero, {@code values} must be {@code null} and {@code width} must be -1.
 * @param values values to be assigned to each element, in the same order of the iterator returned by <code>keys</code>; if {@code null}, the
 * assigned value will the the ordinal number of each element.
 * @param dataWidth the bit width of the <code>values</code>, or -1 if <code>values</code> is {@code null}.
 * @param tempDir a temporary directory for the store files, or {@code null} for the standard temporary directory.
 * @param chunkedHashStore a chunked hash store containing the keys associated with their ranks (if there are no values, or {@code indirect} is true)
 * or values, or {@code null}; the store
 * can be unchecked, but in this case <code>keys</code> and <code>transform</code> must be non-{@code null}. 
 * @param indirect if true, <code>chunkedHashStore</code> contains ordinal positions, and <code>values</code> is a {@link LongIterable} that
 * must be accessed to retrieve the actual values. 
protected MWHCFunction(final Iterable<? extends T> keys, final TransformationStrategy<? super T> transform,
        int signatureWidth, final LongIterable values, final int dataWidth, final File tempDir,
        ChunkedHashStore<T> chunkedHashStore, boolean indirect) throws IOException {
    this.transform = transform;

    if (signatureWidth != 0 && values != null)
        throw new IllegalArgumentException("You cannot sign a function if you specify its values");
    if (signatureWidth != 0 && dataWidth != -1)
        throw new IllegalArgumentException("You cannot specify a signature width and a data width");

    // If we have no keys, values must be a random-access list of longs.
    final LongBigList valueList = indirect
            ? (values instanceof LongList ? LongBigLists.asBigList((LongList) values) : (LongBigList) values)
            : null;

    final ProgressLogger pl = new ProgressLogger(LOGGER);
    pl.displayLocalSpeed = true;
    pl.displayFreeMemory = true;
    final RandomGenerator r = new XorShift1024StarRandomGenerator();
    pl.itemsName = "keys";

    final boolean givenChunkedHashStore = chunkedHashStore != null;
    if (!givenChunkedHashStore) {
        if (keys == null)
            throw new IllegalArgumentException(
                    "If you do not provide a chunked hash store, you must provide the keys");
        chunkedHashStore = new ChunkedHashStore<T>(transform, tempDir, -Math.min(signatureWidth, 0), pl);
        if (values == null || indirect)
            chunkedHashStore.addAll(keys.iterator(), values != null ? values.iterator() : null);
    n = chunkedHashStore.size();
    defRetValue = signatureWidth < 0 ? 0 : -1; // Self-signed maps get zero as default resturn value.

    if (n == 0) {
        m = this.globalSeed = chunkShift = this.width = 0;
        data = null;
        marker = null;
        rank = null;
        seed = null;
        offset = null;
        signatureMask = 0;
        signatures = null;

    int log2NumChunks = Math.max(0, Fast.mostSignificantBit(n >> LOG2_CHUNK_SIZE));
    chunkShift = chunkedHashStore.log2Chunks(log2NumChunks);
    final int numChunks = 1 << log2NumChunks;

    LOGGER.debug("Number of chunks: " + numChunks);

    seed = new long[numChunks];
    offset = new long[numChunks + 1];

    this.width = signatureWidth < 0 ? -signatureWidth : dataWidth == -1 ? Fast.ceilLog2(n) : dataWidth;

    // Candidate data; might be discarded for compaction.
    final OfflineIterable<BitVector, LongArrayBitVector> offlineData = new OfflineIterable<BitVector, LongArrayBitVector>(
            BitVectors.OFFLINE_SERIALIZER, LongArrayBitVector.getInstance());

    int duplicates = 0;

    for (;;) {
        LOGGER.debug("Generating MWHC function with " + this.width + " output bits...");

        long seed = 0;
        pl.expectedUpdates = numChunks;
        pl.itemsName = "chunks";
        pl.start("Analysing chunks... ");

        try {
            int q = 0;
            final LongArrayBitVector dataBitVector = LongArrayBitVector.getInstance();
            final LongBigList data = dataBitVector.asLongBigList(this.width);
            for (ChunkedHashStore.Chunk chunk : chunkedHashStore) {
                HypergraphSorter<BitVector> sorter = new HypergraphSorter<BitVector>(chunk.size());
                do {
                    seed = r.nextLong();
                } while (!sorter.generateAndSort(chunk.iterator(), seed));

                this.seed[q] = seed;
                offset[q + 1] = offset[q] + sorter.numVertices;

                /* We assign values. */

                int top = chunk.size(), x, k;
                final int[] stack = sorter.stack;
                final int[] vertex1 = sorter.vertex1;
                final int[] vertex2 = sorter.vertex2;
                final int[] edge = sorter.edge;

                while (top > 0) {
                    x = stack[--top];
                    k = edge[x];
                    final long s = data.getLong(vertex1[x]) ^ data.getLong(vertex2[x]);
                    final long value = indirect ? valueList.getLong(chunk.data(k)) : chunk.data(k);
                    data.set(x, value ^ s);

                    if (ASSERTS)
                        assert (value == (data.getLong(x) ^ data.getLong(vertex1[x])
                                ^ data.getLong(vertex2[x]))) : "<" + x + "," + vertex1[x] + "," + vertex2[x]
                                        + ">: " + value + " != " + (data.getLong(x) ^ data.getLong(vertex1[x])
                                                ^ data.getLong(vertex2[x]));


        } catch (ChunkedHashStore.DuplicateException e) {
            if (keys == null)
                throw new IllegalStateException(
                        "You provided no keys, but the chunked hash store was not checked");
            if (duplicates++ > 3)
                throw new IllegalArgumentException("The input list contains duplicates");
            LOGGER.warn("Found duplicate. Recomputing triples...");
            pl.itemsName = "keys";
            if (values == null || indirect)
                chunkedHashStore.addAll(keys.iterator(), values != null ? values.iterator() : null);

    if (DEBUG)
        System.out.println("Offsets: " + Arrays.toString(offset));

    globalSeed = chunkedHashStore.seed();

    // Check for compaction
    long nonZero = 0;
    m = offset[offset.length - 1];

        final OfflineIterator<BitVector, LongArrayBitVector> iterator = offlineData.iterator();
        while (iterator.hasNext()) {
            final LongBigList data = iterator.next().asLongBigList(this.width);
            for (long i = 0; i < data.size64(); i++)
                if (data.getLong(i) != 0)
    // We estimate size using Rank16
    if (nonZero * this.width + m * 1.126 < m * this.width) {
        marker = LongArrayBitVector.ofLength(m);
        final LongBigList newData = LongArrayBitVector.getInstance().asLongBigList(this.width);
        nonZero = 0;

        final OfflineIterator<BitVector, LongArrayBitVector> iterator = offlineData.iterator();
        long j = 0;
        while (iterator.hasNext()) {
            final LongBigList data = iterator.next().asLongBigList(this.width);
            for (long i = 0; i < data.size64(); i++, j++) {
                final long value = data.getLong(i);
                if (value != 0) {
                    newData.set(nonZero++, value);

        rank = new Rank16(marker);

        if (ASSERTS) {
            final OfflineIterator<BitVector, LongArrayBitVector> iterator2 = offlineData.iterator();
            long k = 0;
            while (iterator2.hasNext()) {
                final LongBigList data = iterator2.next().asLongBigList(this.width);
                for (long i = 0; i < data.size64(); i++, k++) {
                    final long value = data.getLong(i);
                    assert (value != 0) == marker.getBoolean(k);
                    if (value != 0)
                        assert value == newData.getLong(rank.rank(k)) : value + " != "
                                + newData.getLong(rank.rank(k));
        this.data = newData;
    } else {
        final LongArrayBitVector dataBitVector = LongArrayBitVector.getInstance(m * this.width);
        this.data = dataBitVector.asLongBigList(this.width);

        OfflineIterator<BitVector, LongArrayBitVector> iterator = offlineData.iterator();
        while (iterator.hasNext())

        marker = null;
        rank = null;


    LOGGER.debug("Forecast bit cost per element: " + (marker == null ? HypergraphSorter.GAMMA * this.width
            : HypergraphSorter.GAMMA + this.width + 0.126));
    LOGGER.info("Actual bit cost per element: " + (double) numBits() / n);

    if (signatureWidth > 0) {
        signatureMask = -1L >>> Long.SIZE - signatureWidth;
        signatures = chunkedHashStore.signatures(signatureWidth, pl);
    } else if (signatureWidth < 0) {
        signatureMask = -1L >>> Long.SIZE + signatureWidth;
        signatures = null;
    } else {
        signatureMask = 0;
        signatures = null;

    if (!givenChunkedHashStore)

From source file:it.unimi.dsi.sux4j.mph.GOV3Function.java

/** Creates a new function for the given keys and values.
 * /* ww  w .  jav a2s.c  om*/
 * @param keys the keys in the domain of the function, or {@code null}.
 * @param transform a transformation strategy for the keys.
 * @param signatureWidth a positive number for a signature width, 0 for no signature, a negative value for a self-signed function; if nonzero, {@code values} must be {@code null} and {@code width} must be -1.
 * @param values values to be assigned to each element, in the same order of the iterator returned by <code>keys</code>; if {@code null}, the
 * assigned value will the the ordinal number of each element.
 * @param dataWidth the bit width of the <code>values</code>, or -1 if <code>values</code> is {@code null}.
 * @param indirect if true, <code>chunkedHashStore</code> contains ordinal positions, and <code>values</code> is a {@link LongIterable} that
 * must be accessed to retrieve the actual values.
 * @param compacted if true, the coefficients will be compacted. 
 * @param tempDir a temporary directory for the store files, or {@code null} for the standard temporary directory.
 * @param chunkedHashStore a chunked hash store containing the keys associated with their ranks (if there are no values, or {@code indirect} is true)
 * or values, or {@code null}; the store
 * can be unchecked, but in this case <code>keys</code> and <code>transform</code> must be non-{@code null}. 
protected GOV3Function(final Iterable<? extends T> keys, final TransformationStrategy<? super T> transform,
        int signatureWidth, final LongIterable values, final int dataWidth, final boolean indirect,
        final boolean compacted, final File tempDir, ChunkedHashStore<T> chunkedHashStore) throws IOException {
    this.transform = transform;

    if (signatureWidth != 0 && values != null)
        throw new IllegalArgumentException("You cannot sign a function if you specify its values");
    if (signatureWidth != 0 && dataWidth != -1)
        throw new IllegalArgumentException("You cannot specify a signature width and a data width");

    final ProgressLogger pl = new ProgressLogger(LOGGER);
    pl.displayLocalSpeed = true;
    pl.displayFreeMemory = true;
    final RandomGenerator r = new XorShift1024StarRandomGenerator();
    pl.itemsName = "keys";

    final boolean givenChunkedHashStore = chunkedHashStore != null;
    if (!givenChunkedHashStore) {
        if (keys == null)
            throw new IllegalArgumentException(
                    "If you do not provide a chunked hash store, you must provide the keys");
        chunkedHashStore = new ChunkedHashStore<T>(transform, tempDir, -Math.min(signatureWidth, 0), pl);
        if (values == null || indirect)
            chunkedHashStore.addAll(keys.iterator(), values != null ? values.iterator() : null);
    n = chunkedHashStore.size();
    defRetValue = signatureWidth < 0 ? 0 : -1; // Self-signed maps get zero as default resturn value.

    if (n == 0) {
        m = this.globalSeed = chunkShift = this.width = 0;
        data = null;
        marker = null;
        rank = null;
        offsetAndSeed = null;
        signatureMask = 0;
        signatures = null;
        if (!givenChunkedHashStore)

    int log2NumChunks = Math.max(0, Fast.mostSignificantBit(n >> LOG2_CHUNK_SIZE));
    chunkShift = chunkedHashStore.log2Chunks(log2NumChunks);
    final int numChunks = 1 << log2NumChunks;

    LOGGER.debug("Number of chunks: " + numChunks);

    offsetAndSeed = new long[numChunks + 1];

    this.width = signatureWidth < 0 ? -signatureWidth : dataWidth == -1 ? Fast.ceilLog2(n) : dataWidth;

    // Candidate data; might be discarded for compaction.
    final OfflineIterable<BitVector, LongArrayBitVector> offlineData = new OfflineIterable<BitVector, LongArrayBitVector>(
            BitVectors.OFFLINE_SERIALIZER, LongArrayBitVector.getInstance());

    int duplicates = 0;

    for (;;) {
        LOGGER.debug("Generating GOV function with " + this.width + " output bits...");

        pl.expectedUpdates = numChunks;
        pl.itemsName = "chunks";
        pl.start("Analysing chunks... ");

        try {
            int q = 0;
            final LongArrayBitVector dataBitVector = LongArrayBitVector.getInstance();
            final LongBigList data = dataBitVector.asLongBigList(this.width);
            long unsolvable = 0;
            for (final ChunkedHashStore.Chunk chunk : chunkedHashStore) {

                offsetAndSeed[q + 1] = offsetAndSeed[q] + (C_TIMES_256 * chunk.size() >>> 8);

                long seed = 0;
                final int v = (int) (offsetAndSeed[q + 1] - offsetAndSeed[q]);
                final Linear3SystemSolver<BitVector> solver = new Linear3SystemSolver<BitVector>(v,

                for (;;) {
                    final boolean solved = solver.generateAndSolve(chunk, seed, new AbstractLongBigList() {
                        private final LongBigList valueList = indirect
                                ? (values instanceof LongList ? LongBigLists.asBigList((LongList) values)
                                        : (LongBigList) values)
                                : null;

                        public long size64() {
                            return chunk.size();

                        public long getLong(final long index) {
                            return indirect ? valueList.getLong(chunk.data(index)) : chunk.data(index);
                    unsolvable += solver.unsolvable;
                    if (solved)
                    seed += SEED_STEP;
                    if (seed == 0)
                        throw new AssertionError("Exhausted local seeds");

                this.offsetAndSeed[q] |= seed;


                /* We assign values. */
                final long[] solution = solver.solution;
                for (int i = 0; i < solution.length; i++)
                    data.set(i, solution[i]);


            LOGGER.info("Unsolvable systems: " + unsolvable + "/" + numChunks + " ("
                    + Util.format(100.0 * unsolvable / numChunks) + "%)");

        } catch (ChunkedHashStore.DuplicateException e) {
            if (keys == null)
                throw new IllegalStateException(
                        "You provided no keys, but the chunked hash store was not checked");
            if (duplicates++ > 3)
                throw new IllegalArgumentException("The input list contains duplicates");
            LOGGER.warn("Found duplicate. Recomputing triples...");
            pl.itemsName = "keys";
            if (values == null || indirect)
                chunkedHashStore.addAll(keys.iterator(), values != null ? values.iterator() : null);

    if (DEBUG)
        System.out.println("Offsets: " + Arrays.toString(offsetAndSeed));

    globalSeed = chunkedHashStore.seed();

    // Check for compaction
    long nonZero = 0;
    m = offsetAndSeed[offsetAndSeed.length - 1];

        final OfflineIterator<BitVector, LongArrayBitVector> iterator = offlineData.iterator();
        while (iterator.hasNext()) {
            final LongBigList data = iterator.next().asLongBigList(this.width);
            for (long i = 0; i < data.size64(); i++)
                if (data.getLong(i) != 0)

    if (compacted) {
        marker = LongArrayBitVector.ofLength(m);
        final LongBigList newData = LongArrayBitVector.getInstance().asLongBigList(this.width);
        nonZero = 0;

        final OfflineIterator<BitVector, LongArrayBitVector> iterator = offlineData.iterator();
        long j = 0;
        while (iterator.hasNext()) {
            final LongBigList data = iterator.next().asLongBigList(this.width);
            for (long i = 0; i < data.size64(); i++, j++) {
                final long value = data.getLong(i);
                if (value != 0) {
                    newData.set(nonZero++, value);

        rank = new Rank16(marker);

        if (ASSERTS) {
            final OfflineIterator<BitVector, LongArrayBitVector> iterator2 = offlineData.iterator();
            long k = 0;
            while (iterator2.hasNext()) {
                final LongBigList data = iterator2.next().asLongBigList(this.width);
                for (long i = 0; i < data.size64(); i++, k++) {
                    final long value = data.getLong(i);
                    assert (value != 0) == marker.getBoolean(k);
                    if (value != 0)
                        assert value == newData.getLong(rank.rank(k)) : value + " != "
                                + newData.getLong(rank.rank(k));
        this.data = newData;
    } else {
        final LongArrayBitVector dataBitVector = LongArrayBitVector.getInstance(m * this.width);
        this.data = dataBitVector.asLongBigList(this.width);

        OfflineIterator<BitVector, LongArrayBitVector> iterator = offlineData.iterator();
        while (iterator.hasNext())

        marker = null;
        rank = null;


            "Forecast bit cost per element: " + (marker == null ? C * this.width : C + this.width + 0.126));
    LOGGER.info("Actual bit cost per element: " + (double) numBits() / n);

    if (signatureWidth > 0) {
        signatureMask = -1L >>> Long.SIZE - signatureWidth;
        signatures = chunkedHashStore.signatures(signatureWidth, pl);
    } else if (signatureWidth < 0) {
        signatureMask = -1L >>> Long.SIZE + signatureWidth;
        signatures = null;
    } else {
        signatureMask = 0;
        signatures = null;

    if (!givenChunkedHashStore)

From source file:it.unimi.dsi.sux4j.mph.VLLcpMonotoneMinimalPerfectHashFunction.java

public VLLcpMonotoneMinimalPerfectHashFunction(final Iterable<? extends T> iterable, final int numElements,
        final TransformationStrategy<? super T> transform) throws IOException {

    final ProgressLogger pl = new ProgressLogger(LOGGER);
    pl.displayLocalSpeed = true;//from www .java  2s  .  c  o m
    pl.displayFreeMemory = true;
    this.transform = transform;
    final RandomGenerator r = new XorShift1024StarRandomGenerator();

    if (numElements == -1) {
        if (iterable instanceof Size64)
            n = ((Size64) iterable).size64();
        else if (iterable instanceof Collection)
            n = ((Collection<?>) iterable).size();
        else {
            long c = 0;
            for (T dummy : iterable)
            n = c;
    } else
        n = numElements;

    if (n == 0) {
        bucketSize = bucketSizeMask = log2BucketSize = 0;
        lcp2Bucket = null;
        offsets = null;
        lcpLengths = null;
        mph = null;

    defRetValue = -1; // For the very few cases in which we can decide

    int theoreticalBucketSize = (int) Math
            .ceil(1 + GOV3Function.C * Math.log(2) + Math.log(n) - Math.log(1 + Math.log(n)));
    log2BucketSize = Fast.ceilLog2(theoreticalBucketSize);
    bucketSize = 1 << log2BucketSize;
    bucketSizeMask = bucketSize - 1;

    final long numBuckets = (n + bucketSize - 1) / bucketSize;

    LongArrayBitVector prev = LongArrayBitVector.getInstance();
    LongArrayBitVector curr = LongArrayBitVector.getInstance();
    int currLcp = 0;
    int maxLcp = 0, minLcp = Integer.MAX_VALUE;
    long maxLength = 0, totalLength = 0;

    final ChunkedHashStore<BitVector> chunkedHashStore = new ChunkedHashStore<BitVector>(
            TransformationStrategies.identity(), pl);
    OfflineIterable<BitVector, LongArrayBitVector> lcps = new OfflineIterable<BitVector, LongArrayBitVector>(
            BitVectors.OFFLINE_SERIALIZER, LongArrayBitVector.getInstance());
    pl.expectedUpdates = n;
    pl.start("Scanning collection...");

    Iterator<? extends T> iterator = iterable.iterator();
    for (long b = 0; b < numBuckets; b++) {
        maxLength = Math.max(maxLength, prev.length());
        totalLength += Fast.length(1 + prev.length());
        currLcp = (int) prev.length();
        final int currBucketSize = (int) Math.min(bucketSize, n - b * bucketSize);

        for (int i = 0; i < currBucketSize - 1; i++) {
            final int prefix = (int) curr.longestCommonPrefixLength(prev);
            if (prefix == prev.length() && prefix == curr.length())
                throw new IllegalArgumentException("The input bit vectors are not distinct");
            if (prefix == prev.length() || prefix == curr.length())
                throw new IllegalArgumentException("The input bit vectors are not prefix-free");
            if (prev.getBoolean(prefix))
                throw new IllegalArgumentException("The input bit vectors are not lexicographically sorted");

            currLcp = Math.min(prefix, currLcp);

            maxLength = Math.max(maxLength, prev.length());
            totalLength += Fast.length(1 + prev.length());

        lcps.add(prev.subVector(0, currLcp));
        maxLcp = Math.max(maxLcp, currLcp);
        minLcp = Math.min(minLcp, currLcp);


    // Build function assigning each lcp to its bucket.
    lcp2Bucket = new GOV3Function.Builder<BitVector>().keys(lcps).transform(TransformationStrategies.identity())
    final int[][] lcpLength = IntBigArrays.newBigArray(lcps.size64());
    long p = 0;
    for (LongArrayBitVector bv : lcps)
        IntBigArrays.set(lcpLength, p++, (int) bv.length());

    if (DEBUG) {
        for (BitVector v : lcps)
            System.err.println(v + " " + v.length());
        for (BitVector v : lcps) {
            final long value = lcp2Bucket.getLong(v);
            if (p++ != value) {
                System.err.println("p: " + (p - 1) + "  value: " + value + " key:" + v);
                throw new AssertionError();


    final Iterable<BitVector> bitVectors = TransformationStrategies.wrap(iterable, transform);
    // Build mph on elements.
    mph = new GOVMinimalPerfectHashFunction.Builder<BitVector>().keys(bitVectors)
    this.seed = chunkedHashStore.seed();

    // Build function assigning the lcp length and the bucketing data to each element.
    (offsets = LongArrayBitVector.getInstance().asLongBigList(log2BucketSize)).size(n);
    LongBigList lcpLengthsTemp = LongArrayBitVector.getInstance().asLongBigList(Fast.length(maxLcp));

    LOGGER.info("Generating data tables...");

    for (ChunkedHashStore.Chunk chunk : chunkedHashStore) {
        for (long[] quadruple : chunk) {
            final long index = mph.getLongByTriple(quadruple);
            offsets.set(index, quadruple[3] & bucketSizeMask);
            lcpLengthsTemp.set(index, IntBigArrays.get(lcpLength, (int) (quadruple[3] >> log2BucketSize)));


    lcpLengths = new EliasFanoLongBigList(lcpLengthsTemp.iterator(), minLcp, true);

    if (DEBUG) {
        p = 0;
        for (T key : iterable) {
            BitVector bv = transform.toBitVector(key);
            long index = mph.getLong(bv);
            if (p++ != lcp2Bucket.getLong(bv.subVector(0, lcpLengths.getLong(index))) * bucketSize
                    + offsets.getLong(index)) {
                System.err.println("p: " + (p - 1) + "  Key: " + key + " bucket size: " + bucketSize + " lcp "
                        + transform.toBitVector(key).subVector(0, lcpLengths.getLong(index)) + " lcp length: "
                        + lcpLengths.getLong(index) + " bucket "
                        + lcp2Bucket.getLong(transform.toBitVector(key).subVector(0, lcpLengths.getLong(index)))
                        + " offset: " + offsets.getLong(index));
                throw new AssertionError();

    LOGGER.debug("Bucket size: " + bucketSize);
    final double avgLength = (double) totalLength / n;
    LOGGER.debug("Forecast bit cost per element: " + (2 * GOV3Function.C + 2 + avgLength + Fast.log2(avgLength)
            + Fast.log2(Math.E) - Fast.log2(Fast.log2(Math.E)) + Fast.log2(1 + Fast.log2(n))));
    LOGGER.info("Actual bit cost per element: " + (double) numBits() / n);