it.unimi.dsi.sux4j.io.ChunkedHashStore.java Source code

Introduction

Here is the source code for it.unimi.dsi.sux4j.io.ChunkedHashStore.java
Source

package it.unimi.dsi.sux4j.io;

/*       
 * Sux4J: Succinct data structures for Java
 *
 * Copyright (C) 2008-2016 Sebastiano Vigna 
 *
 *  This library is free software; you can redistribute it and/or modify it
 *  under the terms of the GNU Lesser General Public License as published by the Free
 *  Software Foundation; either version 3 of the License, or (at your option)
 *  any later version.
 *
 *  This library is distributed in the hope that it will be useful, but
 *  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
 *  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public License
 *  for more details.
 *
 *  You should have received a copy of the GNU Lesser General Public License
 *  along with this program; if not, see <http://www.gnu.org/licenses/>.
 *
 */

import it.unimi.dsi.bits.LongArrayBitVector;
import it.unimi.dsi.bits.TransformationStrategy;
import it.unimi.dsi.fastutil.Swapper;
import it.unimi.dsi.fastutil.ints.AbstractIntComparator;
import it.unimi.dsi.fastutil.io.FastBufferedInputStream;
import it.unimi.dsi.fastutil.io.FastBufferedOutputStream;
import it.unimi.dsi.fastutil.longs.LongBigList;
import it.unimi.dsi.fastutil.longs.LongIterable;
import it.unimi.dsi.fastutil.longs.LongIterator;
import it.unimi.dsi.fastutil.objects.AbstractObjectIterator;
import it.unimi.dsi.io.SafelyCloseable;
import it.unimi.dsi.logging.ProgressLogger;
import it.unimi.dsi.sux4j.mph.GOV3Function;
import it.unimi.dsi.sux4j.mph.GOV3Function.Builder;
import it.unimi.dsi.sux4j.mph.Hashes;
import it.unimi.dsi.util.XorShift1024StarRandomGenerator;

import java.io.DataInputStream;
import java.io.DataOutputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.SequenceInputStream;
import java.io.Serializable;
import java.util.Arrays;
import java.util.Iterator;
import java.util.NoSuchElementException;

import org.apache.commons.collections.Predicate;
import org.apache.commons.collections.iterators.IteratorEnumeration;
import org.apache.commons.math3.random.RandomGenerator;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/** A temporary store of hash triples virtually divided into chunks.
 * 
 * <p>A chunked hash store accumulates elements (objects of type {@code T}) 
 * by turning them into bit vectors (using a provided {@link TransformationStrategy})
 * and then hashing such vectors into a triple of longs (i.e., overall we get a hash of 192 bits). 
 * Elements can be added {@linkplain #add(Object, long) one by one}
 * or {@linkplain #addAll(Iterator, LongIterator) in batches}. 
 * Elements must be distinct, or, more precisely, they must be transformed into distinct bit vectors.
 * 
 * <p>Besides the hashes, we store some data associated with each element:
 * if {@linkplain #add(Object) no data is specified}, we store the <em>rank</em> of each element added (the first element added has rank 0,
 * the second one has rank 1, and so on), unless you specified at {@linkplain #ChunkedHashStore(TransformationStrategy, File, int, ProgressLogger) construction time}
 * a nonzero <em>hash width</em>: in that case, the value stored by {@link #add(Object)} will be given the lowest bits of the first hash of the triple
 * associated with the object (the hash width is the number of bits stored). This feature makes it possible, for example, to implement a static
 * {@linkplain Builder#dictionary(int) dictionary} using a {@link GOV3Function}.
 * 
 * <p>Once all elements have been added, they can be gathered into <em>chunks</em> whose 
 * tentative size can be set by calling {@link #log2Chunks(int)}. More precisely,
 * if the latter method is called with argument <var>k</var>, then each chunk
 * will be formed by grouping triples by the <var>k</var> most significant bits of their first hash.
 * 
 * <p>To obtain triples, one calls {@link #iterator()}, which returns chunks one at a time (in their
 * natural order); triples within each chunk are returned by increasing hash. Actually, the iterator
 * provided by a chunk returns a <em>quadruple</em> whose last element is the data associated with the element
 * that generated the triple.
 * 
 * <p>It is possible (albeit <em>very</em> unlikely) that different elements generate the same hash. This event is detected
 * during chunk iteration (not while accumulating hashes), and it will throw a {@link ChunkedHashStore.DuplicateException}.
 * At that point, the caller must handle the exception by {@linkplain #reset(long) resetting the store} ant trying again
 * from scratch. Note that after a few (say, three) exceptions you can safely assume that there are duplicate elements. If you 
 * need to force a check on the whole store you can call {@link #check()}. If all your elements come from an {@link Iterable}, 
 * {@link #checkAndRetry(Iterable, LongIterable)} will try three times to build a checked chunked hash store.
 * 
 * <p>Every {@link #reset(long)} changes the seed used by the store to generate triples. So, if this seed has to be
 * stored this must happen <em>after</em> the last call to {@link #reset(long)}. To help tracking this fact, a call to 
 * {@link #seed()} will <em>lock</em> the store; any further call to {@link #reset(long)} will throw an {@link IllegalStateException}.
 * In case the store needs to be reused, you can call {@link #clear()}, that will bring back the store to after-creation state.
 * 
 * <p>When you have finished using a chunked hash store, you should {@link #close()} it. This class implements
 * {@link SafelyCloseable}, and thus provides a safety-net finalizer.
 * 
 * <h2>Filtering</h2>
 * 
 * <p>You can at any time {@linkplain #filter(Predicate) set a predicate} that will filter the triples returned by the store. 
 * 
 * <h2>Implementation details</h2>
 * 
 * <p>Internally, a chunked hash store has a notion of disk chunk: triples are stored on disk using a fixed number of bits.
 * Once the user chooses a chunk size, the store exhibits the data on disk by grouping disk chunks or splitting them
 * in a suitable way. This process is transparent to the user.
 * 
 * <p>An instance of this class will save triples into {@link #DISK_CHUNKS} disk chunks. Triples have to
 * be loaded into memory only chunk by chunk, so to be sorted and tested for uniqueness. As long as
 * {@link #DISK_CHUNKS} is larger than eight, the store will need less than one bit per element of main
 * memory. {@link #DISK_CHUNKS} can be increased arbitrarily at compile time, but each store
 * will open {@link #DISK_CHUNKS} files at the same time. (For the same reason, it is
 * <strong>strongly</strong> suggested that you close your stores as soon as you do not need them).
 * 
 * <h2>Intended usage</h2>
 * 
 * <p>Chunked hash stores should be built by classes that need to manipulate elements in chunks of approximate given 
 * size without needing access to the elements themselves, but just to their triples, a typical
 * example being {@link GOV3Function}, which uses the triples to compute a 3-hyperedge. Once a chunked hash
 * store is built, it can be passed on to further substructures, reducing greatly the computation time (as the original
 * collection need not to be scanned again).
 * 
 * <p>To compute the chunk corresponding to given element, use
 * <pre>
 * final long[] h = new long[ 3 ];
 * Hashes.spooky4( transform.toBitVector( key ), seed, h );
 * final int chunk = chunkShift == Long.SIZE ? 0 : (int)( h[ 0 ] &gt;&gt;&gt; chunkShift );
 * </pre>
 * where <code>seed</code> is the store seed, and <code>chunkShift</code> 
 * is the return value of {@link #log2Chunks(int)} and should be stored by the caller. Note
 * that you do not need the chunked hash store to compute these data, but just its seed and the chunk shift. 
 * 
 * @author Sebastiano Vigna
 * @since 1.0.4
 */

public class ChunkedHashStore<T> implements Serializable, SafelyCloseable, Iterable<ChunkedHashStore.Chunk> {
    public static final long serialVersionUID = 1L;
    private static final Logger LOGGER = LoggerFactory.getLogger(ChunkedHashStore.class);
    private static final boolean DEBUG = false;

    /** Denotes that the chunked hash store contains a duplicate hash triple. */
    public static class DuplicateException extends RuntimeException {
        private static final long serialVersionUID = 1L;
    }

    /** The size of the output buffers. */
    public final static int OUTPUT_BUFFER_SIZE = 16 * 1024;
    /** The logarithm of the number of physical disk chunks. */
    public final static int LOG2_DISK_CHUNKS = 8;
    /** The number of physical disk chunks. */
    public final static int DISK_CHUNKS = 1 << LOG2_DISK_CHUNKS;
    /** The shift for physical disk chunks. */
    public final static int DISK_CHUNKS_SHIFT = Long.SIZE - LOG2_DISK_CHUNKS;
    /** The number of elements ever {@linkplain #add(Object) added}. */
    protected long size;
    /** The number of elements that pass the current filter, or -1 we it must be recomputed. */
    protected long filteredSize;
    /** The seed used to generate the hash triples. */
    protected long seed;
    /** The number of triples in each disk chunk. */
    private int[] count;
    /** The number of chunks. */
    private long chunks;
    /** The files containing disk chunks. */
    private File file[];
    /** The number of disk chunks making up a chunk, or 1 if a chunk is smaller than or equal to a disk chunk. */
    private int diskChunkStep;
    /** The shift to be applied to the first hash to obtain the chunk index, set by {@link #log2Chunks(int)} (watch out: it can be {@link Long#SIZE}). */
    private int chunkShift;
    /** If true, this store has been checked for duplicates. */
    private boolean checkedForDuplicates;
    /** The transformation strategy provided at construction time. */
    private final TransformationStrategy<? super T> transform;
    /** A progress logger. */
    private final ProgressLogger pl;
    /** If nonzero, no associated data is saved in the store: {@link Chunk#data(long)} will return the first of the three hashes associated with the key, masked by this value. */
    private final long hashMask;
    /** The temporary directory for this chunked hash store, or {@code null}. */
    private final File tempDir;
    /** The data output streams for the disk chunks. */
    private DataOutputStream[] dos;
    /** The number of disk chunks divided by {@link #diskChunkStep}. */
    private int virtualDiskChunks;
    /** If not {@code null}, a filter that will be used to select triples. */
    private Predicate filter;
    /** Whether this store is locked. Any attempt to {@link #reset(long)} the store will cause an {@link IllegalStateException} if this variable is true.*/
    private boolean locked;
    /** Whether this store has already been closed. */
    private boolean closed;

    /** Creates a chunked hash store with given transformation strategy.
     * 
     * @param transform a transformation strategy for the elements.
     * @throws IOException 
     */
    public ChunkedHashStore(final TransformationStrategy<? super T> transform) throws IOException {
        this(transform, null, null);
    }

    /** Creates a chunked hash store with given transformation strategy and temporary file directory.
     * 
     * @param transform a transformation strategy for the elements.
     * @param tempDir a temporary directory for the store files, or {@code null} for the current directory.
     */
    public ChunkedHashStore(final TransformationStrategy<? super T> transform, final File tempDir)
            throws IOException {
        this(transform, tempDir, null);
    }

    /** Creates a chunked hash store with given transformation strategy.
     * 
     * @param transform a transformation strategy for the elements.
     * @param pl a progress logger, or {@code null}.
     */
    public ChunkedHashStore(final TransformationStrategy<? super T> transform, final ProgressLogger pl)
            throws IOException {
        this(transform, null, pl);
    }

    /** Creates a chunked hash store with given transformation strategy and progress logger.
     * 
     * @param transform a transformation strategy for the elements.
     * @param tempDir a temporary directory for the store files, or {@code null} for the current directory.
     * @param pl a progress logger, or {@code null}.
     */

    public ChunkedHashStore(final TransformationStrategy<? super T> transform, final File tempDir,
            final ProgressLogger pl) throws IOException {
        this(transform, tempDir, 0, pl);
    }

    /** Creates a chunked hash store with given transformation strategy and progress logger.
     * 
     * @param transform a transformation strategy for the elements.
     * @param tempDir a temporary directory for the store files, or {@code null} for the current directory.
     * @param hashWidth if nonzero, no associated data is saved in the store: {@link Chunk#data(long)} will return this many lower bits
     * of the first of the three hashes associated with the key. 
     * @param pl a progress logger, or {@code null}.
     */

    public ChunkedHashStore(final TransformationStrategy<? super T> transform, final File tempDir,
            final int hashWidth, final ProgressLogger pl) throws IOException {
        this.transform = transform;
        this.pl = pl;
        this.tempDir = tempDir;
        this.hashMask = hashWidth == 0 ? 0 : -1L >>> Long.SIZE - hashWidth;

        file = new File[DISK_CHUNKS];
        dos = new DataOutputStream[DISK_CHUNKS];
        // Create disk chunks
        for (int i = 0; i < DISK_CHUNKS; i++) {
            dos[i] = new DataOutputStream(new FastBufferedOutputStream(new FileOutputStream(file[i] = File
                    .createTempFile(ChunkedHashStore.class.getSimpleName(), String.valueOf(i), tempDir)),
                    OUTPUT_BUFFER_SIZE));
            file[i].deleteOnExit();
        }

        count = new int[DISK_CHUNKS];
    }

    /** Return the current seed of this chunked hash store. After calling this method, no {@link #reset(long)} will be allowed (unless the store
     * is {@linkplain #clear() cleared}).
     * 
     * @return the current seed of this chunked hash store.
     */

    public long seed() {
        locked = true;
        return seed;
    }

    /** Return the temporary directory of this chunked hash store, or {@code null}.
     *
     * @return the temporary directory of this chunked hash store, or {@code null}.
     */
    public File tempDir() {
        return tempDir;
    }

    /** Return the transformation strategy provided at construction time. 
     * @return the transformation strategy provided at construction time. */
    public TransformationStrategy<? super T> transform() {
        return transform;
    }

    /** Adds an element to this store, associating it with a specified value.
     * 
     * @param o the element to be added.
     * @param value the associated value.
     */
    public void add(final T o, final long value) throws IOException {
        final long[] triple = new long[3];
        Hashes.spooky4(transform.toBitVector(o), seed, triple);
        add(triple, value);
    }

    /** Adds an element to this store, associating it with its ordinal position.
     * 
     * @param o the element to be added.
     */
    public void add(final T o) throws IOException {
        add(o, filteredSize);
    }

    /** Adds a triple to this store.
     * 
     * @param triple the triple to be added.
     * @param value the associated value.
     */
    private void add(final long[] triple, final long value) throws IOException {
        final int chunk = (int) (triple[0] >>> DISK_CHUNKS_SHIFT);
        count[chunk]++;
        checkedForDuplicates = false;
        if (DEBUG)
            System.err.println("Adding " + Arrays.toString(triple));
        dos[chunk].writeLong(triple[0]);
        dos[chunk].writeLong(triple[1]);
        dos[chunk].writeLong(triple[2]);
        if (hashMask == 0)
            dos[chunk].writeLong(value);
        if (filteredSize != -1 && (filter == null || filter.evaluate(triple)))
            filteredSize++;
        size++;
    }

    /** Adds the elements returned by an iterator to this store, associating them with specified values.
     * 
     * @param elements an iterator returning elements.
     * @param values an iterator on values parallel to {@code elements}.
     */
    public void addAll(final Iterator<? extends T> elements, final LongIterator values) throws IOException {
        if (pl != null) {
            pl.expectedUpdates = -1;
            pl.start("Adding elements...");
        }
        final long[] triple = new long[3];
        while (elements.hasNext()) {
            Hashes.spooky4(transform.toBitVector(elements.next()), seed, triple);
            add(triple, values != null ? values.nextLong() : filteredSize);
            if (pl != null)
                pl.lightUpdate();
        }
        if (values != null && values.hasNext())
            throw new IllegalStateException(
                    "The iterator on values contains more entries than the iterator on keys");
        if (pl != null)
            pl.done();
    }

    /** Adds the elements returned by an iterator to this store, associating them with their ordinal position.
     * 
     * @param elements an iterator returning elements.
     */
    public void addAll(final Iterator<? extends T> elements) throws IOException {
        addAll(elements, null);
    }

    /** Returns the size of this store. Note that if you set up 
     * a {@linkplain #filter(Predicate) filter}, the first call to
     * this method will require a scan to the whole store. 
     * 
     * @return the number of (possibly filtered) triples of this store.
     */

    public long size() throws IOException {
        if (filter == null)
            return size;
        if (filteredSize == -1) {
            long c = 0;
            final long[] triple = new long[3];
            for (int i = 0; i < DISK_CHUNKS; i++) {
                if (filter == null)
                    c += count[i];
                else {
                    for (DataOutputStream d : dos)
                        d.flush();
                    final DataInputStream dis = new DataInputStream(
                            new FastBufferedInputStream(new FileInputStream(file[i])));
                    for (int j = 0; j < count[i]; j++) {
                        triple[0] = dis.readLong();
                        triple[1] = dis.readLong();
                        triple[2] = dis.readLong();
                        if (hashMask == 0)
                            dis.readLong();
                        if (filter.evaluate(triple))
                            c++;
                    }
                    dis.close();
                }
            }

            filteredSize = c;
        }
        return filteredSize;
    }

    /** Clears this store. After a call to this method, the store can be reused.
     */

    public void clear() {
        locked = false;
        reset(0);
    }

    protected void finalize() throws Throwable {
        try {
            if (!closed) {
                LOGGER.warn("This " + this.getClass().getName() + " [" + toString() + "] should have been closed.");
                close();
            }
        } finally {
            super.finalize();
        }
    }

    /** Closes this store, disposing all associated resources.
     * 
     */

    public void close() {
        if (!closed) {
            closed = true;
            for (DataOutputStream d : dos)
                try {
                    d.close();
                } catch (IOException e) {
                    throw new RuntimeException(e);
                }
            for (File f : file)
                f.delete();
        }
    }

    /** Resets this store using a new seed. All accumulated data are cleared, and a new seed is reinstated.
     * 
     * @param seed the new seed.
     * @throws IllegalStateException if this store was locked by a call to {@link #seed()}, and never {@linkplain #clear() cleared} thereafter.
     */

    public void reset(final long seed) {
        if (locked)
            throw new IllegalStateException();
        if (DEBUG)
            System.err.println("RESET(" + seed + ")");
        filteredSize = 0;
        this.seed = seed;
        checkedForDuplicates = false;
        Arrays.fill(count, 0);
        try {
            for (DataOutputStream d : dos)
                d.close();
            for (int i = 0; i < DISK_CHUNKS; i++)
                dos[i] = new DataOutputStream(
                        new FastBufferedOutputStream(new FileOutputStream(file[i]), OUTPUT_BUFFER_SIZE));
        } catch (IOException e) {
            throw new RuntimeException(e);
        }
    }

    /** Checks that this store has no duplicate triples, throwing an exception if this fails to happen.
     * 
     * @throws DuplicateException if this store contains duplicate triples.
     */

    public void check() throws DuplicateException {
        for (ChunkedHashStore.Chunk b : this)
            b.iterator();
    }

    /** Checks that this store has no duplicate triples, and try to rebuild if this fails to happen.
     * 
     * @param iterable the elements with which the store will be refilled if there are duplicate triples.
     * @param values the values that will be associated with the elements returned by <code>iterable</code>. 
     * @throws IllegalArgumentException if after a few trials the store still contains duplicate triples.
     */
    public void checkAndRetry(final Iterable<? extends T> iterable, final LongIterable values) throws IOException {
        final RandomGenerator random = new XorShift1024StarRandomGenerator();
        int duplicates = 0;

        for (;;)
            try {
                check();
                break;
            } catch (DuplicateException e) {
                if (duplicates++ > 3)
                    throw new IllegalArgumentException("The input list contains duplicates");
                LOGGER.warn("Found duplicate. Recomputing triples...");
                reset(random.nextLong());
                addAll(iterable.iterator(), values.iterator());
            }

        checkedForDuplicates = true;
    }

    /** Checks that this store has no duplicate triples, and try to rebuild if this fails to happen.
     * 
     * <p><strong>Warning</strong>: the actions are executed exactly in the specified order&mdash;<em>first</em>
     * check, <em>then</em> retry. If you invoke this method on an empty store you'll get a checked empty store.
     * 
     * @param iterable the elements with which the store will be refilled if there are duplicate triples. 
     * @throws IllegalArgumentException if after a few trials the store still contains duplicate triples.
     */
    public void checkAndRetry(final Iterable<? extends T> iterable) throws IOException {
        checkAndRetry(iterable, null);
    }

    /** Generate a list of signatures using the lowest bits of the first hash in this store.
     * 
     * <p>For this method to work, this store must contain ranks.
     * 
     * @param signatureWidth the width in bits of the signatures.
     * @param pl a progress logger.
     */

    public LongBigList signatures(final int signatureWidth, final ProgressLogger pl) throws IOException {
        final LongBigList signatures = LongArrayBitVector.getInstance().asLongBigList(signatureWidth);
        final long signatureMask = -1L >>> Long.SIZE - signatureWidth;
        signatures.size(size());
        pl.expectedUpdates = size();
        pl.itemsName = "signatures";
        pl.start("Signing...");
        for (ChunkedHashStore.Chunk chunk : this) {
            final Iterator<long[]> chunkIterator = chunk.iterator();
            for (int i = chunk.size(); i-- != 0;) {
                final long[] quadruple = chunkIterator.next();
                signatures.set(quadruple[3], signatureMask & quadruple[0]);
                pl.lightUpdate();
            }
        }
        pl.done();
        return signatures;
    }

    /** Sets the number of chunks.
     * 
     * <p>Once the store is filled, you must call this method to set the number of chunks. The store will take
     * care of merging or fragmenting disk chunks to get exactly the desired chunks.
     * 
     * @param log2chunks the base-2 logarithm of the number of chunks.
     * @return the shift to be applied to the first hash of a triple to get the chunk number (see the {@linkplain ChunkedHashStore introduction}).
     */

    public int log2Chunks(final int log2chunks) {
        this.chunks = 1 << log2chunks;
        diskChunkStep = (int) Math.max(DISK_CHUNKS / chunks, 1);
        virtualDiskChunks = DISK_CHUNKS / diskChunkStep;

        if (DEBUG) {
            System.err.print("Chunk sizes: ");
            double avg = filteredSize / (double) DISK_CHUNKS;
            double var = 0;
            for (int i = 0; i < DISK_CHUNKS; i++) {
                System.err.print(i + ":" + count[i] + " ");
                var += (count[i] - avg) * (count[i] - avg);
            }
            System.err.println();
            System.err.println("Average: " + avg);
            System.err.println("Variance: " + var / filteredSize);

        }

        chunkShift = Long.SIZE - log2chunks;

        LOGGER.debug("Number of chunks: " + chunks);
        LOGGER.debug("Number of disk chunks: " + DISK_CHUNKS);
        LOGGER.debug("Number of virtual disk chunks: " + virtualDiskChunks);

        return chunkShift;
    }

    /** A chunk returned by a {@link ChunkedHashStore}. */
    public final static class Chunk implements Iterable<long[]> {
        /** The start position of this chunk in the parallel arrays {@link #buffer0}, {@link #buffer1}, {@link #buffer2}, and {@link #data}. */
        private final int start;
        /** The final position (excluded) of this chunk in the parallel arrays {@link #buffer0}, {@link #buffer1}, {@link #buffer2}, and {@link #data}. */
        private final int end;
        private final long[] buffer0;
        private final long[] buffer1;
        private final long[] buffer2;
        private final long[] data;
        private final long hashMask;

        private Chunk(final long[] buffer0, final long[] buffer1, final long[] buffer2, final long[] data,
                final long hashMask, final int start, final int end) {
            this.start = start;
            this.end = end;
            this.data = data;
            this.hashMask = hashMask;
            this.buffer0 = buffer0;
            this.buffer1 = buffer1;
            this.buffer2 = buffer2;
        }

        /** The number of triples in this chunk.
         * 
         * @return the number of triples in this chunk.
         */
        public int size() {
            return end - start;
        }

        /** Returns the data of the <code>k</code>-th triple returned by this chunk.
         * 
         * <p>This method provides an alternative random access to data (w.r.t. indexing the fourth element of the
         * quadruples returned by {@link #iterator()}). 
         * 
         * @param k the index (in iteration order) of a triple.
         * @return the corresponding data.
         */

        public long data(final long k) {
            return data != null ? data[(int) (start + k)] : (buffer0[(int) (start + k)] & hashMask);
        }

        /** Returns an iterator over the quadruples associated with this chunk; the returned array of longs is reused at each call.
         * 
         * @return an iterator over quadruples formed by a triple (indices 0, 1, 2) and the associated data (index 3).
         */

        public Iterator<long[]> iterator() {
            return new AbstractObjectIterator<long[]>() {
                private int pos = start;
                private long[] quadruple = new long[4];

                public boolean hasNext() {
                    return pos < end;
                }

                public long[] next() {
                    if (!hasNext())
                        throw new NoSuchElementException();
                    final long[] quadruple = this.quadruple;
                    quadruple[0] = buffer0[pos];
                    quadruple[1] = buffer1[pos];
                    quadruple[2] = buffer2[pos];
                    quadruple[3] = data != null ? data[pos] : buffer0[pos] & hashMask;
                    pos++;
                    return quadruple;
                }

            };
        }
    }

    /** Sets a filter for this store.
     * 
     * @param filter a predicate that will be used to filter triples.
     */
    public void filter(final Predicate filter) {
        this.filter = filter;
        filteredSize = -1;
    }

    /** Returns an iterator over the chunks of this chunked hash store.
     *
     * @return an iterator over the chunks of this chunked hash store.
     */

    public Iterator<Chunk> iterator() {
        if (closed)
            throw new IllegalStateException("This " + getClass().getSimpleName() + " has been closed ");
        for (DataOutputStream d : dos)
            try {
                d.flush();
            } catch (IOException e) {
                throw new RuntimeException(e);
            }

        int m = 0;
        for (int i = 0; i < virtualDiskChunks; i++) {
            int s = 0;
            for (int j = 0; j < diskChunkStep; j++)
                s += count[i * diskChunkStep + j];
            if (s > m)
                m = s;
        }

        final int maxCount = m;

        return new AbstractObjectIterator<Chunk>() {
            private int chunk;
            private FastBufferedInputStream fbis;
            private int last;
            private int chunkSize;
            private final long[] buffer0 = new long[maxCount];
            private final long[] buffer1 = new long[maxCount];
            private final long[] buffer2 = new long[maxCount];
            private final long[] data = hashMask != 0 ? null : new long[maxCount];

            public boolean hasNext() {
                return chunk < chunks;
            }

            @SuppressWarnings("unchecked")
            public Chunk next() {
                if (!hasNext())
                    throw new NoSuchElementException();
                final long[] buffer0 = this.buffer0;

                if (chunk % (chunks / virtualDiskChunks) == 0) {
                    final int diskChunk = (int) (chunk / (chunks / virtualDiskChunks));
                    final long[] buffer1 = this.buffer1, buffer2 = this.buffer2;

                    chunkSize = 0;
                    try {
                        if (diskChunkStep == 1) {
                            fbis = new FastBufferedInputStream(new FileInputStream(file[diskChunk]));
                            chunkSize = count[diskChunk];
                        } else {
                            final FileInputStream[] fis = new FileInputStream[diskChunkStep];
                            for (int i = 0; i < fis.length; i++) {
                                fis[i] = new FileInputStream(file[diskChunk * diskChunkStep + i]);
                                chunkSize += count[diskChunk * diskChunkStep + i];
                            }
                            fbis = new FastBufferedInputStream(new SequenceInputStream(
                                    new IteratorEnumeration(Arrays.asList(fis).iterator())));
                        }
                        final DataInputStream dis = new DataInputStream(fbis);

                        final long triple[] = new long[3];
                        int count = 0;
                        for (int j = 0; j < chunkSize; j++) {
                            triple[0] = dis.readLong();
                            triple[1] = dis.readLong();
                            triple[2] = dis.readLong();

                            if (DEBUG)
                                System.err.println("From disk: " + Arrays.toString(triple));

                            if (filter == null || filter.evaluate(triple)) {
                                buffer0[count] = triple[0];
                                buffer1[count] = triple[1];
                                buffer2[count] = triple[2];
                                if (hashMask == 0)
                                    data[count] = dis.readLong();
                                count++;
                            } else if (hashMask == 0)
                                dis.readLong(); // Discard data
                        }

                        chunkSize = count;
                        dis.close();
                    } catch (IOException e) {
                        throw new RuntimeException(e);
                    }

                    it.unimi.dsi.fastutil.Arrays.quickSort(0, chunkSize, new AbstractIntComparator() {
                        private static final long serialVersionUID = 0L;

                        public int compare(final int x, final int y) {
                            int t = Long.signum(buffer0[x] - buffer0[y]);
                            if (t != 0)
                                return t;
                            t = Long.signum(buffer1[x] - buffer1[y]);
                            if (t != 0)
                                return t;
                            return Long.signum(buffer2[x] - buffer2[y]);
                        }
                    }, new Swapper() {
                        public void swap(final int x, final int y) {
                            final long e0 = buffer0[x], e1 = buffer1[x], e2 = buffer2[x];
                            buffer0[x] = buffer0[y];
                            buffer1[x] = buffer1[y];
                            buffer2[x] = buffer2[y];
                            buffer0[y] = e0;
                            buffer1[y] = e1;
                            buffer2[y] = e2;
                            if (hashMask == 0) {
                                final long v = data[x];
                                data[x] = data[y];
                                data[y] = v;
                            }
                        }
                    });

                    if (DEBUG) {
                        for (int i = 0; i < chunkSize; i++)
                            System.err.println(buffer0[i] + ", " + buffer1[i] + ", " + buffer2[i]);
                    }

                    if (!checkedForDuplicates && chunkSize > 1)
                        for (int i = chunkSize - 1; i-- != 0;)
                            if (buffer0[i] == buffer0[i + 1] && buffer1[i] == buffer1[i + 1]
                                    && buffer2[i] == buffer2[i + 1])
                                throw new ChunkedHashStore.DuplicateException();
                    if (chunk == chunks - 1)
                        checkedForDuplicates = true;
                    last = 0;
                }

                final int start = last;
                while (last < chunkSize && (chunkShift == Long.SIZE ? 0 : buffer0[last] >>> chunkShift) == chunk)
                    last++;
                chunk++;

                return new Chunk(buffer0, buffer1, buffer2, data, hashMask, start, last);
            }
        };
    }
}