com.cinchapi.concourse.server.storage.temp.Buffer.java Source code

Java tutorial

Introduction

Here is the source code for com.cinchapi.concourse.server.storage.temp.Buffer.java

Source

/*
 * Copyright (c) 2013-2016 Cinchapi Inc.
 * 
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 * 
 * http://www.apache.org/licenses/LICENSE-2.0
 * 
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package com.cinchapi.concourse.server.storage.temp;

import java.io.File;
import java.nio.ByteBuffer;
import java.nio.MappedByteBuffer;
import java.nio.channels.FileChannel.MapMode;
import java.util.AbstractList;
import java.util.ConcurrentModificationException;
import java.util.Iterator;
import java.util.List;
import java.util.ListIterator;
import java.util.Map;
import java.util.Set;
import java.util.SortedMap;
import java.util.concurrent.atomic.AtomicLong;
import java.util.concurrent.locks.ReentrantLock;
import java.util.concurrent.locks.ReentrantReadWriteLock;

import javax.annotation.Nullable;
import javax.annotation.concurrent.GuardedBy;
import javax.annotation.concurrent.ThreadSafe;

import jsr166e.StampedLock;

import com.cinchapi.common.base.TernaryTruth;
import com.cinchapi.concourse.Tag;
import com.cinchapi.concourse.annotate.Restricted;
import com.cinchapi.concourse.server.GlobalState;
import com.cinchapi.concourse.server.concurrent.ConcourseExecutors;
import com.cinchapi.concourse.server.concurrent.Locks;
import com.cinchapi.concourse.server.concurrent.PriorityReadWriteLock;
import com.cinchapi.concourse.server.io.ByteableCollections;
import com.cinchapi.concourse.server.io.Byteables;
import com.cinchapi.concourse.server.io.FileSystem;
import com.cinchapi.concourse.server.model.PrimaryKey;
import com.cinchapi.concourse.server.model.Text;
import com.cinchapi.concourse.server.model.Value;
import com.cinchapi.concourse.server.storage.Action;
import com.cinchapi.concourse.server.storage.Engine;
import com.cinchapi.concourse.server.storage.Inventory;
import com.cinchapi.concourse.server.storage.InventoryTracker;
import com.cinchapi.concourse.server.storage.PermanentStore;
import com.cinchapi.concourse.server.storage.cache.BloomFilter;
import com.cinchapi.concourse.server.storage.db.Database;
import com.cinchapi.concourse.thrift.Operator;
import com.cinchapi.concourse.thrift.TObject;
import com.cinchapi.concourse.thrift.Type;
import com.cinchapi.concourse.time.Time;
import com.cinchapi.concourse.util.Convert;
import com.cinchapi.concourse.util.Integers;
import com.cinchapi.concourse.util.Logger;
import com.cinchapi.concourse.util.MultimapViews;
import com.cinchapi.concourse.util.NaturalSorter;
import com.cinchapi.concourse.util.ReadOnlyIterator;
import com.cinchapi.concourse.util.TMaps;
import com.google.common.annotations.VisibleForTesting;
import com.google.common.base.Preconditions;
import com.google.common.collect.Iterables;
import com.google.common.collect.Lists;
import com.google.common.collect.Maps;
import com.google.common.collect.Sets;

import static com.cinchapi.concourse.server.GlobalState.*;
import static com.google.common.collect.Maps.newLinkedHashMap;

/**
 * A {@code Buffer} is a special implementation of {@link Limbo} that aims to
 * quickly accumulate writes in memory before performing a batch flush into some
 * {@link PermanentStore}.
 * <p>
 * A Buffer enforces the durability guarantee because all writes are also
 * immediately flushed to disk. Even though there is some disk I/O, the overhead
 * is minimal and writes are fast because the entire backing store is memory
 * mapped and the writes are always appended.
 * </p>
 * 
 * @author Jeff Nelson
 */
@ThreadSafe
public final class Buffer extends Limbo implements InventoryTracker {

    /**
     * Assuming {@code location} is a valid bufferStore, return an
     * {@link Iterator} to traverse the writes in the Buffer directly from disk
     * without loading the entire Buffer into memory.
     * 
     * @return the iterator
     */
    public static Iterator<Write> onDiskIterator(String location) {
        return new OnDiskIterator(location);
    }

    // NOTE: The Buffer does not ever lock itself because its delegates
    // concurrency control to each individual page. Furthermore, since each
    // Page is append-only, there is no need to ever lock any Page that is not
    // equal to #currentPage. The Buffer does grab the transport readLock for
    // most methods so that we don't end up in situations where a transport
    // happens while we're trying to read.

    /**
     * The average number of bytes used to store an arbitrary Write.
     */
    private static final int AVG_WRITE_SIZE = 30; /* arbitrary */

    /**
     * The number of verifies initiated.
     */
    private AtomicLong numVerifyRequests;

    /**
     * The number of verifies scanning the buffer.
     */
    private AtomicLong numVerifyScans;

    /**
     * The directory where the Buffer pages are stored.
     */
    private final String directory;
    /**
     * The environment that is associated with {@link Engine}.
     */
    private String environment;

    /**
     * The sequence of Pages that make up the Buffer.
     */
    private final List<Page> pages = new AbstractList<Page>() { // This List
                                                                // implementation
                                                                // provides an
                                                                // iterator that
                                                                // has
                                                                // "reloading"
                                                                // functionality
                                                                // such that we
                                                                // aren't halted
                                                                // by a CME that
                                                                // occurs when
                                                                // one thread
                                                                // adds a page
                                                                // to the
                                                                // underlying
                                                                // collection
                                                                // while another
                                                                // thread is
                                                                // using the
                                                                // iterator

        /**
         * The wrapped list that actually stores the data.
         */
        private final List<Page> delegate = Lists.newArrayList();

        @Override
        public void add(int index, Page element) {
            delegate.add(index, element);
        }

        @Override
        public Page remove(int index) {
            return delegate.remove(index);
        }

        @Override
        public Page get(int index) {
            return delegate.get(index);
        }

        @Override
        public int size() {
            return delegate.size();
        }

        @Override
        public Iterator<Page> iterator() {
            return new Iterator<Page>() {

                int index = 0;
                ListIterator<Page> it = delegate.listIterator(index);

                @Override
                public boolean hasNext() {
                    return it.hasNext();
                }

                @Override
                public Page next() {
                    try {
                        index = it.nextIndex();
                        return it.next();
                    } catch (ConcurrentModificationException e) {
                        // CON-75: The exception is thrown because a new page
                        // was adding by another thread while the current thread
                        // (which owns the iterator) was in the middle of the
                        // read. We can ignore this exception, because adding a
                        // new page will not lead to inconsistent results since
                        // all the data we've read so far is still valid. This
                        // just means we have more work to do before finishing
                        // than we originally anticipated.
                        //
                        // It is worth noting that each read method grabs the
                        // transportLock which prevents the case of a page being
                        // removed in the middle of a read.
                        it = delegate.listIterator(index);
                        return next();
                    }
                }

                @Override
                public void remove() {
                    it.remove();
                }

            };
        }

    };

    /**
     * A monitor that is used to make a thread block while waiting for the
     * Buffer to become transportable. The {@link #waitUntilTransportable()}
     * waits for this monitor and the {@link #insert(Write)} method notifies the
     * threads waiting on this monitor whenever there is more than single page
     * worth of data in the Buffer.
     */
    private final Object transportable = new Object();

    /**
     * A pointer to the current Page.
     */
    private Page currentPage;

    /**
     * A flag to indicate if the Buffer is running or not.
     */
    private boolean running = false;

    /**
     * We keep track of the time when the last transport occurred so that the
     * Engine can determine if it should avoid busy waiting in the
     * BufferTransportThread.
     */
    private AtomicLong timeOfLastTransport = new AtomicLong(Time.now());

    /**
     * The number of items to transport to the Database per attempt. There is a
     * tension between transporting and reading data (e.g. reads cannot happen
     * while a transport occurs and vice versa). Transports are most efficient
     * when they can batch up the amount of work per cycle, but that would be
     * reads are blocked longer. So this variable indicates how many items
     * should be transported in a single cycle. Each time a transport happens,
     * this value will increase, but it will be decreased whenever a read
     * occurs. This allows us to be more aggressive with transports when there
     * are no reads happening, and also allows us to scale back transports when
     * reads do occur.
     */
    private int transportRate = 1;

    /**
     * A pointer to the inventory that is used within the Engine.
     */
    private Inventory inventory = null;

    /**
     * A runnable instance that flushes the content the current buffer page to
     * disk.
     */
    private Runnable pageSync = new Runnable() {

        @Override
        public void run() {
            currentPage.content.force();
        }

    };

    /**
     * A runnable that flushes the inventory to disk.
     */
    private Runnable inventorySync = new Runnable() {

        @Override
        public void run() {
            inventory.sync();
        }

    };

    /**
     * The prefix for the threads that are responsible for flushing data to
     * disk. This is normally set by the Engine using the
     * {@link #setThreadNamePrefix(String)} method.
     */
    private String threadNamePrefix;

    /**
     * The maximum number of milliseconds to sleep between transport cycles.
     */
    private static final int MAX_TRANSPORT_THREAD_SLEEP_TIME_IN_MS = 100;

    /**
     * The minimum number of milliseconds to sleep between transport cycles.
     */
    private static final int MIN_TRANSPORT_THREAD_SLEEP_TIME_IN_MS = 5;

    /**
     * The number of milliseconds to sleep between transport cycles.
     */
    private int transportThreadSleepTimeInMs = MAX_TRANSPORT_THREAD_SLEEP_TIME_IN_MS;

    /**
     * The multiplier that is used when increasing the rate of transport.
     */
    protected int transportRateMultiplier = 2; // visible for testing

    /**
     * The structure lock ensures that only a single thread can modify the
     * structure of the Buffer, without affecting any readers.
     */
    private final ReentrantLock structure = new ReentrantLock();

    /**
     * Don't let the transport rate exceed this value.
     */
    private static int MAX_TRANSPORT_RATE = 8192;

    /**
     * The number of slots to put in each Page's bloom filter. We want this
     * small enough to have few hash functions, but large enough so that the
     * bloom filter does not become saturated.
     */
    private static int PER_PAGE_BLOOM_FILTER_CAPACITY = GlobalState.BUFFER_PAGE_SIZE / 10;

    /**
     * Construct a Buffer that is backed by the default location, which is
     * {@link GlobalState#BUFFER_DIRECTORY}.
     * 
     */
    public Buffer() {
        this(BUFFER_DIRECTORY);
    }

    /**
     * 
     * Construct a a Buffer that is backed by {@code backingStore}. Existing
     * content, if available, will be loaded from the file. Otherwise, a new and
     * empty Buffer will be returned.
     * 
     * @param directory - the path to directory where the buffer files should be
     *            stored. If the directory does not exist, it'll be created
     *            automatically
     */
    public Buffer(String directory) {
        FileSystem.mkdirs(directory);
        this.directory = directory;
        this.inventory = Inventory.create(directory + File.separator + "meta" + File.separator + "inventory"); // just incase we are running
                                                                                                               // from a unit test and
                                                                                                               // there is no call to
                                                                                                               // #setInventory
        this.threadNamePrefix = "buffer-" + System.identityHashCode(this);
        this.numVerifyRequests = new AtomicLong(0);
        this.numVerifyScans = new AtomicLong(0);
    }

    @Override
    public Map<Long, String> audit(long record) {
        Map<Long, String> audit = Maps.newTreeMap();
        for (Iterator<Write> it = iterator(record, Time.NONE); it.hasNext();) {
            Write write = it.next();
            audit.put(write.getVersion(), write.toString());
        }
        return audit;
    }

    @Override
    public Map<Long, String> audit(String key, long record) {
        Map<Long, String> audit = Maps.newTreeMap();
        for (Iterator<Write> it = iterator(key, record, Time.NONE); it.hasNext();) {
            Write write = it.next();
            audit.put(write.getVersion(), write.toString());
        }
        return audit;
    }

    @Override
    public Map<String, Set<TObject>> select(long record, long timestamp, Map<String, Set<TObject>> context) {
        for (Iterator<Write> it = iterator(record, timestamp); it.hasNext();) {
            Write write = it.next();
            Set<TObject> values;
            values = context.get(write.getKey().toString());
            if (values == null) {
                values = Sets.newHashSet();
                context.put(write.getKey().toString(), values);
            }
            if (write.getType() == Action.ADD) {
                values.add(write.getValue().getTObject());
            } else {
                values.remove(write.getValue().getTObject());
            }
        }
        return Maps.newTreeMap((SortedMap<String, Set<TObject>>) Maps.filterValues(context, emptySetFilter));
    }

    @Override
    public Map<TObject, Set<Long>> browse(String key, long timestamp, Map<TObject, Set<Long>> context) {
        for (Iterator<Write> it = iterator(key, timestamp); it.hasNext();) {
            Write write = it.next();
            Set<Long> records = context.get(write.getValue().getTObject());
            if (records == null) {
                records = Sets.newLinkedHashSet();
                context.put(write.getValue().getTObject(), records);
            }
            if (write.getType() == Action.ADD) {
                records.add(write.getRecord().longValue());
            } else {
                records.remove(write.getRecord().longValue());
            }
        }
        return Maps.newTreeMap((SortedMap<TObject, Set<Long>>) Maps.filterValues(context, emptySetFilter));
    }

    @Override
    public Map<Long, Set<TObject>> chronologize(String key, long record, long start, long end,
            Map<Long, Set<TObject>> context) {
        Set<TObject> snapshot = Iterables.getLast(context.values(), Sets.<TObject>newLinkedHashSet());
        if (snapshot.isEmpty() && !context.isEmpty()) {
            // CON-474: Empty set is placed in the context if it was the last
            // snapshot know to the database
            context.remove(Time.NONE);
        }
        for (Iterator<Write> it = iterator(key, record, end - 1); it.hasNext();) {
            Write write = it.next();
            long timestamp = write.getVersion();
            Text writtenKey = write.getKey();
            long writtenRecordId = write.getRecord().longValue();
            Action action = write.getType();
            if (writtenKey.toString().equals(key) && writtenRecordId == record) {
                snapshot = Sets.newLinkedHashSet(snapshot);
                Value newValue = write.getValue();
                if (action == Action.ADD) {
                    snapshot.add(newValue.getTObject());
                } else if (action == Action.REMOVE) {
                    snapshot.remove(newValue.getTObject());
                }
                if (timestamp >= start && !snapshot.isEmpty()) {
                    context.put(timestamp, snapshot);
                }
            }
        }
        return context;
    }

    @Override
    public boolean contains(long record) {
        return inventory.contains(record);
    }

    @Override
    public Set<String> describe(long record, long timestamp, Map<String, Set<TObject>> context) {
        for (Iterator<Write> it = iterator(record, timestamp); it.hasNext();) {
            Write write = it.next();
            Set<TObject> values;
            values = context.get(write.getKey().toString());
            if (values == null) {
                values = Sets.newHashSet();
                context.put(write.getKey().toString(), values);
            }
            if (write.getType() == Action.ADD) {
                values.add(write.getValue().getTObject());
            } else {
                values.remove(write.getValue().getTObject());
            }
        }
        return newLinkedHashMap(Maps.filterValues(context, emptySetFilter)).keySet();
    }

    /**
     * Return dumps for all the pages in the Buffer.
     * 
     * @return the dump string
     */
    public String dump() {
        StringBuilder sb = new StringBuilder();
        for (Page page : pages) {
            sb.append(page.dump());
            sb.append("\n");
        }
        return sb.toString();
    }

    @Override
    public Set<TObject> select(String key, long record, long timestamp, Set<TObject> context) {
        for (Iterator<Write> it = iterator(key, record, timestamp); it.hasNext();) {
            Write write = it.next();
            if (write.getType() == Action.ADD) {
                context.add(write.getValue().getTObject());
            } else {
                context.remove(write.getValue().getTObject());
            }
        }
        return context;
    }

    @Override
    public Map<Long, Set<TObject>> explore(Map<Long, Set<TObject>> context, long timestamp, String key,
            Operator operator, TObject... values) {
        for (Iterator<Write> it = iterator(key, timestamp); it.hasNext();) {
            Write write = it.next();
            long record = write.getRecord().longValue();
            if (matches(write.getValue(), operator, values)) {
                if (write.getType() == Action.ADD) {
                    MultimapViews.put(context, record, write.getValue().getTObject());
                } else {
                    MultimapViews.remove(context, record, write.getValue().getTObject());
                }
            }
        }
        return TMaps.asSortedMap(context);
    }

    /**
     * Return the location where the Buffer stores its data.
     * 
     * @return {@link GlobalState#BUFFER_DIRECTORY} or the directory that was
     *         passed to the {@link #Buffer(String)} constructor
     */
    @Restricted
    public String getBackingStore() {
        return directory;
    }

    @Override
    public int getDesiredTransportSleepTimeInMs() {
        return transportThreadSleepTimeInMs;
    }

    @Override
    public Inventory getInventory() {
        return inventory;
    }

    @Override
    public Set<Long> getAllRecords() {
        return inventory.getAll();
    }

    /**
     * Return the timestamp of the most recent data transport from the Buffer.
     * 
     * @return the time of last transport
     */
    @Restricted
    public long getTimeOfLastTransport() {
        return timeOfLastTransport.get();
    }

    @Override
    public boolean insert(Write write, boolean sync) {
        structure.lock();
        try {
            boolean notify = pages.size() == 2 && currentPage.size == 0;
            currentPage.append(write, sync);
            if (notify) {
                synchronized (transportable) {
                    transportable.notify();
                }
            }
            return true;
        } catch (CapacityException e) {
            addPage();
            return insert(write, sync);
        } finally {
            structure.unlock();
        }
    }

    /**
     * Return an iterator over all writes in the buffer that occurred no later
     * than {@code timestamp}.
     * 
     * @param timestamp
     * @return the iterator
     */
    public Iterator<Write> iterator(long timestamp) {
        return new AllSeekingIterator(timestamp);
    }

    /**
     * Return an iterator over all writes in the buffer with the specified
     * {@code record} component and that occurred no later than
     * {@code timestamp}.
     * 
     * @param record
     * @param timestamp
     * @return the iterator
     */
    public Iterator<Write> iterator(long record, long timestamp) {
        return new RecordSeekingIterator(record, timestamp);
    }

    /**
     * Return an iterator over all writes in the buffer with the specified
     * {@code key} component and that occurred no later than {@code timestamp}.
     * 
     * @param key
     * @param timestamp
     * @return the iterator
     */
    public Iterator<Write> iterator(String key, long timestamp) {
        return new KeySeekingIterator(key, timestamp);
    }

    /**
     * Return an iterator over all writes in the buffer with the specified
     * {@code key} and {@code record} components and that occurred no later than
     * {@code timestamp}.
     * 
     * @param timestamp
     * @return the iterator
     */
    public Iterator<Write> iterator(String key, long record, long timestamp) {
        return new KeyInRecordSeekingIterator(key, record, timestamp);
    }

    /**
     * Return an iterator over all writes in the buffer that equal the input
     * {@code write} and that occurred no later than {@code timestamp}.
     * 
     * @param timestamp
     * @return the iterator
     */
    public Iterator<Write> iterator(Write write, long timestamp) {
        return new WriteSeekingIterator(write, timestamp);
    }

    @Override
    public Iterator<Write> iterator() {
        return new AllSeekingIterator(Time.NONE);
    }

    /**
     * <p>
     * <strong>DO NOT CALL!!!</strong>
     * </p>
     * <p>
     * Called by the parent {@link Engine} to set the inventory that the Buffer
     * writes to when new records are added.
     * </p>
     * 
     * @param inventory
     */
    @Restricted
    public void setInventory(Inventory inventory) {
        this.inventory = inventory;
    }

    public void setEnvironment(String environment) {
        this.environment = environment;
    }

    /**
     * <p>
     * <strong>DO NOT CALL!!!</strong>
     * </p>
     * <p>
     * Called by the parent {@link Engine} to set the thread name prefix that
     * the Buffer uses when spawning asynchronous threads.
     * </p>
     * 
     * @param threadNamePrefix
     */
    @Restricted
    public void setThreadNamePrefix(String threadNamePrefix) {
        this.threadNamePrefix = threadNamePrefix;
    }

    @Override
    public void start() {
        if (!running) {
            running = true;
            Logger.info("Buffer configured to store data in {}", directory);
            SortedMap<File, Page> pageSorter = Maps.newTreeMap(NaturalSorter.INSTANCE);
            for (File file : new File(directory).listFiles()) {
                if (!file.isDirectory()) {
                    Page page = new Page(file.getAbsolutePath());
                    pageSorter.put(file, page);
                    Logger.info("Loading Buffer content from {}...", page);
                }
            }
            pages.addAll(pageSorter.values());
            if (pages.isEmpty()) {
                addPage(false);
            } else {
                currentPage = pages.get(pages.size() - 1);
            }
        }
    }

    @Override
    public void stop() {
        if (running) {
            running = false;
            synchronized (transportable) {
                transportable.notifyAll(); // notify to allow any waiting
                                           // threads to terminate
            }
        }
    }

    @Override
    public void sync() {
        ConcourseExecutors.executeAndAwaitTermination(threadNamePrefix, pageSync, inventorySync);
    }

    /**
     * {@inheritDoc}
     * <p>
     * This method will transport at least one write from the buffer, in
     * chronological order.
     * </p>
     */
    @Override
    public void transport(PermanentStore destination, boolean sync) {
        // NOTE: The #sync parameter is ignored because the Database does not
        // support allowing the Buffer to control when syncs happen.
        if (pages.size() > 1) {
            Page page = pages.get(0);
            if (!page.transportLock.writeLock().isHeldByCurrentThread()
                    && page.transportLock.writeLock().tryLock()) {
                try {
                    for (int i = 0; i < transportRate; ++i) {
                        if (page.hasNext()) {
                            destination.accept(page.next());
                            page.remove();
                        } else {
                            ((Database) destination).triggerSync();
                            removePage();
                            break;
                        }
                    }
                    timeOfLastTransport.set(Time.now());
                    transportRate = transportRate >= MAX_TRANSPORT_RATE ? MAX_TRANSPORT_RATE
                            : (transportRate * transportRateMultiplier);
                    --transportThreadSleepTimeInMs;
                    if (transportThreadSleepTimeInMs < MIN_TRANSPORT_THREAD_SLEEP_TIME_IN_MS) {
                        transportThreadSleepTimeInMs = MIN_TRANSPORT_THREAD_SLEEP_TIME_IN_MS;
                    }
                } finally {
                    page.transportLock.writeLock().unlock();
                }
            }
        }
    }

    @Override
    public boolean verify(Write write, long timestamp, boolean exists) {
        numVerifyRequests.incrementAndGet();
        for (Iterator<Write> it = iterator(write, timestamp); it.hasNext();) {
            it.next();
            exists ^= true; // toggle boolean
        }
        return exists;
    }

    @Override
    public TernaryTruth verifyFast(Write write, long timestamp) {
        if (inventory.contains(write.getRecord().longValue())) {
            return super.verifyFast(write, timestamp);
        } else {
            return TernaryTruth.FALSE;
        }
    }

    @Override
    public void waitUntilTransportable() {
        if (pages.size() <= 1) {
            synchronized (transportable) {
                try {
                    transportable.wait();
                } catch (InterruptedException e) {
                    /* ignore */}
            }
        }
    }

    /**
     * Return {@code true} if the Buffer has more than 1 page and the first page
     * has at least one element that can be transported. If this method returns
     * {@code false} it means that the first page is the only page or that the
     * Buffer would need to trigger a Database sync and remove the first page in
     * order to transport.
     * 
     * @return {@code true} if the Buffer can transport a Write.
     */
    @VisibleForTesting
    protected boolean canTransport() { // visible for testing
        return pages.size() > 1 && pages.get(0).hasNext();
    }

    @Override
    protected long getOldestWriteTimestamp() {
        return pages.get(0).getOldestWriteTimestamp();
    }

    @Nullable
    @Override
    protected Action getLastWriteAction(Write write, long timestamp) {
        // TODO: use ReverseSeekingIterator to optimize this
        Iterator<Write> it = iterator(write, timestamp);
        Action action = null;
        while (it.hasNext()) {
            action = it.next().getType();
        }
        return action;
    }

    @Override
    protected Iterator<Write> getSearchIterator(String key) {
        return iterator(key, Time.NONE);
    }

    @Override
    protected boolean isPossibleSearchMatch(String key, Write write, Value value) {
        return value.getType() == Type.STRING;
    }

    /**
     * Add a new Page to the Buffer.
     */
    private void addPage() {
        addPage(true);
    }

    /**
     * Add a new Page to the Buffer and optionally perform a {@code sync}.
     * 
     * @param sync - a flag that determines whether the {@link #sync()} method
     *            should be called to durably persist the current page to disk.
     *            This should only be false when called from the
     *            {@link #start()} method.
     */
    private void addPage(boolean sync) {
        structure.lock();
        try {
            if (sync) {
                sync();
            }
            currentPage = new Page(BUFFER_PAGE_SIZE);
            pages.add(currentPage);
            Logger.debug("Added page {} to Buffer", currentPage);
        } finally {
            structure.unlock();
        }
    }

    /**
     * Determines the percentage within range [0, 1] of verifies that scan
     * the buffer.
     * 
     * @return: decimal percentage of verifies initiated that scanned the
     *          buffer.
     */
    @SuppressWarnings("unused")
    private float getPercentVerifyScans() { // to be used for CON-236
        return ((float) numVerifyScans.get()) / numVerifyRequests.get();
    }

    /**
     * Remove the first page in the Buffer.
     */
    private void removePage() {
        structure.lock();
        try {
            pages.remove(0).delete();
        } finally {
            structure.unlock();
        }
    }

    /**
     * Scale back the number of items that are transported in a single cycle.
     */
    private void scaleBackTransportRate() {
        transportRate = 1;
        transportThreadSleepTimeInMs = MAX_TRANSPORT_THREAD_SLEEP_TIME_IN_MS;
    }

    /**
     * A {@link Page} represents a granular section of the {@link Buffer}. Pages
     * are an append-only iterator over a sequence of {@link Write} objects.
     * Pages differ from other iterators because they do not advance in the
     * sequence until the {@link #remove()} method is called.
     * 
     * @author Jeff Nelson
     */
    private class Page implements Iterator<Write>, Iterable<Write> {

        // NOTE: This class does not define hashCode() and equals() because the
        // defaults are the desired behaviour.

        /**
         * The filename extension.
         */
        private static final String ext = ".buf";

        /**
         * The append-only list of {@link Write} objects on the Page. Elements
         * are never deleted from this list, but are marked as "removed"
         * depending on the location of the {@link #head} index.
         */
        private final Write[] writes;

        /**
         * The append-only buffer that contains the content of the backing file.
         * Data is never deleted from the buffer, until the entire Page is
         * removed.
         */
        private MappedByteBuffer content;

        /**
         * The file that contains the content of the Page.
         */
        private final String filename;

        /**
         * The local lock for read/write access on the page. This is only used
         * when this page is equal to the {@link #currentPage}. In that case,
         * this lock is grabbed before any access is allowed on the page, so
         * subsequent structures that are used need not be thread safe.
         */
        private transient StampedLock accessLock = new StampedLock();

        /**
         * The transportLock makes it possible to append new Writes and
         * transport old writes concurrently while prohibiting reading the Page
         * and transporting writes at the same time.
         */
        private final transient ReentrantReadWriteLock transportLock = PriorityReadWriteLock.prioritizeReads();

        /**
         * Indicates the index in {@link #writes} that constitutes the first
         * element. When writes are "removed", elements are not actually deleted
         * from the list, so it is necessary to keep track of the head element
         * so that the correct next() element can be returned.
         */
        private transient int head = 0;

        /**
         * The total number of elements in the list of {@link #writes}.
         */
        private transient int size = 0;

        /**
         * The upper bound on the number of writes that this page can hold.
         */
        private final transient int sizeUpperBound;

        /**
         * A bloom filter like cache that is used to help determine if it is
         * possible that a key/record exists on the page.
         */
        private final boolean[] keyRecordCache;

        /**
         * A bloom filter like cache that is used to help determine if it
         * possible that a Write exists on the page.
         */
        private final BloomFilter writeCache;

        /**
         * A bloom filter like cache that is used to help determine if it
         * possible that a record exists on the page.
         */
        private final boolean[] recordCache;

        /**
         * A bloom filter like cache that is used to help determine if it
         * possible that a key exists on the page.
         */
        private final boolean[] keyCache;

        /**
         * Construct an empty Page with {@code capacity} bytes.
         * 
         * @param size
         */
        public Page(int capacity) {
            this(directory + File.separator + Time.now() + ext, capacity);
        }

        /**
         * Construct a Page that is backed by {@code filename}. Existing
         * content, if available, will be loaded from the file starting at the
         * position specified in {@link #pos}.
         * <p>
         * Please note that this constructor is designed to deserialize a
         * retired page, so the returned Object will be at capacity and unable
         * to append additional {@link Write} objects.
         * </p>
         * 
         * @param filename
         */
        public Page(String filename) {
            this(filename, FileSystem.getFileSize(filename));
        }

        /**
         * Construct a new instance.
         * 
         * @param filename
         * @param capacity
         */
        private Page(String filename, long capacity) {
            this.filename = filename;
            this.content = FileSystem.map(filename, MapMode.READ_WRITE, 0, capacity);
            this.sizeUpperBound = Math.max(1, (int) ((capacity / AVG_WRITE_SIZE) * 1.2));
            this.writes = new Write[sizeUpperBound];
            this.recordCache = new boolean[sizeUpperBound];
            this.keyCache = new boolean[sizeUpperBound];
            this.keyRecordCache = new boolean[sizeUpperBound];
            this.writeCache = BloomFilter.create(PER_PAGE_BLOOM_FILTER_CAPACITY);
            writeCache.disableThreadSafety();
            Iterator<ByteBuffer> it = ByteableCollections.iterator(content);
            while (it.hasNext()) {
                Write write = Write.fromByteBuffer(it.next());
                index(write);
                Logger.debug("Found existing write '{}' in the Buffer", write);
            }
        }

        /**
         * Append {@code write} to the Page if {@link #content} has enough
         * remaining capacity to store {@code write}. Since all inserts are
         * routed to this method, we grab a writeLock so that we don't have a
         * situation where the currentPage is ever changed in the middle of a
         * read.
         * 
         * @param write the {@link Write} to append
         * @param sync a flag that determines if the page should be fsynced
         *            (or the equivalent) after appending {@code write} so that
         *            the changes are guaranteed to be durably persisted, this
         *            flag should almost always be {@code true} if calling this
         *            method directly. It is set to {@code false} when called
         *            from the context of an atomic operation transporting
         *            writes to this Buffer using GROUP SYNC
         * @throws CapacityException
         *             - if the size of {@code write} is greater than the
         *             remaining capacity of {@link #content}
         */
        public void append(Write write, boolean sync) throws CapacityException {
            Preconditions.checkState(this == currentPage,
                    "Illegal attempt to " + "append a Write to an inactive Page");
            long stamp = accessLock.writeLock();
            try {
                if (content.remaining() >= write.size() + 4) {
                    appendUnsafe(write, sync); /* (authorized) */
                } else if (content.position() == 0) {
                    // Handle corner case where a Write is larger than
                    // BUFFER_PAGE_SIZE by auto expanding the capacity for the
                    // page
                    content = FileSystem.map(filename, MapMode.READ_WRITE, 0, write.size() + 4);
                    appendUnsafe(write, sync); /* (authorized) */
                } else {
                    throw CapacityException.INSTANCE;
                }
            } finally {
                accessLock.unlockWrite(stamp);
            }
        }

        /**
         * Delete the page from disk. The Page object will reside in memory
         * until garbage collection.
         */
        public void delete() {
            FileSystem.deleteFile(filename);
            FileSystem.unmap(content); // CON-163 (authorized)
            Logger.info("Deleting Buffer page {}", filename);
        }

        /**
         * Return the timestamp of the oldest (e.g. first) write on this page,
         * if it exists.
         * 
         * @return the oldest write timestamp
         */
        public long getOldestWriteTimestamp() {
            Write oldestWrite = writes[0];
            // When there is no data on the page return the max possible
            // timestamp so that no query's timestamp is less than this
            // timestamp
            return oldestWrite == null ? Long.MAX_VALUE : oldestWrite.getVersion();
        }

        /**
         * Returns {@code true} if {@link #head} is smaller than the largest
         * occupied index in {@link #writes}. This means that it is possible for
         * calls to this method to initially return {@code false} at t0, but
         * eventually return {@code true} at t1 if an element is added to the
         * Page between t0 and t1.
         */
        @Override
        public boolean hasNext() {
            long stamp = Locks.stampLockReadIfCondition(accessLock, this == currentPage);
            try {
                return head < size;
            } finally {
                Locks.stampUnlockReadIfCondition(accessLock, stamp, this == currentPage);
            }
        }

        /**
         * <p>
         * Returns an iterator that is appropriate for the append-only list of
         * {@link Write} objects that backs the Page. The iterator does not
         * support the {@link Iterator#remove()} method and only throws a
         * {@link ConcurrentModificationException} if an element is removed from
         * the Page using the {@link #remove()} method.
         * </p>
         * <p>
         * While the Page is, itself, an iterator (for transporting Writes), the
         * iterator returned from this method is appropriate for cases when it
         * is necessary to iterate through the page for reading.
         * </p>
         */
        /*
         * (non-Javadoc)
         * This iterator is only used for Limbo reads that
         * traverse the collection of Writes. This iterator differs from the
         * Page (which is also an Iterator over Write objects) by virtue of the
         * fact that it does not allow removes and will detect concurrent
         * modification.
         */
        @Override
        public Iterator<Write> iterator() {

            return new Iterator<Write>() {

                /**
                 * The index of the "next" element in {@link #writes}.
                 */
                private int index = head;

                /**
                 * The distance between the {@link #head} element and the
                 * {@code next} element. This is used to detect for concurrent
                 * modifications.
                 */
                private int distance = 0;

                @Override
                public boolean hasNext() {
                    if (index - head != distance) {
                        throw new ConcurrentModificationException("A write has been removed from the Page");
                    }
                    return index < size;
                }

                @Override
                public Write next() {
                    if (index - head != distance) {
                        throw new ConcurrentModificationException("A write has been removed from the Page");
                    }
                    Write next = writes[index];
                    ++index;
                    ++distance;
                    return next;
                }

                @Override
                public void remove() {
                    throw new UnsupportedOperationException();

                }

            };
        }

        /**
         * Return {@code true} if the Page <em>might</em> have a Write equal to
         * {@code write}. If this function returns true, the caller should check
         * with certainty by calling {@link #doesContain(Write, long)}.
         * 
         * @param write
         * @return {@code true} if the write possibly exists
         */
        public boolean mightContain(Write write) {
            Type valueType = write.getValue().getType();
            if (writeCache.mightContainCached(write.getRecord(), write.getKey(), write.getValue())) {
                return true;
            } else if (valueType == Type.STRING) {
                return writeCache.mightContainCached(write.getRecord(), write.getKey(),
                        Value.wrap(Convert.javaToThrift(Tag.create((String) write.getValue().getObject()))));
            } else if (valueType == Type.TAG) {
                return writeCache.mightContainCached(write.getRecord(), write.getKey(),
                        Value.wrap(Convert.javaToThrift(write.getValue().getObject().toString())));
            } else {
                return false;
            }
        }

        /**
         * Return {@code true} if the Page <em>might</em> have a Write with the
         * specified {@code record} component. If this function returns true,
         * the caller should perform a linear scan using the local
         * {@link #iterator()} .
         * 
         * @param record
         * @return {@code true} if a write within {@code record} possibly exists
         */
        public boolean mightContain(PrimaryKey record) {
            return recordCache[slotify(record.hashCode())];
        }

        /**
         * Return {@code true} if the Page <em>might</em> have a Write with the
         * specified {@code key} component. If this function returns true,
         * the caller should perform a linear scan using the local
         * {@link #iterator()} .
         * 
         * @param key
         * @return {@code true} if a write for {@code key} possibly exists
         */
        public boolean mightContain(Text key) {
            return keyCache[slotify(key.hashCode())];
        }

        /**
         * Return {@code true} if the Page <em>might</em> have a Write with the
         * specified {@code key} and {@code record} components. If this function
         * returns true, the caller should perform a linear scan using the local
         * {@link #iterator()} .
         * 
         * @param key
         * @param record
         * @return {@code true} if a write for {@code key} in {@code record}
         *         possibly exists
         */
        public boolean mightContain(Text key, PrimaryKey record) {
            return keyRecordCache[slotify(key.hashCode(), record.hashCode())];
        }

        /**
         * Returns the Write at index {@link #head} in {@link #writes}.
         * <p>
         * <strong>NOTE:</strong>
         * <em>This method will return the same element on multiple
         * invocations until {@link #remove()} is called.</em>
         * </p>
         */
        @Override
        public Write next() {
            long stamp = Locks.stampLockReadIfCondition(accessLock, this == currentPage);
            try {
                return writes[head];
            } finally {
                Locks.stampUnlockReadIfCondition(accessLock, stamp, this == currentPage);
            }
        }

        /**
         * Simulates the removal of the head Write from the Page. This method
         * only updates the {@link #head} and {@link #pos} metadata and does not
         * actually delete any data, which is a performance optimization.
         */
        @Override
        public void remove() {
            long stamp = Locks.stampLockWriteIfCondition(accessLock, this == currentPage);
            try {
                ++head;
            } finally {
                Locks.stampUnlockWriteIfCondition(accessLock, stamp, this == currentPage);
            }
        }

        @Override
        public String toString() {
            return filename;
        }

        /**
         * Dump the contents of this page.
         * 
         * @return the dump string
         */
        protected String dump() {
            long stamp = Locks.stampLockReadIfCondition(accessLock, this == currentPage);
            try {
                StringBuilder sb = new StringBuilder();
                sb.append("Dump for " + getClass().getSimpleName() + " " + filename);
                sb.append("\n");
                sb.append("------");
                sb.append("\n");
                for (Write write : writes) {
                    if (write == null) {
                        break;
                    } else {
                        sb.append(write);
                        sb.append("\n");
                    }
                }
                sb.append("\n");
                return sb.toString();
            } finally {
                Locks.stampUnlockReadIfCondition(accessLock, stamp, this == currentPage);
            }
        }

        /**
         * Do the work to actually index and append {@code write} (while
         * optionally performing a {@code sync} WITHOUT grabbing any locks
         * (hence this method being UNSAFE) for unauthorized usage.
         * 
         * @param write the {@link Write} to append
         * @param sync a flag that determines if the page should be fsynced
         *            (or the equivalent) after appending {@code write} so that
         *            the changes are guaranteed to be durably persisted, this
         *            flag should almost always be {@code true} if calling this
         *            method directly. It is set to {@code false} when called
         *            from the context of an atomic operation transporting
         *            writes to this Buffer using GROUP SYNC
         */
        @GuardedBy("Buffer.Page#append(Write)")
        private void appendUnsafe(Write write, boolean sync) {
            index(write);
            content.putInt(write.size());
            write.copyTo(content);
            inventory.add(write.getRecord().longValue());
            if (sync) {
                sync();
            }
        }

        /**
         * Insert {@code write} into the list of {@link #writes} and increment
         * the {@link #size} counter.
         * 
         * @param write
         * @throws CapacityException
         */
        @GuardedBy("Buffer.Page#append(Write)")
        private void index(Write write) throws CapacityException {
            if (size < writes.length) {
                writes[size] = write;
                int hashCodeRecord = write.getRecord().hashCode();
                int hashCodeKey = write.getKey().hashCode();
                // The individual Write components are added instead of the
                // entire Write so that version information is not factored into
                // the bloom filter hashing
                writeCache.putCached(write.getRecord(), write.getKey(), write.getValue());
                keyRecordCache[slotify(hashCodeRecord, hashCodeKey)] = true;
                recordCache[slotify(hashCodeRecord)] = true;
                keyCache[slotify(hashCodeKey)] = true;
                ++size;
            } else {
                throw CapacityException.INSTANCE;
            }
        }

        /**
         * Convenience function to return the appropriate slot in one of the
         * Page's filter's between 0 and {@code #sizeUpperBound} for an object
         * with {@code hashCode}.
         * 
         * @param hashCode
         * @return the slot
         */
        private int slotify(int hashCode) {
            return Math.abs(hashCode % sizeUpperBound);
        }

        /**
         * Convenience function to return the appropriate slot in one of the
         * Page's filter's between 0 and {@code #sizeUpperBound} for a group of
         * objects with the {@code hashCodes}.
         * 
         * @param hashCode
         * @return the slot
         */
        private int slotify(int... hashCodes) {
            return Math.abs(Integers.avg(hashCodes) % sizeUpperBound);
        }
    }

    /**
     * An {@link Iterator} over the writes in the buffer that has logic to only
     * return writes that match a certain {@code seek} criteria. The iterator
     * also uses the criteria as a hint to perform more optimal searches over
     * the pages in the Buffer.
     * 
     * @author Jeff Nelson
     */
    private abstract class SeekingIterator implements Iterator<Write> {

        /**
         * The max timestamp for which to seek. If a Write's version is greater
         * than this timestamp, then the iterator ceases to return elements.
         */
        private final long timestamp;

        /**
         * An iterator over all the pages in the Buffer.
         */
        private Iterator<Page> pageIterator = pages.iterator();

        /**
         * The iterator over the writes on the page at which the iterator is
         * currently traversing.
         */
        private Iterator<Write> writeIterator = null;

        /**
         * The next write to return.
         */
        private Write next = null;

        /**
         * A flag that indicates whether we should not perform a timestamp check
         * because we want all the writes up until the present state.
         */
        private boolean ignoreTimestamp = false;

        /**
         * A reference to the page in which the iterator is currently
         * traversing.
         */
        private Page myCurrentPage;

        /**
         * The stamp returned from grabbing the read access lock from
         * {@link #myCurrentPage}.
         */
        private long myAccessStamp = 0L;

        /**
         * A flag that indicates whether this iterator has satisfied
         * preconditions and is useable. If it is not useable, it won't perform
         * any traversals or return any data.
         */
        private final boolean useable;

        /**
         * Construct a new instance.
         * 
         * @param timestamp
         */
        protected SeekingIterator(long timestamp) {
            this.timestamp = timestamp;
            if (timestamp >= getOldestWriteTimestamp()) {
                scaleBackTransportRate();
                this.ignoreTimestamp = timestamp == Long.MAX_VALUE;
                this.next = advance();
                this.useable = true;
            } else {
                this.useable = false;
            }

        }

        @Override
        public boolean hasNext() {
            return next != null;
        }

        @Override
        public Write next() {
            Write next0 = next;
            this.next = advance();
            return next0;
        }

        @Override
        public void remove() {
            throw new UnsupportedOperationException();
        }

        /**
         * Each subclass should call this method after constructing the initial
         * state to turn to the first page and get the first write.
         */
        protected void init() {
            if (useable) {
                flip(true);
                this.next = advance();
            }
        }

        /**
         * Advance to the next write that this iterator should return, if it
         * exists.
         * 
         * @return the next write or {@code null}
         */
        private Write advance() {
            for (;;) {
                if (writeIterator == null) {
                    return null;
                }
                while (writeIterator.hasNext()) {
                    Write write = writeIterator.next();
                    if (!ignoreTimestamp && write.getVersion() > timestamp) {
                        writeIterator = null;
                        pageIterator = null;
                        releaseLocks();
                        return null;
                    } else if (isRelevantWrite(write)) {
                        return write;
                    }
                }
                flip();
            }
        }

        /**
         * Flip to the next page in the Buffer with the option to temporarily
         * skip the timestamp check.
         * 
         * @param skipTsCheck
         */
        private void flip(boolean skipTsCheck) {
            writeIterator = null;
            releaseLocks();
            if (pageIterator.hasNext()) {
                while (pageIterator.hasNext()) {
                    Page next = pageIterator.next();
                    grabLocks(next);
                    if (!skipTsCheck && !ignoreTimestamp && next.getOldestWriteTimestamp() > timestamp) {
                        writeIterator = null;
                        pageIterator = null;
                        releaseLocks();
                        break;
                    }
                    if (pageMightContainRelevantWrites(next)) {
                        writeIterator = next.iterator();
                        break;
                    } else {
                        releaseLocks();
                    }
                }
            }
        }

        /**
         * Flip to the next page in the Buffer.
         */
        private void flip() {
            flip(false);
        }

        /**
         * Grab the necessary locks to protected {@code #page} while it is used
         * in the iterator.
         * 
         * @param page
         */
        private void grabLocks(Page page) {
            myCurrentPage = page;
            myAccessStamp = Locks.stampLockReadIfCondition(myCurrentPage.accessLock, myCurrentPage == currentPage);
            myCurrentPage.transportLock.readLock().lock();
        }

        /**
         * Release the locks for {@link #myCurrentPage}.
         */
        private void releaseLocks() {
            if (myCurrentPage != null) {
                Locks.stampUnlockReadIfCondition(myCurrentPage.accessLock, myAccessStamp,
                        myCurrentPage == currentPage);
                myCurrentPage.transportLock.readLock().unlock();
            }
        }

        /**
         * Call the appropriate function to determine if the {@code page} might
         * contain the kinds of writes that this iterator is seeking.
         * 
         * @param page
         * @return {@code true} if the page can possibly contain relevant data
         */
        protected abstract boolean pageMightContainRelevantWrites(Page page);

        /**
         * Return {@code true} if {@code write} is relevant to what this
         * iterator is seeking.
         * 
         * @param write
         * @return {@code true} if the write is relevant
         */
        protected abstract boolean isRelevantWrite(Write write);

    }

    /**
     * A {@link SeekingIterator} that looks for writes that are equal to a
     * comparison write.
     * 
     * @author Jeff Nelson
     */
    private class WriteSeekingIterator extends SeekingIterator {

        /**
         * The relevant write.
         */
        private final Write write;

        /**
         * A flag to check whether the buffer has already been scanned (to
         * mitigate multiple increments given multiple scans
         * to the same buffer).
         */
        private boolean scanned;

        /**
         * Construct a new instance.
         * 
         * @param timestamp
         */
        protected WriteSeekingIterator(Write write, long timestamp) {
            super(timestamp);
            this.write = write;
            init();
        }

        @Override
        protected boolean pageMightContainRelevantWrites(Page page) {
            boolean mightContain = page.mightContain(write);
            if (!scanned && mightContain) {
                numVerifyScans.incrementAndGet();
            }
            return mightContain;
        }

        @Override
        protected boolean isRelevantWrite(Write write) {
            return write.equals(this.write);
        }
    }

    /**
     * A {@link SeekingIterator} that looks for writes with a particular key and
     * record component.
     * 
     * @author Jeff Nelson
     */
    private class KeyInRecordSeekingIterator extends SeekingIterator {

        /**
         * The relevant key.
         */
        private final Text key;

        /**
         * The relevant record.
         */
        private final PrimaryKey record;

        /**
         * Construct a new instance.
         * 
         * @param timestamp
         */
        protected KeyInRecordSeekingIterator(String key, long record, long timestamp) {
            super(timestamp);
            this.key = Text.wrapCached(key);
            this.record = PrimaryKey.wrap(record);
            init();
        }

        @Override
        protected boolean pageMightContainRelevantWrites(Page page) {
            return page.mightContain(key, record);
        }

        @Override
        protected boolean isRelevantWrite(Write write) {
            return write.getRecord().equals(record) && write.getKey().equals(key);
        }

    }

    /**
     * A {@link SeekingIterator} that looks for writes with a particular key
     * component.
     * 
     * @author Jeff Nelson
     */
    private class KeySeekingIterator extends SeekingIterator {

        /**
         * The relevant key
         */
        private final Text key;

        /**
         * Construct a new instance.
         * 
         * @param timestamp
         */
        protected KeySeekingIterator(String key, long timestamp) {
            super(timestamp);
            this.key = Text.wrapCached(key);
            init();
        }

        @Override
        protected boolean pageMightContainRelevantWrites(Page page) {
            return page.mightContain(key);
        }

        @Override
        protected boolean isRelevantWrite(Write write) {
            return write.getKey().equals(key);
        }

    }

    /**
     * A {@link SeekingIterator} that looks for writes with a particular record
     * component.
     * 
     * @author Jeff Nelson
     */
    private class RecordSeekingIterator extends SeekingIterator {

        /**
         * The relevant record.
         */
        private final PrimaryKey record;

        /**
         * Construct a new instance.
         * 
         * @param timestamp
         */
        /**
         * Construct a new instance.
         * 
         * @param record
         * @param timestamp
         */
        protected RecordSeekingIterator(long record, long timestamp) {
            super(timestamp);
            this.record = PrimaryKey.wrap(record);
            init();
        }

        @Override
        protected boolean pageMightContainRelevantWrites(Page page) {
            return page.mightContain(record);
        }

        @Override
        protected boolean isRelevantWrite(Write write) {
            return write.getRecord().equals(record);
        }

    }

    /**
     * A {@link SeekingIterator} for all the writes in the buffer.
     * 
     * @author Jeff Nelson
     */
    private class AllSeekingIterator extends SeekingIterator {

        /**
         * Construct a new instance.
         * 
         * @param timestamp
         */
        protected AllSeekingIterator(long timestamp) {
            super(timestamp);
            init();
        }

        @Override
        protected boolean pageMightContainRelevantWrites(Page page) {
            return true;
        }

        @Override
        protected boolean isRelevantWrite(Write write) {
            return true;
        }

    }

    /**
     * An {@link Iterator} that can traverse Writes directly from disk for a
     * Buffer that uses {@code location} as a store. Call
     * {@link Buffer#onDiskIterator(String)} to instantiate one of these. This
     * should only be used in cases where it is necessary (and safe) to iterate
     * through a Buffer's writes while the Buffer is offline.
     * 
     * @author Jeff Nelson
     */
    private static class OnDiskIterator extends ReadOnlyIterator<Write> {

        /**
         * An {@link Iterator} over all the files in the input directory.
         */
        private final Iterator<String> fileIt;

        /**
         * An {@link Iterator} over the data chunks in the current file.
         */
        private Iterator<ByteBuffer> it = null;

        /**
         * Construct a new instance.
         * 
         * @param location
         */
        private OnDiskIterator(String location) {
            this.fileIt = FileSystem.fileOnlyIterator(location);
            flip();
        }

        @Override
        public boolean hasNext() {
            if (it == null) {
                return false;
            } else if (!it.hasNext() && fileIt.hasNext()) {
                flip();
                return hasNext();
            } else if (!it.hasNext()) {
                return false;
            } else {
                return true;
            }
        }

        @Override
        public Write next() {
            if (hasNext()) {
                return Byteables.readStatic(it.next(), Write.class);
            } else {
                return null;
            }
        }

        /**
         * Flip to the next page in the iterator.
         */
        private void flip() {
            if (fileIt.hasNext()) {
                ByteBuffer bytes = FileSystem.readBytes(fileIt.next());
                it = ByteableCollections.iterator(bytes);
            }
        }

    }
}