de.hpi.fgis.hdrs.node.Node.java Source code

Java tutorial

Introduction

Here is the source code for de.hpi.fgis.hdrs.node.Node.java

Source

/**
 * Copyright 2011 Daniel Hefenbrock
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 * 
 *     http://www.apache.org/licenses/LICENSE-2.0
 * 
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package de.hpi.fgis.hdrs.node;

import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.EnumMap;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.concurrent.Callable;
import java.util.concurrent.ExecutionException;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.Future;
import java.util.concurrent.RejectedExecutionException;
import java.util.concurrent.Semaphore;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.atomic.AtomicLong;
import java.util.concurrent.locks.ReadWriteLock;
import java.util.concurrent.locks.ReentrantReadWriteLock;

import javax.imageio.stream.FileImageInputStream;
import javax.imageio.stream.FileImageOutputStream;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.io.BooleanWritable;
import org.apache.hadoop.ipc.RPC;
import org.apache.hadoop.ipc.Server;

import de.hpi.fgis.hdrs.Configuration;
import de.hpi.fgis.hdrs.LogFormatUtil;
import de.hpi.fgis.hdrs.Peer;
import de.hpi.fgis.hdrs.StoreInfo;
import de.hpi.fgis.hdrs.Triple;
import de.hpi.fgis.hdrs.client.Client;
import de.hpi.fgis.hdrs.compression.Compression;
import de.hpi.fgis.hdrs.ipc.MultiSegmentWrite;
import de.hpi.fgis.hdrs.ipc.NodeProtocol;
import de.hpi.fgis.hdrs.ipc.OrderedTriplesWritable;
import de.hpi.fgis.hdrs.ipc.ProtocolVersion;
import de.hpi.fgis.hdrs.ipc.ScanResult;
import de.hpi.fgis.hdrs.ipc.SegmentCatalog;
import de.hpi.fgis.hdrs.ipc.SegmentCatalogUpdates;
import de.hpi.fgis.hdrs.ipc.TransactionState;
import de.hpi.fgis.hdrs.node.Index.SegmentSplit;
import de.hpi.fgis.hdrs.node.SegmentCompactorThread.CompactionRequest;
import de.hpi.fgis.hdrs.routing.PeerView;
import de.hpi.fgis.hdrs.routing.Router;
import de.hpi.fgis.hdrs.routing.SegmentInfo;
import de.hpi.fgis.hdrs.segment.Segment;
import de.hpi.fgis.hdrs.segment.SegmentConfiguration;
import de.hpi.fgis.hdrs.segment.SegmentExport;
import de.hpi.fgis.hdrs.tio.TripleScanner;
import de.hpi.fgis.hdrs.tio.TripleSink;
import de.hpi.fgis.hdrs.tio.TripleSourceWithSize;
import de.hpi.fgis.hdrs.triplefile.FileBlock;
import de.hpi.fgis.hdrs.triplefile.FileImport;
import de.hpi.fgis.hdrs.triplefile.FileStreamer;

public class Node implements Runnable, NodeProtocol, SegmentIdGenerator, SegmentServer {

    static final Log LOG = LogFactory.getLog(Node.class);

    final Configuration conf;

    final SegmentConfiguration segmentConf;

    /**
     * Main thread running this node.
     */
    Thread mainThread;

    Server rpcServer;

    private final Router router;

    SegmentFlusherThread flushThread = null;

    SegmentCompactorThread compactorThread = null;

    SegmentSplitThread splitThread = null;

    AntiEntropyThread antiEntropyThread = null;

    ExecutorService committerPool = null;

    ExecutorService prefetcherPool = null;

    volatile boolean online = false;

    /**
     * Quit flag
     */
    volatile boolean shutDown = false;

    /**
     * Kill flag
     */
    volatile boolean kill = false;

    /**
     * Restart flag
     */
    volatile boolean restart = false;

    /**
     * There are up to 6 indexes (SPO, POS, ...).  Note that not all
     * indexes need to be present in the store.
     */
    final Map<Triple.COLLATION, Index> indexes = new EnumMap<Triple.COLLATION, Index>(Triple.COLLATION.class);

    int segmentIdCounter = 0;

    /**
     * <p> Segments served by this node that are currently open and online 
     * (readable and writable).  Contains segments for all indexes.
     * 
     * <p> This map is initially empty; segments are loaded on demand.
     */
    final Map<Long, SegmentDescriptor> segments;
    final ReadWriteLock segmentsLock = new ReentrantReadWriteLock();

    public static enum SegmentStatus {
        OFFLINE, // files closed
        OPEN, // files open
        ONLINE, // files open & buffer open

        CLOSED, // closed (cannot be opened)
        CLOSE_PENDING, SPLIT, // closed because split
        SPLIT_PENDING, // split pending

        TRANSFER_IN, TRANSFER_OUT;
    }

    final Map<Long, FileImport> fileImports;

    /*
     *   LOCK ORDER:
     *   
     *   1. SegmentDescriptor
     *   2. Transaction
     *   3. SegmentDescriptor.transactions
     * 
     */

    /**
     * Holds and synchronizes access to runtime information of a segment.
     * (status, transactions, scanners, etc.)
     */
    private class SegmentDescriptor {

        volatile SegmentStatus status;
        volatile boolean compacting = false;
        final SegmentInfo info;
        final Index index;
        Segment segment = null;
        final Map<Long, Transaction> transactions;
        long timestamp = System.currentTimeMillis();

        SegmentDescriptor(SegmentStatus status, SegmentInfo info, Index index) {
            this.status = status;
            this.info = info;
            this.index = index;
            transactions = new HashMap<Long, Transaction>();
        }

        void touch() {
            timestamp = System.currentTimeMillis();
        }

        /**
         * @return  the time since last triples were written to this segment
         */
        long getTimestamp() {
            return timestamp;
        }

        /**
         * This MUST NOT be called while holding a Transaction monitor lock!
         * @param takeOnline
         * @return
         * @throws IOException
         */
        synchronized boolean open(boolean takeOnline) throws IOException {
            switch (status) {
            case OFFLINE:
                segment = Segment.openSegment(segmentConf, index, info.getSegmentId());
                segment.setSegmentFlusher(flushThread);
                segment.setSegmentCompactor(compactorThread);
                if (takeOnline) {
                    segment.takeOnline();
                    status = SegmentStatus.ONLINE;
                } else {
                    status = SegmentStatus.OPEN;
                }
                return true;
            case TRANSFER_IN:
                status = SegmentStatus.OPEN;
            case OPEN:
                if (takeOnline) {
                    segment.takeOnline();
                    status = SegmentStatus.ONLINE;
                }
                return true;
            case ONLINE:
                return true;
            default:
                return false;
            }
        }

        // only for OFFLINE -> OPEN, triggered by user (shell).
        synchronized boolean open() throws IOException {
            if (SegmentStatus.OFFLINE == status) {
                return open(false);
            }
            return false;
        }

        synchronized boolean close() throws IOException {
            switch (status) {
            case OFFLINE:
                status = SegmentStatus.CLOSED;
                return true;
            case OPEN:
                status = SegmentStatus.CLOSED;
                segment.close();
                return true;
            case ONLINE:
            case CLOSE_PENDING:
            case SPLIT_PENDING:
                if (!compacting && abortAllTransactions()) {
                    status = SegmentStatus.CLOSED;
                    segment.close();
                    return true;
                }
                status = SegmentStatus.CLOSE_PENDING;
                return false;
            case TRANSFER_OUT:
            case TRANSFER_IN:
                return false;
            default: // CLOSED, SPLIT
                return true;
            }
        }

        boolean isOnline() {
            return status == SegmentStatus.ONLINE;
        }

        boolean isOpen() {
            return status == SegmentStatus.OPEN;
        }

        boolean splitPending() {
            return status == SegmentStatus.SPLIT_PENDING;
        }

        boolean closePending() {
            return status == SegmentStatus.CLOSE_PENDING;
        }

        boolean isAvailable() {
            return SegmentStatus.ONLINE == status || SegmentStatus.OPEN == status
                    || SegmentStatus.OFFLINE == status;
        }

        void addTransaction(Transaction tx) {
            synchronized (transactions) {
                transactions.put(tx.getId(), tx);
            }
        }

        void removeTransaction(Transaction tx) {
            synchronized (transactions) {
                transactions.remove(tx.getId());
            }
        }

        private boolean abortAllTransactions() {
            // make a copy, otherwise we'll get concurrent mod exceptions...
            List<Transaction> transactions;
            synchronized (this.transactions) {
                transactions = new ArrayList<Transaction>(this.transactions.values());
            }
            // try to abort transactions
            boolean success = true;
            for (Transaction tx : transactions) {
                if (!tx.abort()) {
                    success = false;
                }
            }
            return success;
        }

        synchronized boolean shouldBeSplit(boolean scatter) {
            switch (status) {
            case OPEN:
            case ONLINE:
                return segment.shouldBeSplit(scatter);
            }
            return false;
        }

        synchronized boolean canBeSplit() {
            switch (status) {
            case SPLIT_PENDING:
                return true;
            case OPEN:
            case ONLINE:
                return segment.canBeSplit();
            }
            return false;
        }

        synchronized boolean prepareSplit() throws IOException {
            switch (status) {
            case OFFLINE:
                open(false);
                status = SegmentStatus.SPLIT;
                return true;
            case OPEN:
                status = SegmentStatus.SPLIT;
                return true;
            case ONLINE:
            case SPLIT_PENDING:
                if (!compacting && abortAllTransactions()) {
                    // very well; all transactions dead now.
                    status = SegmentStatus.SPLIT;
                    return true;
                }
                status = SegmentStatus.SPLIT_PENDING;
                return false;
            case SPLIT:
                throw new IllegalStateException("Segment already split");
            default: // CLOSE, CLOSE_PENDING
                return false;
            }
        }

        synchronized void prepareImport() throws IOException {
            if (SegmentStatus.OFFLINE != status) {
                throw new IllegalStateException("Segment transfer not possible in state " + status);
            }
            open(false);
            status = SegmentStatus.TRANSFER_IN;
        }

        synchronized void prepareExport() throws IOException {
            if (SegmentStatus.OFFLINE != status) {
                throw new IllegalStateException("Segment transfer not possible in state " + status);
            }
            open(false);
            status = SegmentStatus.TRANSFER_OUT;
        }

        synchronized void finishExport() throws IOException {
            if (SegmentStatus.TRANSFER_OUT != status) {
                throw new IllegalStateException("Invalid segment state " + status);
            }
            status = SegmentStatus.OPEN;
            close();
        }

        void compact(boolean major) throws IOException {
            synchronized (this) {
                if (!compacting && (SegmentStatus.OPEN == status || SegmentStatus.ONLINE == status)) {
                    compacting = true;
                } else {
                    LOG.info("Skipping compaction for " + this);
                    return;
                }
            }
            LOG.info("Compacting " + this);
            try {
                segment.compact(major);
            } finally {
                compacting = false;
            }
        }

        TripleScanner openScanner() throws IOException {
            synchronized (this) {
                if (!open(false)) {
                    return null;
                }
            }
            return segment.getScanner();
        }

        TripleScanner openScanner(Triple pattern) throws IOException {
            synchronized (this) {
                if (!open(false)) {
                    return null;
                }
            }
            return segment.getScanner(pattern);
        }

        @Override
        public String toString() {
            return "Segment (id = " + info.getSegmentId() + ", status = " + status + ")";
        }

    }

    final Map<Long, Transaction> transactions;

    /**
     * Memory pool for transactions.
     */
    final private Semaphore transactionMemPool;

    /**
     * <p> A write transaction.  A transactions writes triples to a number of segments
     * located on this node.
     * 
     * <p> This transaction class implements the 2-Phase Commit Protocol (2PC).
     * 
     * <p> Triples of a transaction are buffered and only written on commit.
     * 
     * <p> A transaction that is not in 'prepared' state can be aborted anytime by
     * the remote transaction manager or by the node, e.g. when a segment is split.
     * 
     * @author daniel.hefenbrock
     *
     */
    static class Transaction {

        static enum State {
            OPEN, PREPARED, COMMITTED, ABORTED;
        }

        volatile State state;
        final long id;
        Map<SegmentDescriptor, List<TripleSourceWithSize>> segments;

        final Semaphore memPool;
        final int bufferSize;
        int freeBuffer;

        public Transaction(long id, Semaphore memPool, int bufferSize) {
            this.state = State.OPEN;
            this.id = id;
            this.segments = new HashMap<SegmentDescriptor, List<TripleSourceWithSize>>();
            this.memPool = memPool;
            this.bufferSize = bufferSize;
            this.freeBuffer = bufferSize;
        }

        public long getId() {
            return id;
        }

        /**
         * Abort this transaction if possible.  A transaction cannot be aborted if 
         * it is in 'prepared' state.
         * @return
         */
        synchronized boolean abort() {
            if (State.PREPARED == state) {
                return false;
            } else if (State.COMMITTED == state) {
                // tell a lie, it doesn't matter.
                return true;
            }
            return clientAbort();
        }

        /**
         * Client aborts... the client can abort a transaction even in 'prepared'
         * state!  (not that 'Client' in this case is just another node (aka the
         * transaction manager).
         * @return
         */
        synchronized boolean clientAbort() {
            if (State.COMMITTED == state) {
                return false;
            } else if (State.ABORTED == state) {
                return true;
            }
            state = State.ABORTED;
            // remove this transactions from all segments
            for (SegmentDescriptor sd : segments.keySet()) {
                sd.removeTransaction(this);
            }
            // release resources
            segments = null;
            // release memory from transaction memory pool
            memPool.release(bufferSize);
            return true;
        }

        /**
         * <p> The main task of prepare is to make sure that all segments contained
         * in this transaction are writable (online).
         * 
         * <p> ONLY called by RPC handler thread (NOT thread safe).
         * @throws IOException 
         */
        boolean prepare() throws IOException {
            if (State.OPEN == state) {
                Set<SegmentDescriptor> segments;
                synchronized (this) {
                    segments = this.segments.keySet();
                }
                if (null == segments) {
                    // someone else aborted this transaction.
                    return false;
                }
                // pre-check
                for (SegmentDescriptor sd : segments) {
                    if (!sd.open(true)) {
                        // segment is not writable
                        abort();
                        return false;
                    }
                }
                synchronized (this) {
                    // post-check... still not aborted?
                    if (State.OPEN == state) {
                        // ... segments still online?
                        // We MUST do this in this weird way in order to maintain the locking
                        // order (1. SegmentDescriptor, 2. Transaction).  Otherwise deadlock.
                        for (SegmentDescriptor sd : segments) {
                            if (!sd.isOnline()) {
                                // segment not writable... abort.
                                abort();
                                return false;
                            }
                        }
                        // we're good to go.
                        state = State.PREPARED;
                        return true; // SUCCESS
                    }
                }
            }
            if (State.ABORTED == state) {
                // someone else aborted this transaction.
                return false;
            }
            throw new IllegalStateException("cannot prepare transaction in state " + state);
        }

        /**
         * 
         * <p> ONLY called by RPC handler thread (NOT thread safe).
         * @throws IOException
         */
        int commit() throws IOException {
            if (State.PREPARED != state) {
                throw new IllegalStateException("cannot commit transaction in state " + state);
            }
            int nBytesAdded = 0;
            for (Map.Entry<SegmentDescriptor, List<TripleSourceWithSize>> entry : segments.entrySet()) {
                SegmentDescriptor sd = entry.getKey();
                if (!sd.isOnline() && !sd.splitPending() && !sd.closePending()) {
                    throw new IOException("segment is offline in state 'prepared'");
                }
                for (TripleSourceWithSize triples : entry.getValue()) {
                    TripleScanner scanner = triples.getScanner();
                    nBytesAdded += sd.segment.batchAdd(scanner);
                    if (scanner.next()) {
                        throw new IOException("partial write");
                    }
                    scanner.close();
                }
                sd.removeTransaction(this);
                sd.touch(); // update last written timestamp
            }
            // we're done.
            state = State.COMMITTED;
            // release memory and resources
            memPool.release(bufferSize);
            segments = null;
            return nBytesAdded;
        }

        /**
         * 
         * @throws IOException 
         */
        boolean addTriples(SegmentDescriptor sd, TripleSourceWithSize triples) throws IOException {
            if (State.ABORTED == state) {
                return false;
            }
            freeBuffer -= triples.getSizeBytes();
            if (0 > freeBuffer) {
                // transaction buffer size exceeded!  do abort.
                LOG.info("Transaction buffer size exceeded.  Aborting transaction " + id);
                abort();
                return false;
            }
            synchronized (sd) {
                if (!sd.isAvailable()) {
                    // segment was or is about to be split.  cannot be written.
                    LOG.info("Transaction aborted.  Segment cannot be written in state " + sd.status);
                    abort();
                    return false;
                }
                synchronized (this) {
                    if (State.OPEN == state) {
                        List<TripleSourceWithSize> writes = segments.get(sd);
                        if (null == writes) {
                            writes = new ArrayList<TripleSourceWithSize>();
                            segments.put(sd, writes);
                            sd.addTransaction(this);
                        }
                        writes.add(triples);
                        return true;
                    } else if (State.ABORTED == state) {
                        return false;
                    }
                    throw new IllegalStateException("cannot add triples to transaction in state " + state);
                }
            }
        }

    }

    final File rootDir;

    final Set<Triple.COLLATION> orders;

    final AtomicLong bufferSize = new AtomicLong(0L);

    Node(Configuration conf, Set<Triple.COLLATION> orders, ArrayList<Peer> peers, Peer localPeer)
            throws IOException {
        this.conf = conf;
        this.segmentConf = new SegmentConfiguration(conf);
        this.orders = orders;
        this.router = new Router(conf, peers, localPeer, this);

        segments = new HashMap<Long, SegmentDescriptor>();

        fileImports = new HashMap<Long, FileImport>();

        transactions = new HashMap<Long, Transaction>();
        transactionMemPool = new Semaphore(conf.getInt(Configuration.KEY_NODE_TRANSACTION_BUFFER,
                Configuration.DEFAULT_NODE_TRANSACTION_BUFFER));

        rootDir = new File(conf.get(Configuration.KEY_ROOT_DIR));
        if (!rootDir.isDirectory()) {
            throw new IOException("hdrs root dir does not exist: " + rootDir);
        }
    }

    Router getRouter() {
        return router;
    }

    SegmentCatalog getLocalSegmentCatalog() {
        return router.getLocalCatalog();
    }

    SegmentDescriptor getSegmentDescriptor(long segmentId) throws IOException {
        segmentsLock.readLock().lock();
        SegmentDescriptor sd = segments.get(segmentId);
        segmentsLock.readLock().unlock();
        if (null == sd) {
            throw new IOException("segment " + segmentId + " not found");
        }
        return sd;
    }

    @Override
    public boolean splitSegment(long segmentId) throws IOException {
        SegmentDescriptor sd = getSegmentDescriptor(segmentId);
        if (!sd.canBeSplit()) {
            LOG.info("Segment cannot be split because it contains at least one "
                    + "half-file.  Need compaction first.");
            return true; // true = don't try again
        }
        if (!sd.prepareSplit()) {
            // there is at least one transaction in 'prepared' state.  we must
            // wait until it is committed before offlining this segment.
            LOG.info("Postponing split for segment " + sd);
            return false;
        }
        splitSegment(sd);
        return true;
    }

    /**
     * This assumes sd.prepareSplit() succeeded!
     */
    private void splitSegment(SegmentDescriptor sd) throws IOException {

        SegmentSplit split = sd.index.splitSegment(sd.info, sd.segment);
        if (null == split) {
            LOG.warn("couldn't split segment " + sd.info);
            return;
        }

        SegmentDescriptor sdTop = new SegmentDescriptor(SegmentStatus.OFFLINE, split.top, sd.index);
        SegmentDescriptor sdBottom = new SegmentDescriptor(SegmentStatus.OFFLINE, split.bottom, sd.index);

        segmentsLock.writeLock().lock();
        segments.put(split.top.getSegmentId(), sdTop);
        segments.put(split.bottom.getSegmentId(), sdBottom);
        segmentsLock.writeLock().unlock();

        Peer tpeer = router.locateSegment(sd.index.getOrder(), split.top);
        SegmentInfo add1 = null;
        if (router.getLocalPeer() == tpeer) {
            // segment stays at this node
            add1 = split.top;
        } else {
            // export this segment
            startSegmentTransfer(tpeer, sdTop);
        }

        Peer bpeer = router.locateSegment(sd.index.getOrder(), split.bottom);
        SegmentInfo add2 = null;
        if (router.getLocalPeer() == bpeer) {
            // segment stays at this node
            add2 = split.bottom;
        } else {
            // export this segment
            startSegmentTransfer(bpeer, sdBottom);
        }

        LOG.info("Segment split: " + sd.info + " split into " + sdTop.info + " (Node " + tpeer + ") and "
                + sdBottom.info + " (Node " + bpeer + ")");

        // update index.  this is actually the commit point.  if we fail
        // before the split won't be visible but we'll have garbage in
        // the segment directory.
        sd.index.updateSegments(sd.info, add1, add2);

        // indicate segment catalog update is needed
        router.localCatalogUpdate();

        // remove descriptor
        segmentsLock.writeLock().lock();
        segments.remove(sd.info.getSegmentId());
        segmentsLock.writeLock().unlock();
    }

    private void startSegmentTransfer(Peer target, SegmentDescriptor sd) throws IOException {
        sd.prepareExport();

        SegmentTransferThread transferThread = new SegmentTransferThread(target, sd);

        transferThread.start();
    }

    public static final int FILE_TRANSFER_BATCH_SIZE = 1024 * 1024; // 1 MB

    private void transferSegment(Peer peer, SegmentDescriptor sd) throws IOException {
        LOG.info("Starting transfer of segment " + sd);

        SegmentExport export = sd.segment.export();
        NodeProtocol proxy = peer.getProxy(conf);

        long nBytes = 0;
        long time = System.currentTimeMillis();

        proxy.startSegmentTransfer(sd.info, sd.index.getOrder().getCode());
        FileStreamer file;
        while (null != (file = export.nextFile())) {
            proxy.startFileTransfer(sd.info.getSegmentId(), file.getCompression().getCode(), file.getDataSize(),
                    file.getNumberOfTriples());

            while (true) {
                int batchSize = 0;
                List<FileBlock> blocks = new ArrayList<FileBlock>();

                FileBlock block;
                while (null != (block = file.nextBlock())) {
                    blocks.add(block);
                    batchSize += block.size();
                    if (FILE_TRANSFER_BATCH_SIZE < batchSize) {
                        break;
                    }
                }
                if (0 == batchSize) {
                    break;
                }
                proxy.transferFileBlocks(sd.info.getSegmentId(), blocks.toArray(new FileBlock[blocks.size()]));

                nBytes += batchSize;
            }
            // copy to make sure its not a ScatteredTriple (RPC can't handle that...)
            proxy.finishFileTransfer(sd.info.getSegmentId(), file.getLastTriple().copy());
        }
        proxy.finishSegmentTransfer(sd.info.getSegmentId());
        RPC.stopProxy(proxy);

        time = System.currentTimeMillis() - time;
        LOG.info(String.format("Segment %d transferred to peer %s. (%.2f MB, %.2f MB/s)", sd.info.getSegmentId(),
                peer, LogFormatUtil.MB(nBytes), LogFormatUtil.MBperSec(nBytes, time)));

        //  This is for benchmarking segment transfers.
        //    System.out.println(String.format("%.2f\t%.2f",
        //        time/1000.0,
        //        LogFormatUtil.MB(nBytes))
        //        );

        // transfer is complete.  now get rid of the segment
        sd.finishExport();
        // remove descriptor
        segmentsLock.writeLock().lock();
        segments.remove(sd.info.getSegmentId());
        segmentsLock.writeLock().unlock();

        sd.segment.delete();

        antiEntropyThread.run(peer);
    }

    // called by router
    public void runAntiEntropy(Peer peer) {
        antiEntropyThread.run(peer);
    }

    @Override
    public boolean flushSegment(long segmentId) throws IOException {
        SegmentDescriptor sd = null;
        segmentsLock.readLock().lock();
        sd = segments.get(segmentId);
        segmentsLock.readLock().unlock();
        if (null != sd && (sd.isOnline() || sd.splitPending() || sd.closePending())) {
            LOG.info("Flushing " + sd);
            // force flush if split or close is pending.  otherwise there is possibility of deadlock.
            int nBytesFlushed = sd.segment.flushBuffer(sd.splitPending() || sd.closePending());
            if (0 < nBytesFlushed) {
                // flush succeeded
                bufferSize.addAndGet(-nBytesFlushed);
                return true;
            } else if (0 == nBytesFlushed) {
                // there wasn't anything to flush... fair enough. (but buffer could be closed now)
                return true;
            }
            // too many files!  try again later.
            LOG.info("Flush postponed for " + sd + ".  Too many triple files!");
            return false;
        }
        // don't try again..
        return true;
    }

    @Override
    public int compactSegment(Set<CompactionRequest> requests) throws IOException {
        CompactionRequest r = null;
        SegmentDescriptor sd = null;
        int removed = 0;
        // LOCK ORDER: 1. segmentsLock, 2. requests.... NOT THE OTHER WAY AROUND!!
        segmentsLock.readLock().lock();
        int maxFiles = 0;
        synchronized (requests) {
            Iterator<CompactionRequest> it = requests.iterator();
            while (it.hasNext()) {
                CompactionRequest request = it.next();
                SegmentDescriptor rsd = segments.get(request.getSegmentId());
                if (null == rsd || !rsd.isAvailable() || null == rsd.segment) {
                    it.remove();
                    removed++;
                    continue;
                }
                if (rsd.segment.getNumberOfFiles() > maxFiles) {
                    r = request;
                    sd = rsd;
                    maxFiles = rsd.segment.getNumberOfFiles();
                }
            }
            if (null == r) {
                segmentsLock.readLock().unlock();
                return removed;
            }
            requests.remove(r);
            removed++;
        }
        segmentsLock.readLock().unlock();
        sd.compact(r.isMajor());
        // check split.. should we split it?
        if (sd.shouldBeSplit(scatterSegments())) {
            // yes, can we split it right now?
            if (sd.prepareSplit()) {
                // yep, do it.
                splitSegment(sd);
            } else {
                // status is now SPLIT_PENDING
                // try splitting it later
                splitThread.request(r.getSegmentId(), SegmentSplitThread.SPLIT_RETRY_DELAY);
                LOG.info("Postponing split for segment " + sd);
            }
        }
        return removed;
    }

    // when the store is still small, we want to increase the number
    // of segments fast in order to leverage all nodes
    private boolean scatterSegments() {
        return router.getNumberOfSegments() < router.getPeers().size()
                * conf.getFloat(Configuration.KEY_SCATTER_FACTOR, Configuration.DEFAULT_SCATTER_FACTOR);
    }

    @Override
    public synchronized long generateSegmentId() {
        return (((long) router.getLocalPeer().getId()) << 31) + this.segmentIdCounter++;
    }

    private static final String SEGMENT_ID_COUNTER_FILE = "storemeta";

    private void writeSegmentIdCounter() {
        try {
            File f = new File(rootDir.getAbsolutePath() + File.separator + SEGMENT_ID_COUNTER_FILE);
            if (f.isFile()) {
                if (!f.delete()) {
                    throw new IOException("Cannot delete old index meta file");
                }
            }
            FileImageOutputStream out = new FileImageOutputStream(f);
            out.writeInt(segmentIdCounter);
            out.close();
        } catch (IOException ex) {
            LOG.error("Error writing store meta file", ex);
        }
    }

    boolean initialize() {
        // read segment id counter
        File storemeta = new File(rootDir.getAbsolutePath() + File.separator + SEGMENT_ID_COUNTER_FILE);
        if (storemeta.isFile()) {
            FileImageInputStream in;
            try {
                in = new FileImageInputStream(storemeta);
                segmentIdCounter = in.readInt();
                in.close();
            } catch (IOException ex) {
                LOG.fatal("error while reading store meta file ", ex);
                return false;
            }
        }

        LOG.info("Using segment configuration: " + segmentConf);

        // init indexes
        for (Triple.COLLATION order : orders) {
            try {
                File indexRoot = new File(rootDir.getAbsolutePath() + File.separator + order);

                Index index;
                if (indexRoot.isDirectory()) {
                    LOG.info("Opening index " + order);
                    index = Index.openIndex(indexRoot, order, this);
                } else {
                    LOG.info("Creating index " + order);
                    index = Index.createIndex(indexRoot, order, this);
                    if (router.getLocalPeer() == router.locateSegment(order, SegmentInfo.getSeed(0))) {
                        index.createSeedSegment();
                    }
                }

                indexes.put(order, index);
                router.initialize(order);

            } catch (IOException ex) {
                LOG.fatal("error while opening index " + order, ex);
                return false;
            }
        }

        router.syncLocalCatalog();

        // create segment descriptors
        for (Index index : indexes.values()) {
            for (SegmentInfo info : router.getLocalCatalog().getSegments(index.getOrder())) {
                segments.put(Long.valueOf(info.getSegmentId()),
                        new SegmentDescriptor(SegmentStatus.OFFLINE, info, index));
            }
        }

        LOG.info("Segment catalog initialized: " + router.getLocalCatalog());
        return true;
    }

    void startThreads() {
        flushThread = new SegmentFlusherThread(this);
        compactorThread = new SegmentCompactorThread(this);
        splitThread = new SegmentSplitThread(this);
        antiEntropyThread = new AntiEntropyThread(router);
        flushThread.start();
        compactorThread.start();
        splitThread.start();
        antiEntropyThread.start();

        committerPool = Executors.newFixedThreadPool(
                conf.getInt(Configuration.KEY_COMMIT_POOL_SIZE, Configuration.DEFAULT_COMMIT_POOL_SIZE));
        prefetcherPool = Executors.newFixedThreadPool(
                conf.getInt(Configuration.KEY_PREFTECH_POOL_SIZE, Configuration.DEFAULT_PREFETCH_POOL_SIZE));
    }

    void stopThreads() {

        committerPool.shutdown();
        try {
            while (!committerPool.awaitTermination(10, TimeUnit.SECONDS)) {
                LOG.info("Waiting for committer thread pool to shutdown...");
            }
        } catch (InterruptedException e) {
            LOG.warn("Committer thread pool didn't shutdown properly.");
        }
        prefetcherPool.shutdown();
        try {
            while (!prefetcherPool.awaitTermination(10, TimeUnit.SECONDS)) {
                LOG.info("Waiting for prefetcher thread pool to shutdown...");
            }
        } catch (InterruptedException e) {
            LOG.warn("Prefetcher thread pool didn't shutdown properly.");
        }

        if (null != flushThread) {
            flushThread.quitJoin();
        }
        if (null != compactorThread) {
            compactorThread.quitJoin();
        }
        if (null != splitThread) {
            splitThread.quitJoin();
        }
        if (null != antiEntropyThread) {
            antiEntropyThread.quitJoin();
        }
    }

    void close() {
        for (SegmentDescriptor sd : segments.values()) {
            try {
                while (!sd.close() && !kill) {
                    LOG.info("Waiting for " + sd + " to close...");
                    try {
                        Thread.sleep(1000);
                    } catch (InterruptedException e) {
                        LOG.warn(sd + " wasn't closed properly.");
                    }
                }
            } catch (IOException ex) {
                LOG.error("Error while trying to close segment ", ex);
            }
        }
    }

    // for test clean ups
    public void delete() throws IOException {
        for (Index index : indexes.values()) {
            index.delete();
        }
    }

    // for testing
    public Client getClient(Configuration conf) throws IOException {
        // we MUST make a copy since peers store routing
        // information 
        ArrayList<Peer> peers = new ArrayList<Peer>();
        for (Peer peer : router.getPeers()) {
            peers.add(new Peer(peer));
        }
        return new Client(1, conf, orders, peers);
    }

    @Override
    public synchronized void stop(Throwable cause) {
        if (cause instanceof OutOfMemoryError) {
            LOG.error("Segment buffer size: " + bufferSize.get());
        }
        if (shutDown) {
            return;
        }
        shutDown = true;
        mainThread.interrupt();
    }

    @Override
    public void run() {
        mainThread = Thread.currentThread();
        LOG.info("Node " + router.getLocalPeer().toString() + " starting up...");

        do {
            if (restart) {
                LOG.info("Node " + router.getLocalPeer().toString() + " restarting...");
                restart = false;
                kill = false;
                shutDown = false;
                reset();
            }

            startThreads();
            if (!initialize()) {
                return;
            }

            // start rpc server
            try {
                /*   HADOOP 0.21.0:
                rpcServer = RPC.getServer(NodeProtocol.class, this, 
                    router.getLocalPeer().getAddress(), 
                    router.getLocalPeer().getPort(),
                    conf.getInt(Configuration.KEY_RPC_HANDLER_COUNT, Configuration.DEFAULT_RPC_HANDLER_COUNT),
                    false, conf, null);*/
                rpcServer = RPC.getServer(this, router.getLocalPeer().getAddress(), router.getLocalPeer().getPort(),
                        conf.getInt(Configuration.KEY_RPC_HANDLER_COUNT, Configuration.DEFAULT_RPC_HANDLER_COUNT),
                        false, conf);
                rpcServer.start();
            } catch (IOException ex) {
                LOG.fatal("Error during rpc server creation.", ex);
                return;
            }

            // now we're going online
            online = true;

            // main loop
            while (!shutDown) {
                try {
                    Thread.sleep(5 * 60 * 1000); // 5 minutes
                } catch (InterruptedException ex) {
                    // ignore
                }
                // close expired scanners, transactions
                maintenance();
            }

            LOG.info("Node " + router.getLocalPeer().toString() + " shutting down...");
            online = false;

            stopThreads();
            close();

            writeSegmentIdCounter();

            // wait 1 sec to allow stopper to disconnect
            try {
                Thread.sleep(1000);
            } catch (InterruptedException e) {
                // ignore
            }
            rpcServer.stop();
            try {
                rpcServer.join();
            } catch (InterruptedException ex) {
                // ignore
            }

        } while (restart);

    }

    // reset all central data structures
    private void reset() {
        router.reset();
        indexes.clear();
        segments.clear();
        transactions.clear();
        fileImports.clear();
        scanners.clear();

        transactionMemPool.drainPermits();
        transactionMemPool.release(conf.getInt(Configuration.KEY_NODE_TRANSACTION_BUFFER,
                Configuration.DEFAULT_NODE_TRANSACTION_BUFFER));

        bufferSize.set(0L);
    }

    @Override
    public void shutDown(boolean force) {
        LOG.info("Triggering node shut down for node " + router.getLocalPeer().toString() + ", force = " + force);
        kill = force;
        shutDown = true;
        mainThread.interrupt();
    }

    @Override
    public void restart(boolean force) {
        LOG.info("Triggering restart for node " + router.getLocalPeer().toString() + ", force = " + force);
        restart = true;
        kill = force;
        shutDown = true;
        mainThread.interrupt();
    }

    // for testing to wait until the node is online.
    public void waitOnline() {
        while (!online) {
            try {
                Thread.sleep(100);
            } catch (InterruptedException e) {
                return;
            }
        }
    }

    // for testing
    public void waitDone() {
        try {
            mainThread.join();
        } catch (InterruptedException e) {
            LOG.warn("Interrupted while waiting for node main thread to finish");
        }
    }

    private void maintenance() {

        long timeout = conf.getLong(Configuration.KEY_SCANNER_TIMEOUT, Configuration.DEFAULT_SCANNER_TIMEOUT);
        long now = System.currentTimeMillis();

        synchronized (scanners) {
            List<SegmentScanner> remove = new ArrayList<SegmentScanner>();
            for (SegmentScanner scanner : scanners.values()) {
                if (now > scanner.getTimestamp() + timeout) {
                    scanner.close();
                    remove.add(scanner);
                }
            }
            for (SegmentScanner scanner : remove) {
                scanners.remove(scanner);
            }
            if (!remove.isEmpty()) {
                LOG.info("Node maintenance: " + remove.size() + " expired scanners closed");
            }
        }

    }

    /**
     * Called by the flush thread after some time of idling.
     */
    @Override
    public void flushMaintenance() {

        long delay = conf.getLong(Configuration.KEY_SEGMENT_FLUSH_DELAY, Configuration.DEFAULT_SEGMENT_FLUSH_DELAY);
        long now = System.currentTimeMillis();

        int cnt = 0;
        segmentsLock.readLock().lock();
        for (SegmentDescriptor sd : segments.values()) {
            if (sd.isOnline() && 0 < sd.segment.getBufferSize() && now > sd.getTimestamp() + delay) {
                flushThread.request(sd.info.getSegmentId());
                cnt++;
            }
        }
        segmentsLock.readLock().unlock();
        LOG.info("Flush maintenance: scheduled " + cnt + " segments to be flushed");
    }

    // NOT NEEDED IN HADOOP 0.21.0
    @Override
    public long getProtocolVersion(String protocol, long clientVersion) throws IOException {
        return ProtocolVersion.versionID;
    }

    public SegmentCatalog getLocalSegmentCatalog(long version) {
        Map<Triple.COLLATION, List<SegmentInfo>> lists = new EnumMap<Triple.COLLATION, List<SegmentInfo>>(
                Triple.COLLATION.class);
        for (Index index : indexes.values()) {
            lists.put(index.getOrder(), index.getSegments());
        }
        return new SegmentCatalog(version, lists);
    }

    @Override
    public SegmentCatalogUpdates antiEntropy(long[] versionVector) throws IOException {
        //LOG.info("Received anti-entropy request from " + Server.getRemoteAddress());    
        return router.antiEntropy(versionVector);
    }

    Transaction getTransaction(long transactionId) throws IOException {
        synchronized (transactions) {
            Transaction tx = transactions.get(Long.valueOf(transactionId));
            if (null == tx) {
                throw new IOException("unknown transaction id " + transactionId);
            }
            return tx;
        }
    }

    boolean removeTransaction(long transactionId) {
        synchronized (transactions) {
            return null != transactions.remove(Long.valueOf(transactionId));
        }
    }

    class CommitTask implements Runnable {
        private final Transaction transaction;

        CommitTask(Transaction transaction) {
            this.transaction = transaction;
        }

        @Override
        public void run() {
            int nBytesAdded;
            try {
                nBytesAdded = transaction.commit();
                removeTransaction(transaction.getId());
                bufferSize.addAndGet(nBytesAdded);
            } catch (Throwable e) {
                LOG.fatal("error while committing transaction" + transaction.getId(), e);
                stop(e);
            }
        }
    }

    @Override
    public void commitTransaction(long transactionId) throws IOException {
        Transaction tx = getTransaction(transactionId);
        CommitTask commitTask = new CommitTask(tx);
        if (null == committerPool) {
            // looks like we have to do it ourselves...  (might be the case during testing)
            commitTask.run();
        } else {
            try {
                committerPool.execute(commitTask);
            } catch (RejectedExecutionException ex) {
                LOG.warn("Commit rejected by thread pool... running blocking commit.");
                commitTask.run();
            }
        }
    }

    @Override
    public TransactionState prepareTransaction(long transactionId) throws IOException {
        Transaction tx = getTransaction(transactionId);
        if (!tx.prepare()) {
            // transaction failed
            removeTransaction(transactionId);
            return TransactionState.ABORTED;
        }
        return TransactionState.OK;
    }

    @Override
    public BooleanWritable startTransaction(long transactionId, int bufferSize, long timeout) throws IOException {
        // can we allocate memory from the transaction pool?
        try {
            if (!transactionMemPool.tryAcquire(bufferSize, timeout, TimeUnit.MILLISECONDS)) {
                // nope...
                LOG.info("Unable to allocate " + LogFormatUtil.MB(bufferSize) + " MB for transaction "
                        + transactionId);
                return FALSE;
            }
        } catch (InterruptedException e) {
            return FALSE;
        }
        Transaction tx = new Transaction(transactionId, transactionMemPool, bufferSize);
        synchronized (transactions) {
            if (null != transactions.put(Long.valueOf(transactionId), tx)) {
                transactionMemPool.release(bufferSize);
                throw new IOException("transaction id collision");
            }
        }
        return TRUE;
    }

    @Override
    public TransactionState writeTriples(long transactionId, MultiSegmentWrite writes) throws IOException {

        Transaction tx = getTransaction(transactionId);

        for (long segmentId : writes.getSegments()) {

            SegmentDescriptor sd = null;
            segmentsLock.readLock().lock();
            sd = segments.get(segmentId);
            segmentsLock.readLock().unlock();
            if (null == sd) {
                LOG.info("Segment " + segmentId + " not found.  Aborting transaction");
                // segment not found!
                tx.abort();
                removeTransaction(transactionId);
                return TransactionState.aborted(segmentId);
            }

            if (!tx.addTriples(sd, writes.getTriples(segmentId))) {
                // transaction failed
                removeTransaction(transactionId);
                return TransactionState.aborted(segmentId);
            }

        }

        return TransactionState.OK;
    }

    @Override
    public TransactionState writeAndPrepare(long transactionId, MultiSegmentWrite writes) throws IOException {
        TransactionState writeState = writeTriples(transactionId, writes);
        if (writeState.isOk()) {
            return prepareTransaction(transactionId);
        }
        return writeState;
    }

    @Override
    public void abortTransaction(long transactionId) throws IOException {
        synchronized (transactions) {
            Transaction tx = transactions.get(Long.valueOf(transactionId));
            if (null != tx && !tx.clientAbort()) {
                throw new IOException("Cannot abort transaction in state COMMITTED");
            }
        }
        removeTransaction(transactionId);
    }

    @Override
    public boolean finishSegmentTransfer(long segmentId) throws IOException {
        SegmentDescriptor sd = getSegmentDescriptor(segmentId);
        sd.index.updateSegments(null, sd.info, null);

        LOG.info("Segment transfer-in successful (" + sd + ")");

        sd.open(false);

        // indicate segment catalog update is needed
        router.localCatalogUpdate();

        return true;
    }

    @Override
    public void finishFileTransfer(long segmentId, Triple lastTriple) throws IOException {
        FileImport fileImport = getFileImport(segmentId);
        if (fileImport.close(lastTriple)) {
            //LOG.info("File transfer finished");
        } else {
            //LOG.info("File transfer finished with no blocks (file deleted)");
        }
        synchronized (fileImports) {
            fileImports.remove(segmentId);
        }
    }

    @Override
    public void startFileTransfer(long segmentId, byte compressionAlgorithm, long dataSize, long nTriples)
            throws IOException {

        //LOG.info("Starting file transfer-in for segment " + segmentId);

        SegmentDescriptor sd = getSegmentDescriptor(segmentId);
        if (sd.status != SegmentStatus.TRANSFER_IN) {
            throw new IOException("Cannot import file in state " + sd.status);
        }
        FileImport fileImport = sd.segment.fileImport(Compression.ALGORITHM.decode(compressionAlgorithm), dataSize,
                nTriples);
        synchronized (fileImports) {
            if (null != fileImports.put(Long.valueOf(segmentId), fileImport)) {
                throw new IOException("Cannot import more than one file at a time.");
            }
        }
    }

    @Override
    public void startSegmentTransfer(SegmentInfo segment, byte collation) throws IOException {

        LOG.info("Starting transfer-in of segment " + segment);

        Triple.COLLATION order = Triple.COLLATION.decode(collation);

        Index index = indexes.get(order);
        index.createSegment(segment.getSegmentId());

        SegmentDescriptor sd;
        segmentsLock.writeLock().lock();
        try {
            sd = new SegmentDescriptor(SegmentStatus.OFFLINE, segment, index);
            if (null != segments.put(segment.getSegmentId(), sd)) {
                throw new IOException("Segment id collision for segment " + segment.getSegmentId());
            }
        } finally {
            segmentsLock.writeLock().unlock();
        }
        sd.prepareImport();
    }

    @Override
    public void transferFileBlocks(long segmentId, FileBlock blocks[]) throws IOException {
        FileImport fileImport = getFileImport(segmentId);
        fileImport.addBlocks(blocks);
        //LOG.info("Received " + blocks.length + " file blocks");
    }

    private FileImport getFileImport(long segmentId) throws IOException {
        FileImport fileImport;
        synchronized (fileImports) {
            fileImport = fileImports.get(segmentId);
        }
        if (null == fileImport) {
            throw new IOException("Cannot find file import for segment " + segmentId);
        }
        return fileImport;
    }

    private class SegmentTransferThread extends Thread {

        final Peer peer;
        final SegmentDescriptor sd;

        SegmentTransferThread(Peer peer, SegmentDescriptor sd) {
            this.peer = peer;
            this.sd = sd;
        }

        @Override
        public void run() {
            try {
                transferSegment(peer, sd);
            } catch (Throwable ex) {
                LOG.fatal("Segment transfer failed: " + sd, ex);
                Node.this.stop(ex);
            }
        }

    }

    /*
     * Node Management
     */

    @Override
    public SegmentSummary[] getIndexSummary(byte collation) throws IOException {
        Triple.COLLATION order = Triple.COLLATION.decode(collation);
        SegmentInfo[] index = router.getIndex(order);
        if (null == index) {
            return null;
        }
        SegmentSummary[] summary = new SegmentSummary[index.length];
        for (int i = 0; i < index.length; ++i) {
            summary[i] = new SegmentSummary(index[i], router.locateSegment(order, index[i]).getId(), order);
        }
        return summary;
    }

    @Override
    public SegmentSummary[] getPeerSummary() {
        segmentsLock.readLock().lock();
        try {
            ArrayList<SegmentSummary> segments = new ArrayList<SegmentSummary>();
            for (SegmentDescriptor sd : this.segments.values()) {
                Segment segment = sd.segment;
                segments.add(
                        new SegmentSummary(sd.info, sd.status, null == segment ? null : segment.getSegmentSize(),
                                router.getLocalPeer().getId(), sd.index.getOrder(), sd.compacting, null));
            }
            return segments.toArray(new SegmentSummary[segments.size()]);
        } finally {
            segmentsLock.readLock().unlock();
        }
    }

    @Override
    public SegmentSummary getSegmentSummary(long segmentId) throws IOException {
        SegmentDescriptor sd = null;
        segmentsLock.readLock().lock();
        sd = segments.get(segmentId);
        segmentsLock.readLock().unlock();
        if (null == sd) {
            return null;
        }
        sd.open();
        Segment segment = sd.segment;
        if (null == segment) {
            return null;
        }
        return new SegmentSummary(sd.info, sd.status, segment.getSegmentSize(), router.getLocalPeer().getId(),
                sd.index.getOrder(), sd.compacting, segment.getFileInfo());
    }

    @Override
    public PeerView[] getView() {
        return router.getView();
    }

    @Override
    public boolean openSegment(long segmentId) throws IOException {
        if (0 == segmentId) {
            segmentsLock.readLock().lock();
            try {
                for (SegmentDescriptor sd : segments.values()) {
                    sd.open();
                }
            } finally {
                segmentsLock.readLock().unlock();
            }
            return true;
        } else {
            SegmentDescriptor sd = null;
            segmentsLock.readLock().lock();
            sd = segments.get(segmentId);
            segmentsLock.readLock().unlock();
            if (null == sd) {
                return false;
            }
            return sd.open();
        }
    }

    @Override
    public boolean requestCompaction(long segmentId) throws IOException {
        if (0 == segmentId) {
            segmentsLock.readLock().lock();
            try {
                for (SegmentDescriptor sd : segments.values()) {
                    sd.open();
                    compactorThread.request(sd.info.getSegmentId(), true);
                }
            } finally {
                segmentsLock.readLock().unlock();
            }
            return true;
        } else {
            SegmentDescriptor sd = null;
            segmentsLock.readLock().lock();
            sd = segments.get(segmentId);
            segmentsLock.readLock().unlock();
            if (null == sd) {
                return false;
            }
            sd.open();
            sd.segment.requestMajorCompaction();
            return true;
        }
    }

    @Override
    public boolean requestFlush(long segmentId) throws IOException {
        if (0 == segmentId) {
            segmentsLock.readLock().lock();
            try {
                for (SegmentDescriptor sd : segments.values()) {
                    sd.open();
                    flushThread.request(sd.info.getSegmentId());
                }
            } finally {
                segmentsLock.readLock().unlock();
            }
            return true;
        } else {
            SegmentDescriptor sd = null;
            segmentsLock.readLock().lock();
            sd = segments.get(segmentId);
            segmentsLock.readLock().unlock();
            if (null == sd) {
                return false;
            }
            sd.open();
            flushThread.request(sd.info.getSegmentId());
            return true;
        }
    }

    @Override
    public boolean requestSplit(long segmentId) throws IOException {
        SegmentDescriptor sd = null;
        segmentsLock.readLock().lock();
        sd = segments.get(segmentId);
        segmentsLock.readLock().unlock();
        if (null == sd) {
            return false;
        }
        sd.open();
        splitThread.request(sd.info.getSegmentId());
        return true;
    }

    @Override
    public NodeStatus getNodeStatus() {
        Runtime rt = Runtime.getRuntime();
        return new NodeStatus(rt.freeMemory() + (rt.maxMemory() - rt.totalMemory()), bufferSize.get(),
                conf.getInt(Configuration.KEY_NODE_TRANSACTION_BUFFER,
                        Configuration.DEFAULT_NODE_TRANSACTION_BUFFER) - transactionMemPool.availablePermits(),
                segments.size(), transactions.size(), scanners.size());
    }

    /*
     * 
     *  Reading / Scanning
     * 
     */

    public static final int SCAN_BATCH_SIZE = 1024 * 1024;

    private final AtomicLong scannerIdCounter = new AtomicLong(0L);

    private final Map<Long, SegmentScanner> scanners = new HashMap<Long, SegmentScanner>();

    private long createScannerId() {
        return scannerIdCounter.incrementAndGet();
    }

    static class SegmentScanner {

        private final long id;
        private final SegmentDescriptor sd;
        private TripleScanner scanner;
        private final Triple pattern;
        private final boolean filterDeletes;

        private volatile long timestamp = System.currentTimeMillis();

        private Future<ScanResult> prefetch = null;

        SegmentScanner(long id, SegmentDescriptor sd, TripleScanner scanner, boolean filterDeletes) {
            this(id, sd, scanner, filterDeletes, null);
        }

        SegmentScanner(long id, SegmentDescriptor sd, TripleScanner scanner, boolean filterDeletes,
                Triple pattern) {
            this.id = id;
            this.sd = sd;
            this.scanner = scanner;
            this.filterDeletes = filterDeletes;
            this.pattern = pattern;
        }

        Long getId() {
            return id;
        }

        /**
         * Update this scanner's timestamp.
         */
        void touch() {
            timestamp = System.currentTimeMillis();
        }

        long getTimestamp() {
            return timestamp;
        }

        ScanResult next(int limit) throws IOException {
            return next(limit, false);
        }

        ScanResult next(int limit, boolean isPrefetch) throws IOException {

            if (!isPrefetch && null != prefetch) {
                try {
                    ScanResult result = prefetch.get();
                    prefetch = null;
                    return result;
                } catch (InterruptedException e) {
                    throw new IOException("interrupted", e);
                } catch (ExecutionException e) {
                    throw new IOException("prefetch error", e);
                }
            }

            OrderedTriplesWritable triples = new OrderedTriplesWritable(sd.index.getOrder(), SCAN_BATCH_SIZE);
            ScanResult result = new ScanResult(id, triples);

            TripleSink out = triples.getWriter();

            int count = 0;

            synchronized (this) { // prevent close(), concurrent next()
                while (true) {

                    if (!scanner.next()) {
                        if (!sd.isOnline() && !sd.isOpen()) {
                            result.setAborted();
                        } else {
                            scanner.close();
                            result.setEnd();
                        }
                        break;
                    }

                    Triple t = scanner.peek();

                    if (null != pattern && !sd.index.getOrder().comparator().match(t, pattern)) {
                        scanner.close();
                        result.setDone();
                        break;
                    }

                    if (filterDeletes) {
                        if (t.isDelete()) {
                            scanner.pop();
                            continue;
                        }
                    }

                    if (!out.add(t)) {
                        // buffer is full
                        if (0 == count) {
                            // large triple
                            return new ScanResult(id, scanner.pop(), sd.index.getOrder());
                        }
                        break;
                    }
                    scanner.pop();

                    count++;
                    if (0 < limit && count == limit) {
                        scanner.close();
                        result.setAborted();
                        break;
                    }
                }
            } // synchronized (this)

            return result;
        }

        void setPrefetch(Future<ScanResult> prefetch) {
            this.prefetch = prefetch;
        }

        synchronized void close() {
            try {
                scanner.close();
            } catch (IOException ex) {
                LOG.error("Error closing segment scanner: ", ex);
            }
            scanner = null;
        }

        boolean isClosed() {
            return null == scanner;
        }

    }

    static class PrefetchTask implements Callable<ScanResult> {
        private final SegmentScanner scanner;

        PrefetchTask(SegmentScanner scanner) {
            this.scanner = scanner;
        }

        @Override
        public ScanResult call() throws Exception {
            return scanner.next(0, true);
        }
    }

    void prefetch(SegmentScanner scanner) {
        PrefetchTask task = new PrefetchTask(scanner);
        try {
            Future<ScanResult> prefetch = prefetcherPool.submit(task);
            scanner.setPrefetch(prefetch);
        } catch (RejectedExecutionException ex) {
            // skip prefetch in this case
            LOG.warn("Prefetch rejected");
        }
    }

    SegmentScanner getScanner(long scannerId) throws IOException {
        SegmentScanner scanner;
        synchronized (scanners) {
            scanner = scanners.get(Long.valueOf(scannerId));
        }
        if (null != scanner) {
            scanner.touch();
            if (scanner.isClosed()) {
                // the maintenance thread just closed it...
                scanner = null;
            }
        }
        return scanner;
    }

    @Override
    public void close(long scannerId) throws IOException {
        SegmentScanner scanner = getScanner(scannerId);
        if (null != scanner) {
            scanner.close();
            synchronized (scanners) {
                scanners.remove(scanner.getId());
            }
        }
    }

    @Override
    public ScanResult next(long scannerId, int limit) throws IOException {
        SegmentScanner scanner = getScanner(scannerId);
        if (null == scanner) {
            return ScanResult.EXPIRED;
        }
        ScanResult result = scanner.next(limit);
        if (result.isClosed()) {
            synchronized (scanners) {
                scanners.remove(scanner.getId());
            }
        } else {
            prefetch(scanner);
        }
        return result;
    }

    @Override
    public ScanResult openScanner(long segmentId, int limit) throws IOException {
        return openScanner(segmentId, true, limit, null);
    }

    @Override
    public ScanResult openScanner(long segmentId, boolean filterDeletes, int limit) throws IOException {
        return openScanner(segmentId, filterDeletes, limit, null);
    }

    @Override
    public ScanResult openScanner(long segmentId, boolean filterDeletes, int limit, Triple pattern)
            throws IOException {
        return openScanner(segmentId, filterDeletes, limit, pattern, pattern);
    }

    @Override
    public ScanResult openScanner(long segmentId, boolean filterDeletes, int limit, Triple pattern, Triple seek)
            throws IOException {
        SegmentDescriptor sd = null;
        segmentsLock.readLock().lock();
        sd = segments.get(segmentId);
        segmentsLock.readLock().unlock();

        if (null == sd) {
            return ScanResult.INVALID;
        }

        if (null != seek && seek.isMagic()) {
            // Segment can't handle the magic triple
            seek = null;
        }

        TripleScanner tripleScanner = null == seek ? sd.openScanner() : sd.openScanner(seek);
        if (null == tripleScanner) {
            return ScanResult.EMPTY;
        }

        SegmentScanner scanner = new SegmentScanner(createScannerId(), sd, tripleScanner, filterDeletes, pattern);
        ScanResult result = scanner.next(limit);

        if (!result.isClosed()) {
            synchronized (scanners) {
                scanners.put(scanner.getId(), scanner);
            }
        } else {
            prefetch(scanner);
        }
        return result;
    }

    private int clientIdCounter = 0;

    @Override
    public synchronized StoreInfo getStoreInfo() {
        return new StoreInfo(router.getCollations(), router.getPeers(), clientIdCounter++);
    }

}