org.commoncrawl.service.listcrawler.CacheManager.java Source code

Java tutorial

Introduction

Here is the source code for org.commoncrawl.service.listcrawler.CacheManager.java

Source

/**
 * Copyright 2008 - CommonCrawl Foundation
 * 
 *    This program is free software: you can redistribute it and/or modify
 *    it under the terms of the GNU General Public License as published by
 *    the Free Software Foundation, either version 3 of the License, or
 *    (at your option) any later version.
 *
 *    This program is distributed in the hope that it will be useful,
 *    but WITHOUT ANY WARRANTY; without even the implied warranty of
 *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *    GNU General Public License for more details.
 *
 *    You should have received a copy of the GNU General Public License
 *    along with this program.  If not, see <http://www.gnu.org/licenses/>.
 *
 **/

package org.commoncrawl.service.listcrawler;

import java.io.DataInput;
import java.io.DataInputStream;
import java.io.DataOutput;
import java.io.DataOutputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.RandomAccessFile;
import java.net.MalformedURLException;
import java.nio.ByteBuffer;
import java.rmi.server.UID;
import java.security.MessageDigest;
import java.security.NoSuchAlgorithmException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.LinkedList;
import java.util.List;
import java.util.TreeSet;
import java.util.Vector;
import java.util.concurrent.Callable;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.LinkedBlockingQueue;
import java.util.concurrent.Semaphore;
import java.util.concurrent.TimeUnit;
import java.util.zip.CRC32;
import java.util.zip.CheckedInputStream;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.WritableUtils;
import org.apache.hadoop.record.Buffer;
import org.commoncrawl.async.ConcurrentTask;
import org.commoncrawl.async.ConcurrentTask.CompletionCallback;
import org.commoncrawl.async.EventLoop;
import org.commoncrawl.async.Timer;
import org.commoncrawl.crawl.common.internal.CrawlEnvironment;
import org.commoncrawl.io.NIOHttpHeaders;
import org.commoncrawl.protocol.CacheItem;
import org.commoncrawl.protocol.CrawlURLMetadata;
import org.commoncrawl.service.listcrawler.HDFSFlusherThread.IndexDataFileTriple;
import org.commoncrawl.util.HttpHeaderInfoExtractor;
import org.commoncrawl.util.SessionIDURLNormalizer;
import org.commoncrawl.util.URLFingerprint;
import org.commoncrawl.util.URLUtils;
import org.commoncrawl.util.ArcFileItemUtils;
import org.commoncrawl.util.CCStringUtils;
import org.junit.Assert;

import com.google.common.collect.HashMultimap;
import com.google.common.collect.ImmutableList;
import com.google.common.collect.Iterables;
import com.google.common.collect.Lists;

/** 
 * Class that encapsulates most of the serving side of the crawler
 * 
 * @author rana
 *
 */
public class CacheManager {

    private HashMultimap<Long, Long> _fingerprintToLocalLogPos = HashMultimap.create();
    private LinkedList<CacheItem> _writeQueue = new LinkedList<CacheItem>();
    private LinkedBlockingQueue<CacheWriteRequest> _writeRequestQueue = new LinkedBlockingQueue<CacheWriteRequest>();
    private LinkedBlockingQueue<CacheFlushRequest> _hdfsFlushRequestQueue = new LinkedBlockingQueue<CacheFlushRequest>();
    private Vector<HDFSFileIndex> _hdfsIndexList = new Vector<HDFSFileIndex>();

    // private LinkedBlockingQueue<CacheLoadRequest> _loadRequestQueue = new LinkedBlockingQueue<CacheLoadRequest>();
    private static final int DEFAULT_DISK_READER_THREADS = 8 * 4;
    private static final int HDFS_READER_THREADS = 3 * 8 * 4;
    private static final int LOG_ACCESS_SEMAPHORE_COUNT = 100 + 1;
    static final int ITEM_RECORD_TRAILING_BYTES = 4;
    // flush the local cache to hdfs once the local cache document counts exceeds this number 
    public static final int LOCAL_CACHE_FLUSH_THRESHOLD = 10000;
    // proxy cache location in hdfs 
    private static final String PROXY_CACHE_LOCATION = "crawl/proxy/cache";
    static final String PROXY_CACHE_FILE_DATA_PREFIX = "cacheData";
    static final String PROXY_CACHE_FILE_INDEX_PREFIX = "cacheIndex";

    private static final int CACHE_POLL_TIMER_INTERVAL = 10000;

    public static final Log LOG = LogFactory.getLog(CacheManager.class);

    /** log file header **/
    private LocalLogFileHeader _header = new LocalLogFileHeader();

    /** local data directory **/
    File _localDataDirectory;

    /** remote data directory **/
    Path _remoteDataDirectory;

    /** event loop **/
    EventLoop _eventLoop;

    /** reader thread pool **/
    ExecutorService _cacheLoadThreadPool = null;
    /** hdfs loader thread pool **/
    ExecutorService _hdfsLoaderPool = null;
    /** number of cache writer threads **/
    public static final int CACHE_WRITER_THREADS = 8;
    /** local cache writer thread(S) **/
    Thread _writerThreads[] = null;
    /** hdfs flusher thread **/
    Thread _hdfsFlusherThread = null;
    /** hdfs flusher active indicator **/
    boolean _hdfsFlusherActive = false;
    /** file system object **/
    FileSystem _remoteFileSystem = null;
    /** local log virtual offset **/
    long _localLogStartOffset = 0;
    /** local log access mutex **/
    Semaphore _localLogAccessSempahore = new Semaphore(LOG_ACCESS_SEMAPHORE_COUNT);
    /** local log write mutex **/
    Semaphore _localLogWriteAccessSemaphore = new Semaphore(1);
    /** session id normalizer **/
    SessionIDURLNormalizer _sessionIdNormalizer = new SessionIDURLNormalizer();
    // cache flush timer
    Timer _cacheFlushTimer;
    /** cache flush threshold **/
    int _cacheFlushThreshold = LOCAL_CACHE_FLUSH_THRESHOLD;

    /** internal constructor for test purposes  
     * 
     */
    private CacheManager(EventLoop eventLoop) {
        _localDataDirectory = new File("/tmp/proxy/localData");
        _localDataDirectory.mkdirs();
        _remoteDataDirectory = new Path("/tmp/proxy/remoteData");

        if (eventLoop == null) {
            _eventLoop = new EventLoop();
            _eventLoop.start();
        } else {
            _eventLoop = eventLoop;
        }
        Configuration conf = new Configuration();

        conf.addResource("nutch-default.xml");
        conf.addResource("nutch-site.xml");
        conf.addResource("hadoop-default.xml");
        conf.addResource("hadoop-site.xml");
        conf.addResource("commoncrawl-default.xml");
        conf.addResource("commoncrawl-site.xml");

        CrawlEnvironment.setHadoopConfig(conf);

        try {
            _remoteFileSystem = FileSystem.getLocal(conf);
            _remoteFileSystem.mkdirs(_remoteDataDirectory);
        } catch (IOException e) {
            LOG.error(CCStringUtils.stringifyException(e));
            throw new RuntimeException("Could not initialize hdfs connection");
        }
    }

    /** DocumentCache - proper constructor
     * 
     */
    public CacheManager(FileSystem remoteFileSystem, File dataDirectory, EventLoop eventLoop) {
        _localDataDirectory = dataDirectory;
        _eventLoop = eventLoop;
        _remoteFileSystem = remoteFileSystem;
        _remoteDataDirectory = new Path(PROXY_CACHE_LOCATION);

    }

    public static final int INIT_FLAG_SKIP_CACHE_WRITER_INIT = 1;
    public static final int INIT_FLAG_SKIP_HDFS_WRITER_INIT = 2;
    public static final int INIT_FLAG_SKIP_INDEX_LOAD = 4;

    /** 
     * set cache flush threshold
     *  best to call this before initialize
     */
    public void setCacheFlushThreshold(int cacheFlushThreshold) {
        _cacheFlushThreshold = cacheFlushThreshold;
    }

    public int getCacheFlushThreshold() {
        return _cacheFlushThreshold;
    }

    /**
     * initialize the DocumentCache
     * 
     * @throws IOException
     */
    public void initialize(int initFlags) throws IOException {
        initializeActiveLog();
        if ((initFlags & INIT_FLAG_SKIP_INDEX_LOAD) == 0) {
            loadHDFSIndexFiles();
        }
        if ((initFlags & INIT_FLAG_SKIP_CACHE_WRITER_INIT) == 0) {
            startCacheWriterThread();
        }
        if ((initFlags & INIT_FLAG_SKIP_HDFS_WRITER_INIT) == 0) {
            startHDFSFlusherThread();
        }
        _cacheLoadThreadPool = Executors.newFixedThreadPool(DEFAULT_DISK_READER_THREADS);
        _hdfsLoaderPool = Executors.newFixedThreadPool(HDFS_READER_THREADS);
        _cacheFlushTimer = new Timer(CACHE_POLL_TIMER_INTERVAL, true, new Timer.Callback() {

            @Override
            public void timerFired(Timer timer) {
                if (!_hdfsFlusherActive) {
                    synchronized (_header) {
                        // check status ...
                        if (_header._itemCount >= _cacheFlushThreshold) {
                            LOG.info("Local Cache Item Count:" + _header._itemCount
                                    + " Exceeded Cache Flush Threshold. Scheduling HDFS Flush");
                            CacheFlushRequest flushRequest = new CacheFlushRequest(
                                    _header._fileSize - LocalLogFileHeader.SIZE, (int) _header._itemCount);
                            _hdfsFlushRequestQueue.add(flushRequest);
                            _hdfsFlusherActive = true;
                        }
                    }
                }
            }

        });
        _eventLoop.setTimer(_cacheFlushTimer);

        LOG.info("Initialization Complete");
    }

    /**
     * shutdown the DocuemntCache manager
     * 
     */
    public void shutdown() {
        if (_cacheFlushTimer != null) {
            _eventLoop.cancelTimer(_cacheFlushTimer);
            _cacheFlushTimer = null;
        }
        if (_writerThreads != null) {
            LOG.info("Shuting down write threads");
            for (int i = 0; i < CACHE_WRITER_THREADS; ++i)
                _writeRequestQueue.add(new CacheWriteRequest());
            for (int i = 0; i < CACHE_WRITER_THREADS; ++i) {
                try {
                    _writerThreads[i].join();
                } catch (InterruptedException e1) {
                }
            }
            _writerThreads = null;
        }
        if (_cacheLoadThreadPool != null) {
            LOG.info("write thread terminated. shuting down reader threads");
            _cacheLoadThreadPool.shutdown();
            try {
                while (!_cacheLoadThreadPool.awaitTermination(5000, TimeUnit.MILLISECONDS)) {
                    LOG.info("waiting ... ");
                }
            } catch (InterruptedException e) {
            }
            LOG.info("reader thread terminated");
            _cacheLoadThreadPool = null;
        }

        if (_hdfsLoaderPool != null) {
            LOG.info("shuting down hdfs loader threads");
            _hdfsLoaderPool.shutdown();
            try {
                while (!_hdfsLoaderPool.awaitTermination(5000, TimeUnit.MILLISECONDS)) {
                    LOG.info("waiting ... ");
                }
            } catch (InterruptedException e) {
            }
            LOG.info("hdfs loads threads terminated");
            _hdfsLoaderPool = null;
        }
    }

    /********************************************************************************************************/
    // CacheItemCheckCallback
    /********************************************************************************************************/

    public static interface CacheItemCheckCallback {
        public void cacheItemAvailable(String url, CacheItem item);

        public void cacheItemNotFound(String url);
    }

    /**
     * check if this specified url is present in the cache
     * 
     * @param url
     * @param callback
     * @throws MalformedURLException
     */
    public void checkCacheForItem(final String url, final CacheItemCheckCallback callback) {
        try {
            // get the normalized url 
            String normalizedURL = normalizeURL(url);
            // get the fingerprint 
            long urlfp = URLFingerprint.generate64BitURLFPrint(normalizedURL);
            // delegate to properly qualified method
            checkCacheForItem(normalizedURL, urlfp, callback);
        } catch (MalformedURLException e) {
            LOG.error(CCStringUtils.stringifyException(e));
            queueCacheItemNotFoundCallback(callback, url);
        }
    }

    /**
     * check if this specified url is present in the cache
     * 
     * @param url
     * @param callback
     * @throws MalformedURLException
     */
    public CacheItem checkCacheForItemInWorkerThread(final String url) {
        try {
            // get the normalized url 
            String normalizedURL = normalizeURL(url);
            // get the fingerprint 
            long urlfp = URLFingerprint.generate64BitURLFPrint(normalizedURL);
            // delegate to properly qualified method
            return checkCacheForItemInWorkerThread(normalizedURL, urlfp);
        } catch (MalformedURLException e) {
            LOG.error(CCStringUtils.stringifyException(e));
            return null;
        }
    }

    private static long dateFromCacheItem(CacheItem cacheItem) {
        NIOHttpHeaders headers = ArcFileItemUtils.buildHeaderFromArcFileItemHeaders(cacheItem.getHeaderItems());
        CrawlURLMetadata metadata = new CrawlURLMetadata();
        try {
            HttpHeaderInfoExtractor.parseHeaders(headers, metadata);
        } catch (IOException e) {
            LOG.error(CCStringUtils.stringifyException(e));
            return 0;
        }
        if (metadata.isFieldDirty(CrawlURLMetadata.Field_HTTPDATE)) {
            return metadata.getHttpDate();
        } else if (metadata.isFieldDirty(CrawlURLMetadata.Field_LASTMODIFIEDTIME)) {
            return metadata.getLastModifiedTime();
        } else if (metadata.isFieldDirty(CrawlURLMetadata.Field_EXPIRES)) {
            return metadata.getExpires();
        }
        return 0;
    }

    /** check cache via fingerprint - this call blocks and should not be used in an async context
     * 
     * @param urlFingerprint
     * @return true if a document with matching fingerprint exists in the cache ... 
     */
    public long checkCacheForFingerprint(long urlFingerprint, boolean returnDate) {

        synchronized (this) {
            for (CacheItem item : _writeQueue) {
                if (item.getUrlFingerprint() == urlFingerprint) {
                    if (returnDate) {
                        long dateOut = dateFromCacheItem(item);
                        // if no date found, use current date as an approximate...
                        return (dateOut != 0) ? dateOut : System.currentTimeMillis();
                    } else
                        return 1;
                }
            }
        }

        synchronized (this) {
            if (_fingerprintToLocalLogPos.get(urlFingerprint).size() != 0) {
                // assume recent date as an approximate 
                return System.currentTimeMillis();
            }
        }

        // now check hdfs indexes 
        ImmutableList<HDFSFileIndex> indexList = null;

        synchronized (CacheManager.this) {
            indexList = ImmutableList.copyOf(_hdfsIndexList);
        }

        long timeStart = System.currentTimeMillis();

        // first check local item cache ...
        TreeSet<Long> cachedItems = new TreeSet<Long>();

        for (HDFSFileIndex index : Lists.reverse(indexList)) {
            try {
                CacheItem itemFound = index.findItem(urlFingerprint, !returnDate);
                if (itemFound != null) {
                    if (returnDate) {
                        // get item date from headers .
                        long itemDate = dateFromCacheItem(itemFound);
                        if (itemDate == 0) {
                            itemDate = index.getIndexTimestamp();
                            // if item date still 0, this is BAD !!!
                            if (itemDate == 0) {
                                LOG.error("!!!!!!UNABLE TO PARSE INDEX TIMESTAMP:" + index.getIndexDataPath());
                                itemDate = 1L;
                            }
                        }
                        // ok add it to the map ... 
                        cachedItems.add(itemDate);
                    } else {
                        return 1;
                    }
                }
            } catch (IOException e) {
                LOG.error(CCStringUtils.stringifyException(e));
            }
        }
        if (returnDate && cachedItems.size() != 0) {
            return cachedItems.last();
        }
        return 0;
    }

    /**
     * check cache for url by fingerprint 
     * 
     * @param urlfingerprint
     * @param callback
     * @throws MalformedURLException
     */
    public void checkCacheForItem(final String normalizedURL, final long urlFingerprint,
            final CacheItemCheckCallback callback) {

        // first check local item cache ...
        CacheItem cachedItemOut = null;

        synchronized (this) {
            for (CacheItem item : _writeQueue) {
                if (item.getUrlFingerprint() == urlFingerprint) {
                    cachedItemOut = item;
                    break;
                }
            }
        }
        // if found initiate immediate callback 
        if (cachedItemOut != null) {
            queueCacheItemFoundCallback(callback, cachedItemOut);
        } else {
            Long[] fpToItemCache = new Long[0];
            synchronized (this) {
                // now check local cache first ... 
                fpToItemCache = _fingerprintToLocalLogPos.get(urlFingerprint).toArray(fpToItemCache);
            }

            if (fpToItemCache.length != 0) {
                queueLocalCacheLoadRequest(new CacheLoadRequest(normalizedURL, fpToItemCache, callback));
            } else {
                // if not found ... check hdfs cache ... 
                queueHDFSCacheLoadRequest(new CacheLoadRequest(normalizedURL, urlFingerprint, callback));
            }
        }
    }

    /**
     * check cache for url by fingerprint 
     * 
     * @param urlfingerprint
     * @param callback
     * @throws MalformedURLException
     */
    public CacheItem checkCacheForItemInWorkerThread(final String normalizedURL, final long urlFingerprint) {

        // first check local item cache ...
        CacheItem cachedItemOut = null;

        synchronized (this) {
            for (CacheItem item : _writeQueue) {
                if (item.getUrlFingerprint() == urlFingerprint) {
                    cachedItemOut = item;
                    break;
                }
            }
        }
        // if found initiate immediate callback 
        if (cachedItemOut != null) {
            // callback.cacheItemAvailable(cachedItemOut.getUrl(), cachedItemOut);
            return cachedItemOut;
        } else {
            Long[] fpToItemCache = new Long[0];
            synchronized (this) {
                // now check local cache first ... 
                fpToItemCache = _fingerprintToLocalLogPos.get(urlFingerprint).toArray(fpToItemCache);
            }

            if (fpToItemCache.length != 0) {
                return queueLocalCacheLoadRequestInWorkerThread(
                        new CacheLoadRequest(normalizedURL, fpToItemCache, null));
            } else {
                // if not found ... check hdfs cache ... 
                return queueHDFSCacheLoadRequestInWorkerThread(
                        new CacheLoadRequest(normalizedURL, urlFingerprint, null));
            }
        }
    }

    /** 
     * cache this item
     * 
     * @param item - the item to cache 
     * @param optionalSemaphore an optional semaphore that will be passed back in completion callback when io operations complete
     */
    public void cacheItem(CacheItem item, Semaphore optionalSemaphore) {
        synchronized (this) {
            _writeQueue.add(item);
            _writeRequestQueue.add(new CacheWriteRequest(item.getUrlFingerprint(), item, optionalSemaphore));
        }
    }

    EventLoop getEventLoop() {
        return _eventLoop;
    }

    long getLocalLogFilePos() {
        long filePosOut = 0;
        synchronized (_header) {
            filePosOut = _header._fileSize;
        }
        return filePosOut;
    }

    byte[] getLocalLogSyncBytes() {
        return _header._sync;
    }

    LinkedBlockingQueue<CacheWriteRequest> getWriteRequestQueue() {
        return _writeRequestQueue;
    }

    File getLocalDataDirectory() {
        return _localDataDirectory;
    }

    Path getRemoteDataDirectory() {
        return _remoteDataDirectory;
    }

    /** startCacheWriterThread
     * 
     */
    private void startCacheWriterThread() {

        _writerThreads = new Thread[CACHE_WRITER_THREADS];
        for (int i = 0; i < CACHE_WRITER_THREADS; ++i) {
            _writerThreads[i] = new Thread(new CacheWriterThread(this));
            _writerThreads[i].start();
        }
    }

    /** startHDFSFlusherThread
     * 
     */
    private void startHDFSFlusherThread() {

        _hdfsFlusherThread = new Thread(new HDFSFlusherThread(this));
        _hdfsFlusherThread.start();
    }

    /**  load hdfs indexes 
     * 
     */
    private synchronized void loadHDFSIndexFiles() throws IOException {
        //scan remote file system for index files ... 
        FileStatus indexFiles[] = getRemoteFileSystem()
                .globStatus(new Path(getRemoteDataDirectory(), PROXY_CACHE_FILE_INDEX_PREFIX + "*"));
        // iterate files 
        for (FileStatus indexFile : indexFiles) {
            LOG.info("Found Remote Index File:" + indexFile.getPath() + " Scanning for valid local copy");
            File localPath = new File(getLocalDataDirectory(), indexFile.getPath().getName());
            if (!localPath.exists() || localPath.length() != indexFile.getLen()) {
                LOG.info("Local Index File:" + localPath.getAbsolutePath() + " Not Found. Copying...");
                getRemoteFileSystem().copyToLocalFile(indexFile.getPath(), new Path(localPath.getAbsolutePath()));
                LOG.info("Remote Index File:" + indexFile.getPath() + " copied to:" + localPath.getAbsolutePath());
            }
            // extract timestamp from index name 
            long indexTimestamp = Long
                    .parseLong(indexFile.getPath().getName().substring(PROXY_CACHE_FILE_INDEX_PREFIX.length() + 1));
            // construct data file path 
            Path remoteDataPath = new Path(getRemoteDataDirectory(),
                    PROXY_CACHE_FILE_DATA_PREFIX + "-" + indexTimestamp);
            // now load the index ...
            LOG.info("Loading Index from:" + localPath.getAbsolutePath() + " Data Path:" + remoteDataPath);
            HDFSFileIndex indexObject = new HDFSFileIndex(getRemoteFileSystem(), localPath, remoteDataPath);
            LOG.info("Loaded Index from:" + localPath.getAbsolutePath());
            _hdfsIndexList.add(indexObject);
        }
    }

    // shrink the log file by the desired amount and update the header 
    private final void flushLocalLog(final long bytesToRemove, final int itemsToRemove,
            final List<FingerprintAndOffsetTuple> flushedTupleList,
            final ArrayList<IndexDataFileTriple> tempFileTriples) {

        LOG.info("Acquiring Log Access Semaphores");
        // first boost this thread's priority ... 
        int originalThreadPriority = Thread.currentThread().getPriority();
        Thread.currentThread().setPriority(Thread.MAX_PRIORITY);
        // next acquire all permits to the local access log ... block until we get there ... 
        getLocalLogAccessSemaphore().acquireUninterruptibly(LOG_ACCESS_SEMAPHORE_COUNT);
        // now that we have all the semaphores we need, reduce the thread's priority to normal
        Thread.currentThread().setPriority(originalThreadPriority);
        LOG.info("Acquired ALL Log Access Semaphores");

        long timeStart = System.currentTimeMillis();

        // now we have exclusive access to the local transaction log ... 
        File activeLogFilePath = getActiveLogFilePath();
        File checkpointLogFilePath = getCheckpointLogFilePath();
        try {
            // delete checkpoint file if it existed ... 
            checkpointLogFilePath.delete();
            // now rename activelog to checkpoint path 
            activeLogFilePath.renameTo(checkpointLogFilePath);

            long logFileConsolidationStartTime = System.currentTimeMillis();
            // now trap for exceptions in case something fails 
            try {
                // fix up the header ... 
                _header._fileSize -= bytesToRemove;
                _header._itemCount -= itemsToRemove;

                // open a old file and new file 
                RandomAccessFile newFile = new RandomAccessFile(activeLogFilePath, "rw");
                RandomAccessFile oldFile = new RandomAccessFile(checkpointLogFilePath, "r");

                LOG.info("Opened new and old files. New Header FileSize is:" + _header._fileSize + " ItemCount:"
                        + _header._itemCount);
                try {
                    // write out header ...
                    long bytesRemainingInLogFile = _header._fileSize;

                    LOG.info("Writing Header to New File. Bytes Remaining for Data are:" + bytesRemainingInLogFile);
                    // write header to new file ... 
                    _header.writeHeader(newFile);
                    // decrement bytes available ... 
                    bytesRemainingInLogFile -= LocalLogFileHeader.SIZE;

                    if (bytesRemainingInLogFile != 0) {
                        byte transferBuffer[] = new byte[(1 << 20) * 16];
                        LOG.info("Seeking old file past flushed data (pos:" + LocalLogFileHeader.SIZE
                                + bytesToRemove + ")");
                        // seek past old data ... 
                        oldFile.seek(LocalLogFileHeader.SIZE + bytesToRemove);
                        // and copy across remaining data 
                        while (bytesRemainingInLogFile != 0) {
                            int bytesToReadWriteThisIteration = Math.min((int) bytesRemainingInLogFile,
                                    transferBuffer.length);
                            oldFile.read(transferBuffer, 0, bytesToReadWriteThisIteration);
                            newFile.write(transferBuffer, 0, bytesToReadWriteThisIteration);
                            LOG.info("Copied " + bytesToReadWriteThisIteration + " from Old to New");
                            bytesRemainingInLogFile -= bytesToReadWriteThisIteration;
                        }
                    }
                } finally {
                    if (newFile != null) {
                        newFile.close();
                    }
                    if (oldFile != null) {
                        oldFile.close();
                    }
                }
                // if we reached here then checkpoint was successfull ... 
                LOG.info("Checkpoint - Log Consolidation Successfull! TOOK:"
                        + (System.currentTimeMillis() - logFileConsolidationStartTime));

                LOG.info("Loading Index Files");
                for (IndexDataFileTriple triple : tempFileTriples) {
                    LOG.info("Loading Index File:" + triple._localIndexFilePath);
                    final HDFSFileIndex fileIndex = new HDFSFileIndex(_remoteFileSystem, triple._localIndexFilePath,
                            triple._dataFilePath);
                    LOG.info("Loaded Index File");
                    // update hdfs index list ... 
                    synchronized (CacheManager.this) {
                        LOG.info("Adding HDFS Index to list");
                        _hdfsIndexList.addElement(fileIndex);
                    }
                }

                // create a semaphore to wait on 
                final Semaphore semaphore = new Semaphore(0);

                LOG.info("Scheduling Async Event");
                // now we need to schedule an async call to main thread to update data structures safely ... 
                _eventLoop.setTimer(new Timer(0, false, new Timer.Callback() {

                    @Override
                    public void timerFired(Timer timer) {
                        LOG.info("Cleaning Map");

                        synchronized (CacheManager.this) {
                            // walk tuples 
                            for (FingerprintAndOffsetTuple tuple : flushedTupleList) {
                                //TODO: HACK!
                                // remove from collection ... 
                                _fingerprintToLocalLogPos.removeAll(tuple._fingerprint);
                            }
                        }
                        LOG.info("Increment Offset Info");
                        // finally increment locallog offset by bytes removed ... 
                        _localLogStartOffset += bytesToRemove;

                        LOG.info("Releasing Wait Semaphore");
                        //release wait sempahore 
                        semaphore.release();
                    }
                }));

                LOG.info("Waiting for Async Event to Complete");
                //wait for async operation to complete ...
                semaphore.acquireUninterruptibly();

                LOG.info("Async Event to Completed");
            } catch (IOException e) {
                LOG.error("Checkpoint Failed with Exception:" + CCStringUtils.stringifyException(e));
                // delete new file ... 
                activeLogFilePath.delete();
                // and rename checkpoint file to active file ... 
                checkpointLogFilePath.renameTo(activeLogFilePath);
            }
        } finally {
            LOG.info("Releasing ALL Log Access Semaphores. HELD FOR:" + (System.currentTimeMillis() - timeStart));
            getLocalLogAccessSemaphore().release(LOG_ACCESS_SEMAPHORE_COUNT);
        }
    }

    /** called by hdfs log flusher thread when a cache flush is complete 
     * 
     * @param request
     * @param tupleListOut
     * @param localIndexFileName
     * @param remoteDataFileName
     * @throws IOException
     */
    void hdfsCacheFlushRequestComplete(CacheFlushRequest request, List<FingerprintAndOffsetTuple> tupleListOut,
            ArrayList<IndexDataFileTriple> tempFileList) throws IOException {
        // ok we have been called from the hdfs worker thread, and it is in the middle of a flush transaction
        // and it wants us to atomically update our cache based on its recent successful flush operation
        flushLocalLog(request._bytesToFlush, request._itemsToFlush, tupleListOut, tempFileList);

        _eventLoop.setTimer(new Timer(0, false, new Timer.Callback() {

            @Override
            public void timerFired(Timer timer) {
                // reset hdfs flusher thread variable 
                _hdfsFlusherActive = false;
            }
        }));

    }

    /** called by hdfs log flusher thread if a cache flush request fails
     * 
     * @param request
     */
    public void hdfsCacheFlushRequestFailed(CacheFlushRequest request) {

        LOG.error("HDFS Cache Flush Failed");

        _eventLoop.setTimer(new Timer(0, false, new Timer.Callback() {

            @Override
            public void timerFired(Timer timer) {
                // reset hdfs flusher thread variable 
                _hdfsFlusherActive = false;
            }
        }));
    }

    Semaphore getLocalLogAccessSemaphore() {
        return _localLogAccessSempahore;
    }

    Semaphore getLocalLogWriteAccessSemaphore() {
        return _localLogWriteAccessSemaphore;
    }

    FileSystem getRemoteFileSystem() {
        return _remoteFileSystem;
    }

    LinkedBlockingQueue<CacheFlushRequest> getHDFSFlushRequestQueue() {
        return _hdfsFlushRequestQueue;
    }

    void writeRequestFailed(final CacheWriteRequest request, final IOException e) {
        LOG.error("Failed to complete write request for Item:+ " + request._item.getUrl() + " with Exception:"
                + CCStringUtils.stringifyException(e));
        synchronized (CacheManager.this) {
            // ok time to find this item in the write queue and move it to the long term position queue ...
            _writeQueue.remove(request._item);
        }
        // ok finally, if completion semaphore is set... release it  
        if (request._optionalSemaphore != null) {
            request._optionalSemaphore.release();
        }
    }

    void writeRequestComplete(final CacheWriteRequest request, final long absoluteFilePosition) {
        synchronized (CacheManager.this) {
            // ok time to find this item in the write queue and move it to the long term position queue ...
            _writeQueue.remove(request._item);
            // now ...  push it into long term lookup map ... 
            _fingerprintToLocalLogPos.put(request._itemFingerprint, _localLogStartOffset + absoluteFilePosition);
        }
        // ok finally, if completion semaphore is set... release it  
        if (request._optionalSemaphore != null) {
            request._optionalSemaphore.release();
        }
    }

    /**
     * queueCacheItemFoundCallback - helper method used to dispatch async callback
     * 
     * @param callback
     * @param itemFound
     */
    private void queueCacheItemFoundCallback(final CacheItemCheckCallback callback, final CacheItem itemFound) {
        _eventLoop.setTimer(new Timer(0, false, new Timer.Callback() {

            @Override
            public void timerFired(Timer timer) {
                callback.cacheItemAvailable(itemFound.getUrl(), itemFound);
            }
        }));
    }

    /**
     * queueCacheItemNotFoundCallback - helper method used to dispatch async callback
     * 
     * @param callback
     * @param itemFound
     */

    private void queueCacheItemNotFoundCallback(final CacheItemCheckCallback callback, final String url) {
        _eventLoop.setTimer(new Timer(0, false, new Timer.Callback() {

            @Override
            public void timerFired(Timer timer) {
                callback.cacheItemNotFound(url);
            }
        }));
    }

    /**
     * loadCacheItemFromDisk - load a single cache item from disk 
     * 
     * @param file
     * @param optTargetURL
     * @param location
     * @return
     * @throws IOException
     */
    private CacheItem loadCacheItemFromDisk(FileInputStream file, String optTargetURL, long location)
            throws IOException {

        long timeStart = System.currentTimeMillis();

        // and read out the Item Header ...  
        CacheItemHeader itemHeader = new CacheItemHeader();
        itemHeader.readHeader(new DataInputStream(file));
        // see if it is valid ... 
        if (!Arrays.equals(itemHeader._sync, _header._sync)) {
            LOG.error("### Item Lookup for URL:" + optTargetURL + " Record at:" + location
                    + " failed - corrupt sync bytes detected!!!");
        } else {
            CRC32 crc32 = new CRC32();
            // ok deserialize the bytes ... 
            CacheItem item = new CacheItem();
            CheckedInputStream checkedStream = new CheckedInputStream(file, crc32);
            DataInputStream itemStream = new DataInputStream(checkedStream);
            item.readFields(itemStream);
            // read the content buffer length 
            int contentBufferLen = itemStream.readInt();
            if (contentBufferLen != 0) {
                byte data[] = new byte[contentBufferLen];
                itemStream.read(data);
                item.setContent(new Buffer(data));
            }

            // cache crc 
            long crcValueComputed = crc32.getValue();
            // read disk crc 
            long crcValueOnDisk = itemStream.readLong();
            // validate 
            if (crcValueComputed == crcValueOnDisk) {
                String canonicalURL = URLUtils.canonicalizeURL(item.getUrl(), true);
                if (optTargetURL.length() == 0 || optTargetURL.equals(canonicalURL)) {
                    if (isValidCacheItem(item)) {
                        LOG.info("### Item Lookup for URL:" + optTargetURL + " Record at:" + location
                                + " completed in:" + (System.currentTimeMillis() - timeStart));
                        return item;
                    } else {
                        LOG.info("### Item Lookup for URL:" + optTargetURL + " Record at:" + location
                                + " failed with invalid result code");
                    }

                } else {
                    LOG.info("### Item Lookup for URL:" + optTargetURL + " Record at:" + location
                            + " failed with url mismatch. record url:" + item.getUrl());
                }
            } else {
                LOG.error("### Item Lookup for URL:" + optTargetURL + " Record at:" + location
                        + " failed - crc mismatch!!!");
            }
        }
        return null;
    }

    /** is this a valid cache item for servicing a query request 
     * 
     */
    private static boolean isValidCacheItem(CacheItem item) {

        if ((item.getFlags()
                & (CacheItem.Flags.Flag_IsPermanentRedirect | CacheItem.Flags.Flag_IsTemporaryRedirect)) != 0) {
            return true;
        }
        // parse response code in headers ... 
        CrawlURLMetadata metadata = new CrawlURLMetadata();

        HttpHeaderInfoExtractor.parseStatusLine(item.getHeaderItems().get(0).getItemValue(), metadata);

        if (metadata.isFieldDirty(CrawlURLMetadata.Field_HTTPRESULTCODE)) {
            if (metadata
                    .getHttpResultCode() == 200 /*|| (metadata.getHttpResultCode()>=400 && metadata.getHttpResultCode() <500)*/) {
                return true;
            } else {
                LOG.info("Rejecting Cache Item:" + item.getUrl() + " With Invalid Result Code:"
                        + metadata.getHttpResultCode() + " ActualValue:"
                        + item.getHeaderItems().get(0).getItemValue());
            }
        }
        return false;
    }

    /** queue up an hdfs cache load request
     * 
     */
    private void queueHDFSCacheLoadRequest(final CacheLoadRequest loadRequest) {

        _hdfsLoaderPool.submit(new ConcurrentTask<CacheItem>(_eventLoop, new Callable<CacheItem>() {

            @Override
            public CacheItem call() throws Exception {

                LOG.info("Executing HDFS Index Search Thread for URL:" + loadRequest._targetURL);

                ImmutableList<HDFSFileIndex> indexList = null;

                synchronized (CacheManager.this) {
                    long timeStart = System.currentTimeMillis();
                    indexList = ImmutableList.copyOf(_hdfsIndexList);
                    long timeEnd = System.currentTimeMillis();
                    // LOG.info("#### TIMER - indexList Copy Took:" + (timeEnd-timeStart));
                }
                long timeStart = System.currentTimeMillis();
                LOG.info("Starting Search of:" + indexList.size() + " hdfs indexes for fp:"
                        + loadRequest._fingerprint);
                for (HDFSFileIndex index : Lists.reverse(indexList)) {
                    CacheItem item = index.findItem(loadRequest._fingerprint, false);

                    if (item != null) {
                        LOG.info("Found Hit for fingerprint:" + loadRequest._fingerprint + " URL:" + item.getUrl()
                                + " IN:" + (System.currentTimeMillis() - timeStart));
                        return item;
                    }
                }
                LOG.info("FAILED TO FIND Hit during for fingerprint:" + loadRequest._fingerprint + " IN:"
                        + (System.currentTimeMillis() - timeStart));
                return null;
            }

        }, new CompletionCallback<CacheItem>() {

            @Override
            public void taskComplete(CacheItem loadResult) {
                if (loadResult != null && isValidCacheItem(loadResult)) {
                    LOG.info("### Item Load Request for URL:" + loadRequest._targetURL
                            + " Succeeded. Initiating Callback");
                    // reset pending to zero so no other load requests satisfy callback 
                    loadRequest._callback.cacheItemAvailable(loadRequest._targetURL, loadResult);
                } else {
                    LOG.info("### Item Load Request for URL:" + loadRequest._targetURL
                            + " Failed with No-Item-Found");
                    // if pending zero ... initiate failure callback 
                    loadRequest._callback.cacheItemNotFound(loadRequest._targetURL);
                }
            }

            @Override
            public void taskFailed(Exception e) {
                LOG.info("### Item Load Request for URL:" + loadRequest._targetURL + " Failed with FAILURE Reason:"
                        + CCStringUtils.stringifyException(e));
                // if pending zero ... initiate failure callback 
                loadRequest._callback.cacheItemNotFound(loadRequest._targetURL);
            }

        }));
    }

    /** queue up an hdfs cache load request
     * 
     */
    private CacheItem queueHDFSCacheLoadRequestInWorkerThread(final CacheLoadRequest loadRequest) {

        // LOG.info("#### ENTERING queueHDFSCacheLoadRequestInWorkerThread");

        try {
            CacheItem loadResult = null;

            // LOG.info("Executing HDFS Index Search Thread for URL:" + loadRequest._targetURL);

            ImmutableList<HDFSFileIndex> indexList = null;

            synchronized (CacheManager.this) {
                indexList = ImmutableList.copyOf(_hdfsIndexList);
            }
            long timeStart = System.currentTimeMillis();

            // LOG.info("Starting Search of:" + indexList.size() + " hdfs indexes for fp:" + loadRequest._fingerprint);        
            for (HDFSFileIndex index : Lists.reverse(indexList)) {
                CacheItem item = index.findItem(loadRequest._fingerprint, false);

                if (item != null) {
                    LOG.info("Found Hit for fingerprint:" + loadRequest._fingerprint + " URL:" + item.getUrl()
                            + " IN:" + (System.currentTimeMillis() - timeStart));
                    loadResult = item;
                    break;
                }
            }
            LOG.info("FAILED TO FIND Hit during for fingerprint:" + loadRequest._fingerprint + " IN:"
                    + (System.currentTimeMillis() - timeStart));

            if (loadResult != null && isValidCacheItem(loadResult)) {
                // LOG.info("### Item Load Request for URL:" + loadRequest._targetURL + " Succeeded. Initiating Callback");
                // reset pending to zero so no other load requests satisfy callback 
                // loadRequest._callback.cacheItemAvailable(loadRequest._targetURL,loadResult);
                return loadResult;
            } else {
                // LOG.info("### Item Load Request for URL:" + loadRequest._targetURL + " Failed with No-Item-Found");
                // if pending zero ... initiate failure callback 
                //loadRequest._callback.cacheItemNotFound(loadRequest._targetURL);
            }
        } catch (Exception e) {
            LOG.info("### Item Load Request for URL:" + loadRequest._targetURL + " Failed with FAILURE Reason:"
                    + CCStringUtils.stringifyException(e));
            // if pending zero ... initiate failure callback 
            // loadRequest._callback.cacheItemNotFound(loadRequest._targetURL);
        } finally {
            //LOG.info("#### EXITING queueHDFSCacheLoadRequestInWorkerThread");
        }
        return null;
    }

    /**
     * queue a cache load request via a background thread 
     * 
     * @param loadRequest
     */
    private void queueLocalCacheLoadRequest(final CacheLoadRequest loadRequest) {
        // queue up requests into the thread pool executor (for now)
        for (final Long location : loadRequest._loacations) {

            _cacheLoadThreadPool.submit(new ConcurrentTask<CacheItem>(_eventLoop, new Callable<CacheItem>() {

                @Override
                public CacheItem call() throws Exception {

                    //LOG.info("### Local Cache Loader Called. Acquiring Semaphore");
                    getLocalLogAccessSemaphore().acquireUninterruptibly();
                    //LOG.info("### Local Cache Loader Called. Acquired Semaphore");

                    // now set up and exception handler block to ensure that we release semaphore
                    try {
                        //LOG.info("### Item Loading Item for URL:" + loadRequest._targetURL + " at Pos:" + location.longValue());

                        // now that we have acquire the semaphore ... validate position against current log file offset ...
                        if (location < _localLogStartOffset) {
                            LOG.error("### Load Request for Potentially Flushed Item. Location Request:" + location
                                    + " Current LogStartOffset:" + _localLogStartOffset);
                        } else {
                            long timeStart = System.currentTimeMillis();

                            // we got a location ... initiate a disk read to fetch the serialized CacheItem
                            FileInputStream file = new FileInputStream(getActiveLogFilePath());

                            try {
                                // seek to item location ...
                                file.skip(location.longValue() - _localLogStartOffset);
                                return loadCacheItemFromDisk(file, loadRequest._targetURL, location.longValue());
                            } catch (IOException e) {
                                LOG.error(CCStringUtils.stringifyException(e));
                            } finally {
                                // file.getFD().sync();
                                file.close();
                            }
                        }
                        return null;
                    } finally {
                        //LOG.info("### Local Cache Loader Releasing Semaphore");
                        getLocalLogAccessSemaphore().release();
                    }
                }

            }, new CompletionCallback<CacheItem>() {

                @Override
                public void taskComplete(CacheItem loadResult) {
                    if (loadResult != null) {
                        // LOG.info("### Item Load Request for URL:" + loadRequest._targetURL + " Succeeded. Initiating Callback");
                        // reset pending to zero so no other load requests satisfy callback 
                        loadRequest._pendingItemCount = 0;
                        loadRequest._callback.cacheItemAvailable(loadRequest._targetURL, loadResult);
                    } else {
                        // on failure reduce pending count ... 
                        loadRequest._pendingItemCount--;

                        if (loadRequest._pendingItemCount == 0) {
                            // LOG.info("### Item Load Request for URL:" + loadRequest._targetURL + " Failed with No-Item-Found");
                            // if pending zero ... initiate failure callback 
                            loadRequest._callback.cacheItemNotFound(loadRequest._targetURL);
                        }
                    }
                }

                @Override
                public void taskFailed(Exception e) {
                    // on failure reduce pending count ... 
                    loadRequest._pendingItemCount--;

                    if (loadRequest._pendingItemCount == 0) {
                        // LOG.info("### Item Load Request for URL:" + loadRequest._targetURL + " Failed with FAILURE Reason:" + CCStringUtils.stringifyException(e));
                        // if pending zero ... initiate failure callback 
                        loadRequest._callback.cacheItemNotFound(loadRequest._targetURL);
                    }
                }

            }));

        }
    }

    /**
     * queue a cache load request via a background thread 
     * 
     * @param loadRequest
     */
    private CacheItem queueLocalCacheLoadRequestInWorkerThread(final CacheLoadRequest loadRequest) {
        // LOG.info("#### ENTERING queueLocalCacheLoadRequestInWorkerThread");
        try {
            CacheItem loadResult = null;

            // queue up requests into the thread pool executor (for now)
            for (final Long location : loadRequest._loacations) {

                LOG.info("### Local Cache Loader Called. Acquiring Semaphore");
                getLocalLogAccessSemaphore().acquireUninterruptibly();
                LOG.info("### Local Cache Loader Called. Acquired Semaphore");

                // now set up and exception handler block to ensure that we release semaphore
                try {
                    // LOG.info("### Item Loading Item for URL:" + loadRequest._targetURL + " at Pos:" + location.longValue());

                    // now that we have acquire the semaphore ... validate position against current log file offset ...
                    if (location < _localLogStartOffset) {
                        LOG.error("### Load Request for Potentially Flushed Item. Location Request:" + location
                                + " Current LogStartOffset:" + _localLogStartOffset);
                    } else {
                        long timeStart = System.currentTimeMillis();

                        // we got a location ... initiate a disk read to fetch the serialized CacheItem
                        FileInputStream file = new FileInputStream(getActiveLogFilePath());

                        try {
                            // seek to item location ...
                            file.skip(location.longValue() - _localLogStartOffset);
                            loadResult = loadCacheItemFromDisk(file, loadRequest._targetURL, location.longValue());
                            if (loadResult != null) {
                                break;
                            }
                        } catch (IOException e) {
                            LOG.error(CCStringUtils.stringifyException(e));
                        } finally {
                            // file.getFD().sync();
                            file.close();
                        }
                    }
                } finally {
                    LOG.info("### Local Cache Loader Releasing Semaphore");
                    getLocalLogAccessSemaphore().release();
                }
            }

            if (loadResult != null) {
                LOG.info("### Item Load Request for URL:" + loadRequest._targetURL
                        + " Succeeded. Initiating Callback");
                // reset pending to zero so no other load requests satisfy callback 
                loadRequest._pendingItemCount = 0;
                // loadRequest._callback.cacheItemAvailable(loadRequest._targetURL,loadResult);
                return loadResult;
            } else {
                // on failure reduce pending count ... 
                loadRequest._pendingItemCount--;

                if (loadRequest._pendingItemCount == 0) {
                    LOG.info("### Item Load Request for URL:" + loadRequest._targetURL
                            + " Failed with No-Item-Found");
                    // if pending zero ... initiate failure callback 
                    // loadRequest._callback.cacheItemNotFound(loadRequest._targetURL);
                    return null;
                }
            }
        } catch (Exception e) {
            LOG.info("### Item Load Request for URL:" + loadRequest._targetURL + " Failed with FAILURE Reason:"
                    + CCStringUtils.stringifyException(e));
            // if pending zero ... initiate failure callback 
            //loadRequest._callback.cacheItemNotFound(loadRequest._targetURL);
        } finally {
            //LOG.info("#### EXITING queueLocalCacheLoadRequestInWorkerThread");
        }
        return null;
    }

    /** 
     * normalize a url to be canonical 
     * 
     * @param url
     * @return
     * @throws MalformedURLException
     */
    public static String normalizeURL(String url) throws MalformedURLException {
        return URLUtils.canonicalizeURL(url, true);
    }

    /** 
     * get fingerprint given url 
     */
    public static long getFingerprintGivenURL(String url) throws MalformedURLException {
        // get the normalized url 
        String normalizedURL = normalizeURL(url);
        // get the fingerprint 
        return URLFingerprint.generate64BitURLFPrint(normalizedURL);
    }

    /** 
     * getLogFilePath - get active log file path 
     * 
     * @param directoryRoot
     * @return
     */
    File getActiveLogFilePath() {
        return new File(_localDataDirectory, "ActiveLog");
    }

    File getCheckpointLogFilePath() {
        return new File(_localDataDirectory, "Checkpoint");
    }

    /**
     * updateLogFileHeader - update the log file header 
     *    called via the log file writer thread ...
     * @throws IOException
     */
    void updateLogFileHeader(File logFileName, long newlyAddedItemsCount, long newItemsFileSize)
            throws IOException {

        RandomAccessFile file = new RandomAccessFile(logFileName, "rw");

        try {

            synchronized (_header) {
                // update cached header ... 
                _header._fileSize += newItemsFileSize;
                _header._itemCount += newlyAddedItemsCount;
                // set the position at zero .. 
                file.seek(0);
                // and write header to disk ... 
                _header.writeHeader(file);
            }
        } finally {
            // major bottle neck.. 
            // file.getFD().sync();
            file.close();
        }
    }

    /**
     * initializeEmptyLogFile - init an empty log file header
     * 
     * @param stream
     * @return
     * @throws IOException
     */
    private static LocalLogFileHeader initializeEmptyLogFile(DataOutput stream) throws IOException {

        LocalLogFileHeader header = new LocalLogFileHeader();
        header.writeHeader(stream);

        return header;
    }

    /**
     * readLogFileHeader - from File
     * 
     * @param logFileName
     * @return
     * @throws IOException
     */
    private static LocalLogFileHeader readLogFileHeader(File logFileName) throws IOException {

        LocalLogFileHeader headerOut = new LocalLogFileHeader();
        RandomAccessFile file = new RandomAccessFile(logFileName, "r");
        try {
            headerOut = readLogFileHeader(file);
        } finally {
            file.close();
        }
        return headerOut;
    }

    /**
     * readLogFileHeader - from Stream
     * 
     * @param reader
     * @return
     * @throws IOException
     */
    private static LocalLogFileHeader readLogFileHeader(DataInput reader) throws IOException {

        LocalLogFileHeader headerOut = new LocalLogFileHeader();

        headerOut.readHeader(reader);

        return headerOut;
    }

    /** initiailizeActiveLog - init local cache log 
     * 
     * 
     * **/
    private void initializeActiveLog() throws IOException {

        File activeLogPath = getActiveLogFilePath();

        if (!activeLogPath.exists()) {
            DataOutputStream outputStream = new DataOutputStream(new FileOutputStream(activeLogPath));
            try {
                _header = initializeEmptyLogFile(outputStream);
            } finally {
                outputStream.close();
            }
        } else {
            _header = new LocalLogFileHeader();

            DataInputStream inputStream = new DataInputStream(new FileInputStream(activeLogPath));

            try {
                _header.readHeader(inputStream);
            } finally {
                inputStream.close();
            }

            if (_header._itemCount != 0) {
                loadCache(activeLogPath, _header);
            }
        }
    }

    /**
     * loadCache - load local cache from disk 
     * @param activeLogPath
     * @param logFileHeader
     * @throws IOException
     */
    private synchronized void loadCache(File activeLogPath, LocalLogFileHeader logFileHeader) throws IOException {

        RandomAccessFile file = new RandomAccessFile(getActiveLogFilePath(), "rw");

        byte[] syncCheck = new byte[_header._sync.length];

        try {

            long lastValidPos = LocalLogFileHeader.SIZE;
            long currentPos = lastValidPos;
            long endPos = file.length();

            CacheItemHeader itemHeader = new CacheItemHeader();

            // start read 
            while (currentPos < endPos) {

                if ((endPos - currentPos) < LocalLogFileHeader.SYNC_BYTES_SIZE)
                    break;

                // seek to current position ... 
                file.seek(currentPos);

                boolean headerLoadFailed = false;

                try {
                    // read the item header ... assuming things are good so far ... 
                    itemHeader.readHeader(file);
                } catch (IOException e) {
                    LOG.error("### Item Header Load Failed With Exception:" + CCStringUtils.stringifyException(e));
                    headerLoadFailed = true;
                }

                if (headerLoadFailed) {
                    LOG.error("### Item File Corrupt at position:" + currentPos + " Seeking Next Sync Point");
                    currentPos += LocalLogFileHeader.SYNC_BYTES_SIZE;
                }

                // if header sync bytes don't match .. then seek to next sync position ... 
                if (headerLoadFailed || !Arrays.equals(itemHeader._sync, _header._sync)) {

                    LOG.error("### Item File Corrupt at position:" + currentPos + " Seeking Next Sync Point");

                    // reseek to current pos 
                    file.seek(currentPos);
                    // read in a sync.length buffer amount 
                    file.readFully(syncCheck);

                    int syncLen = _header._sync.length;

                    // start scan for next sync position ...
                    for (int i = 0; file.getFilePointer() < endPos; i++) {
                        int j = 0;
                        for (; j < syncLen; j++) {
                            if (_header._sync[j] != syncCheck[(i + j) % syncLen])
                                break;
                        }
                        if (j == syncLen) {
                            file.seek(file.getFilePointer() - LocalLogFileHeader.SYNC_BYTES_SIZE); // position before sync
                            break;
                        }
                        syncCheck[i % syncLen] = file.readByte();
                    }
                    // whatever, happened file pointer is at current pos 
                    currentPos = file.getFilePointer();

                    if (currentPos < endPos) {
                        LOG.info("### Item Loader Found another sync point at:" + currentPos);
                    } else {
                        LOG.error("### No more sync points found!");
                    }
                } else {
                    // ok figure out next steps based on header ... 
                    // for now, just add item to our list ... 
                    _fingerprintToLocalLogPos.put(itemHeader._fingerprint, _localLogStartOffset + currentPos);
                    // now seek past data
                    currentPos += CacheItemHeader.SIZE + itemHeader._dataLength + ITEM_RECORD_TRAILING_BYTES;
                }
            }
        } finally {
            if (file != null) {
                file.close();
            }
        }
    }

    public static long readVLongFromByteBuffer(ByteBuffer source) {
        byte firstByte = source.get();
        int len = WritableUtils.decodeVIntSize(firstByte);
        if (len == 1) {
            return firstByte;
        }
        long i = 0;
        for (int idx = 0; idx < len - 1; idx++) {
            byte b = source.get();
            i = i << 8;
            i = i | (b & 0xFF);
        }
        return (WritableUtils.isNegativeVInt(firstByte) ? (i ^ -1L) : i);
    }

    public static void writeVLongToByteBuffer(ByteBuffer stream, long i) throws IOException {
        if (i >= -112 && i <= 127) {
            stream.put((byte) i);
            return;
        }

        int len = -112;
        if (i < 0) {
            i ^= -1L; // take one's complement'
            len = -120;
        }

        long tmp = i;
        while (tmp != 0) {
            tmp = tmp >> 8;
            len--;
        }

        stream.put((byte) len);

        len = (len < -120) ? -(len + 120) : -(len + 112);

        for (int idx = len; idx != 0; idx--) {
            int shiftbits = (idx - 1) * 8;
            long mask = 0xFFL << shiftbits;
            stream.put((byte) ((i & mask) >> shiftbits));
        }
    }

    /********************************************************************************************************/
    // TEST CODE 
    /********************************************************************************************************/

    public static void main(String[] args) {

        final EventLoop eventLoop = new EventLoop();
        eventLoop.start();

        final CacheManager manager = new CacheManager(eventLoop);
        // delete active log if it exists ... 
        manager.getActiveLogFilePath().delete();
        try {
            manager.initialize(INIT_FLAG_SKIP_CACHE_WRITER_INIT | INIT_FLAG_SKIP_HDFS_WRITER_INIT);
        } catch (IOException e1) {
            LOG.error(CCStringUtils.stringifyException(e1));
            return;
        }

        MessageDigest digester;
        try {
            digester = MessageDigest.getInstance("MD5");
        } catch (NoSuchAlgorithmException e1) {
            LOG.error(CCStringUtils.stringifyException(e1));
            return;
        }

        final byte[] randomBytes = new byte[1 << 15];
        LOG.info("Building Random Digest");
        for (int i = 0; i < randomBytes.length; i += 16) {
            long time = System.nanoTime();
            digester.update((new UID() + "@" + time).getBytes());
            System.arraycopy(digester.digest(), 0, randomBytes, i, 16);
        }

        final Semaphore semaphore = new Semaphore(0);

        if (args[0].equals("populate")) {

            manager.startCacheWriterThread();
            manager.startHDFSFlusherThread();

            try {

                LOG.info("Done Building Random Digest");

                LOG.info("Writing Items To Disk");
                for (int i = 0; i < 1000000; ++i) {

                    if (i % 1000 == 0) {
                        LOG.info("Wrote:" + i + " entries");
                    }

                    final CacheItem item1 = new CacheItem();
                    item1.setUrl(manager.normalizeURL("http://www.domain.com/foobar/" + i));
                    item1.setContent(new Buffer(randomBytes));
                    item1.setUrlFingerprint(URLFingerprint.generate64BitURLFPrint(item1.getUrl()));
                    manager.cacheItem(item1, null);
                    Thread.sleep(1);

                    if (i != 0 && i % 10000 == 0) {
                        LOG.info("Hit 10000 items.. sleeping for 20 seconds");
                        Thread.sleep(20 * 1000);
                    }
                }

                Thread.sleep(30000);

                for (int i = 0; i < 1000000; ++i) {

                    final String url = new String("http://www.domain.com/foobar/" + i);
                    manager.checkCacheForItem(url, new CacheItemCheckCallback() {

                        @Override
                        public void cacheItemAvailable(String url, CacheItem item) {
                            Assert.assertTrue(item.getUrl().equals(url));
                            String itemIndex = url.substring("http://www.domain.com/foobar/".length());
                            int itemNumber = Integer.parseInt(itemIndex);
                            if (itemNumber == 999999) {
                                semaphore.release();
                            }
                        }

                        @Override
                        public void cacheItemNotFound(String url) {
                            Assert.assertTrue(false);
                        }
                    });
                }
            } catch (IOException e) {
                LOG.error(CCStringUtils.stringifyException(e));
            } catch (InterruptedException e2) {

            }
        } else if (args[0].equals("read")) {

            try {
                final CacheItem item1 = new CacheItem();
                item1.setUrl(manager.normalizeURL("http://www.domain.com/barz/"));
                item1.setUrlFingerprint(URLFingerprint.generate64BitURLFPrint(item1.getUrl()));
                item1.setContent(new Buffer(randomBytes));
                manager.cacheItem(item1, null);

                // queue up cache load requests .... 
                for (int i = 0; i < 10000; ++i) {

                    final String url = new String("http://www.domain.com/foobar/" + i);

                    eventLoop.setTimer(new Timer(1, false, new Timer.Callback() {

                        @Override
                        public void timerFired(Timer timer) {
                            manager.checkCacheForItem(url, new CacheItemCheckCallback() {

                                @Override
                                public void cacheItemAvailable(String url, CacheItem item) {
                                    LOG.info("FOUND Item for URL:" + url + " ContentSize:"
                                            + item.getContent().getCount());
                                }

                                @Override
                                public void cacheItemNotFound(String url) {
                                    LOG.info("DIDNOT Find Item for URL:" + url);
                                }

                            });
                        }
                    }));
                }

                eventLoop.setTimer(new Timer(1, false, new Timer.Callback() {

                    @Override
                    public void timerFired(Timer timer) {
                        manager.checkCacheForItem(item1.getUrl(), new CacheItemCheckCallback() {

                            @Override
                            public void cacheItemAvailable(String url, CacheItem item) {
                                LOG.info("FOUND Item for URL:" + url + " ContentSize:"
                                        + item.getContent().getCount());
                            }

                            @Override
                            public void cacheItemNotFound(String url) {
                                LOG.info("DIDNOT Find Item for URL:" + url);
                            }

                        });
                    }

                }));
            } catch (IOException e) {
                LOG.error(CCStringUtils.stringifyException(e));
            }
        }
        semaphore.acquireUninterruptibly();

    }

}