org.commoncrawl.service.listcrawler.CrawlList.java Source code

Introduction

Here is the source code for org.commoncrawl.service.listcrawler.CrawlList.java
Source

/**
 * Copyright 2008 - CommonCrawl Foundation
 * 
 *    This program is free software: you can redistribute it and/or modify
 *    it under the terms of the GNU General Public License as published by
 *    the Free Software Foundation, either version 3 of the License, or
 *    (at your option) any later version.
 *
 *    This program is distributed in the hope that it will be useful,
 *    but WITHOUT ANY WARRANTY; without even the implied warranty of
 *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *    GNU General Public License for more details.
 *
 *    You should have received a copy of the GNU General Public License
 *    along with this program.  If not, see <http://www.gnu.org/licenses/>.
 *
 **/

package org.commoncrawl.service.listcrawler;

import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.DataInput;
import java.io.DataOutput;
import java.io.DataOutputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.FileWriter;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.PrintWriter;
import java.io.RandomAccessFile;
import java.nio.channels.FileLock;
import java.nio.channels.OverlappingFileLockException;
import java.nio.charset.Charset;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Comparator;
import java.util.Map;
import java.util.Set;
import java.util.TreeMap;
import java.util.TreeSet;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.DataInputBuffer;
import org.apache.hadoop.io.DataOutputBuffer;
import org.apache.hadoop.io.SequenceFile;
import org.apache.hadoop.io.WritableUtils;
import org.apache.log4j.BasicConfigurator;
import org.commoncrawl.crawl.common.internal.CrawlEnvironment;
import org.commoncrawl.hadoop.mergeutils.MergeSortSpillWriter;
import org.commoncrawl.hadoop.mergeutils.RawKeyValueComparator;
import org.commoncrawl.hadoop.mergeutils.SequenceFileSpillWriter;
import org.commoncrawl.mapred.ProxyCrawlHistoryItem;
import org.commoncrawl.protocol.CrawlURL;
import org.commoncrawl.protocol.URLFP;
import org.commoncrawl.rpc.base.shared.BinaryProtocol;
import org.commoncrawl.service.crawler.util.URLFPBloomFilter;
import org.commoncrawl.service.listcrawler.CrawlListDomainItem;
import org.commoncrawl.service.listcrawler.CrawlListMetadata;
import org.commoncrawl.service.listcrawler.CrawlHistoryManager.ItemUpdater;
import org.commoncrawl.util.GoogleURL;
import org.commoncrawl.util.URLFingerprint;
import org.commoncrawl.util.URLUtils;
import org.commoncrawl.util.CCStringUtils;
import org.commoncrawl.util.CRC16;
import org.commoncrawl.util.FileUtils;
import org.junit.Assert;

import com.google.gson.stream.JsonWriter;

/** 
 * A list of urls that need to be crawled 
 * @author rana
 *
 */
public final class CrawlList implements ItemUpdater {

    // default refresh interval is 60 days ...
    public static final int DEFAULT_REFRESH_INTERVAL_IN_SECS = 86400 * 60;

    /** 
     * events generated by the CrawlList 
     * 
     * @author rana
     *
     */
    public static interface CrawlListEvents {
        public void itemUpdated(URLFP itemFingerprint);
    }

    public static final Log LOG = LogFactory.getLog(CrawlList.class);

    public static final int ValueFlag_HasRedirect = 1 << 0;

    File _listURLDataFile = null;
    File _fixedDataFile = null;
    File _variableDataFile = null;
    File _bloomFilterData = null;
    File _listMetadataFile = null;
    File _subDomainMetadataFile = null;
    URLFPBloomFilter _bloomFilter = null;
    long _listId;
    CrawlHistoryStorage _manager;
    CrawlListMetadata _metadata = new CrawlListMetadata();
    CrawlListEvents _eventListener;
    byte[] _tempFixedDataBuffer = null;
    int _tempFixedDataBufferSize = 0;
    DataOutputBuffer _tempOutputBuffer = new DataOutputBuffer(OnDiskCrawlHistoryItem.ON_DISK_SIZE);
    TreeMap<Long, CrawlListMetadata> _transientSubDomainStats = new TreeMap<Long, CrawlListMetadata>();
    DataOutputBuffer _offsetLookupTable = null;
    Exception _exception;

    public enum LoadState {
        UNINITIALIZED, QUEUED_FOR_LOADING, REALLY_LOADING, LOADED, ERROR
    }

    LoadState _listState = LoadState.UNINITIALIZED;

    public enum QueueState {
        WAITING, QUEUEING, QUEUED, ERROR
    }

    QueueState _queueState = QueueState.WAITING;

    /**
     * internal factory constructor
     */
    private CrawlList(CrawlHistoryStorage manager, long listId, LoadState state) {

        _manager = manager;

        //establish file names 
        initializeListFileNames();

        _listId = listId;
        _listState = state;
    }

    /**
     * internal factory constructor
     */
    private CrawlList(CrawlHistoryStorage manager, long listId, Exception e) {

        _manager = manager;

        //establish file names 
        initializeListFileNames();

        _listId = listId;
        _listState = LoadState.ERROR;
        _exception = e;
    }

    /** is list loaded 
     * 
     */
    public boolean isListLoaded() {
        return _listState == LoadState.LOADED;
    }

    /** mark list as loading 
     * 
     * 
     */
    public void markListAsReallyLoading() {
        _listState = LoadState.REALLY_LOADING;
    }

    // get the list's load state 
    public LoadState getLoadState() {
        return _listState;
    }

    // get the last caught exception (if list is in error state)
    public Exception getLastException() {
        return _exception;
    }

    /** get the list id 
     * 
     */
    public long getListId() {
        return _listId;
    }

    /** set the event listener hook **
     * 
     * @param eventListener
     */
    public synchronized void setEventListener(CrawlListEvents eventListener) {
        _eventListener = eventListener;
    }

    public synchronized CrawlListEvents getEventListener() {
        return _eventListener;
    }

    /** get metadata 
     * 
     */
    public CrawlListMetadata getMetadata() {
        CrawlListMetadata metadataOut = null;
        synchronized (_metadata) {
            try {
                metadataOut = (CrawlListMetadata) _metadata.clone();
            } catch (CloneNotSupportedException e) {
            }
        }
        return metadataOut;
    }

    /**
     * 
     * @return the path to the url data file (source for the urls in this list)
     */
    public File getListURLDataFile() {
        return _listURLDataFile;
    }

    /**
     * Initialize a CrawlList in an error state ..
     */
    public static CrawlList createListWithLoadErrorState(CrawlHistoryStorage manager, long listId, Exception e) {
        return new CrawlList(manager, listId, e);
    }

    /**
     * Initialize a CrawlList in an laoding state ..
     */
    public static CrawlList createListLoadingInLoadingState(CrawlHistoryStorage manager, long listId, File dataFile,
            int refreshInterval) {
        CrawlList listOut = new CrawlList(manager, listId, LoadState.QUEUED_FOR_LOADING);

        listOut.getMetadata().setRefreshInterval(refreshInterval);
        listOut._listURLDataFile = dataFile;

        return listOut;
    }

    /**
     *   Load a CrawlList from previously stored disk state     
     * 
     * @param manager - reference to the crawl list history manager 
     * @param listId  - the list id (the timestamp) for the given list to load from disk state
     */
    public CrawlList(CrawlHistoryStorage storage, long listId) throws IOException {
        _listId = listId;
        _manager = storage;
        //establish file names 
        initializeListFileNames();

        LOG.info("Initilaizing pre-existing List with Id:" + listId);

        LOG.info("Loading BloomFilterData for List:" + listId);
        FileInputStream bloomFilterData = new FileInputStream(_bloomFilterData);

        try {
            // load bloom filter 
            _bloomFilter = URLFPBloomFilter.load(bloomFilterData);
        } finally {
            bloomFilterData.close();
        }

        // load list metadata from disk 
        loadMetadataFromDisk();
        // reset queued counts ... 
        _metadata.setQueuedItemCount(0);
        // write it back 
        writeMetadataToDisk();
        // load sub domain metadata from disk ... 
        loadSubDomainMetadataFromDisk();
        // reset queued count ... 
        resetSubDomainCounts();

        _listState = LoadState.LOADED;
    }

    /**
     * Initialize a new CrawlList object from a given input stream of urls 
     * 
     * @param manager           - reference to the crawl history log manager 
     * @param urlInputStream - the input stream containing the list of urls that we should add to this list ... 
     * @throws IOException      
     */
    public CrawlList(CrawlHistoryStorage manager, long listId, File sourceURLFile, int refreshInterval)
            throws IOException {

        _manager = manager;

        _listState = LoadState.REALLY_LOADING;

        // initialize a new list id 
        _listId = listId;

        LOG.info("*** LIST:" + getListId() + " LOADING FROM SOURCE FILE:" + sourceURLFile.getAbsolutePath());

        //establish file names 
        initializeListFileNames();

        sourceURLFile.renameTo(_listURLDataFile);

        FileInputStream urlInputStream = new FileInputStream(_listURLDataFile);

        try {

            // set we will use to hold all fingerprints generated 
            TreeSet<URLFP> urlSet = new TreeSet<URLFP>();

            // create temp files ...
            File spillOutputFile = File.createTempFile("spillOut", Long.toString(_listId));

            // create mergesortspillwriter 
            SequenceFileSpillWriter<URLFP, ProxyCrawlHistoryItem> spillwriter = new SequenceFileSpillWriter<URLFP, ProxyCrawlHistoryItem>(
                    FileSystem.getLocal(CrawlEnvironment.getHadoopConfig()), CrawlEnvironment.getHadoopConfig(),
                    new Path(spillOutputFile.getAbsolutePath()), URLFP.class, ProxyCrawlHistoryItem.class, null,
                    false);

            try {

                MergeSortSpillWriter<URLFP, ProxyCrawlHistoryItem> merger = new MergeSortSpillWriter<URLFP, ProxyCrawlHistoryItem>(
                        CrawlEnvironment.getHadoopConfig(), spillwriter,
                        FileSystem.getLocal(CrawlEnvironment.getHadoopConfig()),
                        new Path(manager.getLocalDataDir().getAbsolutePath()), null,
                        new RawKeyValueComparator<URLFP, ProxyCrawlHistoryItem>() {

                            DataInputBuffer _key1Buffer = new DataInputBuffer();
                            DataInputBuffer _key2Buffer = new DataInputBuffer();

                            @Override
                            public int compareRaw(byte[] key1Data, int key1Offset, int key1Length, byte[] key2Data,
                                    int key2Offset, int key2Length, byte[] value1Data, int value1Offset,
                                    int value1Length, byte[] value2Data, int value2Offset, int value2Length)
                                    throws IOException {

                                _key1Buffer.reset(key1Data, key1Offset, key1Length);
                                _key2Buffer.reset(key2Data, key2Offset, key2Length);

                                _key1Buffer.skip(2); // skip verison, and 1 byte id 
                                _key2Buffer.skip(2); // skip verison, and 1 byte id 

                                int domainHash1 = WritableUtils.readVInt(_key1Buffer);
                                int domainHash2 = WritableUtils.readVInt(_key2Buffer);

                                _key1Buffer.skip(1); // skip 1 byte id 
                                _key2Buffer.skip(1); // skip 1 byte id 

                                long fingerprint1 = WritableUtils.readVLong(_key1Buffer);
                                long fingerprint2 = WritableUtils.readVLong(_key2Buffer);

                                int result = ((Integer) domainHash1).compareTo(domainHash2);

                                if (result == 0) {
                                    result = ((Long) fingerprint1).compareTo(fingerprint2);
                                }

                                return result;
                            }

                            @Override
                            public int compare(URLFP key1, ProxyCrawlHistoryItem value1, URLFP key2,
                                    ProxyCrawlHistoryItem value2) {
                                return key1.compareTo(key2);
                            }
                        }, URLFP.class, ProxyCrawlHistoryItem.class, false, null);

                try {

                    LOG.info("*** LIST:" + getListId() + " Starting Scan of URLS In List");
                    BufferedReader reader = new BufferedReader(
                            new InputStreamReader(urlInputStream, Charset.forName("UTF-8")));

                    String line = null;
                    int lineNumber = 0;
                    ProxyCrawlHistoryItem item = new ProxyCrawlHistoryItem();
                    while ((line = reader.readLine()) != null) {
                        ++lineNumber;
                        if (line.length() != 0 && !line.startsWith("#")) {
                            URLFP fingerprint = URLUtils.getURLFPFromURL(line, true);

                            if (fingerprint != null) {

                                if (!urlSet.contains(fingerprint)) {
                                    // and add fingerprint to set 
                                    urlSet.add(fingerprint);
                                    // initialize item 
                                    item.clear();
                                    item.setOriginalURL(line);
                                    // and spill to merger / sorter .. 
                                    merger.spillRecord(fingerprint, item);
                                }
                            } else {
                                LOG.error("*** LIST:" + getListId() + " Invalid URL Encounered at Line:"
                                        + lineNumber + " URL" + line);
                            }
                        }
                    }
                    LOG.info("*** LIST:" + getListId() + " Completed Scan of:" + urlSet.size() + " URLS");
                } finally {
                    merger.close();
                }
            } finally {
                if (spillwriter != null)
                    spillwriter.close();
            }
            LOG.info("*** LIST:" + getListId() + " Generating BloomFilter for:" + urlSet.size() + " keys");
            // generate bloom filter ...  
            _bloomFilter = new URLFPBloomFilter(urlSet.size(), 7, 10);

            for (URLFP fingerprint : urlSet) {
                _bloomFilter.add(fingerprint);
            }
            LOG.info("*** LIST:" + getListId() + " Serializing BloomFilter");
            // serialize it
            FileOutputStream bloomFilterStream = new FileOutputStream(_bloomFilterData);
            try {
                _bloomFilter.serialize(bloomFilterStream);
            } finally {
                bloomFilterStream.flush();
                bloomFilterStream.close();
            }

            LOG.info("*** LIST:" + getListId() + " Starting Read of Merged Sequence File:" + spillOutputFile);
            // now initialize value map and string maps based on output sequence file ... 
            SequenceFile.Reader reader = new SequenceFile.Reader(
                    FileSystem.getLocal(CrawlEnvironment.getHadoopConfig()),
                    new Path(spillOutputFile.getAbsolutePath()), CrawlEnvironment.getHadoopConfig());

            LOG.info("*** LIST:" + getListId() + " PRE-ALLOCATING FIXED DATA BUFFER OF SIZE:"
                    + (urlSet.size() * OnDiskCrawlHistoryItem.ON_DISK_SIZE));
            // OK, Allocate room for fixed data file upfront 
            DataOutputBuffer valueStream = new DataOutputBuffer(
                    urlSet.size() * OnDiskCrawlHistoryItem.ON_DISK_SIZE);
            LOG.info("*** LIST:" + getListId() + " ALLOCATION SUCCEEDED");

            try {

                //DataOutputStream valueStream = new DataOutputStream(new FileOutputStream(_fixedDataFile));
                RandomAccessFile stringsStream = new RandomAccessFile(_variableDataFile, "rw");

                try {
                    URLFP urlFP = new URLFP();
                    ProxyCrawlHistoryItem item = new ProxyCrawlHistoryItem();

                    // read fingerprints ... 
                    while (reader.next(urlFP, item)) {
                        // write out fixed data structure and strings 
                        writeInitialOnDiskItem(urlFP, item, valueStream, stringsStream);
                    }
                } finally {
                    //valueStream.flush();
                    //valueStream.close();
                    stringsStream.close();
                }
            } finally {
                reader.close();
            }
            LOG.info("*** LIST:" + getListId() + " Finished Writing Initial Values to Disk");

            LOG.info("*** LIST:" + getListId() + " FIXED DATA BUFFER OF SIZE:" + valueStream.getLength()
                    + " EXCEPECTED SIZE:" + (urlSet.size() * OnDiskCrawlHistoryItem.ON_DISK_SIZE));
            if (valueStream.getLength() != (urlSet.size() * OnDiskCrawlHistoryItem.ON_DISK_SIZE)) {
                throw new IOException("Final FixedItemData Buffer Size:" + valueStream.getLength()
                        + " != URLSetSize:" + (urlSet.size() * OnDiskCrawlHistoryItem.ON_DISK_SIZE));
            }
            // initialize temp data buffer variables 
            _tempFixedDataBuffer = valueStream.getData();
            _tempFixedDataBufferSize = valueStream.getLength();

            // update metadata 
            _metadata.setRefreshInterval(refreshInterval);
            _metadata.setUrlCount(urlSet.size());

            // setup version 
            _metadata.setVersion(1);

            // and write to disk 
            writeMetadataToDisk();

            // mark state as loaded ... 
            _listState = LoadState.LOADED;

            LOG.info("*** LIST:" + getListId() + " SYNCING");
            // reconcile with history log
            _manager.syncList(this.getListId(), urlSet, this);
            LOG.info("*** LIST:" + getListId() + " SYNC COMPLETE");

            // write metdata to disk again 
            writeMetadataToDisk();

            LOG.info("*** LIST:" + getListId() + " FLUSHING FIXED DATA");

            // and finally flush fixed data to disk 
            FileOutputStream finalDataStream = new FileOutputStream(_fixedDataFile);

            try {
                synchronized (this) {
                    int blockSize = 1 << 20;
                    long bytesCopied = 0;
                    for (int offset = 0; offset < _tempFixedDataBufferSize; offset += blockSize) {
                        int bytesToCopy = Math.min(blockSize, _tempFixedDataBufferSize - offset);
                        finalDataStream.write(_tempFixedDataBuffer, offset, bytesToCopy);
                        bytesCopied += bytesToCopy;
                    }
                    // validate bytes copied 
                    if (bytesCopied != _tempFixedDataBufferSize) {
                        throw new IOException("Buffer Size:" + _tempFixedDataBufferSize
                                + " Does not Match BytesCopied:" + bytesCopied);
                    }

                    // ok release the buffer 
                    _tempFixedDataBuffer = null;
                    _tempFixedDataBufferSize = 0;

                    LOG.info("*** LIST:" + getListId() + " FIXED DATA FLUSH COMPLETE");
                }

            } finally {
                finalDataStream.flush();
                finalDataStream.close();
            }

            // load sub domain metadata from disk ... 
            loadSubDomainMetadataFromDisk();

        } catch (IOException e) {
            LOG.error("*** LIST:" + getListId() + " Crawl List Initialization Failed With Exception:"
                    + CCStringUtils.stringifyException(e));

            _fixedDataFile.delete();
            _variableDataFile.delete();
            _bloomFilterData.delete();

            _listState = LoadState.ERROR;

            throw e;
        } finally {
            urlInputStream.close();
        }

    }

    /**
     * update list state of a recently crawled item 
     * 
     * @param fingerprint - the fingerprint of the updated item 
     * @param newData         - the updated crawl history data for the given item    
     * @throws IOException
     */
    @Override
    public void updateItemState(URLFP fingerprint, ProxyCrawlHistoryItem newData) throws IOException {

        if (_listState == LoadState.LOADED) {
            // check for membership ... 
            if (_bloomFilter.isPresent(fingerprint)) {

                //LOG.info("UpdateItemState Called for URL:" + newData.getOriginalURL() + " List:" + getListId());

                //LOG.info("UpdateItemState Loading OnDisk Item for URL:" + newData.getOriginalURL() + " List:" + getListId());
                // extract existing item from disk 
                OnDiskCrawlHistoryItem originalItem = loadOnDiskItemForURLFP(fingerprint);

                //if present (null if false cache hit) 
                if (originalItem != null) {

                    // build an on disk item data structure for any potential changes ... 
                    OnDiskCrawlHistoryItem newItem = onDiskItemFromHistoryItem(fingerprint, newData);

                    // set inital offset information 
                    newItem._fileOffset = originalItem._fileOffset;
                    newItem._stringsOffset = originalItem._stringsOffset;

                    // LOG.info("UpdateItemState Comparing OnDisk Item  to New Item for URL:" + newData.getOriginalURL() + " List:" + getListId());
                    // compare the two items ... 
                    if (!newItem.equals(originalItem)) {
                        //LOG.info("UpdateItemState Items Don't Match for  URL:" + newData.getOriginalURL() + " List:" + getListId());
                        // ok items do not match ... figure out if strings are different ... 
                        if (newItem._stringsCRC != originalItem._stringsCRC) {
                            RandomAccessFile stringsFile = new RandomAccessFile(_variableDataFile, "rw");

                            try {
                                // seek to end 
                                stringsFile.seek(stringsFile.length());
                                // update offset info 
                                newItem._stringsOffset = stringsFile.length();
                                // write out string data length 
                                WritableUtils.writeVInt(stringsFile, _stringBuffer1.getLength());
                                // write strings to log file
                                stringsFile.write(_stringBuffer1.getData(), 0, _stringBuffer1.getLength());
                            } finally {
                                stringsFile.close();
                            }
                        }
                        // otherwise take the offset from old item 
                        else {
                            newItem._stringsOffset = originalItem._stringsOffset;
                        }
                        //LOG.info("Opening Data File for OnDiskItem load for Fingerprint:" + newItem._urlFingerprint);

                        // ok, different paths depending on wether this is an in memory update or not ... 
                        boolean wroteToMemory = false;
                        synchronized (this) {
                            if (_tempFixedDataBuffer != null) {
                                wroteToMemory = true;
                                // reset output buffer 
                                _tempOutputBuffer.reset();
                                // serizlie to output buffer 
                                newItem.serialize(_tempOutputBuffer);
                                // copy to appropriate location 
                                System.arraycopy(_tempOutputBuffer.getData(), 0, _tempFixedDataBuffer,
                                        (int) originalItem._fileOffset, OnDiskCrawlHistoryItem.ON_DISK_SIZE);
                            }
                        }

                        if (!wroteToMemory) {
                            // write to disk 
                            RandomAccessFile file = new RandomAccessFile(_fixedDataFile, "rw");

                            try {

                                while (true) {
                                    try {
                                        //LOG.info("*** TRYING UPDATE LOCK FOR OFFSET:" + originalItem._fileOffset);
                                        FileLock lock = file.getChannel().tryLock(originalItem._fileOffset,
                                                OnDiskCrawlHistoryItem.ON_DISK_SIZE, false);

                                        try {
                                            //LOG.info("*** GOT UPDATE LOCK FOR OFFSET:" + originalItem._fileOffset);
                                            file.seek(originalItem._fileOffset);
                                            newItem.serialize(file);
                                            //LOG.info("Updated Data File for OnDiskItem for Fingerprint:" + originalItem._urlFingerprint);
                                            break;
                                        } finally {
                                            //LOG.info("*** RELEASED UPDATE LOCK FOR OFFSET:" + originalItem._fileOffset);
                                            lock.release();
                                        }
                                    } catch (OverlappingFileLockException e) {
                                        LOG.error("###LockConflict(RETRY):" + CCStringUtils.stringifyException(e));
                                    }
                                }
                            } finally {
                                file.close();
                            }
                        }

                        // ok now update metadata ... 
                        synchronized (_metadata) {

                            int updateFlags = calculateUpdateFlags(originalItem, newItem);

                            if (updateFlags != 0) {

                                int metadataDirtyFlags = updateMetadata(newItem, _metadata, 0);

                                // only write metadata to disk if temp data buffer is null
                                if (metadataDirtyFlags != 0 && !wroteToMemory) {
                                    if ((metadataDirtyFlags & MetadataUpdateFlag_ModifiedCrawlStatus) != 0) {
                                        _metadata.setQueuedItemCount(_metadata.getQueuedItemCount() - 1);
                                    }
                                    writeMetadataToDisk();
                                }

                                // if not writing to memory then update subdomain metadata 
                                if (!wroteToMemory) {

                                    synchronized (_subDomainMetadataFile) {
                                        CrawlListMetadata subDomainMetadata = getSubDomainMetadataByURL(
                                                newData.getOriginalURL());

                                        int subDomainMetadataDirtyFlags = updateMetadata(newItem, subDomainMetadata,
                                                processFileOffsets);

                                        if (subDomainMetadataDirtyFlags != 0 && !wroteToMemory) {
                                            if ((subDomainMetadataDirtyFlags
                                                    & MetadataUpdateFlag_ModifiedCrawlStatus) != 0) {
                                                subDomainMetadata.setQueuedItemCount(
                                                        subDomainMetadata.getQueuedItemCount() - 1);
                                            }
                                            writeSubDomainMetadataToDisk(subDomainMetadata);
                                        }
                                    }
                                }
                            }
                        }

                        synchronized (this) {
                            if (_eventListener != null) {
                                _eventListener.itemUpdated(fingerprint);
                            }
                        }
                    }
                }
            }
        }
    }

    private static final int processOrignalStatus = 1 << 0;
    private static final int processOriginalResult = 1 << 1;
    private static final int processRedirectStatus = 1 << 2;
    private static final int processRedirectResult = 1 << 3;
    private static final int processFileOffsets = 1 << 4;
    private static final int processAllItems = Integer.MAX_VALUE;

    private static int calculateUpdateFlags(OnDiskCrawlHistoryItem originalItem, OnDiskCrawlHistoryItem newItem) {
        int updateFlags = 0;

        if (newItem.isFlagSet(OnDiskCrawlHistoryItem.FLAG_HAS_CRAWL_STATUS)
                && !originalItem.isFlagSet(OnDiskCrawlHistoryItem.FLAG_HAS_CRAWL_STATUS)) {
            updateFlags |= processOrignalStatus;
        }

        if (newItem.isFlagSet(OnDiskCrawlHistoryItem.FLAG_HAS_ORIGINAL_RESULT_CODE)
                && !originalItem.isFlagSet(OnDiskCrawlHistoryItem.FLAG_HAS_ORIGINAL_RESULT_CODE)) {
            updateFlags |= processOriginalResult;
        }

        if (newItem.isFlagSet(OnDiskCrawlHistoryItem.FLAG_HAS_REDIRECT_STATUS)
                && !originalItem.isFlagSet(OnDiskCrawlHistoryItem.FLAG_HAS_REDIRECT_STATUS)) {
            updateFlags |= processRedirectStatus;
        }

        if (newItem.isFlagSet(OnDiskCrawlHistoryItem.FLAG_HAS_REDIRECT_RESULT_CODE)
                && !originalItem.isFlagSet(OnDiskCrawlHistoryItem.FLAG_HAS_REDIRECT_RESULT_CODE)) {
            updateFlags |= processRedirectResult;
        }
        return updateFlags;
    }

    private static final int MetadataUpdateFlag_ModifiedCrawlStatus = 1 << 0;
    private static final int MetadataUpdateFlag_ModifiedRedirectStatus = 1 << 1;
    private static final int MetadataUpdateFlag_ModifiedOffsets = 1 << 1;

    private static int updateMetadata(OnDiskCrawlHistoryItem newItem, CrawlListMetadata metadata, int updateFlags) {

        int metadataDirtyFlags = 0;
        if (!newItem.isFlagSet(OnDiskCrawlHistoryItem.FLAG_HAS_REDIRECT_STATUS)) {
            //if ((updateFlags & processOrignalStatus) != 0) {
            // LOG.info("### Updating OriginalCrawlStatus for Item:" + newData.getOriginalURL());
            // status changed ... 
            if (newItem._crawlStatus != 0) {
                switch (newItem._crawlStatus) {
                case CrawlURL.FailureReason.RobotsExcluded:
                    metadata.setRobotsExcludedCount(metadata.getRobotsExcludedCount() + 1);
                    break;
                case CrawlURL.FailureReason.Timeout:
                    metadata.setTimeoutErrorCount(metadata.getTimeoutErrorCount() + 1);
                    break;
                case CrawlURL.FailureReason.IOException:
                    metadata.setIOExceptionCount(metadata.getIOExceptionCount() + 1);
                    break;
                case CrawlURL.FailureReason.DNSFailure:
                    metadata.setDNSErrorCount(metadata.getDNSErrorCount() + 1);
                    break;

                default:
                    metadata.setOtherErrorCount(metadata.getOtherErrorCount() + 1);
                }
                metadataDirtyFlags |= MetadataUpdateFlag_ModifiedCrawlStatus;
            }

            //}

            //if ((updateFlags & processOriginalResult) != 0) {

            // LOG.info("### Updating OriginalResultCode for Item:" + newData.getOriginalURL());
            if (newItem._crawlStatus == 0) {
                if (newItem._httpResultCode == 200)
                    metadata.setHttp200Count(metadata.getHttp200Count() + 1);
                else if (newItem._httpResultCode == 301)
                    metadata.setHttp301Count(metadata.getHttp301Count() + 1);
                else if (newItem._httpResultCode == 403)
                    metadata.setHttp403Count(metadata.getHttp403Count() + 1);
                else if (newItem._httpResultCode == 404)
                    metadata.setHttp404Count(metadata.getHttp404Count() + 1);
                else if (newItem._httpResultCode >= 500 && newItem._httpResultCode < 600)
                    metadata.setHttp500Count(metadata.getHttp500Count() + 1);
                else if (newItem._httpResultCode >= 600)
                    metadata.setHttpOtherCount(metadata.getHttpOtherCount() + 1);

                metadataDirtyFlags |= MetadataUpdateFlag_ModifiedCrawlStatus;
            }
            //}
        }

        else {
            //if ((updateFlags & processRedirectStatus) != 0) { 
            // status changed ... 
            if (newItem._redirectStatus != 0) {
                switch (newItem._redirectStatus) {
                case CrawlURL.FailureReason.RobotsExcluded:
                    metadata.setRobotsExcludedCount(metadata.getRobotsExcludedCount() + 1);
                    break;
                case CrawlURL.FailureReason.Timeout:
                    metadata.setTimeoutErrorCount(metadata.getTimeoutErrorCount() + 1);
                    break;
                case CrawlURL.FailureReason.IOException:
                    metadata.setIOExceptionCount(metadata.getIOExceptionCount() + 1);
                    break;
                case CrawlURL.FailureReason.DNSFailure:
                    metadata.setDNSErrorCount(metadata.getDNSErrorCount() + 1);
                    break;
                default:
                    metadata.setOtherErrorCount(metadata.getOtherErrorCount() + 1);
                }
                metadataDirtyFlags |= MetadataUpdateFlag_ModifiedCrawlStatus;
            }

            //}

            //if ((updateFlags & processRedirectResult) != 0) {
            if (newItem._redirectStatus == 0) {
                if (newItem._redirectHttpResult == 200)
                    metadata.setHttp200Count(metadata.getHttp200Count() + 1);
                else if (newItem._redirectHttpResult == 301)
                    metadata.setHttp301Count(metadata.getHttp301Count() + 1);
                else if (newItem._redirectHttpResult == 403)
                    metadata.setHttp403Count(metadata.getHttp403Count() + 1);
                else if (newItem._redirectHttpResult == 404)
                    metadata.setHttp404Count(metadata.getHttp404Count() + 1);
                else if (newItem._redirectHttpResult >= 500 && newItem._redirectHttpResult < 600)
                    metadata.setHttp500Count(metadata.getRedirectHttp500Count() + 1);
                else if (newItem._redirectHttpResult >= 600)
                    metadata.setRedirectHttpOtherCount(metadata.getHttpOtherCount() + 1);

                metadataDirtyFlags |= MetadataUpdateFlag_ModifiedCrawlStatus;
            }
            //}
        }

        if ((updateFlags & processFileOffsets) != 0) {
            if (!metadata.isFieldDirty(CrawlListMetadata.Field_FIRSTRECORDOFFSET)
                    || metadata.getFirstRecordOffset() > newItem._fileOffset) {
                metadata.setFirstRecordOffset(newItem._fileOffset);
                metadataDirtyFlags = MetadataUpdateFlag_ModifiedOffsets;
            }

            if (!metadata.isFieldDirty(CrawlListMetadata.Field_LASTRECORDOFFSET)
                    || metadata.getLastRecordOffset() < newItem._fileOffset) {
                metadata.setLastRecordOffset(newItem._fileOffset);
                metadataDirtyFlags = MetadataUpdateFlag_ModifiedOffsets;
            }
        }

        return metadataDirtyFlags;

    }

    /**
     * 
     * @return the queued (all urls queued for crawling or not) state of this list 
     */
    public QueueState getQueuedState() {
        return _queueState;
    }

    private int lastDomainHash = -1;
    private String lastRootDomainName = null;
    private CrawlListMetadata lastRootDomainMetadata = null;
    private int domainQueuedCount = 0;

    private void updateSubDomainMetadataForItemDuringLoad(OnDiskCrawlHistoryItem item, String itemURL, URLFP itemFP,
            boolean isQueued) throws IOException {
        // ok unfortunately, we need to update stats for the subdomain here 
        if (item._domainHash != lastDomainHash) {
            // update last domain hash ...
            lastDomainHash = item._domainHash;
            // extract root domain name 
            GoogleURL urlObject = new GoogleURL(itemURL);
            String rootDomainName = URLUtils.extractRootDomainName(urlObject.getHost());

            // if root domain name different than last root domain name ... 
            if (rootDomainName != lastRootDomainName) {
                // flush last entry 
                flushCachedSubDomainMetadata();
                // load new entry 
                if (rootDomainName != null) {
                    lastRootDomainName = rootDomainName;
                    lastRootDomainMetadata = new CrawlListMetadata();
                }
            }
            if (lastRootDomainMetadata != null) {
                if (isQueued) {
                    lastRootDomainMetadata.setQueuedItemCount(lastRootDomainMetadata.getQueuedItemCount() + 1);
                } else {
                    updateMetadata(item, lastRootDomainMetadata, 0);
                }
            }
            if (lastRootDomainName != null) {
                updateSubDomainQueueStatus(lastRootDomainName, domainQueuedCount);
            }
        }
    }

    private void flushCachedSubDomainMetadata() throws IOException {
        if (lastRootDomainMetadata != null) {
            // ok get the latest version of the metadata from disk 
            synchronized (_subDomainMetadataFile) {
                // get from disk 
                CrawlListMetadata metadataOnDisk = getSubDomainMetadataByRootDomain(lastRootDomainName);
                // update on disk version ... 
                metadataOnDisk.setHttp200Count(
                        metadataOnDisk.getHttp200Count() + lastRootDomainMetadata.getHttp200Count());
                metadataOnDisk.setHttp301Count(
                        metadataOnDisk.getHttp301Count() + lastRootDomainMetadata.getHttp301Count());
                metadataOnDisk.setHttp403Count(
                        metadataOnDisk.getHttp403Count() + lastRootDomainMetadata.getHttp403Count());
                metadataOnDisk.setHttp404Count(
                        metadataOnDisk.getHttp404Count() + lastRootDomainMetadata.getHttp404Count());
                metadataOnDisk.setHttp500Count(
                        metadataOnDisk.getHttp500Count() + lastRootDomainMetadata.getHttp500Count());
                metadataOnDisk.setHttpOtherCount(
                        metadataOnDisk.getHttpOtherCount() + lastRootDomainMetadata.getHttpOtherCount());

                metadataOnDisk.setRobotsExcludedCount(
                        metadataOnDisk.getRobotsExcludedCount() + lastRootDomainMetadata.getRobotsExcludedCount());
                metadataOnDisk.setTimeoutErrorCount(
                        metadataOnDisk.getTimeoutErrorCount() + lastRootDomainMetadata.getTimeoutErrorCount());
                metadataOnDisk.setIOExceptionCount(
                        metadataOnDisk.getIOExceptionCount() + lastRootDomainMetadata.getIOExceptionCount());
                metadataOnDisk.setDNSErrorCount(
                        metadataOnDisk.getDNSErrorCount() + lastRootDomainMetadata.getDNSErrorCount());
                metadataOnDisk.setOtherErrorCount(
                        metadataOnDisk.getOtherErrorCount() + lastRootDomainMetadata.getOtherErrorCount());

                metadataOnDisk.setQueuedItemCount(
                        metadataOnDisk.getQueuedItemCount() + lastRootDomainMetadata.getQueuedItemCount());
                // ok write it back to disk 
                writeSubDomainMetadataToDisk(metadataOnDisk);
            }
            lastRootDomainMetadata = null;
            lastRootDomainName = null;
            lastDomainHash = -1;
        }
    }

    /** queue uncrawled urls via the CrawlQueueLoader
     * 
     * @param loader
     */
    public void queueUnCrawledItems(CrawlQueueLoader loader) throws IOException {
        _queueState = QueueState.QUEUEING;

        int metadataVersion = getMetadata().getVersion();

        synchronized (_metadata) {
            // reset metadata PERIOD  
            int urlCount = _metadata.getUrlCount();
            _metadata.clear();
            _metadata.setUrlCount(urlCount);
        }

        RandomAccessFile fixedDataReader = new RandomAccessFile(_fixedDataFile, "rw");
        RandomAccessFile stringDataReader = new RandomAccessFile(_variableDataFile, "rw");
        try {

            OnDiskCrawlHistoryItem item = new OnDiskCrawlHistoryItem();
            URLFP fingerprint = new URLFP();

            while (fixedDataReader.getFilePointer() != fixedDataReader.length()) {

                long position = fixedDataReader.getFilePointer();

                //LOG.info("*** TRYING READ LOCK FOR OFFSET:" + position);
                while (true) {
                    // get read lock on position ... 
                    try {
                        FileLock lock = fixedDataReader.getChannel().tryLock(position,
                                OnDiskCrawlHistoryItem.ON_DISK_SIZE, false);

                        try {
                            //LOG.info("*** GOT READ LOCK FOR OFFSET:" + position);
                            item.deserialize(fixedDataReader);
                            break;
                        } finally {
                            lock.release();
                            //LOG.info("*** RELEASED READ LOCK FOR OFFSET:" + position);
                        }
                    } catch (OverlappingFileLockException e) {
                        LOG.error("*** LOCK CONTENTION AT:" + position + " Exception:"
                                + CCStringUtils.stringifyException(e));
                    }
                }

                // seek to string data 
                stringDataReader.seek(item._stringsOffset);
                // and skip buffer length 
                WritableUtils.readVInt(stringDataReader);
                // and read primary string 
                String url = stringDataReader.readUTF();
                // setup fingerprint 
                fingerprint.setDomainHash(item._domainHash);
                fingerprint.setUrlHash(item._urlFingerprint);

                // first, if it has not been crawled ever, crawl it not matter what ... 
                boolean crawlItem = !item.isFlagSet(OnDiskCrawlHistoryItem.FLAG_HAS_CRAWL_STATUS);

                // if it has been crawled ... check list metadata version ... 
                if (!crawlItem && metadataVersion >= 1) {
                    // ok this is newer version of the list ... 
                    // check refresh time if specified ...
                    int refreshIntervalInSeconds = DEFAULT_REFRESH_INTERVAL_IN_SECS;

                    if (getMetadata().getRefreshInterval() != 0) {
                        refreshIntervalInSeconds = getMetadata().getRefreshInterval();
                    }

                    if (item._updateTimestamp > 0) {
                        long timeSinceLastCrawl = item._updateTimestamp;
                        if (System.currentTimeMillis() - timeSinceLastCrawl >= (refreshIntervalInSeconds * 1000)) {
                            crawlItem = true;
                        }
                    }
                }

                if (crawlItem) {

                    loader.queueURL(fingerprint, url);

                    synchronized (_metadata) {
                        // update queued item count 
                        _metadata.setQueuedItemCount(_metadata.getQueuedItemCount() + 1);
                    }
                } else {
                    updateMetadata(item, _metadata, 0);
                }
                // ok update subdomain stats 
                updateSubDomainMetadataForItemDuringLoad(item, url, fingerprint, crawlItem);
            }

            flushCachedSubDomainMetadata();

            loader.flush();

            _queueState = QueueState.QUEUED;
        } catch (IOException e) {
            LOG.error("Encountered Exception Queueing Items for List:" + _listId + " Exception:"
                    + CCStringUtils.stringifyException(e));
            _queueState = QueueState.ERROR;
        } finally {
            fixedDataReader.close();
            stringDataReader.close();
        }
    }

    /** resubmit failed items 
     * 
     * @param loader
     */
    public void requeueFailedItems(CrawlQueueLoader loader) throws IOException {
        synchronized (this) {
            _queueState = QueueState.QUEUEING;
        }
        RandomAccessFile fixedDataReader = new RandomAccessFile(_fixedDataFile, "rw");
        RandomAccessFile stringDataReader = new RandomAccessFile(_variableDataFile, "rw");
        try {

            OnDiskCrawlHistoryItem item = new OnDiskCrawlHistoryItem();
            URLFP fingerprint = new URLFP();

            while (fixedDataReader.getFilePointer() != fixedDataReader.length()) {
                item.deserialize(fixedDataReader);
                boolean queueItem = false;
                if (item.isFlagSet(OnDiskCrawlHistoryItem.FLAG_HAS_CRAWL_STATUS)) {

                    if (item.isFlagSet(OnDiskCrawlHistoryItem.FLAG_HAS_REDIRECT_STATUS)) {
                        queueItem = (item._redirectStatus != 0);

                        if (!queueItem) {
                            if (item._redirectHttpResult != 200 && item._redirectHttpResult != 404) {
                                queueItem = true;
                            }
                        }
                    } else {
                        queueItem = (item._crawlStatus != 0);

                        if (!queueItem) {
                            if (item._httpResultCode != 200 && item._httpResultCode != 404) {
                                queueItem = true;
                            }
                        }
                    }

                    if (queueItem) {
                        // seek to string data 
                        stringDataReader.seek(item._stringsOffset);
                        // and skip buffer length 
                        WritableUtils.readVInt(stringDataReader);
                        // and read primary string 
                        String url = stringDataReader.readUTF();
                        // and spill
                        fingerprint.setDomainHash(item._domainHash);
                        fingerprint.setUrlHash(item._urlFingerprint);

                        loader.queueURL(fingerprint, url);
                    }
                }
            }
        } catch (IOException e) {
            LOG.error("Encountered Exception Queueing Items for List:" + _listId + " Exception:"
                    + CCStringUtils.stringifyException(e));
            _queueState = QueueState.QUEUED;
        } finally {
            fixedDataReader.close();
            stringDataReader.close();
        }
    }

    /**
     * 
     * @param localLogFileDir
     * @param listId
     * @return
     */
    public static boolean allFilesPresent(File localLogFileDir, long listId) {
        //establish file names 
        File urlDataFile = new File(localLogFileDir, LIST_URL_DATA_PREFIX + Long.toString(listId));
        File fixedDataFile = new File(localLogFileDir, LIST_VALUE_MAP_PREFIX + Long.toString(listId));
        File variableDataFile = new File(localLogFileDir, LIST_STRING_MAP_PREFIX + Long.toString(listId));
        File bloomFilterFile = new File(localLogFileDir, LIST_BLOOM_DATA_PREFIX + Long.toString(listId));

        if (urlDataFile.exists() && fixedDataFile.exists() && variableDataFile.exists()
                && bloomFilterFile.exists()) {

            return true;
        }
        return false;
    }

    public static final String LIST_URL_DATA_PREFIX = "listURLS-";
    public static final String LIST_VALUE_MAP_PREFIX = "listValueMap-";
    public static final String LIST_STRING_MAP_PREFIX = "listStringMap-";
    public static final String LIST_BLOOM_DATA_PREFIX = "listBloomFilter-";
    public static final String LIST_METADATA_PREFIX = "listMetadata-";
    public static final String LIST_SUBDOMAIN_METADATA_PREFIX = "listSubDomainMetadata-";

    private void initializeListFileNames() {
        //establish file names 
        _listURLDataFile = new File(_manager.getLocalDataDir(), LIST_URL_DATA_PREFIX + Long.toString(_listId));
        _fixedDataFile = new File(_manager.getLocalDataDir(), LIST_VALUE_MAP_PREFIX + Long.toString(_listId));
        _variableDataFile = new File(_manager.getLocalDataDir(), LIST_STRING_MAP_PREFIX + Long.toString(_listId));
        _bloomFilterData = new File(_manager.getLocalDataDir(), LIST_BLOOM_DATA_PREFIX + Long.toString(_listId));
        _listMetadataFile = new File(_manager.getLocalDataDir(), LIST_METADATA_PREFIX + Long.toString(_listId));
        _subDomainMetadataFile = new File(_manager.getLocalDataDir(),
                LIST_SUBDOMAIN_METADATA_PREFIX + Long.toString(_listId));
    }

    private static class OnDiskCrawlHistoryItem {
        public long _fileOffset = -1;
        int _domainHash = -1; // 4
        long _urlFingerprint = -1; // 8
        int _stringsCRC = -1; // 4
        long _stringsOffset = -1; // 8
        byte _flags = 0; // 1
        byte _crawlStatus = -1; // 1
        short _httpResultCode = -1; // 2
        byte _redirectStatus = -1; // 1
        short _redirectHttpResult = -1; // 2
        long _updateTimestamp = -1; // 8
        //__
        // 39 bytes

        public static final int ON_DISK_SIZE = 39;

        public static final int FLAG_HAS_CRAWL_STATUS = 1;
        public static final int FLAG_HAS_ORIGINAL_RESULT_CODE = 2;
        public static final int FLAG_HAS_REDIRECT_URL = 4;
        public static final int FLAG_HAS_REDIRECT_STATUS = 8;
        public static final int FLAG_HAS_REDIRECT_RESULT_CODE = 16;
        public static final int FLAG_HAS_LASTMODIFIED_TIME = 32;

        public int compareFingerprints(URLFP fp) {
            int result = ((Integer) _domainHash).compareTo(fp.getDomainHash());
            if (result == 0) {
                result = ((Long) _urlFingerprint).compareTo(fp.getUrlHash());
            }
            return result;
        }

        @Override
        public boolean equals(Object obj) {
            if (obj instanceof OnDiskCrawlHistoryItem) {
                OnDiskCrawlHistoryItem other = (OnDiskCrawlHistoryItem) obj;
                if (_domainHash == other._domainHash && _urlFingerprint == other._urlFingerprint
                        && _stringsCRC == other._stringsCRC && _flags == other._flags
                        && _crawlStatus == other._crawlStatus && _httpResultCode == other._httpResultCode
                        && _redirectStatus == other._redirectStatus
                        && _redirectHttpResult == other._redirectHttpResult) {
                    return true;
                }
            }
            return false;
        }

        public void setFlag(int flag) {
            _flags |= flag;
        }

        public boolean isFlagSet(int flag) {
            return ((_flags & flag) != 0);
        }

        public void serialize(DataOutput out) throws IOException {
            out.writeInt(_domainHash);
            out.writeLong(_urlFingerprint);
            out.writeInt(_stringsCRC);
            out.writeLong(_stringsOffset);
            out.write(_flags);
            out.writeByte(_crawlStatus);
            out.writeShort(_httpResultCode);
            out.writeByte(_redirectStatus);
            out.writeShort(_redirectHttpResult);
            out.writeLong(_updateTimestamp);
        }

        public void deserialize(DataInput in) throws IOException {
            _domainHash = in.readInt();
            _urlFingerprint = in.readLong();
            _stringsCRC = in.readInt();
            _stringsOffset = in.readLong();
            _flags = in.readByte();
            _crawlStatus = in.readByte();
            _httpResultCode = in.readShort();
            _redirectStatus = in.readByte();
            _redirectHttpResult = in.readShort();
            _updateTimestamp = in.readLong();
        }

    }

    DataOutputBuffer _stringBuffer1 = new DataOutputBuffer();
    DataOutputBuffer _stringBuffer2 = new DataOutputBuffer();
    CRC16 _stringCRC = new CRC16();

    private OnDiskCrawlHistoryItem onDiskItemFromHistoryItem(URLFP fingerprint, ProxyCrawlHistoryItem item)
            throws IOException {

        OnDiskCrawlHistoryItem itemOut = new OnDiskCrawlHistoryItem();

        itemOut._domainHash = fingerprint.getDomainHash();
        itemOut._urlFingerprint = fingerprint.getUrlHash();
        itemOut._stringsCRC = calculateStringCRC(item, _stringBuffer1);
        if (item.isFieldDirty(ProxyCrawlHistoryItem.Field_CRAWLSTATUS)) {
            itemOut.setFlag(OnDiskCrawlHistoryItem.FLAG_HAS_CRAWL_STATUS);
            itemOut._crawlStatus = (byte) item.getCrawlStatus();
        }
        if (item.isFieldDirty(ProxyCrawlHistoryItem.Field_HTTPRESULTCODE)) {
            itemOut.setFlag(OnDiskCrawlHistoryItem.FLAG_HAS_ORIGINAL_RESULT_CODE);
            itemOut._httpResultCode = (short) item.getHttpResultCode();
        }
        if (item.isFieldDirty(ProxyCrawlHistoryItem.Field_REDIRECTURL)) {
            itemOut.setFlag(OnDiskCrawlHistoryItem.FLAG_HAS_REDIRECT_URL);
        }
        if (item.isFieldDirty(ProxyCrawlHistoryItem.Field_REDIRECTSTATUS)) {
            itemOut.setFlag(OnDiskCrawlHistoryItem.FLAG_HAS_REDIRECT_STATUS);
            itemOut._redirectStatus = (byte) item.getRedirectStatus();
        }
        if (item.isFieldDirty(ProxyCrawlHistoryItem.Field_REDIRECTHTTPRESULT)) {
            itemOut.setFlag(OnDiskCrawlHistoryItem.FLAG_HAS_REDIRECT_RESULT_CODE);
            itemOut._redirectHttpResult = (short) item.getRedirectHttpResult();
        }
        // update last modified time if present ....  
        if (item.isFieldDirty(ProxyCrawlHistoryItem.Field_LASTMODIFIEDTIME) && item.getLastModifiedTime() > 0) {
            itemOut._updateTimestamp = Math.max(itemOut._updateTimestamp, item.getLastModifiedTime());
            itemOut.setFlag(OnDiskCrawlHistoryItem.FLAG_HAS_LASTMODIFIED_TIME);
        }

        return itemOut;
    }

    private int calculateStringCRC(ProxyCrawlHistoryItem item, DataOutputBuffer stringBuffer) throws IOException {
        stringBuffer.reset();
        stringBuffer.writeUTF(item.getOriginalURL());
        if (item.isFieldDirty(ProxyCrawlHistoryItem.Field_REDIRECTURL)) {
            stringBuffer.writeUTF(item.getRedirectURL());
        }
        _stringCRC.reset();
        _stringCRC.update(stringBuffer.getData(), 0, stringBuffer.getLength());

        return (int) _stringCRC.getValue();
    }

    private void writeInitialOnDiskItem(URLFP fp, ProxyCrawlHistoryItem historyItem,
            DataOutputStream valueStreamOut, RandomAccessFile stringStream) throws IOException {

        OnDiskCrawlHistoryItem itemOut = onDiskItemFromHistoryItem(fp, historyItem);

        // update string offset ... 
        itemOut._stringsOffset = stringStream.length();
        // write out string data length 
        WritableUtils.writeVInt(stringStream, _stringBuffer1.getLength());
        // write strings to log file
        stringStream.write(_stringBuffer1.getData(), 0, _stringBuffer1.getLength());
        // update timestamp ... 
        itemOut._updateTimestamp = -1;
        // and write to disk 
        itemOut.serialize(valueStreamOut);
    }

    private void dumpFixedDataFile() {
        try {
            RandomAccessFile fixedDataReader = new RandomAccessFile(_fixedDataFile, "rw");

            try {
                OnDiskCrawlHistoryItem item = new OnDiskCrawlHistoryItem();
                int index = 0;
                while (fixedDataReader.getFilePointer() != fixedDataReader.length()) {
                    item.deserialize(fixedDataReader);
                    LOG.info("Item at Index:" + index++ + " Domain:" + item._domainHash + " URLFP:"
                            + item._urlFingerprint);
                }
            } finally {
                fixedDataReader.close();
            }
        } catch (IOException e) {
            LOG.error(CCStringUtils.stringifyException(e));
        }

    }

    private OnDiskCrawlHistoryItem loadOnDiskItemForURLFP(URLFP fingerprint) throws IOException {

        // see if state is cached in memory ...
        boolean loadedFromMemory = false;

        synchronized (this) {
            if (_tempFixedDataBuffer != null) {

                loadedFromMemory = true;

                int low = 0;
                int high = (int) (_tempFixedDataBufferSize / OnDiskCrawlHistoryItem.ON_DISK_SIZE) - 1;

                OnDiskCrawlHistoryItem itemOut = new OnDiskCrawlHistoryItem();
                DataInputBuffer inputBuffer = new DataInputBuffer();

                int iterationNumber = 0;

                while (low <= high) {

                    ++iterationNumber;

                    int mid = low + ((high - low) / 2);

                    inputBuffer.reset(_tempFixedDataBuffer, 0, _tempFixedDataBufferSize);
                    inputBuffer.skip(mid * OnDiskCrawlHistoryItem.ON_DISK_SIZE);

                    // deserialize 
                    itemOut.deserialize(inputBuffer);

                    // now compare it against desired hash value ...
                    int comparisonResult = itemOut.compareFingerprints(fingerprint);

                    if (comparisonResult > 0)
                        high = mid - 1;
                    else if (comparisonResult < 0)
                        low = mid + 1;
                    else {

                        // cache offset 
                        itemOut._fileOffset = mid * OnDiskCrawlHistoryItem.ON_DISK_SIZE;

                        // LOG.info("Found Match. Took:"+ iterationNumber + " iterations");
                        // and return item 
                        return itemOut;
                    }
                }
                //LOG.error("Did Not Find Match For Domain:" + fingerprint.getDomainHash() + " URLFP:" + fingerprint.getUrlHash() + " Took:" + iterationNumber + " iterations");
            }
        }

        if (!loadedFromMemory) {
            //load from disk 

            //LOG.info("Opening Data File for OnDiskItem load for Fingerprint:" + fingerprint.getUrlHash());

            RandomAccessFile file = new RandomAccessFile(_fixedDataFile, "rw");

            // allocate buffer upfront 
            byte[] onDiskItemBuffer = new byte[OnDiskCrawlHistoryItem.ON_DISK_SIZE];
            DataInputBuffer inputStream = new DataInputBuffer();

            //LOG.info("Opened Data File. Searching for match");
            try {

                int low = 0;
                int high = (int) (file.length() / OnDiskCrawlHistoryItem.ON_DISK_SIZE) - 1;

                OnDiskCrawlHistoryItem itemOut = new OnDiskCrawlHistoryItem();

                int iterationNumber = 0;

                while (low <= high) {

                    ++iterationNumber;

                    int mid = low + ((high - low) / 2);

                    // seek to proper location 
                    file.seek(mid * OnDiskCrawlHistoryItem.ON_DISK_SIZE);
                    // read the data structure 
                    file.readFully(onDiskItemBuffer, 0, onDiskItemBuffer.length);
                    // map location in file 
                    //MappedByteBuffer memoryBuffer = file.getChannel().map(MapMode.READ_ONLY,mid * OnDiskCrawlHistoryItem.ON_DISK_SIZE,OnDiskCrawlHistoryItem.ON_DISK_SIZE);
                    //DataInputStream  inputStream = new DataInputStream(new ByteBufferInputStream(memoryBuffer));
                    inputStream.reset(onDiskItemBuffer, 0, OnDiskCrawlHistoryItem.ON_DISK_SIZE);

                    // deserialize 
                    itemOut.deserialize(inputStream);

                    // memoryBuffer = null;
                    //inputStream = null;

                    // now compare it against desired hash value ...
                    int comparisonResult = itemOut.compareFingerprints(fingerprint);

                    if (comparisonResult > 0)
                        high = mid - 1;
                    else if (comparisonResult < 0)
                        low = mid + 1;
                    else {

                        // cache offset 
                        itemOut._fileOffset = mid * OnDiskCrawlHistoryItem.ON_DISK_SIZE;

                        // LOG.info("Found Match. Took:"+ iterationNumber + " iterations");
                        // and return item 
                        return itemOut;
                    }
                }
                //LOG.error("******Did Not Find Match For Domain:" + fingerprint.getDomainHash() + " URLFP:" + fingerprint.getUrlHash() + " Took:" + iterationNumber + " iterations");

                //DEBUG ONLY !
                // dumpFixedDataFile();
            } finally {
                file.close();
            }
        }
        return null;
    }

    private ProxyCrawlHistoryItem getHistoryItemFromURLFP(URLFP fingerprint) throws IOException {
        OnDiskCrawlHistoryItem item = loadOnDiskItemForURLFP(fingerprint);
        if (item != null) {
            return getHistoryItemFromOnDiskItem(item);
        }
        return null;
    }

    private ProxyCrawlHistoryItem getHistoryItemFromOnDiskItem(OnDiskCrawlHistoryItem item) throws IOException {

        ProxyCrawlHistoryItem itemOut = new ProxyCrawlHistoryItem();

        if ((item._flags & OnDiskCrawlHistoryItem.FLAG_HAS_CRAWL_STATUS) != 0)
            itemOut.setCrawlStatus(item._crawlStatus);
        if ((item._flags & OnDiskCrawlHistoryItem.FLAG_HAS_ORIGINAL_RESULT_CODE) != 0)
            itemOut.setHttpResultCode(item._httpResultCode);
        if ((item._flags & OnDiskCrawlHistoryItem.FLAG_HAS_REDIRECT_STATUS) != 0)
            itemOut.setRedirectStatus(item._redirectStatus);
        if ((item._flags & OnDiskCrawlHistoryItem.FLAG_HAS_REDIRECT_RESULT_CODE) != 0)
            itemOut.setRedirectHttpResult(item._redirectHttpResult);
        if ((item._flags & OnDiskCrawlHistoryItem.FLAG_HAS_LASTMODIFIED_TIME) != 0)
            itemOut.setLastModifiedTime(item._updateTimestamp);
        // now attept to get the string offset 
        RandomAccessFile stringDataReader = new RandomAccessFile(_variableDataFile, "rw");
        try {
            // seek to string data 
            stringDataReader.seek(item._stringsOffset);
            // and skip buffer length 
            WritableUtils.readVInt(stringDataReader);
            // now populate original url ... 
            itemOut.setOriginalURL(stringDataReader.readUTF());
            // now if redirect url is present 
            if ((item._flags & OnDiskCrawlHistoryItem.FLAG_HAS_REDIRECT_URL) != 0) {
                itemOut.setRedirectURL(stringDataReader.readUTF());
            }
        } finally {
            stringDataReader.close();
        }
        return itemOut;
    }

    /**
     * deserialize metadata from disk 
     * 
     * @throws IOException
     */
    void loadMetadataFromDisk() throws IOException {

        // skip metadata load if sub-domain metadata file is missing... 
        // in this case, metadata will be rebuilt during subdomain metadata rescan ... 
        if (_subDomainMetadataFile.exists()) {

            RandomAccessFile file = new RandomAccessFile(_listMetadataFile, "rw");
            try {
                _metadata.deserialize(file, new BinaryProtocol());
                int urlCount = _metadata.getUrlCount();
                _metadata.clear();
                _metadata.setUrlCount(urlCount);
            } finally {
                file.close();
            }
        }
    }

    /**
     * serialize metadata to disk 
     * @throws IOException
     */
    void writeMetadataToDisk() throws IOException {

        synchronized (_metadata) {
            RandomAccessFile file = new RandomAccessFile(_listMetadataFile, "rw");
            try {
                file.seek(0);
                _metadata.serialize(file, new BinaryProtocol());
            } finally {
                file.close();
            }
        }
    }

    public static void generateTestURLFile(File outputFile, String... urlList) throws IOException {
        PrintWriter writer = new PrintWriter(outputFile, "UTF-8");

        for (String url : urlList) {
            writer.println(url);
        }

        writer.flush();
        writer.close();

    }

    private static void validateListCode(final File dataDirectory, long listId) throws IOException {

        final String urlList[] = new String[] { "http://www.yahoo.com/1", "http://www.google.com/1",
                "http://www.cnn.com/1", "http://www.yahoo.com/2", "http://www.google.com/2",
                "http://www.cnn.com/2" };

        File tempFile = File.createTempFile("CrawlList", "validateListInit");
        File localTempFile = new File(dataDirectory, tempFile.getName());

        generateTestURLFile(localTempFile, urlList);

        final TreeMap<String, URLFP> urlToFPMap = new TreeMap<String, URLFP>();
        final TreeMap<URLFP, String> urlFPToString = new TreeMap<URLFP, String>();

        for (String url : urlList) {
            URLFP fp = URLUtils.getURLFPFromURL(url, true);
            urlToFPMap.put(url, fp);
            urlFPToString.put(fp, url);
        }

        final TreeMap<URLFP, ProxyCrawlHistoryItem> itemsToMarkComplete = new TreeMap<URLFP, ProxyCrawlHistoryItem>();

        ProxyCrawlHistoryItem item1 = new ProxyCrawlHistoryItem();

        item1.setCrawlStatus(CrawlURL.FailureReason.RobotsExcluded);
        item1.setOriginalURL(urlList[1]);

        ProxyCrawlHistoryItem item2 = new ProxyCrawlHistoryItem();

        item2.setCrawlStatus(0);
        item2.setOriginalURL(urlList[3]);
        item2.setHttpResultCode(301);
        item2.setRedirectURL("http://www.yahoo.com/3");
        item2.setRedirectStatus(0);
        item2.setRedirectHttpResult(200);

        ProxyCrawlHistoryItem item3 = new ProxyCrawlHistoryItem();

        item3.setCrawlStatus(0);
        item3.setOriginalURL(urlList[4]);
        item3.setHttpResultCode(301);
        item3.setRedirectURL("http://www.google.com/3");
        item3.setRedirectStatus(CrawlURL.FailureReason.IOException);

        itemsToMarkComplete.put(urlToFPMap.get(item1.getOriginalURL()), item1);
        itemsToMarkComplete.put(urlToFPMap.get(item2.getOriginalURL()), item2);
        itemsToMarkComplete.put(urlToFPMap.get(item3.getOriginalURL()), item3);

        final Set<URLFP> itemsToMarkCompleteFPSet = itemsToMarkComplete.keySet();
        final Set<URLFP> itemsNotMarked = new TreeSet<URLFP>(urlToFPMap.values());
        itemsNotMarked.removeAll(itemsToMarkCompleteFPSet);

        CrawlHistoryStorage storage = new CrawlHistoryStorage() {

            @Override
            public void syncList(long listId, TreeSet<URLFP> matchCriteria, ItemUpdater targetList)
                    throws IOException {

                for (URLFP matchItem : matchCriteria) {
                    if (itemsToMarkCompleteFPSet.contains(matchItem)) {
                        targetList.updateItemState(matchItem, itemsToMarkComplete.get(matchItem));
                    }
                }
            }

            @Override
            public File getLocalDataDir() {
                return dataDirectory;
            }
        };

        CrawlList list1 = new CrawlList(storage, listId, localTempFile, 0);

        for (int pass = 0; pass < 2; ++pass) {

            CrawlList list = null;

            if (pass == 0) {
                System.out.println("Pass 0 - Initialize from URLList");
                list = list1;
            } else {
                System.out.println("Pass 1 - Initialize from OnDisk Data");
                list = new CrawlList(storage, listId);
            }

            // iterate fingerprints 
            for (URLFP fingerprint : urlToFPMap.values()) {
                ProxyCrawlHistoryItem itemRetrieved = list.getHistoryItemFromURLFP(fingerprint);
                if (itemsToMarkCompleteFPSet.contains(fingerprint)) {
                    ProxyCrawlHistoryItem itemExpected = itemsToMarkComplete.get(fingerprint);
                    Assert.assertTrue(itemExpected.equals(itemRetrieved));
                } else {
                    Assert.assertTrue(itemRetrieved.getOriginalURL().equals(urlFPToString.get(fingerprint))
                            && !itemRetrieved.isFieldDirty(ProxyCrawlHistoryItem.Field_CRAWLSTATUS)
                            && !itemRetrieved.isFieldDirty(ProxyCrawlHistoryItem.Field_HTTPRESULTCODE)
                            && !itemRetrieved.isFieldDirty(ProxyCrawlHistoryItem.Field_REDIRECTHTTPRESULT)
                            && !itemRetrieved.isFieldDirty(ProxyCrawlHistoryItem.Field_REDIRECTSTATUS)
                            && !itemRetrieved.isFieldDirty(ProxyCrawlHistoryItem.Field_REDIRECTURL));
                }
            }
        }
        // validate string code does not update when strings have not changed 
        item3.setRedirectStatus(0);
        item3.setRedirectHttpResult(200);

        long variableDataLength = list1._variableDataFile.length();
        long fixedDataLength = list1._fixedDataFile.length();

        list1.updateItemState(urlToFPMap.get(item3.getOriginalURL()), item3);

        Assert.assertTrue(fixedDataLength == list1._fixedDataFile.length());
        Assert.assertTrue(variableDataLength == list1._variableDataFile.length());

        list1.queueUnCrawledItems(new CrawlQueueLoader() {

            @Override
            public void queueURL(URLFP urlfp, String url) {
                Assert.assertTrue(itemsNotMarked.contains(urlfp));
                Assert.assertTrue(urlFPToString.get(urlfp).equals(url));
            }

            @Override
            public void flush() {
                // TODO Auto-generated method stub

            }

        });

    }

    public static void testmain(String[] args) {

        // initialize ...
        Configuration conf = new Configuration();

        conf.addResource("nutch-default.xml");
        conf.addResource("nutch-site.xml");
        conf.addResource("core-site.xml");
        conf.addResource("hdfs-site.xml");
        conf.addResource("mapred-site.xml");

        BasicConfigurator.configure();

        conf.set("mapred.output.compression.codec", "org.apache.hadoop.io.compress.GzipCodec");
        conf.set("mapred.map.output.compression.codec", "org.apache.hadoop.io.compress.GzipCodec");

        CrawlEnvironment.setHadoopConfig(conf);
        CrawlEnvironment.setDefaultHadoopFSURI("file:///");

        File testDirectory = new File("/tmp/CrawlListTests");
        FileUtils.recursivelyDeleteFile(testDirectory);
        testDirectory.mkdir();

        try {
            validateListCode(testDirectory, System.currentTimeMillis());
        } catch (IOException e) {
            e.printStackTrace();
        }
    }

    private static final int OFFSET_TABLE_ENTRY_SIZE = 12;

    private final int getOffsetForSubDomainData(long domainHash) throws IOException {
        DataInputBuffer inputBuffer = new DataInputBuffer();

        int low = 0;
        int high = (int) (_offsetLookupTable.getLength() / OFFSET_TABLE_ENTRY_SIZE) - 1;

        while (low <= high) {

            int mid = low + ((high - low) / 2);

            inputBuffer.reset(_offsetLookupTable.getData(), _offsetLookupTable.getLength());
            inputBuffer.skip(mid * OFFSET_TABLE_ENTRY_SIZE);

            // deserialize
            long hash = inputBuffer.readLong();

            // now compare it against desired hash value ...
            int comparisonResult = ((Long) hash).compareTo(domainHash);

            if (comparisonResult > 0)
                high = mid - 1;
            else if (comparisonResult < 0)
                low = mid + 1;
            else {
                return inputBuffer.readInt();
            }
        }
        throw new IOException("NOT-FOUND!");
    }

    void updateSubDomainQueueStatus(String rootDomainName, int deltaQueuedCount) throws IOException {
        long domainHash = URLFingerprint.generate64BitURLFPrint(rootDomainName);
        synchronized (_subDomainMetadataFile) {
            CrawlListMetadata metadata = new CrawlListMetadata();

            RandomAccessFile file = new RandomAccessFile(_subDomainMetadataFile, "rw");
            try {
                int dataOffset = getOffsetForSubDomainData(domainHash);
                if (dataOffset == 0) {
                    throw new IOException("Data Offset Zero for host:" + rootDomainName);
                }
                file.seek(dataOffset);
                metadata.readFields(file);
                // set the data offset on the way out so that updates write to the proper location 
                metadata.setQueuedItemCount(metadata.getQueuedItemCount() + deltaQueuedCount);
                // ok reseek to data offset 
                file.seek(dataOffset);
                // rewrite the data structure
                metadata.write(file);
            } finally {
                file.close();
            }
        }
    }

    public CrawlListMetadata getSubDomainMetadataByURL(String originalURL) throws IOException {
        GoogleURL urlObject = new GoogleURL(originalURL);
        return getSubDomainMetadataByDomain(urlObject.getHost());
    }

    public CrawlListMetadata getSubDomainMetadataByDomain(String hostName) throws IOException {
        String rootDomainName = URLUtils.extractRootDomainName(hostName);

        if (rootDomainName != null) {
            return getSubDomainMetadataByRootDomain(rootDomainName);
        }
        throw new IOException("Unable to Extract RootDomainName for host:" + hostName);
    }

    public CrawlListMetadata getSubDomainMetadataByRootDomain(String rootDomainName) throws IOException {
        long domainHash = URLFingerprint.generate64BitURLFPrint(rootDomainName);

        CrawlListMetadata metadata = new CrawlListMetadata();
        synchronized (_subDomainMetadataFile) {
            RandomAccessFile file = new RandomAccessFile(_subDomainMetadataFile, "rw");
            try {
                int dataOffset = getOffsetForSubDomainData(domainHash);
                if (dataOffset == 0) {
                    throw new IOException("Data Offset Zero for host:" + rootDomainName);
                }
                file.seek(dataOffset);
                metadata.readFields(file);
                // set the data offset on the way out so that updates write to the proper location 
                metadata.setSubDomainDataOffset(dataOffset);
            } finally {
                file.close();
            }
        }
        return metadata;
    }

    // get subdomain metadata  
    CrawlListMetadata getTransientSubDomainMetadata(String originalURL) throws IOException {
        GoogleURL urlObject = new GoogleURL(originalURL);
        String rootDomainName = URLUtils.extractRootDomainName(urlObject.getHost());
        if (rootDomainName != null) {
            long domainHash = URLFingerprint.generate64BitURLFPrint(rootDomainName);

            CrawlListMetadata metadata = _transientSubDomainStats.get(domainHash);
            if (metadata == null) {
                metadata = new CrawlListMetadata();
                _transientSubDomainStats.put(domainHash, metadata);
                metadata.setDomainName(rootDomainName);
                metadata.setDomainHash(domainHash);
            }
            return metadata;
        }
        throw new IOException("Unable to Extract RootDomainName for url:" + originalURL);
    }

    /**
     * serialize metadata to disk 
     * @throws IOException
     */
    void writeSubDomainMetadataToDisk(CrawlListMetadata subDomainData) throws IOException {

        DataOutputBuffer outputBuffer = new DataOutputBuffer(CrawlListMetadata.Constants.FixedDataSize);

        subDomainData.serialize(outputBuffer, new BinaryProtocol());

        if (outputBuffer.getLength() > CrawlListMetadata.Constants.FixedDataSize) {
            LOG.error("ListMetadata Serialize for List:" + subDomainData.getDomainName() + " > FixedDataSize!!!");
            outputBuffer.reset();
            subDomainData.setDomainName("<<CORRUPT>>");
            subDomainData.serialize(outputBuffer, new BinaryProtocol());
        }

        synchronized (_subDomainMetadataFile) {
            RandomAccessFile file = new RandomAccessFile(_subDomainMetadataFile, "rw");
            try {
                if (subDomainData.getSubDomainDataOffset() == 0) {
                    throw new IOException("Data Offset Zero during write!");
                }
                file.seek(subDomainData.getSubDomainDataOffset());
                file.write(outputBuffer.getData(), 0, outputBuffer.getLength());
            } finally {
                file.close();
            }
        }
    }

    void writeInitialSubDomainMetadataToDisk() throws IOException {

        RandomAccessFile file = new RandomAccessFile(_subDomainMetadataFile, "rw");

        try {

            file.writeByte(0); // version
            file.writeInt(_transientSubDomainStats.size());

            ArrayList<CrawlListMetadata> sortedMetadata = new ArrayList<CrawlListMetadata>();
            sortedMetadata.addAll(_transientSubDomainStats.values());
            _transientSubDomainStats = null;
            CrawlListMetadata metadataArray[] = sortedMetadata.toArray(new CrawlListMetadata[0]);
            Arrays.sort(metadataArray, new Comparator<CrawlListMetadata>() {

                @Override
                public int compare(CrawlListMetadata o1, CrawlListMetadata o2) {
                    int result = ((Integer) o2.getUrlCount()).compareTo(o1.getUrlCount());
                    if (result == 0) {
                        result = o1.getDomainName().compareTo(o2.getDomainName());
                    }
                    return result;
                }
            });

            DataOutputBuffer outputBuffer = new DataOutputBuffer(CrawlListMetadata.Constants.FixedDataSize);

            TreeMap<Long, Integer> idToOffsetMap = new TreeMap<Long, Integer>();

            for (CrawlListMetadata entry : metadataArray) {
                // reset output buffer 
                outputBuffer.reset();
                // write item to disk 
                entry.serialize(outputBuffer, new BinaryProtocol());

                if (outputBuffer.getLength() > CrawlListMetadata.Constants.FixedDataSize) {
                    LOG.fatal("Metadata Serialization for List:" + getListId() + " SubDomain:"
                            + entry.getDomainName());
                    System.out.println("Metadata Serialization for List:" + getListId() + " SubDomain:"
                            + entry.getDomainName());
                }
                // save offset 
                idToOffsetMap.put(entry.getDomainHash(), (int) file.getFilePointer());
                // write out fixed data size 
                file.write(outputBuffer.getData(), 0, CrawlListMetadata.Constants.FixedDataSize);
            }

            // write lookup table 
            _offsetLookupTable = new DataOutputBuffer(idToOffsetMap.size() * OFFSET_TABLE_ENTRY_SIZE);

            for (Map.Entry<Long, Integer> entry : idToOffsetMap.entrySet()) {
                _offsetLookupTable.writeLong(entry.getKey());
                _offsetLookupTable.writeInt(entry.getValue());
            }
        } finally {
            file.close();
        }
        _transientSubDomainStats = null;
    }

    void resetSubDomainCounts() throws IOException {

        LOG.info("*** LIST:" + getListId() + " Reset SubDomain Queued Counts.");

        if (_subDomainMetadataFile.exists()) {

            LOG.info("*** LIST:" + getListId() + " FILE EXISTS .");

            RandomAccessFile file = new RandomAccessFile(_subDomainMetadataFile, "rw");
            DataInputBuffer inputBuffer = new DataInputBuffer();
            DataOutputBuffer outputBuffer = new DataOutputBuffer(CrawlListMetadata.Constants.FixedDataSize);

            try {
                // skip version 
                file.read();
                // read item count 
                int itemCount = file.readInt();

                LOG.info("*** LIST:" + getListId() + " SUBDOMAIN ITEM COUNT:" + itemCount);

                CrawlListMetadata newMetadata = new CrawlListMetadata();

                for (int i = 0; i < itemCount; ++i) {

                    long orignalPos = file.getFilePointer();
                    file.readFully(outputBuffer.getData(), 0, CrawlListMetadata.Constants.FixedDataSize);
                    inputBuffer.reset(outputBuffer.getData(), CrawlListMetadata.Constants.FixedDataSize);
                    try {
                        newMetadata.deserialize(inputBuffer, new BinaryProtocol());
                    } catch (Exception e) {
                        LOG.error("-----Failed to Deserialize Metadata at Index:" + i + " Exception:"
                                + CCStringUtils.stringifyException(e));
                    }
                    // ok reset everything except hashes and first/last url pointers 
                    int urlCount = newMetadata.getUrlCount();
                    long firstRecordOffset = newMetadata.getFirstRecordOffset();
                    long lastRecordOffset = newMetadata.getLastRecordOffset();
                    String domainName = newMetadata.getDomainName();
                    long domainHash = newMetadata.getDomainHash();

                    // reset 
                    newMetadata.clear();
                    // restore 
                    newMetadata.setUrlCount(urlCount);
                    newMetadata.setFirstRecordOffset(firstRecordOffset);
                    newMetadata.setLastRecordOffset(lastRecordOffset);
                    newMetadata.setDomainName(domainName);
                    newMetadata.setDomainHash(domainHash);

                    // serialize it ... 
                    outputBuffer.reset();
                    newMetadata.serialize(outputBuffer, new BinaryProtocol());
                    // write it back to disk 
                    file.seek(orignalPos);
                    // and rewrite it ... 
                    file.write(outputBuffer.getData(), 0, CrawlListMetadata.Constants.FixedDataSize);
                }
            } finally {
                file.close();
            }
            LOG.info("*** LIST:" + getListId() + " DONE RESETTIGN SUBDOMAIN METADATA QUEUE COUNTS");
        }
    }

    void loadSubDomainMetadataFromDisk() throws IOException {
        LOG.info("*** LIST:" + getListId() + " LOAD SUBDOMAIN METADATA FROM DISK ...  ");
        if (_subDomainMetadataFile.exists()) {

            LOG.info("*** LIST:" + getListId() + " FILE EXISTS LOADING SUBDOMAIN DATA FROM DISK.");

            RandomAccessFile file = new RandomAccessFile(_subDomainMetadataFile, "rw");
            DataInputBuffer inputBuffer = new DataInputBuffer();
            byte fixedDataBlock[] = new byte[CrawlListMetadata.Constants.FixedDataSize];

            try {
                // skip version 
                file.read();
                // read item count 
                int itemCount = file.readInt();

                LOG.info("*** LIST:" + getListId() + " SUBDOMAIN ITEM COUNT:" + itemCount);

                CrawlListMetadata newMetadata = new CrawlListMetadata();

                TreeMap<Long, Integer> idToOffsetMap = new TreeMap<Long, Integer>();
                for (int i = 0; i < itemCount; ++i) {

                    long orignalPos = file.getFilePointer();
                    file.readFully(fixedDataBlock, 0, fixedDataBlock.length);
                    inputBuffer.reset(fixedDataBlock, fixedDataBlock.length);
                    try {
                        newMetadata.deserialize(inputBuffer, new BinaryProtocol());
                    } catch (Exception e) {
                        LOG.error("-----Failed to Deserialize Metadata at Index:" + i + " Exception:"
                                + CCStringUtils.stringifyException(e));
                    }
                    idToOffsetMap.put(newMetadata.getDomainHash(), (int) orignalPos);
                }

                // write lookup table 
                _offsetLookupTable = new DataOutputBuffer(idToOffsetMap.size() * OFFSET_TABLE_ENTRY_SIZE);
                for (Map.Entry<Long, Integer> entry : idToOffsetMap.entrySet()) {
                    _offsetLookupTable.writeLong(entry.getKey());
                    _offsetLookupTable.writeInt(entry.getValue());
                }
            } finally {
                file.close();
            }
            LOG.info("*** LIST:" + getListId() + " DONE LOADING SUBDOMAIN DATA FROM DISK");
        } else {

            LOG.info("*** LIST:" + getListId() + " SUBDOMAIN METADATA DOES NOT EXIST! LOADING FROM SCRATCH");

            RandomAccessFile fixedDataReader = new RandomAccessFile(_fixedDataFile, "rw");
            RandomAccessFile stringDataReader = new RandomAccessFile(_variableDataFile, "rw");

            try {

                //ok rebuild top level metadata as well 
                _metadata.clear();

                OnDiskCrawlHistoryItem item = new OnDiskCrawlHistoryItem();

                int processedCount = 0;
                while (fixedDataReader.getFilePointer() != fixedDataReader.length()) {

                    long position = fixedDataReader.getFilePointer();

                    // store offset in item 
                    item._fileOffset = position;
                    // load from disk 
                    item.deserialize(fixedDataReader);
                    try {
                        // seek to string data 
                        stringDataReader.seek(item._stringsOffset);
                        // and skip buffer length 
                        WritableUtils.readVInt(stringDataReader);
                        // and read primary string 
                        String url = stringDataReader.readUTF();

                        // get metadata object for subdomain 
                        CrawlListMetadata subDomainMetadata = getTransientSubDomainMetadata(url);

                        // increment url count 
                        subDomainMetadata.setUrlCount(subDomainMetadata.getUrlCount() + 1);

                        // increment top level metadata count 
                        _metadata.setUrlCount(_metadata.getUrlCount() + 1);

                        // update top level metadata ..
                        updateMetadata(item, _metadata, 0);

                        // update sub-domain metadata object  from item data
                        updateMetadata(item, subDomainMetadata, 0);

                        ++processedCount;
                    } catch (IOException e) {
                        LOG.error("Exception Reading String Data For Item:" + (processedCount + 1));
                        LOG.error("Exception:" + CCStringUtils.stringifyException(e));
                        LOG.error("File Position:" + fixedDataReader.getFilePointer() + " StringsPointer:"
                                + stringDataReader.getFilePointer());
                    }

                    if (processedCount % 10000 == 0) {
                        LOG.info("*** LIST:" + getListId() + " Processed:" + processedCount + " Items");
                    }
                }

                // ok commit top level metadata to disk as well 
                writeMetadataToDisk();

            } catch (IOException e) {
                LOG.error("Encountered Exception Queueing Items for List:" + _listId + " Exception:"
                        + CCStringUtils.stringifyException(e));
                LOG.error("File Position:" + fixedDataReader.getFilePointer() + " StringsPointer:"
                        + stringDataReader.getFilePointer());
                _queueState = QueueState.QUEUED;
            } finally {
                fixedDataReader.close();
                stringDataReader.close();
            }
            LOG.info("*** LIST:" + getListId() + " SUBDOMAIN METADATA REBUILT FROM LIST DATA . WRITING TO DISK");

            // write metadat to disk 
            writeInitialSubDomainMetadataToDisk();

            LOG.info("*** LIST:" + getListId() + " SUBDOMAIN METADATA REBUILT FROM LIST DATA . WRITE COMPLETE");
        }
    }

    public int getSubDomainItemCount() {
        synchronized (_metadata) {
            return _offsetLookupTable.getLength() / OFFSET_TABLE_ENTRY_SIZE;
        }
    }

    public ArrayList<CrawlListDomainItem> getSubDomainList(int offset, int count) {
        synchronized (_metadata) {

            ArrayList<CrawlListDomainItem> itemsOut = new ArrayList<CrawlListDomainItem>();

            try {
                synchronized (_subDomainMetadataFile) {
                    RandomAccessFile file = new RandomAccessFile(_subDomainMetadataFile, "rw");
                    DataInputBuffer inputBuffer = new DataInputBuffer();
                    byte fixedDataBlock[] = new byte[CrawlListMetadata.Constants.FixedDataSize];

                    try {
                        // skip version 
                        file.read();
                        // read item count 
                        int itemCount = file.readInt();

                        int i = offset;
                        int end = Math.min(i + count, itemCount);

                        LOG.info("*** LIST:" + getListId() + " SUBDOMAIN ITEM COUNT:" + itemCount);

                        if (i < itemCount) {

                            file.seek(5 + (CrawlListMetadata.Constants.FixedDataSize * offset));

                            CrawlListMetadata newMetadata = new CrawlListMetadata();

                            for (; i < end; ++i) {

                                long orignalPos = file.getFilePointer();
                                file.readFully(fixedDataBlock, 0, fixedDataBlock.length);
                                inputBuffer.reset(fixedDataBlock, fixedDataBlock.length);
                                newMetadata.deserialize(inputBuffer, new BinaryProtocol());
                                itemsOut.add(buildSubDomainSummary(newMetadata.getDomainName(), newMetadata));
                            }
                        }
                    } finally {
                        file.close();
                    }
                }
            } catch (IOException e) {
                LOG.error(CCStringUtils.stringifyException(e));
            }
            LOG.info("*** LIST:" + getListId() + " DONE LOADING SUBDOMAIN DATA FROM DISK");

            return itemsOut;
        }
    }

    private static CrawlListDomainItem buildSubDomainSummary(String domainName, CrawlListMetadata metadata) {
        CrawlListDomainItem domainItem = new CrawlListDomainItem();

        domainItem.setDomainName(domainName);

        int robotsExcludedItemsCount = 0;
        int errorItemsCount = 0;
        int otherHTTPResultsCount = 0;

        metadata.getHttp200Count();

        metadata.getRobotsExcludedCount();

        errorItemsCount += metadata.getTimeoutErrorCount();
        errorItemsCount += metadata.getIOExceptionCount();
        errorItemsCount += metadata.getDNSErrorCount();
        errorItemsCount += metadata.getOtherErrorCount();

        otherHTTPResultsCount += metadata.getHttp403Count();
        otherHTTPResultsCount += metadata.getHttp404Count();
        otherHTTPResultsCount += metadata.getHttp500Count();
        otherHTTPResultsCount += metadata.getHttpOtherCount();

        domainItem.setUrlCount(metadata.getUrlCount());
        domainItem.setUrlsCrawled(metadata.getHttp200Count() + otherHTTPResultsCount);
        domainItem.setHttp200Count(metadata.getHttp200Count());
        domainItem.setInCacheItemsCount(0);
        domainItem.setRobotsExcludedCount(robotsExcludedItemsCount);
        domainItem.setErrorCount(errorItemsCount);

        domainItem.setFirstItemOffset(metadata.getFirstRecordOffset());
        domainItem.setLastItemOffset(metadata.getLastRecordOffset());
        domainItem.setHashCode((int) metadata.getDomainHash());

        domainItem.setQueuedCount(metadata.getQueuedItemCount());

        return domainItem;
    }

    /*
     public CrawlListMetadata getSubDomainMetadata() { 
        synchronized (_metadata) {
     ImmutableSortedSet.Builder<String> builder = ImmutableSortedSet.naturalOrder();
     builder.addAll(_subDomainNameToStatsMap.keySet());
     return builder.build();
      }
     }
     */
    /**********************************************************************/
    public static void main(String[] args) throws IOException {
        if (args[0].equalsIgnoreCase("dump")) {
            File dataDir = new File(args[1]);
            long listId = Long.parseLong(args[2]);
            File outputPath = new File(args[3]);
            dumpUnCrawledItems(dataDir, listId, outputPath, true);
        }
    }

    public static void dumpUnCrawledItems(File dataDir, long listId, File outputFilePath,
            boolean includeRobotsExcludedItems) throws IOException {

        File fixedDataFile = new File(dataDir, LIST_VALUE_MAP_PREFIX + Long.toString(listId));
        File variableDataFile = new File(dataDir, LIST_STRING_MAP_PREFIX + Long.toString(listId));

        LOG.info("FixedDataFile is:" + fixedDataFile);
        LOG.info("VariableDataFile is:" + variableDataFile);

        RandomAccessFile fixedDataReader = new RandomAccessFile(fixedDataFile, "r");
        RandomAccessFile stringDataReader = new RandomAccessFile(variableDataFile, "r");

        JsonWriter writer = new JsonWriter(new BufferedWriter(new FileWriter(outputFilePath), 1024 * 1024 * 10));

        writer.setIndent(" ");

        try {
            writer.beginObject();
            writer.name("urls");
            writer.beginArray();
            try {

                OnDiskCrawlHistoryItem item = new OnDiskCrawlHistoryItem();
                URLFP fingerprint = new URLFP();

                while (fixedDataReader.getFilePointer() != fixedDataReader.length()) {

                    long position = fixedDataReader.getFilePointer();

                    item.deserialize(fixedDataReader);

                    // seek to string data 
                    stringDataReader.seek(item._stringsOffset);
                    // and skip buffer length 
                    WritableUtils.readVInt(stringDataReader);
                    // and read primary string 
                    String url = stringDataReader.readUTF();
                    // setup fingerprint 
                    fingerprint.setDomainHash(item._domainHash);
                    fingerprint.setUrlHash(item._urlFingerprint);

                    // any item that has not been crawled needs to be queued 
                    boolean queueItem = !item.isFlagSet(OnDiskCrawlHistoryItem.FLAG_HAS_CRAWL_STATUS);

                    // if item is not queued, check to see if we need to retry the item 
                    if (!queueItem && item.isFlagSet(OnDiskCrawlHistoryItem.FLAG_HAS_CRAWL_STATUS)) {

                        if (item.isFlagSet(OnDiskCrawlHistoryItem.FLAG_HAS_REDIRECT_STATUS)) {

                            queueItem = (item._redirectStatus != 0);

                            if (!queueItem) {
                                if (item._redirectHttpResult != 200 && item._redirectHttpResult != 404) {
                                    queueItem = true;
                                }
                            }
                        } else {
                            queueItem = (item._crawlStatus != 0);

                            if (!queueItem) {
                                if (item._httpResultCode != 200 && item._httpResultCode != 404) {
                                    queueItem = true;
                                }
                            }
                        }
                    }

                    if (queueItem) {
                        // ok if queue item is set ... 
                        writer.beginObject();
                        writer.name("url");
                        writer.value(url);
                        writer.name("redirected");
                        writer.value((boolean) item.isFlagSet(OnDiskCrawlHistoryItem.FLAG_HAS_REDIRECT_STATUS));
                        writer.name("lastStatus");
                        if (item.isFlagSet(OnDiskCrawlHistoryItem.FLAG_HAS_REDIRECT_STATUS)) {
                            if (item._redirectStatus == 0) {
                                writer.value("HTTP-" + item._redirectHttpResult);
                            } else {
                                writer.value(CrawlURL.FailureReason.toString(item._redirectHttpResult));
                            }
                        } else {
                            if (item.isFlagSet(OnDiskCrawlHistoryItem.FLAG_HAS_CRAWL_STATUS)) {
                                if (item._crawlStatus == 0) {
                                    writer.value("HTTP-" + item._httpResultCode);
                                } else {
                                    writer.value(CrawlURL.FailureReason.toString(item._crawlStatus));
                                }
                            } else {
                                writer.value("UNCRAWLED");
                            }
                        }
                        writer.name("updateTime");
                        writer.value(item._updateTimestamp);
                        writer.endObject();
                    }
                }
            } catch (IOException e) {
                LOG.error("Encountered Exception Queueing Items for List:" + listId + " Exception:"
                        + CCStringUtils.stringifyException(e));
            } finally {
                fixedDataReader.close();
                stringDataReader.close();
            }

            writer.endArray();
            writer.endObject();
        } catch (Exception e) {
            LOG.error(CCStringUtils.stringifyException(e));
            throw new IOException(e);
        } finally {
            writer.flush();
            writer.close();
        }

    }

}