org.commoncrawl.service.crawler.CrawlLog.java Source code

Java tutorial

Introduction

Here is the source code for org.commoncrawl.service.crawler.CrawlLog.java

Source

/**
 * Copyright 2008 - CommonCrawl Foundation
 * 
 *    This program is free software: you can redistribute it and/or modify
 *    it under the terms of the GNU General Public License as published by
 *    the Free Software Foundation, either version 3 of the License, or
 *    (at your option) any later version.
 *
 *    This program is distributed in the hope that it will be useful,
 *    but WITHOUT ANY WARRANTY; without even the implied warranty of
 *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *    GNU General Public License for more details.
 *
 *    You should have received a copy of the GNU General Public License
 *    along with this program.  If not, see <http://www.gnu.org/licenses/>.
 *
 */

package org.commoncrawl.service.crawler;

import java.io.BufferedInputStream;
import java.io.BufferedReader;
import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.DataInput;
import java.io.DataInputStream;
import java.io.DataOutput;
import java.io.DataOutputStream;
import java.io.EOFException;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.RandomAccessFile;
import java.net.MalformedURLException;
import java.net.URL;
import java.security.MessageDigest;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Date;
import java.util.HashMap;
import java.util.HashSet;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.TreeMap;
import java.util.Vector;
import java.util.concurrent.Callable;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.zip.CRC32;

import junit.framework.Assert;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.DataOutputBuffer;
import org.apache.hadoop.io.SequenceFile;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.SequenceFile.CompressionType;
import org.apache.hadoop.io.compress.SnappyCodec;
import org.commoncrawl.async.ConcurrentTask;
import org.commoncrawl.async.EventLoop;
import org.commoncrawl.async.Timer;
import org.commoncrawl.async.ConcurrentTask.CompletionCallback;
import org.commoncrawl.common.Environment;
import org.commoncrawl.crawl.common.internal.CrawlEnvironment;
import org.commoncrawl.protocol.CrawlSegmentDetail;
import org.commoncrawl.protocol.CrawlSegmentHost;
import org.commoncrawl.protocol.CrawlSegmentURL;
import org.commoncrawl.protocol.CrawlURL;
import org.commoncrawl.util.CCStringUtils;
import org.commoncrawl.util.FPGenerator;
import org.commoncrawl.util.FileUtils;
import org.commoncrawl.util.FlexBuffer;
import org.commoncrawl.util.MovingAverage;
import org.commoncrawl.util.RuntimeStatsCollector;
import org.commoncrawl.util.SmoothedAverage;
import org.mortbay.jetty.security.Credential.MD5;

import com.google.common.collect.Iterators;

public final class CrawlLog {

    public static final Log LOG = LogFactory.getLog(CrawlLog.class);

    private static final int LOG_FLUSH_INTERVAL = 30000;

    private static final int LOG_CHECKPOINT_INTERVAL = 60000 * 5;

    private static final int LOG_FILE_CHECKPOINT_ITEM_COUNT_THRESHOLD = 100000;

    /** log file header **/
    LogFileHeader _header = new LogFileHeader();

    /** node name **/
    String _nodeName;

    /** data directory **/
    File _rootDirectory;

    /** event loop **/
    EventLoop _eventLoop;

    /** thread pool **/
    ExecutorService _threadPool;

    /** crawler engine pointer **/
    CrawlerEngine _engine;

    /** robots segment logger **/
    CrawlSegmentLog _robotsSegment = new CrawlSegmentLog(null, -1, -1, null);

    /** individual crawl segment loggers **/
    Map<Long, CrawlSegmentLog> _loggers = new HashMap<Long, CrawlSegmentLog>();

    /** checkpoint completion callback **/
    CheckpointCompletionCallback _checkpointCompletionCallback = null;

    /** checkpoint id - analogous to parse segment id **/
    long _checkpointId;

    /** flush in progress flag **/
    boolean _flushInProgress = false;

    /** a shutdown operation is in progress **/
    boolean _shutdownInProgress = false;

    /** log flusher timer **/
    Timer _logFlusherTimer = null;

    /** last checkpoint time **/
    long _lastCheckpointTime = -1;

    MovingAverage _flushTimeAVG = new MovingAverage(10);
    SmoothedAverage _flushTimeSmoothed = new SmoothedAverage(.8);
    long _lastFlushTime = 0;

    /** get active log file path **/
    public static File getActivePath(File directoryRoot) {
        // and construct a path to the local crawl segment directory ...
        File crawlDataDir = new File(directoryRoot, CrawlEnvironment.getCrawlerLocalOutputPath());
        // append the segment id to the path ...
        return new File(crawlDataDir, CrawlEnvironment.ActiveCrawlLog);
    }

    /** get active log file path **/
    public static File getCheckpointPath(File directoryRoot) {
        // and construct a path to the local crawl segment directory ...
        File crawlDataDir = new File(directoryRoot, CrawlEnvironment.getCrawlerLocalOutputPath());
        // append the segment id to the path ...
        return new File(crawlDataDir, CrawlEnvironment.CheckpointCrawlLog);
    }

    public static void ensureDataDirectory(File directoryRoot) {
        // and construct a path to the local crawl segment directory ...
        File crawlDataDir = new File(directoryRoot, CrawlEnvironment.getCrawlerLocalOutputPath());

        if (!crawlDataDir.exists()) {
            crawlDataDir.mkdir();
        }
    }

    /** purge local data directory **/
    public static void purgeDataDirectory(File directoryRoot) {
        // get crawl output path ...
        File crawlDataDir = new File(directoryRoot, CrawlEnvironment.getCrawlerLocalOutputPath());
        // delete entire directory and all contents underneath it
        FileUtils.recursivelyDeleteFile(crawlDataDir);
        // recreate directory
        crawlDataDir.mkdirs();
    }

    /** unit test constructor **/
    public CrawlLog() throws IOException {
        _rootDirectory = new File(CrawlEnvironment.DEFAULT_DATA_DIR, "crawlLog_unittest");
        if (!_rootDirectory.exists())
            _rootDirectory.mkdir();
        _eventLoop = new EventLoop();
        _nodeName = "test";
        _eventLoop.start();
        _threadPool = Executors.newFixedThreadPool(1);

        initialize();

    }

    public CrawlLog(CrawlerEngine engine) throws IOException {

        _engine = engine;
        _rootDirectory = engine.getServer().getDataDirectory();
        _nodeName = engine.getServer().getHostName();
        _eventLoop = engine.getEventLoop();
        _threadPool = engine.getServer().getDefaultThreadPool();

        initialize();
    }

    private void initialize() throws IOException {

        // create data directory if necessary ...
        ensureDataDirectory(_rootDirectory);

        File checkpointLogPath = getCheckpointPath(_rootDirectory);
        File activeLogPath = getActivePath(_rootDirectory);

        // check if it exists ...
        if (checkpointLogPath.exists()) {
            // log it ...
            LOG.warn("####Checkpoint Crawl Log Found - Possible Crash Recovery");
            // rename it as the active log ...
            checkpointLogPath.renameTo(activeLogPath);
        }

        LOG.info("Crawl Log Initializing Active Log");
        // either way call initialize active log ...
        _header = initializeActiveLog(_rootDirectory);

        LOG.info("Crawl Log Initialize returned " + _header._itemCount + " Entries in Active Log");

    }

    /** initialize log (file) **/
    private static LogFileHeader initializeActiveLog(File rootDirectory) throws IOException {
        File activeLogPath = getActivePath(rootDirectory);
        return initializeLogFileHeaderFromLogFile(activeLogPath);
    }

    private static LogFileHeader initializeLogFileHeaderFromLogFile(File logFilePath) throws IOException {

        LogFileHeader headerOut = null;
        if (!logFilePath.exists()) {
            DataOutputStream outputStream = new DataOutputStream(new FileOutputStream(logFilePath));
            try {
                headerOut = initializeEmptyLogFile(outputStream);
            } finally {
                outputStream.close();
            }
        } else {
            headerOut = new LogFileHeader();

            DataInputStream inputStream = new DataInputStream(new FileInputStream(logFilePath));

            try {
                headerOut.readHeader(inputStream);
            } finally {
                inputStream.close();
            }
        }

        return headerOut;
    }

    /** get the host name **/
    public String getNodeName() {
        return _nodeName;
    }

    /** make packed log id from list id and segment log id **/
    public static long makeSegmentLogId(int listId, int segmentId) {
        return (((long) listId) << 32) | (long) segmentId;
    }

    /** get segment log id from packed id **/
    public static int getSegmentIdFromLogId(long logId) {
        return (int) (logId & 0xFFFFFFFF);
    }

    /** get list id from packed id **/
    public static int getListIdFromLogId(long logId) {
        return (int) ((logId >> 32) & 0xFFFFFFFF);
    }

    /** add a segment log given segment id **/
    public void addSegmentLog(CrawlSegmentLog log) {
        if (_loggers.get(log.getSegmentId()) != null) {
            LOG.error("Attempt to Activate an Already Active Segment Log. Segment Id:" + log.getSegmentId());
            throw new RuntimeException(
                    "Attempt to Activate an Already Active Segment Log. Segment Id:" + log.getSegmentId());
        }
        _loggers.put(makeSegmentLogId(log.getListId(), log.getSegmentId()), log);
    }

    /** get the special robots crawl segment **/
    public CrawlSegmentLog getRobotsSegment() {
        return _robotsSegment;
    }

    /** get a segment log given segment id **/
    public CrawlSegmentLog getLogForSegment(int listId, int segmentId) {
        return _loggers.get(makeSegmentLogId(listId, segmentId));
    }

    /** remove segment log **/
    public CrawlSegmentLog removeSegmentLog(int listId, int segmentId) {
        return _loggers.remove(makeSegmentLogId(listId, segmentId));
    }

    private static class LogFileHeader {

        public static final int LogFileHeaderBytes = 0xCC00CC00;
        public static final int LogFileVersion = 1;

        public LogFileHeader() {
            _fileSize = 0;
            _itemCount = 0;
        }

        public long _fileSize;
        public long _itemCount;

        public void writeHeader(DataOutput stream) throws IOException {
            stream.writeInt(LogFileHeaderBytes);
            stream.writeInt(LogFileVersion);
            stream.writeLong(_fileSize);
            stream.writeLong(_itemCount);
        }

        public void readHeader(DataInput stream) throws IOException {
            int headerBytes = stream.readInt();
            int version = stream.readInt();

            if (headerBytes != LogFileHeaderBytes && version != LogFileVersion) {
                throw new IOException("Invalid CrawlLog File Header Detected!");
            }
            _fileSize = stream.readLong();
            _itemCount = stream.readLong();
        }
    }

    private static void updateLogFileHeader(File logFileName, LogFileHeader header, long addedRecordCount)
            throws IOException {

        RandomAccessFile file = new RandomAccessFile(logFileName, "rw");

        try {

            // update cached header ...
            header._fileSize = file.getChannel().size();
            header._itemCount += addedRecordCount;
            // set the position at zero ..
            file.seek(0);
            // and write header to disk ...
            header.writeHeader(file);
        } finally {
            // major bottle neck..
            // file.getFD().sync();
            file.close();
        }
    }

    private static LogFileHeader initializeEmptyLogFile(DataOutput stream) throws IOException {

        LogFileHeader header = new LogFileHeader();
        header.writeHeader(stream);

        return header;
    }

    public static LogFileHeader readLogFileHeader(File logFileName) throws IOException {

        LogFileHeader headerOut = new LogFileHeader();
        RandomAccessFile file = new RandomAccessFile(logFileName, "r");
        try {
            headerOut = readLogFileHeader(file);
        } finally {
            file.close();
        }
        return headerOut;
    }

    private static LogFileHeader readLogFileHeader(DataInput reader) throws IOException {

        LogFileHeader headerOut = new LogFileHeader();

        headerOut.readHeader(reader);

        return headerOut;
    }

    private boolean isCheckpointInProgress() {
        return _checkpointCompletionCallback != null;
    }

    private boolean isFlushInProgress() {
        return _flushInProgress == true;
    }

    private void setFlushInProgress(boolean value) {
        _flushInProgress = value;

        if (value == false) {
            // since we are in the async thread at this point, check to see if a
            // checkpoint is in progress
            if (isCheckpointInProgress()) {
                // if so, it was deferred, because of the flush in progress... so we
                // need to actually kick off the checkpoint progress
                // now that the flush is complete
                doCheckpoint();
            }
        }
    }

    public static interface CheckpointCompletionCallback {

        public void checkpointComplete(long checkpointId, Vector<Long> completedSegmentList);

        public void checkpointFailed(long checkpointId, Exception e);

    }

    public static interface FlushCompletionCallback {
        public void flushComplete();

        public void flushFailed(Exception e);
    }

    /** essentially swap crawl logs **/
    private void checkpointLocalCrawlLog() throws IOException {
        File activeCrawlLog = getActivePath(_rootDirectory);
        File checkpointCrawlLog = getCheckpointPath(_rootDirectory);

        LOG.info("MOVING ACTIVE:" + activeCrawlLog + "TO:" + checkpointCrawlLog);
        // delete any existing checkpoint log ...
        checkpointCrawlLog.delete();
        // rename active log to check point log
        activeCrawlLog.renameTo(checkpointCrawlLog);
        // and create a new active crawlLog ...
        _header = initializeActiveLog(_rootDirectory);
    }

    public void checkpoint(long checkpointStartTime, CheckpointCompletionCallback callback, long checkpointId) {

        // first check to see if checkpoint is already in progress ...
        if (isCheckpointInProgress()) {
            // immediately fail call
            callback.checkpointFailed(checkpointId,
                    new Exception("Invalid State. Checkpoint already in progress!"));
        }

        _lastCheckpointTime = checkpointStartTime;

        // otherwise transition to a checkpoint in progress state
        _checkpointCompletionCallback = callback;
        _checkpointId = checkpointId;

        // now check to see if we are not in the middle of a flush ...
        if (!isFlushInProgress()) {
            // if not we can directly start the actual checkpoint process ...
            doCheckpoint();
        }
        // otherwise wait for the flush to finish (and thus trigger the checkpoint
        // process)
    }

    public void finalizeCheckpoint() {
        File checkpointLogFile = getCheckpointPath(_rootDirectory);
        checkpointLogFile.delete();
    }

    public void abortCheckpoint() {
        File activeLogFile = getActivePath(_rootDirectory);
        File checkpointLogFile = getCheckpointPath(_rootDirectory);
        LOG.info("###ABORTING CHECKPOINT! RENAMING:" + checkpointLogFile + " TO:" + activeLogFile);
        checkpointLogFile.renameTo(activeLogFile);
    }

    public void purgeActiveLog() throws IOException {
        File activeLogFilePath = getActivePath(_rootDirectory);

        if (activeLogFilePath.exists())
            activeLogFilePath.delete();

        _header = initializeActiveLog(_rootDirectory);
    }

    private static class CorruptCrawlLogException extends IOException {

        public CorruptCrawlLogException(String description) {
            super(description);
        }
    }

    /**
     * seek out next instance of sync bytes in the file input stream
     * 
     * @param file
     * @throws IOException
     */
    private static boolean seekToNextSyncBytesPos(byte[] syncBytesBuffer, RandomAccessFile file, long maxFileSize)
            throws IOException {

        while (file.getFilePointer() < maxFileSize) {
            try {
                // read in a sync.length buffer amount
                file.read(syncBytesBuffer);

                int syncLen = SYNC_BYTES_SIZE;

                // start scan for next sync position ...
                for (int i = 0; file.getFilePointer() < maxFileSize; i++) {
                    int j = 0;
                    for (; j < syncLen; j++) {
                        if (_sync[j] != syncBytesBuffer[(i + j) % syncLen])
                            break;
                    }
                    if (j == syncLen) {
                        // found matching sync bytes - reset file pos to before sync bytes
                        file.seek(file.getFilePointer() - SYNC_BYTES_SIZE); // position
                        // before
                        // sync
                        return true;
                    }
                    syncBytesBuffer[i % syncLen] = file.readByte();
                }
            } catch (IOException e) {
                LOG.warn("IOException at:" + file.getFilePointer() + " Exception:"
                        + CCStringUtils.stringifyException(e));
                LOG.warn("Skipping to:" + file.getFilePointer() + 4096);
                file.seek(file.getFilePointer() + 4096);
            }
        }
        return false;
    }

    private static interface HDFSCrawlURLWriter {
        public void writeCrawlURLItem(Text url, CrawlURL urlObject) throws IOException;

        public void close() throws IOException;

        public List<Path> getFilenames();
    }

    private static class SequenceFileCrawlURLWriter implements HDFSCrawlURLWriter {

        FileSystem _fs;
        Configuration _conf;
        Path _stagingDirectory;
        String _nodeName;
        long _currentFileRecordCount = 0;
        ArrayList<Path> _outputPaths = new ArrayList<Path>();

        long _nextFileId = -1L;
        Path currentFilePath = null;
        SequenceFile.Writer writer = null;
        long _prevPos;

        public SequenceFileCrawlURLWriter(Configuration conf, FileSystem fs, Path path, String nodeName,
                long checkpointId) throws IOException {
            _conf = conf;
            _fs = fs;
            _stagingDirectory = path;
            _nodeName = nodeName;
            _nextFileId = checkpointId;

            flushFile(true);
        }

        private void flushFile(boolean openNew) throws IOException {
            if (writer != null) {
                writer.close();
                if (_currentFileRecordCount != 0) {
                    LOG.info("Flushed Temp Checkpoint File:" + currentFilePath);
                    _outputPaths.add(currentFilePath);
                } else {
                    _fs.delete(currentFilePath, false);
                }
                writer = null;
                _currentFileRecordCount = 0;
                currentFilePath = null;
            }

            if (openNew) {
                // allocate a new filename
                currentFilePath = new Path(_stagingDirectory,
                        CrawlEnvironment.buildCrawlLogCheckpointName(_nodeName, _nextFileId++));
                LOG.info("Allocating new Checkpoint File:" + currentFilePath);
                // delete it
                if (_fs.exists(currentFilePath)) {
                    LOG.warn("Existing Checkpoint TempFile found at:" + currentFilePath + " - Deleting");
                    _fs.delete(currentFilePath, false);
                }
                // open a sequence file writer at the temp file location ...
                writer = SequenceFile.createWriter(_fs, _conf, currentFilePath, Text.class, CrawlURL.class,
                        CompressionType.BLOCK, new SnappyCodec());
                // reset record count ...
                _currentFileRecordCount = 0;
            }
        }

        @Override
        public void writeCrawlURLItem(Text url, CrawlURL urlObject) throws IOException {
            writer.append(url, urlObject);
            ++_currentFileRecordCount;
            long pos = writer.getLength();
            if (pos != _prevPos) {
                _prevPos = pos;
                if (pos >= 1073741824L) {
                    flushFile(true);
                }
            }
        }

        public void close() throws IOException {
            flushFile(false);
        }

        public List<Path> getFilenames() {
            return _outputPaths;
        }
    };

    private static class URLWriterException extends IOException {
        public URLWriterException() {

        }
    }

    private static void transferLocalCheckpointLog(File crawlLogPath, HDFSCrawlURLWriter writer, long checkpointId)
            throws IOException {

        // and open the crawl log file ...
        RandomAccessFile inputStream = null;

        IOException exception = null;

        CRC32 crc = new CRC32();
        CustomByteArrayOutputStream buffer = new CustomByteArrayOutputStream(1 << 17);
        byte[] syncBytesBuffer = new byte[SYNC_BYTES_SIZE];

        // save position for potential debug output.
        long lastReadPosition = 0;

        try {
            inputStream = new RandomAccessFile(crawlLogPath, "rw");
            // and a data input stream ...
            RandomAccessFile reader = inputStream;
            // seek to zero
            reader.seek(0L);

            // read the header ...
            LogFileHeader header = readLogFileHeader(reader);

            // read a crawl url from the stream...

            while (inputStream.getFilePointer() < header._fileSize) {

                if (seekToNextSyncBytesPos(syncBytesBuffer, reader, header._fileSize)) {

                    try {
                        lastReadPosition = inputStream.getFilePointer();

                        // skip sync
                        inputStream.skipBytes(SYNC_BYTES_SIZE);

                        // read length ...
                        int urlDataLen = reader.readInt();
                        long urlDataCRC = reader.readLong();

                        if (urlDataLen > buffer.getBuffer().length) {
                            buffer = new CustomByteArrayOutputStream(((urlDataLen / 65536) + 1) * 65536);
                        }
                        reader.read(buffer.getBuffer(), 0, urlDataLen);
                        crc.reset();
                        crc.update(buffer.getBuffer(), 0, urlDataLen);

                        long computedValue = crc.getValue();

                        // validate crc values ...
                        if (computedValue != urlDataCRC) {
                            LOG.error("CRC Mismatch Detected during HDFS transfer in CrawlLog:"
                                    + crawlLogPath.getAbsolutePath() + " Checkpoint Id:" + checkpointId
                                    + " FilePosition:" + lastReadPosition);
                            inputStream.seek(lastReadPosition + 1);
                        } else {
                            // allocate a crawl url data structure
                            CrawlURL url = new CrawlURL();
                            DataInputStream bufferReader = new DataInputStream(
                                    new ByteArrayInputStream(buffer.getBuffer(), 0, urlDataLen));
                            // populate it from the (in memory) data stream
                            url.readFields(bufferReader);
                            try {
                                // and write out appropriate sequence file entries ...
                                writer.writeCrawlURLItem(new Text(url.getUrl()), url);
                            } catch (IOException e) {
                                LOG.error("Failed to write CrawlURL to SequenceFileWriter with Exception:"
                                        + CCStringUtils.stringifyException(e));
                                throw new URLWriterException();
                            }
                        }
                    } catch (URLWriterException e) {
                        LOG.error("Caught URLRewriter Exception! - Throwing to outer layer!");
                        throw e;
                    } catch (Exception e) {
                        LOG.error("Ignoring Error Processing CrawlLog Entry at Position:" + lastReadPosition
                                + " Exception:" + CCStringUtils.stringifyException(e));
                    }
                } else {
                    break;
                }
            }
        } catch (EOFException e) {
            LOG.error("Caught EOF Exception during read of local CrawlLog:" + crawlLogPath.getAbsolutePath()
                    + " Checkpoint Id:" + checkpointId + " FilePosition:" + lastReadPosition);
        } catch (IOException e) {
            LOG.error(CCStringUtils.stringifyException(e));
            exception = e;
            throw e;
        } finally {
            if (inputStream != null)
                inputStream.close();
        }
    }

    private Path getFinalSegmentLogPath(FileSystem hdfs, long checkpointId, int listId, int segmentId)
            throws IOException {
        Path listLogDirectory = new Path(CrawlEnvironment.getCrawlSegmentDataDirectory(),
                ((Integer) listId).toString());
        Path segmentLogDirectory = new Path(listLogDirectory, ((Integer) segmentId).toString());
        Path completionLogFilePath = new Path(segmentLogDirectory,
                CrawlEnvironment.buildCrawlSegmentLogCheckpointFileName(getNodeName(), checkpointId));

        return completionLogFilePath;
    }

    private Path transferLocalSegmentLog(FileSystem hdfs, File localSegmentLogFile, long checkpointId, int listId,
            int segmentId) throws IOException {

        if (localSegmentLogFile.exists()) {

            // determine the file's size ...
            // if > header size (in other words it has data ... )
            if (localSegmentLogFile.length() > CrawlSegmentLog.getHeaderSize()) {
                // construct a target path (where we are going to store the checkpointed
                // crawl log )
                Path remoteLogFileName = new Path(CrawlEnvironment.getCheckpointStagingDirectory(),
                        CrawlEnvironment.buildCrawlSegmentLogCheckpointFileName(getNodeName(), checkpointId) + "_"
                                + Integer.toString(listId) + "_" + Integer.toString(segmentId));

                hdfs.copyFromLocalFile(new Path(localSegmentLogFile.getAbsolutePath()), remoteLogFileName);

                return remoteLogFileName;
            }
        }
        return null;
    }

    private void purgeHDFSSegmentLogs(FileSystem hdfs, int listId, int segmentId) throws IOException {

        Path listLogDirectory = new Path(CrawlEnvironment.getCrawlSegmentDataDirectory(),
                ((Integer) listId).toString());
        Path segmentLogDirectory = new Path(listLogDirectory, ((Integer) segmentId).toString());
        Path completionLogFilePath = new Path(segmentLogDirectory,
                CrawlEnvironment.buildCrawlSegmentCompletionLogFileName(getNodeName()));

        if (!hdfs.exists(completionLogFilePath)) {
            // create a zero length completion log file on hdfs ...
            hdfs.createNewFile(completionLogFilePath);
        }

        // skip this step as history servers now manage segment logs
        /*
         * // and now ... delete all logs Path segmentLogWildcardPath = new
         * Path(segmentLogDirectory
         * ,CrawlEnvironment.buildCrawlSegmentLogCheckpointWildcardString
         * (getNodeName())); FileStatus paths[] =
         * hdfs.globStatus(segmentLogWildcardPath); if (paths != null) { for
         * (FileStatus path : paths) { // hdfs.delete(path.getPath()); } }
         */
    }

    /** perform the actual checkpoint work here ... **/
    private void doCheckpoint() {
        // at this point, we should be in the async thread, and all flusher
        // activities are blocked ...
        LOG.info("CrawlLog Checkpoint - Starting ");
        // collect all necessary information from thread-unsafe data structure now
        // (in async thread context)
        final Set<Long> activeSegments = new HashSet<Long>();

        try {
            // add all active segment ids to our key set ...
            activeSegments.addAll(_loggers.keySet());
            LOG.info("CrawlLog Checkpoint - Preparing CrawlLog Files");
            // checkpoint crawl log ...
            checkpointLocalCrawlLog();
            LOG.info("CrawlLog Checkpoint - Preparing Segment Log Files");
            // next checkpoint all active segment logs ...
            for (CrawlSegmentLog segmentLog : _loggers.values()) {
                segmentLog.checkpointLocalLog();
            }
            LOG.info("CrawlLog Checkpoint - Ready for HDFS Transfer");
        } catch (IOException e) {
            LOG.error("Checkpoint failed with Exception:" + CCStringUtils.stringifyException(e));
        }

        // spawn a thread to do most of the blocking io ...
        _threadPool.submit(new ConcurrentTask<Boolean>(_eventLoop,

                new Callable<Boolean>() {

                    public Boolean call() throws Exception {

                        // we need to track these in case of failure ...
                        Vector<Path> segmentLogStagingPaths = new Vector<Path>();
                        Vector<Path> segmentLogFinalPaths = new Vector<Path>();

                        // get the file system
                        final FileSystem hdfs = CrawlEnvironment.getDefaultFileSystem();

                        try {

                            LOG.info("CrawlLog Checkpoint - Transferring CrawlLog to HDFS");

                            // construct a target path (where we are going to store the
                            // checkpointed crawl log )
                            Path stagingDirectory = new Path(CrawlEnvironment.getCheckpointStagingDirectory());

                            SequenceFileCrawlURLWriter hdfsWriter = new SequenceFileCrawlURLWriter(
                                    CrawlEnvironment.getHadoopConfig(), hdfs, stagingDirectory, getNodeName(),
                                    _checkpointId);

                            try {
                                // write out crawl log to hdfs ...
                                transferLocalCheckpointLog(getCheckpointPath(_rootDirectory), hdfsWriter,
                                        _checkpointId);
                            } catch (Exception e) {
                                LOG.error("HDFS Write of CrawlLog failed. Deleting tempFiles:"
                                        + hdfsWriter.getFilenames() + " Exception:"
                                        + CCStringUtils.stringifyException(e));

                                // close writer
                                hdfsWriter.close();
                                // delete any hdfs output ...
                                for (Path path : hdfsWriter.getFilenames()) {
                                    LOG.info("Deleting temp (HDFS) checkpoint file:" + path);
                                    hdfs.delete(path, false);
                                }
                                throw e;
                            } finally {
                                hdfsWriter.close();
                            }

                            LOG.info("CrawlLog Checkpoint - Transferring CrawlSegment Logs");
                            // and next for every segment
                            for (long packedLogId : activeSegments) {

                                File segmentLogPath = CrawlSegmentLog.buildCheckpointPath(_rootDirectory,
                                        getListIdFromLogId(packedLogId), getSegmentIdFromLogId(packedLogId));

                                // LOG.info("CrawlLog Checkpoint - Transferring CrawlSegment Log for Segment:"
                                // + segmentId);
                                // copy the segment log ...
                                Path remoteLogFilePath = transferLocalSegmentLog(hdfs, segmentLogPath,
                                        _checkpointId, getListIdFromLogId(packedLogId),
                                        getSegmentIdFromLogId(packedLogId));
                                // if path is not null (data was copied) ...
                                if (remoteLogFilePath != null) {
                                    // add it to vector ...
                                    segmentLogStagingPaths.add(remoteLogFilePath);
                                    // and add final path to vector while we are at it ...
                                    segmentLogFinalPaths.add(getFinalSegmentLogPath(hdfs, _checkpointId,
                                            getListIdFromLogId(packedLogId), getSegmentIdFromLogId(packedLogId)));
                                }
                            }
                            LOG.info("CrawlLog Checkpoint - Finished Transferring CrawlSegment Logs");

                            // now if we got here ... all hdfs transfers succeeded ...
                            // go ahead and move checkpoint log from staging to final data
                            // directory ...
                            Path checkpointDirectory = new Path(CrawlEnvironment.getCheckpointDataDirectory());

                            // if no checkpoint data directory ... create one ...
                            if (!hdfs.exists(checkpointDirectory))
                                hdfs.mkdirs(checkpointDirectory);

                            for (Path checkpointTempFilePath : hdfsWriter.getFilenames()) {
                                Path checkpointFinalPath = new Path(checkpointDirectory,
                                        checkpointTempFilePath.getName());
                                LOG.info("Promoting Checking File From:" + checkpointTempFilePath + " to:"
                                        + checkpointFinalPath);
                                // and essentially move the crawl log file from staging to data
                                // directory ..
                                boolean success = hdfs.rename(checkpointTempFilePath, checkpointFinalPath);
                                if (!success) {
                                    throw new IOException("Failed to Rename Checkpoint Temp:"
                                            + checkpointTempFilePath + " to:" + checkpointFinalPath);
                                }
                            }
                            // and now do the same thing for each segment log files
                            for (int i = 0; i < segmentLogStagingPaths.size(); ++i) {
                                hdfs.rename(segmentLogStagingPaths.get(i), segmentLogFinalPaths.get(i));
                            }
                            // if we got here checkpoint was successfull...
                            return true;
                        } catch (Exception e) {
                            LOG.error("Checkpoint:" + _checkpointId + " FAILED with exception:"
                                    + CCStringUtils.stringifyException(e));
                            for (Path segmentPath : segmentLogStagingPaths) {
                                hdfs.delete(segmentPath, false);
                            }
                            for (Path segmentPath : segmentLogFinalPaths) {
                                hdfs.delete(segmentPath, false);
                            }
                            throw e;
                        }
                    }
                },

                new CompletionCallback<Boolean>() {

                    public void taskComplete(Boolean updateResult) {

                        Vector<Long> completedSegmentList = new Vector<Long>();

                        LOG.info("CrawlLog Checkpoint - Finalizing CrawlLog Checkpoint");
                        // delete the local checkpoint log ...
                        finalizeCheckpoint();

                        LOG.info("CrawlLog Checkpoint - Finalizing CrawlSegmentLogs");
                        for (CrawlSegmentLog segmentLog : _loggers.values()) {
                            // LOG.info("CrawlLog Checkpoint - Finalizing CrawlSegmentLog for Segment:"
                            // + segmentLog.getSegmentId());
                            // finalize the checkpoint on the segment log ...
                            segmentLog.finalizeCheckpoint();
                            // and check to see if the segment has been completed ...
                            if (segmentLog.isSegmentComplete()) {
                                // if so, add it our completed segments list ...
                                completedSegmentList
                                        .add(makeSegmentLogId(segmentLog.getListId(), segmentLog.getSegmentId()));
                            }
                        }

                        // now for all completed segments ... purge hdfs logs ...
                        for (long packedSegmentId : completedSegmentList) {
                            try {
                                LOG.info(
                                        "CrawlLog Checkpoint - Purging HDFS CrawlSegmentLogs from Completed Segment. List:"
                                                + getListIdFromLogId(packedSegmentId) + " Segment:"
                                                + getSegmentIdFromLogId(packedSegmentId));
                                // purge hdfs files (and create a completion log file)
                                purgeHDFSSegmentLogs(CrawlEnvironment.getDefaultFileSystem(),
                                        getListIdFromLogId(packedSegmentId),
                                        getSegmentIdFromLogId(packedSegmentId));
                                LOG.info(
                                        "CrawlLog Checkpoint - Purging Local CrawlSegmentLogs from Completed Segment. List:"
                                                + getListIdFromLogId(packedSegmentId) + " Segment:"
                                                + getSegmentIdFromLogId(packedSegmentId));
                                // and purge local files as well ...
                                _loggers.get(packedSegmentId).purgeLocalFiles();
                            } catch (IOException e) {
                                LOG.error("Purge SegmentLog for Segment List:" + getListIdFromLogId(packedSegmentId)
                                        + " Segment:" + getSegmentIdFromLogId(packedSegmentId)
                                        + " threw IOException:" + CCStringUtils.stringifyException(e));
                            }
                            LOG.info("CrawlLog Checkpoint - DeRegistering Segment List:"
                                    + getListIdFromLogId(packedSegmentId) + " Segment:"
                                    + getSegmentIdFromLogId(packedSegmentId) + " From CrawlLog");
                            // no matter what ... unload the segment ...
                            _loggers.remove(packedSegmentId);
                        }

                        CheckpointCompletionCallback callback = _checkpointCompletionCallback;
                        long checkpointId = _checkpointId;

                        // otherwise transition to a checkpoint in progress state
                        _checkpointCompletionCallback = null;
                        _checkpointId = -1;

                        LOG.info("CrawlLog Checkpoint - Checkpoint Complete - Initiating Callback");

                        // and complete transaction ...
                        callback.checkpointComplete(checkpointId, completedSegmentList);

                    }

                    public void taskFailed(Exception e) {

                        // all failures are critical in this particular task ...
                        LOG.error("Crawl Log FLUSH Threw Exception:" + CCStringUtils.stringifyException(e));

                        // revert checkpoint logs ...
                        abortCheckpoint();

                        for (CrawlSegmentLog segmentLog : _loggers.values()) {
                            segmentLog.abortCheckpoint();
                        }

                        CheckpointCompletionCallback callback = _checkpointCompletionCallback;
                        long checkpointId = _checkpointId;

                        // otherwise transition to a checkpoint in progress state
                        _checkpointCompletionCallback = null;
                        _checkpointId = -1;

                        // now check to see if this was corrupt crawl log exception
                        if (e.getCause() instanceof CorruptCrawlLogException) {
                            // ACK!!!
                            LOG.fatal("Corrupt CrawlLog detected with Exception:"
                                    + CCStringUtils.stringifyException(e));

                            try {
                                // this is a serious error ... time to purge the crawl log directory
                                // altogether ...
                                purgeActiveLog();

                                // and all active segment logs as well...
                                for (CrawlSegmentLog segmentLog : _loggers.values()) {
                                    segmentLog.purgeActiveLog();
                                }
                            } catch (IOException e2) {
                                LOG.error("IOException during Segment Log PURGE:"
                                        + CCStringUtils.stringifyException(e2));
                            }

                            // time to die hard ...
                            throw new RuntimeException(e);
                        }

                        // and complete transaction ...
                        callback.checkpointFailed(checkpointId, e);
                    }
                }));
    }

    private static final class CustomByteArrayOutputStream extends ByteArrayOutputStream {
        public CustomByteArrayOutputStream(int initialSize) {
            super(initialSize);
        }

        public byte[] getBuffer() {
            return buf;
        }
    }

    private void logCrawlLogWrite(CrawlURL url, int bufferSizeOut) {
        StringBuffer sb = new StringBuffer();

        sb.append(String.format("%1$20.20s ", CCStringUtils.dateStringFromTimeValue(System.currentTimeMillis())));
        sb.append(String.format("%1$4.4s ", url.getResultCode()));
        sb.append(String.format("%1$10.10s ", url.getContentRaw().getCount()));
        if ((url.getFlags() & CrawlURL.Flags.IsRedirected) != 0) {
            sb.append(url.getRedirectURL());
            sb.append(" ");
        }
        sb.append(url.getUrl());
        _engine.getCrawlLogLog().info(sb.toString());
    }

    static byte[] _sync; // 16 random bytes
    static final int SYNC_BYTES_SIZE = 16;
    static {
        try {
            MessageDigest digester = MessageDigest.getInstance("MD5");
            digester.update("SOME RANDOM BYTES".getBytes());
            _sync = digester.digest();
        } catch (Exception e) {
            throw new RuntimeException(e);
        }
    }

    private static class SyncedCrawlURLLogWriter {

        boolean _injectErrors = false;
        boolean _corruptThisEntry = false;

        public SyncedCrawlURLLogWriter(boolean injectErrors) {
            _injectErrors = injectErrors;
        }

        public SyncedCrawlURLLogWriter() {

        }

        private CustomByteArrayOutputStream bufferOutputStream = new CustomByteArrayOutputStream(1 << 17);
        private DataOutputStream dataOutputStream = new DataOutputStream(bufferOutputStream);
        private CRC32 crc = new CRC32();

        public void writeItem(DataOutputStream crawlLogStream, CrawlURL url) throws IOException {

            bufferOutputStream.reset();
            // write to intermediate stream ...
            url.write(dataOutputStream);
            // and crc the data ...
            crc.reset();
            crc.update(bufferOutputStream.getBuffer(), 0, bufferOutputStream.size());
            // write out sync bytes first
            crawlLogStream.write(_sync);
            // write out length
            crawlLogStream.writeInt(bufferOutputStream.size());
            // crc next
            long computedValue = crc.getValue();
            if (_injectErrors) {
                _corruptThisEntry = !_corruptThisEntry;
                if (_corruptThisEntry) {
                    LOG.info("Intentionally Corrupting URL:" + url.getUrl());
                    computedValue += 12;
                }
            }
            crawlLogStream.writeLong(computedValue);
            // and then the data
            crawlLogStream.write(bufferOutputStream.getBuffer(), 0, bufferOutputStream.size());
        }
    }

    private void flushLog(final FlushCompletionCallback completionCallback) {
        if (Environment.detailLogEnabled())
            LOG.info("LOG_FLUSH:Collecting Entries....");
        // set flush in progress indicator ...
        setFlushInProgress(true);
        // and collect buffers in async thread context (thus not requiring
        // synchronization)
        final LinkedList<CrawlSegmentLog.LogItemBuffer> collector = new LinkedList<CrawlSegmentLog.LogItemBuffer>();
        // flush robots log
        _robotsSegment.flushLog(collector);
        // walk segments collecting log items ....
        for (CrawlSegmentLog logger : _loggers.values()) {
            // flush any log items into the collector
            logger.flushLog(collector);
        }
        if (Environment.detailLogEnabled())
            LOG.info("LOG_FLUSH:Collection Returned " + collector.size() + " Buffers");

        // walk collector list identifying the list of unique segment ids
        final Set<Long> packedSegmentIdSet = new HashSet<Long>();

        int urlItemCount = 0;

        for (CrawlSegmentLog.LogItemBuffer buffer : collector) {
            if (buffer.getListId() != -1 && buffer.getSegmentId() != -1) {
                packedSegmentIdSet.add(makeSegmentLogId(buffer.getListId(), buffer.getSegmentId()));
            }
            urlItemCount += buffer.getItemCount();
        }

        if (Environment.detailLogEnabled())
            LOG.info("LOG_FLUSH:There are  " + urlItemCount + " Items in Flush Buffer Associated With "
                    + packedSegmentIdSet.size() + " Segments");

        final File crawlLogFile = getActivePath(_rootDirectory);

        // now check to see if there is anything to do ...
        if (collector.size() != 0) {
            if (Environment.detailLogEnabled())
                LOG.info("LOG_FLUSH: Collector Size is NOT Zero... Starting Log Flusher Thread");
            // ok ... time to spawn a thread to do the blocking flush io
            _threadPool.submit(new ConcurrentTask<Boolean>(_eventLoop,

                    new Callable<Boolean>() {

                        public Boolean call() throws Exception {

                            if (Environment.detailLogEnabled())
                                LOG.info("LOG_FLUSH: Log Flusher Thread Started");
                            long startTime = System.currentTimeMillis();

                            Map<Long, DataOutputStream> streamsMapByPackedId = new HashMap<Long, DataOutputStream>();
                            Map<Long, Integer> recordCountsByPackedId = new HashMap<Long, Integer>();

                            long crawlLogRecordCount = 0;

                            // open the actual crawler log file ...
                            final DataOutputStream crawlLogStream = new DataOutputStream(
                                    new FileOutputStream(crawlLogFile, true));

                            try {
                                if (Environment.detailLogEnabled())
                                    LOG.info(
                                            "LOG_FLUSH: Log Flusher Thread Opening Streams for Segments in Buffer");
                                // now open a set of file descriptors related to the identified
                                // segments
                                for (long packedSegmentId : packedSegmentIdSet) {
                                    // construct the unique filename for the given log file...
                                    File activeSegmentLog = CrawlSegmentLog.buildActivePath(_rootDirectory,
                                            getListIdFromLogId(packedSegmentId),
                                            getSegmentIdFromLogId(packedSegmentId));
                                    // initialize the segment log ...
                                    CrawlSegmentLog.initializeLogFile(activeSegmentLog);
                                    // initialize record counts per stream ...
                                    recordCountsByPackedId.put(packedSegmentId,
                                            CrawlSegmentLog.readerHeader(activeSegmentLog));
                                    // and open an output stream for the specified log file ...
                                    streamsMapByPackedId.put(packedSegmentId,
                                            new DataOutputStream(new FileOutputStream(activeSegmentLog, true)));
                                }

                                if (Environment.detailLogEnabled())
                                    LOG.info("LOG_FLUSH: Log Flusher Thread Walking Items in Buffer");

                                // initialize a total item count variable
                                int totalItemCount = 0;

                                // crawl history stream
                                DataOutputBuffer historyStream = new DataOutputBuffer();

                                // and now walk log buffers ...
                                for (CrawlSegmentLog.LogItemBuffer buffer : collector) {
                                    if (Environment.detailLogEnabled())
                                        LOG.info("LOG_FLUSH: Log Flusher Thread Writing " + buffer.getItemCount()
                                                + " Entries for Segment:" + buffer.getSegmentId());

                                    // output stream
                                    DataOutputStream segmentLogStream = null;

                                    if (buffer.getListId() != -1 && buffer.getSegmentId() != -1) {
                                        // update segment count first ...
                                        recordCountsByPackedId.put(
                                                makeSegmentLogId(buffer.getListId(), buffer.getSegmentId()),
                                                recordCountsByPackedId.get(
                                                        makeSegmentLogId(buffer.getListId(), buffer.getSegmentId()))
                                                        + buffer.getItemCount());
                                        // get output stream associated with segment id
                                        segmentLogStream = streamsMapByPackedId
                                                .get(makeSegmentLogId(buffer.getListId(), buffer.getSegmentId()));
                                    }

                                    // and our local record counter ...
                                    crawlLogRecordCount += buffer.getItemCount();

                                    // and next do the actual disk flush ...
                                    totalItemCount += buffer.flushToDisk(totalItemCount,

                                            new CrawlSegmentLog.LogItemBuffer.CrawlURLWriter() {

                                                SyncedCrawlURLLogWriter syncedLogWriter = new SyncedCrawlURLLogWriter();

                                                public void writeItem(CrawlURL url) throws IOException {
                                                    // log it
                                                    logCrawlLogWrite(url, url.getContentSize());
                                                    // write it
                                                    syncedLogWriter.writeItem(crawlLogStream, url);
                                                }

                                                public void writeItemCount(int entryCount) throws IOException {
                                                }

                                            }, segmentLogStream, historyStream);
                                }

                                if (Environment.detailLogEnabled())
                                    LOG.info("LOG_FLUSH: Log Flusher Finished Writing Entries To Disk");
                                collector.clear();

                            } catch (IOException e) {
                                LOG.error("Critical Exception during Crawl Log Flush:"
                                        + CCStringUtils.stringifyException(e));
                                throw e;
                            } finally {
                                if (crawlLogStream != null) {
                                    crawlLogStream.flush();
                                    crawlLogStream.close();
                                }

                                for (DataOutputStream stream : streamsMapByPackedId.values()) {
                                    if (stream != null)
                                        stream.flush();
                                    stream.close();
                                }
                            }
                            // at this point... update the crawl log header ...
                            try {
                                if (Environment.detailLogEnabled())
                                    LOG.info("LOG_FLUSH: Updating Log File Headers");
                                // update the log file header
                                updateLogFileHeader(crawlLogFile, _header, crawlLogRecordCount);
                                // and update each completion log header ...
                                for (long packedSegmentId : recordCountsByPackedId.keySet()) {
                                    File activeSegmentLogPath = CrawlSegmentLog.buildActivePath(_rootDirectory,
                                            getListIdFromLogId(packedSegmentId),
                                            getSegmentIdFromLogId(packedSegmentId));
                                    CrawlSegmentLog.writeHeader(activeSegmentLogPath,
                                            recordCountsByPackedId.get(packedSegmentId));
                                }
                            } catch (IOException e) {
                                LOG.error("Criticial Exception during Crawl Log Fluhs:"
                                        + CCStringUtils.stringifyException(e));
                                throw e;
                            } finally {

                            }

                            long endTime = System.currentTimeMillis();

                            _flushTimeAVG.addSample((double) endTime - startTime);
                            _flushTimeSmoothed.addSample((double) endTime - startTime);
                            _lastFlushTime = endTime - startTime;

                            LOG.info("LOG_FLUSH: Log Flusher Flushed Successfully");
                            return true;
                        }
                    },

                    new CompletionCallback<Boolean>() {

                        public void taskComplete(Boolean updateResult) {
                            setFlushInProgress(false);
                            if (completionCallback != null) {
                                completionCallback.flushComplete();
                            }
                        }

                        public void taskFailed(Exception e) {

                            setFlushInProgress(false);

                            if (completionCallback != null) {
                                completionCallback.flushFailed(e);
                            }

                            // all failures are critical in this particular task ...
                            LOG.fatal("Crawl Log FLUSH Threw Exception:" + CCStringUtils.stringifyException(e));

                            // no matter ... it is time to CORE the server ...
                            throw new RuntimeException("CRITICAL FAILURE: Crawl Log FLUSH Threw Exception:"
                                    + CCStringUtils.stringifyException(e));

                        }
                    }));
        } else {
            setFlushInProgress(false);
            if (completionCallback != null) {
                completionCallback.flushComplete();
            }
        }
    }

    public boolean isForcedCheckpointPossible() {
        // now one more check to see if we have enough items to do a checkpoint ...
        if (_header._itemCount != 0) {
            return true;
        }
        return false;
    }

    public boolean isCheckpointPossible(long currentTime) {

        if (_lastCheckpointTime == -1 || currentTime - _lastCheckpointTime >= LOG_CHECKPOINT_INTERVAL) {

            // now one more check to see if we have enough items to do a checkpoint
            // ...
            if (_header._itemCount >= LOG_FILE_CHECKPOINT_ITEM_COUNT_THRESHOLD
                    || _header._fileSize >= 1073741824 * 4) {
                return true;
            }
        }
        return false;
    }

    public void forceFlushAndCheckpointLog(final CheckpointCompletionCallback outerCallback) {
        if (isCheckpointInProgress() || isFlushInProgress()) {
            throw new RuntimeException("forceFlush called while active Checkpoint or Flush In Progress!!");
        }

        flushLog(new FlushCompletionCallback() {

            @Override
            public void flushComplete() {

                long currentTime = System.currentTimeMillis();

                LOG.info("LOG_FLUSH Flush Complete... Checking to see if Checkpoint Possilbe");
                if (isForcedCheckpointPossible()) {
                    // yes .. go ahead and checkpoint log
                    LOG.info("Checkpointing Logs to HDFS");
                    // start the checkpoint ...
                    checkpoint(currentTime, new CheckpointCompletionCallback() {

                        public void checkpointComplete(long checkpointId, Vector<Long> completedSegmentList) {
                            LOG.info("CrawlLog Checkpoint:" + checkpointId + " completed");

                            if (completedSegmentList != null) {
                                // walk completed segments ... updating their crawl state ...
                                if (_engine != null) {
                                    for (long packedSegmentId : completedSegmentList) {
                                        // notify crawler engine of status change ...
                                        _engine.crawlSegmentComplete(packedSegmentId);
                                    }
                                }
                            }
                            // ok initiate outer callback
                            outerCallback.checkpointComplete(checkpointId, null);
                        }

                        public void checkpointFailed(long checkpointId, Exception e) {
                            LOG.error("Checkpoint Failed for Checkpoint:" + checkpointId + " With Exception:"
                                    + CCStringUtils.stringifyException(e));
                            outerCallback.checkpointFailed(checkpointId, e);
                        }

                    }, currentTime);
                } else {
                    if (Environment.detailLogEnabled())
                        LOG.info("Checkpoint Skipped. Nothing to checkpoint");
                    outerCallback.checkpointComplete(0, null);
                }
            }

            @Override
            public void flushFailed(Exception e) {
                // log error and bail ...
                LOG.error(CCStringUtils.stringifyException(e));
                // initiate callback
                outerCallback.checkpointFailed(0, e);
            }

        });
    }

    public void startLogFlusher() {

        _logFlusherTimer = new Timer(LOG_FLUSH_INTERVAL, true, new Timer.Callback() {

            public void timerFired(Timer timer) {
                // if checkpoint is NOT in progress ...
                if (!isCheckpointInProgress() && !isFlushInProgress()) {

                    LOG.info("LOG_FLUSH Starting ...");

                    flushLog(

                            new FlushCompletionCallback() {

                                public void flushComplete() {
                                    // flush is complete ... check to see if we want to do a
                                    // checkpoint ...
                                    long currentTime = System.currentTimeMillis();

                                    LOG.info("LOG_FLUSH Flush Complete... Checking to see if Checkpoint Possilbe");
                                    if (isCheckpointPossible(currentTime)) {

                                        LOG.info("Checkpointing Logs to HDFS");

                                        // pause fetcher to prevent race condition where log flush takes
                                        // a long time and causes the fetcher to consume all avaliable
                                        // memory with content buffers
                                        _engine.pauseFetch();

                                        // start the checkpoint ...
                                        checkpoint(currentTime, new CheckpointCompletionCallback() {

                                            public void checkpointComplete(long checkpointId,
                                                    Vector<Long> completedSegmentList) {
                                                LOG.info("CrawlLog Checkpoint:" + checkpointId + " completed");

                                                _engine.resumeFetch();

                                                if (completedSegmentList != null) {
                                                    // walk completed segments ... updating their crawl state
                                                    // ...
                                                    if (_engine != null) {
                                                        for (long packedSegmentId : completedSegmentList) {
                                                            // notify crawler engine of status change ...
                                                            _engine.crawlSegmentComplete(packedSegmentId);
                                                        }
                                                    }
                                                }
                                            }

                                            public void checkpointFailed(long checkpointId, Exception e) {

                                                _engine.resumeFetch();

                                                LOG.error("Checkpoint Failed for Checkpoint:" + checkpointId
                                                        + " With Exception:" + CCStringUtils.stringifyException(e));
                                            }

                                        }, currentTime);
                                    }
                                }

                                public void flushFailed(Exception e) {
                                    LOG.error("Flush Failed with Exception:" + CCStringUtils.stringifyException(e));
                                }

                            }

                    );

                    _engine.resumeFetch();
                }

                // now
            }
        });

        _eventLoop.setTimer(_logFlusherTimer);
    }

    public interface LogFlusherStopActionCallback {
        public void stopComplete();
    }

    public void stopLogFlusher(final LogFlusherStopActionCallback completionCallback) {

        // indicate that a shutdown is in progress ...
        _shutdownInProgress = true;

        // stop the log flusher timer ...
        if (_logFlusherTimer != null) {
            _eventLoop.cancelTimer(_logFlusherTimer);
        }

        // create a polling timer ...
        final Timer waitTimer = new Timer(1000, true, new Timer.Callback() {

            public void timerFired(Timer timer) {

                // check to see if we are done flushing or checkpointing ...
                if (!isFlushInProgress() && !isCheckpointInProgress()) {
                    LOG.info(
                            "CrawlLog - stopLog Timer - No Flush or Checkpoint in Progress... Initiating CrawlLog Shutdown");
                    // good to go ... cancel timer first ...
                    _eventLoop.cancelTimer(timer);
                    // and cleanup ...
                    _logFlusherTimer = null;
                    _shutdownInProgress = false;
                    // initiate callback ...
                    completionCallback.stopComplete();
                } else {
                    LOG.info("CrawlLog - stopLog Timer - Flush or Checkpoint in Progress... Waiting ... ");
                }
            }
        });
        // and start the timer ...
        _eventLoop.setTimer(waitTimer);
    }

    public void collectStats(RuntimeStatsCollector collector) {

        collector.setDoubleValue(CrawlerEngineStats.ID, CrawlerEngineStats.Name.CrawlLog_FlushTimeAVG,
                _flushTimeAVG.getAverage());
        collector.setDoubleValue(CrawlerEngineStats.ID, CrawlerEngineStats.Name.CrawlLog_FlushTimeSmoothed,
                _flushTimeSmoothed.getAverage());
        collector.setLongValue(CrawlerEngineStats.ID, CrawlerEngineStats.Name.CrawlLog_FlushTimeLast,
                _lastFlushTime);
    }

    private static CrawlSegmentHost createHost(String hostName) {
        CrawlSegmentHost host = new CrawlSegmentHost();
        host.setHostName(hostName);
        byte[] hostNameAsBytes = host.getHostName().getBytes();
        host.setHostFP(FPGenerator.std64.fp(hostNameAsBytes, 0, hostNameAsBytes.length));
        return host;
    }

    private static CrawlSegmentURL createSegmentURL(URL url) {
        CrawlSegmentURL segmentURL = new CrawlSegmentURL();
        segmentURL.setUrl(url.toString());
        byte[] urlAsBytes = segmentURL.getUrl().getBytes();
        segmentURL.setUrlFP(FPGenerator.std64.fp(urlAsBytes, 0, urlAsBytes.length));
        return segmentURL;
    }

    private static CrawlSegmentDetail loadCrawlSegment(String fileName) throws IOException {

        TreeMap<String, CrawlSegmentHost> hosts = new TreeMap<String, CrawlSegmentHost>();

        URL resourceURL = CrawlEnvironment.getHadoopConfig().getResource(fileName);

        if (resourceURL == null) {
            throw new FileNotFoundException();
        }
        InputStream stream = resourceURL.openStream();
        BufferedReader reader = new BufferedReader(new InputStreamReader(new BufferedInputStream(stream)));

        String line = null;

        do {
            line = reader.readLine();
            if (line != null) {
                if (Environment.detailLogEnabled())
                    LOG.info(line);
                try {
                    URL theURL = new URL(line);

                    CrawlSegmentHost host = hosts.get(theURL.getHost());
                    if (host == null) {

                        host = createHost(theURL.getHost());

                        hosts.put(theURL.getHost(), host);
                    }
                    CrawlSegmentURL segmentURL = createSegmentURL(theURL);
                    host.getUrlTargets().add(segmentURL);
                } catch (MalformedURLException e) {
                    LOG.error("SKIPPING Malformed URL::" + line);
                }
            }
        } while (line != null);

        CrawlSegmentDetail crawlSegmentDetail = new CrawlSegmentDetail();

        int urlCount = 0;
        crawlSegmentDetail.setSegmentId(1);
        for (CrawlSegmentHost host : hosts.values()) {
            crawlSegmentDetail.getHosts().add(host);
            urlCount += host.getUrlTargets().size();
        }

        crawlSegmentDetail.setUrlCount(urlCount);

        // finally, sort by host (as will be the case in a proper map reduce
        // produced segment ...
        Collections.sort(crawlSegmentDetail.getHosts());

        return crawlSegmentDetail;

    }

    public Vector<Long> getActiveSegmentIdList() {

        Vector<Long> segmentIdList = new Vector<Long>();
        segmentIdList.addAll(_loggers.keySet());
        return segmentIdList;
    }

    static void validateInputOutputCrawlURLArrays(ArrayList<CrawlURL> input, ArrayList<CrawlURL> output)
            throws IOException {
        Assert.assertTrue(input.size() == output.size());
        for (int i = 0; i < input.size(); ++i) {
            CrawlURL left = input.get(i);
            CrawlURL right = input.get(i);
            Assert.assertTrue(left.getUrl().equals(right.getUrl()));
            Assert.assertTrue(
                    left.getContentRaw().getReadOnlyBytes().equals(right.getContentRaw().getReadOnlyBytes()));
        }
    }

    static void validateLogFlusherCode(final File localDirPath, final Path remotePath, boolean injectErrors)
            throws IOException {

        final Configuration conf = new Configuration();

        final FileSystem fs = FileSystem.get(conf);

        fs.mkdirs(remotePath);

        // ok create a crawlLog test file
        File localFile = File.createTempFile("crawlLog", "test", localDirPath);
        localFile.delete();

        LOG.info("Initializing Temp File:" + localFile);
        // initialize
        LogFileHeader fileHeader = initializeLogFileHeaderFromLogFile(localFile);

        LOG.info("Creating SyncedCrawl URL Writer");
        // create synced url writer ...
        SyncedCrawlURLLogWriter crawlURLWriter = new SyncedCrawlURLLogWriter(injectErrors);

        ArrayList<CrawlURL> urlObjects = new ArrayList<CrawlURL>();
        // write a couple of url objects
        for (int i = 0; i < 100; ++i) {
            CrawlURL url = new CrawlURL();
            url.setUrl("http://someurl.com/" + i);
            byte bytes[] = MD5.digest("Some Random:" + Math.random() + " Number").getBytes();
            url.setContentRaw(new FlexBuffer(bytes));
            final DataOutputStream crawlLogStream = new DataOutputStream(new FileOutputStream(localFile, true));
            try {
                LOG.info("Appending object to log");
                crawlURLWriter.writeItem(crawlLogStream, url);
            } finally {
                LOG.info("Flushing Log");
                crawlLogStream.flush();
                crawlLogStream.close();
            }
            LOG.info("Updating Header");
            updateLogFileHeader(localFile, fileHeader, 1);

            if (!injectErrors || i % 2 == 0) {
                urlObjects.add(url);
            } else {
                // drop odd entry
                LOG.info("Dropping Odd Entry:" + url.getUrl());
            }
        }

        final ArrayList<CrawlURL> urlObjectsOut = new ArrayList<CrawlURL>();

        HDFSCrawlURLWriter stubWriter = new HDFSCrawlURLWriter() {

            SequenceFileCrawlURLWriter innerWriter = new SequenceFileCrawlURLWriter(conf, fs, remotePath,
                    "testNode", 1L);

            @Override
            public void writeCrawlURLItem(Text url, CrawlURL urlObject) throws IOException {
                LOG.info("Got URL:" + url.toString());
                urlObjectsOut.add(urlObject);
                innerWriter.writeCrawlURLItem(url, urlObject);
            }

            @Override
            public void close() throws IOException {
                innerWriter.close();
            }

            public List<Path> getFilenames() {
                return innerWriter.getFilenames();
            }
        };

        try {
            LOG.info("Transferring from Local to Remote");
            transferLocalCheckpointLog(localFile, stubWriter, 1L);
        } finally {
            stubWriter.close();
        }
        LOG.info("Validating Input/Output");
        validateInputOutputCrawlURLArrays(urlObjects, urlObjectsOut);
        // read via sequenceFile
        urlObjectsOut.clear();
        Path firstFile = Iterators.getNext(stubWriter.getFilenames().iterator(), null);
        SequenceFile.Reader reader = new SequenceFile.Reader(fs, firstFile, conf);
        Text key = new Text();
        CrawlURL value = new CrawlURL();
        while (reader.next(key, value)) {
            LOG.info("Got:" + key.toString());
            urlObjectsOut.add(value);
            value = new CrawlURL();
        }
        reader.close();
        LOG.info("Validating Input/Output");
        validateInputOutputCrawlURLArrays(urlObjects, urlObjectsOut);

        LOG.info("Done!");
    }

    public static void walkCrawlLogFile(File crawlLogPath, long startOffset) throws IOException {

        // and open the crawl log file ...
        RandomAccessFile inputStream = null;

        IOException exception = null;

        CRC32 crc = new CRC32();
        CustomByteArrayOutputStream buffer = new CustomByteArrayOutputStream(1 << 17);
        byte[] syncBytesBuffer = new byte[SYNC_BYTES_SIZE];

        // save position for potential debug output.
        long lastReadPosition = 0;

        try {
            inputStream = new RandomAccessFile(crawlLogPath, "rw");

            // and a data input stream ...
            RandomAccessFile reader = inputStream;
            // seek to zero
            reader.seek(0L);

            // read the header ...
            LogFileHeader header = readLogFileHeader(reader);

            System.out.println("Header ItemCount:" + header._itemCount + " FileSize:" + header._fileSize);

            if (startOffset != 0L) {
                System.out.println("Preseeking to:" + startOffset);
                reader.seek(startOffset);
            }

            Configuration conf = new Configuration();

            // read a crawl url from the stream...

            long recordCount = 0;
            while (inputStream.getFilePointer() < header._fileSize) {

                // System.out.println("PRE-SYNC SeekPos:"+
                // inputStream.getFilePointer());
                if (seekToNextSyncBytesPos(syncBytesBuffer, reader, header._fileSize)) {

                    // System.out.println("POST-SYNC SeekPos:"+
                    // inputStream.getFilePointer());

                    lastReadPosition = inputStream.getFilePointer();

                    // skip sync
                    inputStream.skipBytes(SYNC_BYTES_SIZE);

                    // read length ...
                    int urlDataLen = reader.readInt();
                    long urlDataCRC = reader.readLong();

                    if (urlDataLen > buffer.getBuffer().length) {
                        buffer = new CustomByteArrayOutputStream(((urlDataLen / 65536) + 1) * 65536);
                    }
                    reader.read(buffer.getBuffer(), 0, urlDataLen);
                    crc.reset();
                    crc.update(buffer.getBuffer(), 0, urlDataLen);

                    long computedValue = crc.getValue();

                    // validate crc values ...
                    if (computedValue != urlDataCRC) {
                        LOG.error("CRC Mismatch Detected during HDFS transfer in CrawlLog:"
                                + crawlLogPath.getAbsolutePath() + " FilePosition:" + lastReadPosition);
                        inputStream.seek(lastReadPosition + 1);
                    } else {
                        if (recordCount++ % 10000 == 0) {
                            // allocate a crawl url data structure
                            CrawlURL url = new CrawlURL();
                            DataInputStream bufferReader = new DataInputStream(
                                    new ByteArrayInputStream(buffer.getBuffer(), 0, urlDataLen));
                            // populate it from the (in memory) data stream
                            url.readFields(bufferReader);

                            System.out.println("Record:" + recordCount + " At:" + lastReadPosition + " URL:"
                                    + url.getUrl() + " BuffSize:" + urlDataLen + " ContentLen:"
                                    + url.getContentRaw().getCount() + " LastModified:"
                                    + new Date(url.getLastAttemptTime()).toString());
                        }
                    }
                } else {
                    break;
                }
            }
        } catch (EOFException e) {
            LOG.error("Caught EOF Exception during read of local CrawlLog:" + crawlLogPath.getAbsolutePath()
                    + " FilePosition:" + lastReadPosition);
        } catch (IOException e) {
            LOG.error(CCStringUtils.stringifyException(e));
            exception = e;
            throw e;
        } finally {
            if (inputStream != null)
                inputStream.close();
        }
    }
}