org.archive.crawler.admin.StatisticsTracker.java Source code

Introduction

Here is the source code for org.archive.crawler.admin.StatisticsTracker.java
Source

/* Copyright (C) 2009 Internet Archive 
 *
 * This file is part of the Heritrix web crawler (crawler.archive.org).
 *
 * Heritrix is free software; you can redistribute it and/or modify
 * it under the terms of the GNU Lesser Public License as published by
 * the Free Software Foundation; either version 2.1 of the License, or
 * any later version.
 *
 * Heritrix is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU Lesser Public License for more details.
 *
 * You should have received a copy of the GNU Lesser Public License
 * along with Heritrix; if not, write to the Free Software
 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
 *
 * Created on Jul 16, 2003
 *
 */
package org.archive.crawler.admin;

import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.OutputStreamWriter;
import java.io.PrintWriter;
import java.io.Serializable;
import java.util.Comparator;
import java.util.Date;
import java.util.EventObject;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.SortedMap;
import java.util.TreeMap;
import java.util.TreeSet;
import java.util.Vector;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.ConcurrentMap;
import java.util.concurrent.atomic.AtomicLong;
import java.util.logging.Level;
import java.util.logging.Logger;

import org.apache.commons.collections.Closure;
import org.apache.commons.httpclient.HttpStatus;
import org.archive.crawler.datamodel.CrawlHost;
import org.archive.crawler.datamodel.CrawlURI;
import org.archive.crawler.datamodel.FetchStatusCodes;
import org.archive.crawler.deciderules.recrawl.IdenticalDigestDecideRule;
import org.archive.crawler.event.CrawlURIDispositionListener;
import org.archive.crawler.framework.AbstractTracker;
import org.archive.crawler.framework.CrawlController;
import org.archive.crawler.framework.exceptions.FatalConfigurationException;
import org.archive.crawler.util.CrawledBytesHistotable;
import org.archive.net.UURI;
import org.archive.util.ArchiveUtils;
import org.archive.util.MimetypeUtils;
import org.archive.util.ObjectIdentityCache;
import org.archive.util.PaddingStringBuffer;
import org.archive.util.Supplier;

/**
 * This is an implementation of the AbstractTracker. It is designed to function
 * with the WUI as well as performing various logging activity.
 * <p>
 * At the end of each snapshot a line is written to the
 * 'progress-statistics.log' file.
 * <p>
 * The header of that file is as follows:
 * <pre> [timestamp] [discovered]    [queued] [downloaded] [doc/s(avg)]  [KB/s(avg)] [dl-failures] [busy-thread] [mem-use-KB]</pre>
 * First there is a <b>timestamp</b>, accurate down to 1 second.
 * <p>
 * <b>discovered</b>, <b>queued</b>, <b>downloaded</b> and <b>dl-failures</b>
 * are (respectively) the discovered URI count, pending URI count, successfully
 * fetched count and failed fetch count from the frontier at the time of the
 * snapshot.
 * <p>
 * <b>KB/s(avg)</b> is the bandwidth usage.  We use the total bytes downloaded
 * to calculate average bandwidth usage (KB/sec). Since we also note the value
 * each time a snapshot is made we can calculate the average bandwidth usage
 * during the last snapshot period to gain a "current" rate. The first number is
 * the current and the average is in parenthesis.
 * <p>
 * <b>doc/s(avg)</b> works the same way as doc/s except it show the number of
 * documents (URIs) rather then KB downloaded.
 * <p>
 * <b>busy-threads</b> is the total number of ToeThreads that are not available
 * (and thus presumably busy processing a URI). This information is extracted
 * from the crawl controller.
 * <p>
 * Finally mem-use-KB is extracted from the run time environment
 * (<code>Runtime.getRuntime().totalMemory()</code>).
 * <p>
 * In addition to the data collected for the above logs, various other data
 * is gathered and stored by this tracker.
 * <ul>
 *   <li> Successfully downloaded documents per fetch status code
 *   <li> Successfully downloaded documents per document mime type
 *   <li> Amount of data per mime type
 *   <li> Successfully downloaded documents per host
 *   <li> Amount of data per host
 *   <li> Disposition of all seeds (this is written to 'reports.log' at end of
 *        crawl)
 *   <li> Successfully downloaded documents per host per source
 * </ul>
 *
 * @author Parker Thompson
 * @author Kristinn Sigurdsson
 * 
 * @see org.archive.crawler.framework.StatisticsTracking
 * @see org.archive.crawler.framework.AbstractTracker
 */
public class StatisticsTracker extends AbstractTracker implements CrawlURIDispositionListener, Serializable {
    private static final long serialVersionUID = 8004878315916392305L;

    /**
     * Messages from the StatisticsTracker.
     */
    private final static Logger logger = Logger.getLogger(StatisticsTracker.class.getName());

    // TODO: Need to be able to specify file where the object will be
    // written once the CrawlEnded event occurs

    protected long lastPagesFetchedCount = 0;
    protected long lastProcessedBytesCount = 0;

    /*
     * Snapshot data.
     */
    protected long discoveredUriCount = 0;
    protected long queuedUriCount = 0;
    protected long finishedUriCount = 0;

    protected long downloadedUriCount = 0;
    protected long downloadFailures = 0;
    protected long downloadDisregards = 0;
    protected double docsPerSecond = 0;
    protected double currentDocsPerSecond = 0;
    protected int currentKBPerSec = 0;
    protected long totalKBPerSec = 0;
    protected int busyThreads = 0;
    protected long totalProcessedBytes = 0;
    protected float congestionRatio = 0;
    protected long deepestUri;
    protected long averageDepth;

    /*
     * Cumulative data
     */
    /** tally sizes novel, verified (same hash), vouched (not-modified) */
    protected CrawledBytesHistotable crawledBytes = new CrawledBytesHistotable();

    protected long notModifiedUriCount = 0;
    protected long dupByHashUriCount = 0;
    protected long novelUriCount = 0;

    /** Keep track of the file types we see (mime type -> count) */
    protected ConcurrentMap<String, AtomicLong> mimeTypeDistribution = new ConcurrentHashMap<String, AtomicLong>();
    protected ConcurrentMap<String, AtomicLong> mimeTypeBytes = new ConcurrentHashMap<String, AtomicLong>();

    /** Keep track of fetch status codes */
    protected ConcurrentMap<String, AtomicLong> statusCodeDistribution = new ConcurrentHashMap<String, AtomicLong>();

    /** reusable Supplier for initial zero AtomicLong instances */
    private static final Supplier<AtomicLong> ATOMIC_ZERO_SUPPLIER = new Supplier<AtomicLong>() {
        public AtomicLong get() {
            return new AtomicLong(0);
        }
    };

    /** Keep track of hosts. 
     * 
     * <p>They're transient because usually bigmaps that get reconstituted
     * on recover from checkpoint.
     */
    protected transient ObjectIdentityCache<String, AtomicLong> hostsDistribution = null;
    protected transient ObjectIdentityCache<String, AtomicLong> hostsBytes = null;
    protected transient ObjectIdentityCache<String, AtomicLong> hostsLastFinished = null;

    /** Keep track of URL counts per host per seed */
    protected transient ObjectIdentityCache<String, ConcurrentMap<String, AtomicLong>> sourceHostDistribution = null;

    /**
     * Record of seeds' latest actions.
     */
    protected transient ObjectIdentityCache<String, SeedRecord> processedSeedsRecords;

    // seeds tallies: ONLY UPDATED WHEN SEED REPORT WRITTEN
    private int seedsCrawled;
    private int seedsNotCrawled;
    // sExitMessage: only set at crawl-end
    private String sExitMessage = "Before crawl end";

    public StatisticsTracker(String name) {
        super(name, "A statistics tracker thats integrated into "
                + "the web UI and that creates the progress-statistics log.");
    }

    public void initialize(CrawlController c) throws FatalConfigurationException {
        super.initialize(c);
        try {
            this.sourceHostDistribution = c.getBigMap("sourceHostDistribution", ConcurrentMap.class);
            this.hostsDistribution = c.getBigMap("hostsDistribution", AtomicLong.class);
            this.hostsBytes = c.getBigMap("hostsBytes", AtomicLong.class);
            this.hostsLastFinished = c.getBigMap("hostsLastFinished", AtomicLong.class);
            this.processedSeedsRecords = c.getBigMap("processedSeedsRecords", SeedRecord.class);
        } catch (Exception e) {
            throw new FatalConfigurationException("Failed setup of" + " StatisticsTracker: " + e);
        }
        controller.addCrawlURIDispositionListener(this);
    }

    protected void finalCleanup() {
        super.finalCleanup();
        if (this.hostsBytes != null) {
            this.hostsBytes.close();
            this.hostsBytes = null;
        }
        if (this.hostsDistribution != null) {
            this.hostsDistribution.close();
            this.hostsDistribution = null;
        }
        if (this.hostsLastFinished != null) {
            this.hostsLastFinished.close();
            this.hostsLastFinished = null;
        }
        if (this.processedSeedsRecords != null) {
            this.processedSeedsRecords.close();
            this.processedSeedsRecords = null;
        }
        if (this.sourceHostDistribution != null) {
            this.sourceHostDistribution.close();
            this.sourceHostDistribution = null;
        }

    }

    protected synchronized void progressStatisticsEvent(final EventObject e) {
        // This method loads "snapshot" data.
        discoveredUriCount = discoveredUriCount();
        downloadedUriCount = successfullyFetchedCount();
        finishedUriCount = finishedUriCount();
        queuedUriCount = queuedUriCount();
        downloadFailures = failedFetchAttempts();
        downloadDisregards = disregardedFetchAttempts();
        totalProcessedBytes = totalBytesCrawled();
        congestionRatio = congestionRatio();
        deepestUri = deepestUri();
        averageDepth = averageDepth();

        if (finishedUriCount() == 0) {
            docsPerSecond = 0;
            totalKBPerSec = 0;
        } else if (getCrawlerTotalElapsedTime() < 1000) {
            return; // Not enough time has passed for a decent snapshot.
        } else {
            docsPerSecond = (double) downloadedUriCount / (double) (getCrawlerTotalElapsedTime() / 1000);
            // Round to nearest long.
            totalKBPerSec = (long) (((totalProcessedBytes / 1024) / ((getCrawlerTotalElapsedTime()) / 1000)) + .5);
        }

        busyThreads = activeThreadCount();

        if (shouldrun || (System.currentTimeMillis() - lastLogPointTime) >= 1000) {
            // If shouldrun is false there is a chance that the time interval
            // since last time is too small for a good sample.  We only want
            // to update "current" data when the interval is long enough or
            // shouldrun is true.
            currentDocsPerSecond = 0;
            currentKBPerSec = 0;

            // Note time.
            long currentTime = System.currentTimeMillis();
            long sampleTime = currentTime - lastLogPointTime;

            // if we haven't done anyting or there isn't a reasonable sample
            // size give up.
            if (sampleTime >= 1000) {
                // Update docs/sec snapshot
                long currentPageCount = successfullyFetchedCount();
                long samplePageCount = currentPageCount - lastPagesFetchedCount;

                currentDocsPerSecond = (double) samplePageCount / (double) (sampleTime / 1000);

                lastPagesFetchedCount = currentPageCount;

                // Update kbytes/sec snapshot
                long currentProcessedBytes = totalProcessedBytes;
                long sampleProcessedBytes = currentProcessedBytes - lastProcessedBytesCount;

                currentKBPerSec = (int) (((sampleProcessedBytes / 1024) / (sampleTime / 1000)) + .5);

                lastProcessedBytesCount = currentProcessedBytes;
            }
        }

        if (this.controller != null) {
            this.controller.logProgressStatistics(getProgressStatisticsLine());
        }
        lastLogPointTime = System.currentTimeMillis();
        super.progressStatisticsEvent(e);
    }

    /**
     * Return one line of current progress-statistics
     * 
     * @param now
     * @return String of stats
     */
    public String getProgressStatisticsLine(Date now) {
        return new PaddingStringBuffer().append(ArchiveUtils.getLog14Date(now)).raAppend(32, discoveredUriCount)
                .raAppend(44, queuedUriCount).raAppend(57, downloadedUriCount)
                .raAppend(74,
                        ArchiveUtils.doubleToString(currentDocsPerSecond, 2) + "("
                                + ArchiveUtils.doubleToString(docsPerSecond, 2) + ")")
                .raAppend(85, currentKBPerSec + "(" + totalKBPerSec + ")").raAppend(99, downloadFailures)
                .raAppend(113, busyThreads)
                .raAppend(126, (Runtime.getRuntime().totalMemory() - Runtime.getRuntime().freeMemory()) / 1024)
                .raAppend(140, Runtime.getRuntime().totalMemory() / 1024)
                .raAppend(153, ArchiveUtils.doubleToString(congestionRatio, 2)).raAppend(165, deepestUri)
                .raAppend(177, averageDepth).toString();
    }

    public Map<String, Number> getProgressStatistics() {
        Map<String, Number> stats = new HashMap<String, Number>();
        stats.put("discoveredUriCount", new Long(discoveredUriCount));
        stats.put("queuedUriCount", new Long(queuedUriCount));
        stats.put("downloadedUriCount", new Long(downloadedUriCount));
        stats.put("currentDocsPerSecond", new Double(currentDocsPerSecond));
        stats.put("docsPerSecond", new Double(docsPerSecond));
        stats.put("totalKBPerSec", new Long(totalKBPerSec));
        stats.put("totalProcessedBytes", new Long(totalProcessedBytes));
        stats.put("currentKBPerSec", new Long(currentKBPerSec));
        stats.put("downloadFailures", new Long(downloadFailures));
        stats.put("busyThreads", new Integer(busyThreads));
        stats.put("congestionRatio", new Double(congestionRatio));
        stats.put("deepestUri", new Long(deepestUri));
        stats.put("averageDepth", new Long(averageDepth));
        stats.put("totalMemory", new Long(Runtime.getRuntime().totalMemory()));
        stats.put("freeMemory", new Long(Runtime.getRuntime().freeMemory()));
        return stats;
    }

    /**
     * Return one line of current progress-statistics
     * 
     * @return String of stats
     */
    public String getProgressStatisticsLine() {
        return getProgressStatisticsLine(new Date());
    }

    public double processedDocsPerSec() {
        return docsPerSecond;
    }

    public double currentProcessedDocsPerSec() {
        return currentDocsPerSecond;
    }

    public long processedKBPerSec() {
        return totalKBPerSec;
    }

    public int currentProcessedKBPerSec() {
        return currentKBPerSec;
    }

    /** Returns a HashMap that contains information about distributions of
     *  encountered mime types.  Key/value pairs represent
     *  mime type -> count.
     * <p>
     * <b>Note:</b> All the values are wrapped with a {@link AtomicLong AtomicLong}
     * @return mimeTypeDistribution
     */
    public Map<String, AtomicLong> getFileDistribution() {
        return mimeTypeDistribution;
    }

    /**
     * Increment a counter for a key in a given HashMap. Used for various
     * aggregate data.
     * 
     * As this is used to change Maps which depend on StatisticsTracker
     * for their synchronization, this method should only be invoked
     * from a a block synchronized on 'this'. 
     *
     * @param map The HashMap
     * @param key The key for the counter to be incremented, if it does not
     *               exist it will be added (set to 1).  If null it will
     *            increment the counter "unknown".
     */
    protected static void incrementMapCount(ConcurrentMap<String, AtomicLong> map, String key) {
        incrementMapCount(map, key, 1);
    }

    /**
     * Increment a counter for a key in a given cache. Used for various
     * aggregate data.
     * 
     * @param cache the ObjectIdentityCache
     * @param key The key for the counter to be incremented, if it does not
     *               exist it will be added (set to 1).  If null it will
     *            increment the counter "unknown".
     */
    protected static void incrementCacheCount(ObjectIdentityCache<String, AtomicLong> cache, String key) {
        incrementCacheCount(cache, key, 1);
    }

    /**
     * Increment a counter for a key in a given cache by an arbitrary amount.
     * Used for various aggregate data. The increment amount can be negative.
     *
     *
     * @param cache
     *            The ObjectIdentityCache
     * @param key
     *            The key for the counter to be incremented, if it does not exist
     *            it will be added (set to equal to <code>increment</code>).
     *            If null it will increment the counter "unknown".
     * @param increment
     *            The amount to increment counter related to the <code>key</code>.
     */
    protected static void incrementCacheCount(ObjectIdentityCache<String, AtomicLong> cache, String key,
            long increment) {
        if (key == null) {
            key = "unknown";
        }
        AtomicLong lw = cache.getOrUse(key, ATOMIC_ZERO_SUPPLIER);
        lw.addAndGet(increment);
    }

    /**
     * Increment a counter for a key in a given HashMap by an arbitrary amount.
     * Used for various aggregate data. The increment amount can be negative.
     *
     * @param map
     *            The Map or ConcurrentMap
     * @param key
     *            The key for the counter to be incremented, if it does not exist
     *            it will be added (set to equal to <code>increment</code>).
     *            If null it will increment the counter "unknown".
     * @param increment
     *            The amount to increment counter related to the <code>key</code>.
     */
    protected static void incrementMapCount(ConcurrentMap<String, AtomicLong> map, String key, long increment) {
        if (key == null) {
            key = "unknown";
        }
        AtomicLong lw = map.get(key);
        if (lw == null) {
            lw = new AtomicLong();
            AtomicLong prevVal = map.putIfAbsent(key, lw);
            if (prevVal != null) {
                lw = prevVal;
            }
        }
        lw.addAndGet(increment);
    }

    /**
     * Sort the entries of the given HashMap in descending order by their
     * values, which must be longs wrapped with <code>AtomicLong</code>.
     * <p>
     * Elements are sorted by value from largest to smallest. Equal values are
     * sorted in an arbitrary, but consistent manner by their keys. Only items
     * with identical value and key are considered equal.
     *
     * If the passed-in map requires access to be synchronized, the caller
     * should ensure this synchronization. 
     * 
     * @param mapOfAtomicLongValues
     *            Assumes values are wrapped with AtomicLong.
     * @return a sorted set containing the same elements as the map.
     */
    public TreeMap<String, AtomicLong> getReverseSortedCopy(final Map<String, AtomicLong> mapOfAtomicLongValues) {
        TreeMap<String, AtomicLong> sortedMap = new TreeMap<String, AtomicLong>(new Comparator<String>() {
            public int compare(String e1, String e2) {
                long firstVal = mapOfAtomicLongValues.get(e1).get();
                long secondVal = mapOfAtomicLongValues.get(e2).get();
                if (firstVal < secondVal) {
                    return 1;
                }
                if (secondVal < firstVal) {
                    return -1;
                }
                // If the values are the same, sort by keys.
                return e1.compareTo(e2);
            }
        });
        try {
            sortedMap.putAll(mapOfAtomicLongValues);
        } catch (UnsupportedOperationException e) {
            for (String key : mapOfAtomicLongValues.keySet()) {
                sortedMap.put(key, mapOfAtomicLongValues.get(key));
            }
        }
        return sortedMap;
    }

    /**
     * Sort the entries of the given ObjectIdentityCache in descending order by their
     * values, which must be longs wrapped with <code>AtomicLong</code>.
     * <p>
     * Elements are sorted by value from largest to smallest. Equal values are
     * sorted in an arbitrary, but consistent manner by their keys. Only items
     * with identical value and key are considered equal.
     *
     * If the passed-in map requires access to be synchronized, the caller
     * should ensure this synchronization. 
     * 
     * @param mapOfAtomicLongValues
     *            Assumes values are wrapped with AtomicLong.
     * @return a sorted set containing the same elements as the map.
     */
    public TreeMap<String, AtomicLong> getReverseSortedCopy(
            final ObjectIdentityCache<String, AtomicLong> mapOfAtomicLongValues) {
        TreeMap<String, AtomicLong> sortedMap = new TreeMap<String, AtomicLong>(new Comparator<String>() {
            public int compare(String e1, String e2) {
                long firstVal = mapOfAtomicLongValues.get(e1).get();
                long secondVal = mapOfAtomicLongValues.get(e2).get();
                if (firstVal < secondVal) {
                    return 1;
                }
                if (secondVal < firstVal) {
                    return -1;
                }
                // If the values are the same, sort by keys.
                return e1.compareTo(e2);
            }
        });
        for (String key : mapOfAtomicLongValues.keySet()) {
            sortedMap.put(key, mapOfAtomicLongValues.get(key));
        }
        return sortedMap;
    }

    /**
     * Return a HashMap representing the distribution of status codes for
     * successfully fetched curis, as represented by a hashmap where key -&gt;
     * val represents (string)code -&gt; (integer)count.
     * 
     * <b>Note: </b> All the values are wrapped with a
     * {@link AtomicLong AtomicLong}
     * 
     * @return statusCodeDistribution
     */
    public Map<String, AtomicLong> getStatusCodeDistribution() {
        return statusCodeDistribution;
    }

    /**
     * Returns the time (in millisec) when a URI belonging to a given host was
     * last finished processing. 
     * 
     * @param host The host to look up time of last completed URI.
     * @return Returns the time (in millisec) when a URI belonging to a given 
     * host was last finished processing. If no URI has been completed for host
     * -1 will be returned. 
     */
    public AtomicLong getHostLastFinished(String host) {
        AtomicLong fini = hostsLastFinished.getOrUse(host, ATOMIC_ZERO_SUPPLIER);
        return fini;
    }

    /**
     * Returns the accumulated number of bytes downloaded from a given host.
     * @param host name of the host
     * @return the accumulated number of bytes downloaded from a given host
     */
    public long getBytesPerHost(String host) {
        return ((AtomicLong) hostsBytes.get(host)).get();
    }

    /**
     * Returns the accumulated number of bytes from files of a given file type.
     * @param filetype Filetype to check.
     * @return the accumulated number of bytes from files of a given mime type
     */
    public long getBytesPerFileType(String filetype) {
        return ((AtomicLong) mimeTypeBytes.get(filetype)).get();
    }

    /**
     * Get the total number of ToeThreads (sleeping and active)
     *
     * @return The total number of ToeThreads
     */
    public int threadCount() {
        return this.controller != null ? controller.getToeCount() : 0;
    }

    /**
     * @return Current thread count (or zero if can't figure it out).
     */
    public int activeThreadCount() {
        return this.controller != null ? controller.getActiveToeCount() : 0;
        // note: reuse of old busy value seemed misleading: anyone asking
        // for thread count when paused or stopped still wants accurate reading
    }

    /**
     * This returns the number of completed URIs as a percentage of the total
     * number of URIs encountered (should be inverse to the discovery curve)
     *
     * @return The number of completed URIs as a percentage of the total
     * number of URIs encountered
     */
    public int percentOfDiscoveredUrisCompleted() {
        long completed = finishedUriCount();
        long total = discoveredUriCount();

        if (total == 0) {
            return 0;
        }

        return (int) (100 * completed / total);
    }

    /**
     * Number of <i>discovered</i> URIs.
     *
     * <p>If crawl not running (paused or stopped) this will return the value of
     * the last snapshot.
     *
     * @return A count of all uris encountered
     *
     * @see org.archive.crawler.framework.Frontier#discoveredUriCount()
     */
    public long discoveredUriCount() {
        // While shouldrun is true we can use info direct from the crawler.
        // After that our last snapshot will have to do.
        return shouldrun && this.controller != null && this.controller.getFrontier() != null
                ? controller.getFrontier().discoveredUriCount()
                : discoveredUriCount;
    }

    /**
     * Number of URIs that have <i>finished</i> processing.
     *
     * @return Number of URIs that have finished processing
     *
     * @see org.archive.crawler.framework.Frontier#finishedUriCount()
     */
    public long finishedUriCount() {
        return shouldrun && this.controller != null && this.controller.getFrontier() != null
                ? controller.getFrontier().finishedUriCount()
                : finishedUriCount;
    }

    /**
     * Get the total number of failed fetch attempts (connection failures -> give up, etc)
     *
     * @return The total number of failed fetch attempts
     */
    public long failedFetchAttempts() {
        // While shouldrun is true we can use info direct from the crawler.
        // After that our last snapshot will have to do.
        return shouldrun && this.controller != null && this.controller.getFrontier() != null
                ? controller.getFrontier().failedFetchCount()
                : downloadFailures;
    }

    /**
     * Get the total number of failed fetch attempts (connection failures -> give up, etc)
     *
     * @return The total number of failed fetch attempts
     */
    public long disregardedFetchAttempts() {
        // While shouldrun is true we can use info direct from the crawler.
        // After that our last snapshot will have to do.
        return shouldrun && this.controller != null && this.controller.getFrontier() != null
                ? controller.getFrontier().disregardedUriCount()
                : downloadDisregards;
    }

    public long successfullyFetchedCount() {
        // While shouldrun is true we can use info direct from the crawler.
        // After that our last snapshot will have to do.
        return shouldrun && this.controller != null && this.controller.getFrontier() != null
                ? controller.getFrontier().succeededFetchCount()
                : downloadedUriCount;
    }

    public long totalCount() {
        return queuedUriCount() + activeThreadCount() + successfullyFetchedCount();
    }

    /**
     * Ratio of number of threads that would theoretically allow
     * maximum crawl progress (if each was as productive as current
     * threads), to current number of threads.
     * 
     * @return float congestion ratio 
     */
    public float congestionRatio() {
        // While shouldrun is true we can use info direct from the crawler.
        // After that our last snapshot will have to do.
        return shouldrun && this.controller != null && this.controller.getFrontier() != null
                ? controller.getFrontier().congestionRatio()
                : congestionRatio;
    }

    /**
     * Ordinal position of the 'deepest' URI eligible 
     * for crawling. Essentially, the length of the longest
     * frontier internal queue. 
     * 
     * @return long URI count to deepest URI
     */
    public long deepestUri() {
        // While shouldrun is true we can use info direct from the crawler.
        // After that our last snapshot will have to do.
        return shouldrun && this.controller != null && this.controller.getFrontier() != null
                ? controller.getFrontier().deepestUri()
                : deepestUri;
    }

    /**
     * Average depth of the last URI in all eligible queues.
     * That is, the average length of all eligible queues.
     * 
     * @return long average depth of last URIs in queues 
     */
    public long averageDepth() {
        // While shouldrun is true we can use info direct from the crawler.
        // After that our last snapshot will have to do.
        return shouldrun && this.controller != null && this.controller.getFrontier() != null
                ? controller.getFrontier().averageDepth()
                : averageDepth;
    }

    /**
     * Number of URIs <i>queued</i> up and waiting for processing.
     *
     * <p>If crawl not running (paused or stopped) this will return the value
     * of the last snapshot.
     *
     * @return Number of URIs queued up and waiting for processing.
     *
     * @see org.archive.crawler.framework.Frontier#queuedUriCount()
     */
    public long queuedUriCount() {
        // While shouldrun is true we can use info direct from the crawler.
        // After that our last snapshot will have to do.
        return shouldrun && this.controller != null && this.controller.getFrontier() != null
                ? controller.getFrontier().queuedUriCount()
                : queuedUriCount;
    }

    /** @deprecated use totalBytesCrawled */
    public long totalBytesWritten() {
        // return totalBytesCrawled(); 
        return shouldrun && this.controller != null && this.controller.getFrontier() != null
                ? controller.getFrontier().totalBytesWritten()
                : totalProcessedBytes;
    }

    public long totalBytesCrawled() {
        return shouldrun ? crawledBytes.getTotal() : totalProcessedBytes;
    }

    public String crawledBytesSummary() {
        return crawledBytes.summary();
    }

    /**
     * If the curi is a seed, we insert into the processedSeedsRecords map.
     *
     * @param curi The CrawlURI that may be a seed.
     * @param disposition The dispositino of the CrawlURI.
     */
    private void handleSeed(final CrawlURI curi, final String disposition) {
        if (curi.isSeed()) {
            SeedRecord sr = processedSeedsRecords.getOrUse(curi.toString(), new Supplier<SeedRecord>() {
                public SeedRecord get() {
                    return new SeedRecord(curi, disposition);
                }
            });
            sr.updateWith(curi, disposition);
        }
    }

    public void crawledURISuccessful(CrawlURI curi) {
        handleSeed(curi, SEED_DISPOSITION_SUCCESS);
        // save crawled bytes tally
        crawledBytes.accumulate(curi);

        // save crawled docs tally
        if (curi.getFetchStatus() == HttpStatus.SC_NOT_MODIFIED) {
            notModifiedUriCount++;
        } else if (IdenticalDigestDecideRule.hasIdenticalDigest(curi)) {
            dupByHashUriCount++;
        } else {
            novelUriCount++;
        }

        // Save status codes
        incrementMapCount(statusCodeDistribution, Integer.toString(curi.getFetchStatus()));

        // Save mime types
        String mime = MimetypeUtils.truncate(curi.getContentType());
        incrementMapCount(mimeTypeDistribution, mime);
        incrementMapCount(mimeTypeBytes, mime, curi.getContentSize());

        // Save hosts stats.
        saveHostStats(curi.getFetchStatus() == FetchStatusCodes.S_DNS_SUCCESS ? "dns:"
                : this.controller.getServerCache().getHostFor(curi).getHostName(), curi.getContentSize());

        if (curi.containsKey(CrawlURI.A_SOURCE_TAG)) {
            saveSourceStats(curi.getString(CrawlURI.A_SOURCE_TAG),
                    this.controller.getServerCache().getHostFor(curi).getHostName());
        }
    }

    protected void saveSourceStats(String source, String hostname) {
        synchronized (sourceHostDistribution) {
            ConcurrentMap<String, AtomicLong> hostUriCount = sourceHostDistribution.getOrUse(source,
                    new Supplier<ConcurrentMap<String, AtomicLong>>() {
                        public ConcurrentMap<String, AtomicLong> get() {
                            return new ConcurrentHashMap<String, AtomicLong>();
                        }
                    });
            incrementMapCount(hostUriCount, hostname);
        }
    }

    protected void saveHostStats(String hostname, long size) {
        incrementCacheCount(hostsDistribution, hostname);

        incrementCacheCount(hostsBytes, hostname, size);

        long time = new Long(System.currentTimeMillis());
        getHostLastFinished(hostname).set(time);
    }

    public void crawledURINeedRetry(CrawlURI curi) {
        handleSeed(curi, SEED_DISPOSITION_RETRY);
    }

    public void crawledURIDisregard(CrawlURI curi) {
        handleSeed(curi, SEED_DISPOSITION_DISREGARD);
    }

    public void crawledURIFailure(CrawlURI curi) {
        handleSeed(curi, SEED_DISPOSITION_FAILURE);
    }

    /**
     * Get a seed iterator for the job being monitored. 
     * 
     * <b>Note:</b> This iterator will iterate over a list of <i>strings</i> not
     * UURIs like the Scope seed iterator. The strings are equal to the URIs'
     * getURIString() values.
     * @return the seed iterator
     * FIXME: Consider using TransformingIterator here
     */
    public Iterator<String> getSeeds() {
        List<String> seedsCopy = new Vector<String>();
        Iterator<UURI> i = controller.getScope().seedsIterator();
        while (i.hasNext()) {
            seedsCopy.add(i.next().toString());
        }
        return seedsCopy.iterator();
    }

    public Iterator<SeedRecord> getSeedRecordsSortedByStatusCode() {
        return getSeedRecordsSortedByStatusCode(getSeeds());
    }

    protected Iterator<SeedRecord> getSeedRecordsSortedByStatusCode(Iterator<String> i) {
        TreeSet<SeedRecord> sortedSet = new TreeSet<SeedRecord>(new Comparator<SeedRecord>() {
            public int compare(SeedRecord sr1, SeedRecord sr2) {
                int code1 = sr1.getStatusCode();
                int code2 = sr2.getStatusCode();
                if (code1 == code2) {
                    // If the values are equal, sort by URIs.
                    return sr1.getUri().compareTo(sr2.getUri());
                }
                // mirror and shift the nubmer line so as to
                // place zero at the beginning, then all negatives 
                // in order of ascending absolute value, then all 
                // positives descending
                code1 = -code1 - Integer.MAX_VALUE;
                code2 = -code2 - Integer.MAX_VALUE;

                return new Integer(code1).compareTo(new Integer(code2));
            }
        });
        while (i.hasNext()) {
            String seed = i.next();
            SeedRecord sr = (SeedRecord) processedSeedsRecords.get(seed);
            if (sr == null) {
                sr = new SeedRecord(seed, SEED_DISPOSITION_NOT_PROCESSED);
            }
            sortedSet.add(sr);
        }
        return sortedSet.iterator();
    }

    public void crawlEnded(String message) {
        logger.info("Entered crawlEnded");
        this.sExitMessage = message; // held for reference by reports
        super.crawlEnded(message);
        logger.info("Leaving crawlEnded");
    }

    /**
     * @param writer Where to write.
     */
    protected void writeSeedsReportTo(PrintWriter writer) {
        // Build header.
        writer.print("[code] [status] [seed] [redirect]\n");

        seedsCrawled = 0;
        seedsNotCrawled = 0;
        for (Iterator<SeedRecord> i = getSeedRecordsSortedByStatusCode(getSeeds()); i.hasNext();) {
            SeedRecord sr = i.next();
            writer.print(sr.getStatusCode());
            writer.print(" ");
            if ((sr.getStatusCode() > 0)) {
                seedsCrawled++;
                writer.print("CRAWLED");
            } else {
                seedsNotCrawled++;
                writer.print("NOTCRAWLED");
            }
            writer.print(" ");
            writer.print(sr.getUri());
            if (sr.getRedirectUri() != null) {
                writer.print(" ");
                writer.print(sr.getRedirectUri());
            }
            writer.print("\n");
        }
    }

    protected void writeSourceReportTo(PrintWriter writer) {

        writer.print("[source] [host] [#urls]\n");
        // for each source
        for (String sourceKey : sourceHostDistribution.keySet()) {
            Map<String, AtomicLong> hostCounts = sourceHostDistribution.get(sourceKey);
            // sort hosts by #urls
            SortedMap<String, AtomicLong> sortedHostCounts = getReverseSortedHostCounts(hostCounts);
            // for each host
            for (String hostKey : sortedHostCounts.keySet()) {
                AtomicLong hostCount = hostCounts.get(hostKey);
                writer.print(sourceKey.toString());
                writer.print(" ");
                writer.print(hostKey.toString());
                writer.print(" ");
                writer.print(hostCount.get());
                writer.print("\n");
            }
        }
    }

    /**
     * Return a copy of the hosts distribution in reverse-sorted (largest first)
     * order.
     * 
     * @return SortedMap of hosts distribution
     */
    public SortedMap<String, AtomicLong> getReverseSortedHostCounts(Map<String, AtomicLong> hostCounts) {
        return getReverseSortedCopy(hostCounts);
    }

    protected void writeHostsReportTo(final PrintWriter writer) {
        // TODO: use CrawlHosts for all stats; only perform sorting on 
        // manageable number of hosts
        SortedMap<String, AtomicLong> hd = getReverseSortedHostsDistribution();
        // header
        writer.print(
                "[#urls] [#bytes] [host] [#robots] [#remaining] [#novel-urls] [#novel-bytes] [#dup-by-hash-urls] [#dup-by-hash-bytes] [#not-modified-urls] [#not-modified-bytes]\n");
        for (String key : hd.keySet()) {
            // Key is 'host'.
            CrawlHost host = controller.getServerCache().getHostFor(key);
            AtomicLong val = hd.get(key);
            writeReportLine(writer, val == null ? "-" : val.get(), getBytesPerHost(key), key,
                    host.getSubstats().getRobotsDenials(), host.getSubstats().getRemaining(),
                    host.getSubstats().getNovelUrls(), host.getSubstats().getNovelBytes(),
                    host.getSubstats().getDupByHashUrls(), host.getSubstats().getDupByHashBytes(),
                    host.getSubstats().getNotModifiedUrls(), host.getSubstats().getNotModifiedBytes());
        }
        // StatisticsTracker doesn't know of zero-completion hosts; 
        // so supplement report with those entries from host cache
        Closure logZeros = new Closure() {
            public void execute(Object obj) {
                CrawlHost host = (CrawlHost) obj;
                if (host.getSubstats().getRecordedFinishes() == 0) {
                    writeReportLine(writer, host.getSubstats().getRecordedFinishes(),
                            host.getSubstats().getTotalBytes(), host.getHostName(),
                            host.getSubstats().getRobotsDenials(), host.getSubstats().getRemaining(),
                            host.getSubstats().getNovelUrls(), host.getSubstats().getNovelBytes(),
                            host.getSubstats().getDupByHashUrls(), host.getSubstats().getDupByHashBytes(),
                            host.getSubstats().getNotModifiedUrls(), host.getSubstats().getNotModifiedBytes());
                }
            }
        };
        controller.getServerCache().forAllHostsDo(logZeros);
    }

    protected void writeReportLine(PrintWriter writer, Object... fields) {
        for (Object field : fields) {
            writer.print(field);
            writer.print(" ");
        }
        writer.print("\n");
    }

    /**
     * Return a copy of the hosts distribution in reverse-sorted
     * (largest first) order. 
     * @return SortedMap of hosts distribution
     */
    public SortedMap<String, AtomicLong> getReverseSortedHostsDistribution() {
        return getReverseSortedCopy(hostsDistribution);
    }

    protected void writeMimetypesReportTo(PrintWriter writer) {
        // header
        writer.print("[#urls] [#bytes] [mime-types]\n");
        TreeMap<String, AtomicLong> fd = getReverseSortedCopy(getFileDistribution());
        for (String key : fd.keySet()) {
            // Key is mime type.
            writer.print(Long.toString(fd.get(key).get()));
            writer.print(" ");
            writer.print(Long.toString(getBytesPerFileType(key)));
            writer.print(" ");
            writer.print(key);
            writer.print("\n");
        }
    }

    protected void writeResponseCodeReportTo(PrintWriter writer) {
        // Build header.
        writer.print("[rescode] [#urls]\n");
        TreeMap<String, AtomicLong> scd = getReverseSortedCopy(getStatusCodeDistribution());
        for (String key : scd.keySet()) {
            writer.print(key);
            writer.print(" ");
            writer.print(Long.toString(scd.get(key).get()));
            writer.print("\n");
        }
    }

    protected void writeCrawlReportTo(PrintWriter writer) {
        writer.print("Crawl Name: " + controller.getOrder().getCrawlOrderName());
        writer.print("\nCrawl Status: " + sExitMessage);
        writer.print("\nDuration Time: " + ArchiveUtils.formatMillisecondsToConventional(crawlDuration()));
        writer.print("\nTotal Seeds Crawled: " + seedsCrawled);
        writer.print("\nTotal Seeds not Crawled: " + seedsNotCrawled);
        // hostsDistribution contains all hosts crawled plus an entry for dns.
        writer.print("\nTotal Hosts Crawled: " + (hostsDistribution.size() - 1));
        writer.print("\nTotal Documents Crawled: " + finishedUriCount);
        writer.print("\nDocuments Crawled Successfully: " + downloadedUriCount);
        writer.print("\nNovel Documents Crawled: " + novelUriCount);
        if (dupByHashUriCount > 0)
            writer.print("\nDuplicate-by-hash Documents Crawled: " + dupByHashUriCount);
        if (notModifiedUriCount > 0)
            writer.print("\nNot-modified Documents Crawled: " + notModifiedUriCount);
        writer.print("\nProcessed docs/sec: " + ArchiveUtils.doubleToString(docsPerSecond, 2));
        writer.print("\nBandwidth in Kbytes/sec: " + totalKBPerSec);
        writer.print("\nTotal Raw Data Size in Bytes: " + totalProcessedBytes + " ("
                + ArchiveUtils.formatBytesForDisplay(totalProcessedBytes) + ") \n");
        writer.print("Novel Bytes: " + crawledBytes.get(CrawledBytesHistotable.NOVEL) + " ("
                + ArchiveUtils.formatBytesForDisplay(crawledBytes.get(CrawledBytesHistotable.NOVEL)) + ") \n");
        if (crawledBytes.containsKey(CrawledBytesHistotable.DUPLICATE)) {
            writer.print("Duplicate-by-hash Bytes: " + crawledBytes.get(CrawledBytesHistotable.DUPLICATE) + " ("
                    + ArchiveUtils.formatBytesForDisplay(crawledBytes.get(CrawledBytesHistotable.DUPLICATE))
                    + ") \n");
        }
        if (crawledBytes.containsKey(CrawledBytesHistotable.NOTMODIFIED)) {
            writer.print("Not-modified Bytes: " + crawledBytes.get(CrawledBytesHistotable.NOTMODIFIED) + " ("
                    + ArchiveUtils.formatBytesForDisplay(crawledBytes.get(CrawledBytesHistotable.NOTMODIFIED))
                    + ") \n");
        }
    }

    protected void writeProcessorsReportTo(PrintWriter writer) {
        controller.reportTo(CrawlController.PROCESSORS_REPORT, writer);
    }

    protected void writeReportFile(String reportName, String filename) {
        File f = new File(controller.getDisk().getPath(), filename);
        try {
            PrintWriter bw = new PrintWriter(new OutputStreamWriter(new FileOutputStream(f, false), "UTF-8"));
            writeReportTo(reportName, bw);
            bw.close();
            controller.addToManifest(f.getAbsolutePath(), CrawlController.MANIFEST_REPORT_FILE, true);
        } catch (IOException e) {
            logger.log(Level.SEVERE, "Unable to write " + f.getAbsolutePath() + " at the end of crawl.", e);
        }
        logger.info("wrote report: " + f.getAbsolutePath());
    }

    /**
     * @param writer Where to write.
     */
    protected void writeManifestReportTo(PrintWriter writer) {
        controller.reportTo(CrawlController.MANIFEST_REPORT, writer);
    }

    /**
     * @param reportName Name of report.
     * @param w Where to write.
     */
    private void writeReportTo(String reportName, PrintWriter w) {
        if ("hosts".equals(reportName)) {
            writeHostsReportTo(w);
        } else if ("mime types".equals(reportName)) {
            writeMimetypesReportTo(w);
        } else if ("response codes".equals(reportName)) {
            writeResponseCodeReportTo(w);
        } else if ("seeds".equals(reportName)) {
            writeSeedsReportTo(w);
        } else if ("crawl".equals(reportName)) {
            writeCrawlReportTo(w);
        } else if ("processors".equals(reportName)) {
            writeProcessorsReportTo(w);
        } else if ("manifest".equals(reportName)) {
            writeManifestReportTo(w);
        } else if ("frontier".equals(reportName)) {
            writeFrontierReportTo(w);
        } else if ("source".equals(reportName)) {
            writeSourceReportTo(w);
        } // / TODO else default/error
    }

    /**
     * Write the Frontier's 'nonempty' report (if available)
     * @param writer to report to
     */
    protected void writeFrontierReportTo(PrintWriter writer) {
        if (controller.getFrontier().isEmpty()) {
            writer.println("frontier empty");
        } else {
            controller.getFrontier().reportTo("nonempty", writer);
        }
    }

    /**
     * Run the reports.
     */
    public void dumpReports() {
        // Add all files mentioned in the crawl order to the
        // manifest set.
        controller.addOrderToManifest();
        controller.installThreadContextSettingsHandler();
        writeReportFile("hosts", "hosts-report.txt");
        writeReportFile("mime types", "mimetype-report.txt");
        writeReportFile("response codes", "responsecode-report.txt");
        writeReportFile("seeds", "seeds-report.txt");
        writeReportFile("crawl", "crawl-report.txt");
        writeReportFile("processors", "processors-report.txt");
        writeReportFile("manifest", "crawl-manifest.txt");
        writeReportFile("frontier", "frontier-report.txt");
        if (sourceHostDistribution.size() > 0) {
            writeReportFile("source", "source-report.txt");
        }
        // TODO: Save object to disk?
    }

    public void crawlCheckpoint(File cpDir) throws Exception {
        // CrawlController is managing the checkpointing of this object.
        logNote("CRAWL CHECKPOINTING TO " + cpDir.toString());
    }
}