org.archive.crawler.reporting.StatisticsTracker.java Source code

Introduction

Here is the source code for org.archive.crawler.reporting.StatisticsTracker.java
Source

/*
 *  This file is part of the Heritrix web crawler (crawler.archive.org).
 *
 *  Licensed to the Internet Archive (IA) by one or more individual 
 *  contributors. 
 *
 *  The IA licenses this file to You under the Apache License, Version 2.0
 *  (the "License"); you may not use this file except in compliance with
 *  the License.  You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 *  Unless required by applicable law or agreed to in writing, software
 *  distributed under the License is distributed on an "AS IS" BASIS,
 *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 *  See the License for the specific language governing permissions and
 *  limitations under the License.
 */

package org.archive.crawler.reporting;

import static org.archive.modules.CoreAttributeConstants.A_SOURCE_TAG;

import java.io.File;
import java.io.FileWriter;
import java.io.IOException;
import java.io.PrintWriter;
import java.util.Date;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.ConcurrentMap;
import java.util.concurrent.Executors;
import java.util.concurrent.ScheduledExecutorService;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.atomic.AtomicLong;
import java.util.logging.Level;
import java.util.logging.Logger;

import org.apache.commons.collections.Closure;
import org.archive.bdb.BdbModule;
import org.archive.bdb.DisposableStoredSortedMap;
import org.archive.checkpointing.Checkpoint;
import org.archive.checkpointing.Checkpointable;
import org.archive.crawler.event.CrawlStateEvent;
import org.archive.crawler.event.CrawlURIDispositionEvent;
import org.archive.crawler.event.StatSnapshotEvent;
import org.archive.crawler.framework.CrawlController;
import org.archive.crawler.framework.Engine;
import org.archive.crawler.util.CrawledBytesHistotable;
import org.archive.crawler.util.TopNSet;
import org.archive.modules.CrawlURI;
import org.archive.modules.net.CrawlHost;
import org.archive.modules.net.ServerCache;
import org.archive.modules.seeds.SeedListener;
import org.archive.modules.seeds.SeedModule;
import org.archive.spring.ConfigPath;
import org.archive.util.ArchiveUtils;
import org.archive.util.FileUtils;
import org.archive.util.JSONUtils;
import org.archive.util.MimetypeUtils;
import org.archive.util.ObjectIdentityCache;
import org.archive.util.ObjectIdentityMemCache;
import org.archive.util.PaddingStringBuffer;
import org.archive.util.Supplier;
import org.json.JSONException;
import org.json.JSONObject;
import org.springframework.beans.BeansException;
import org.springframework.beans.factory.BeanNameAware;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.context.ApplicationContext;
import org.springframework.context.ApplicationContextAware;
import org.springframework.context.ApplicationEvent;
import org.springframework.context.ApplicationListener;
import org.springframework.context.Lifecycle;
import org.xbill.DNS.DClass;
import org.xbill.DNS.Lookup;

import com.sleepycat.je.DatabaseException;

/**
 * This is an implementation of the AbstractTracker. It is designed to function
 * with the WUI as well as performing various logging activity.
 * <p>
 * At the end of each snapshot a line is written to the
 * 'progress-statistics.log' file.
 * <p>
 * The header of that file is as follows:
 * <pre> [timestamp] [discovered]    [queued] [downloaded] [doc/s(avg)]  [KB/s(avg)] [dl-failures] [busy-thread] [mem-use-KB]</pre>
 * First there is a <b>timestamp</b>, accurate down to 1 second.
 * <p>
 * <b>discovered</b>, <b>queued</b>, <b>downloaded</b> and <b>dl-failures</b>
 * are (respectively) the discovered URI count, pending URI count, successfully
 * fetched count and failed fetch count from the frontier at the time of the
 * snapshot.
 * <p>
 * <b>KB/s(avg)</b> is the bandwidth usage.  We use the total bytes downloaded
 * to calculate average bandwidth usage (KB/sec). Since we also note the value
 * each time a snapshot is made we can calculate the average bandwidth usage
 * during the last snapshot period to gain a "current" rate. The first number is
 * the current and the average is in parenthesis.
 * <p>
 * <b>doc/s(avg)</b> works the same way as doc/s except it show the number of
 * documents (URIs) rather then KB downloaded.
 * <p>
 * <b>busy-threads</b> is the total number of ToeThreads that are not available
 * (and thus presumably busy processing a URI). This information is extracted
 * from the crawl controller.
 * <p>
 * Finally mem-use-KB is extracted from the run time environment
 * (<code>Runtime.getRuntime().totalMemory()</code>).
 * <p>
 * In addition to the data collected for the above logs, various other data
 * is gathered and stored by this tracker.
 * <ul>
 *   <li> Successfully downloaded documents per fetch status code
 *   <li> Successfully downloaded documents per document mime type
 *   <li> Amount of data per mime type
 *   <li> Successfully downloaded documents per host
 *   <li> Amount of data per host
 *   <li> Disposition of all seeds (this is written to 'reports.log' at end of
 *        crawl)
 *   <li> Successfully downloaded documents per host per source
 * </ul>
 *
 * @contributor Parker Thompson
 * @contributor Kristinn Sigurdsson
 * @contributor gojomo
 */
public class StatisticsTracker implements ApplicationContextAware, ApplicationListener<ApplicationEvent>,
        SeedListener, Lifecycle, Runnable, Checkpointable, BeanNameAware {
    @SuppressWarnings("unused")
    private static final long serialVersionUID = 5L;

    protected SeedModule seeds;

    public SeedModule getSeeds() {
        return this.seeds;
    }

    @Autowired
    public void setSeeds(SeedModule seeds) {
        this.seeds = seeds;
    }

    protected BdbModule bdb;

    @Autowired
    public void setBdbModule(BdbModule bdb) {
        this.bdb = bdb;
    }

    protected ConfigPath reportsDir = new ConfigPath(Engine.REPORTS_DIR_NAME, "${launchId}/reports");

    public ConfigPath getReportsDir() {
        return reportsDir;
    }

    public void setReportsDir(ConfigPath reportsDir) {
        this.reportsDir = reportsDir;
    }

    protected ServerCache serverCache;

    public ServerCache getServerCache() {
        return this.serverCache;
    }

    @Autowired
    public void setServerCache(ServerCache serverCache) {
        this.serverCache = serverCache;
    }

    protected int liveHostReportSize = 20;

    public int getLiveHostReportSize() {
        return liveHostReportSize;
    }

    public void setLiveHostReportSize(int liveHostReportSize) {
        this.liveHostReportSize = liveHostReportSize;
    }

    protected ApplicationContext appCtx;

    public void setApplicationContext(ApplicationContext appCtx) throws BeansException {
        this.appCtx = appCtx;
    }

    /**
     * Messages from the StatisticsTracker.
     */
    private final static Logger logger = Logger.getLogger(StatisticsTracker.class.getName());

    /**
     * Whether to maintain seed disposition records (expensive in 
     * crawls with millions of seeds)
     */
    protected boolean trackSeeds = true;

    public boolean getTrackSeeds() {
        return this.trackSeeds;
    }

    public void setTrackSeeds(boolean trackSeeds) {
        this.trackSeeds = trackSeeds;
    }

    /**
     * Whether to maintain hosts-per-source-tag records for; very expensive in 
     * crawls with large numbers of source-tags (seeds) or large crawls 
     * over many hosts
     */
    protected boolean trackSources = true;

    public boolean getTrackSources() {
        return this.trackSources;
    }

    public void setTrackSources(boolean trackSources) {
        this.trackSources = trackSources;
    }

    /**
     * The interval between writing progress information to log.
     */
    protected int intervalSeconds = 20;

    public int getIntervalSeconds() {
        return this.intervalSeconds;
    }

    public void setIntervalSeconds(int interval) {
        this.intervalSeconds = interval;
    }

    /**
     * Number of crawl-stat sample snapshots to keep for calculation 
     * purposes.
     */
    protected int keepSnapshotsCount = 5;

    public int getKeepSnapshotsCount() {
        return this.keepSnapshotsCount;
    }

    public void setKeepSnapshotsCount(int count) {
        this.keepSnapshotsCount = count;
    }

    protected CrawlController controller;

    public CrawlController getCrawlController() {
        return this.controller;
    }

    @Autowired
    public void setCrawlController(CrawlController controller) {
        this.controller = controller;
    }

    /** wall-clock time the crawl started */
    protected long crawlStartTime;
    /** wall-clock time the crawl ended */
    protected long crawlEndTime = -1; // Until crawl ends, this value is -1.
    /** wall-clock time of last pause, while pause in progres */
    protected long crawlPauseStarted = 0;
    /** duration tally of all time spent in paused state */
    protected long crawlTotalPausedTime = 0;

    /** snapshots of crawl tallies and rates */
    protected LinkedList<CrawlStatSnapshot> snapshots = new LinkedList<CrawlStatSnapshot>();

    protected ScheduledExecutorService executor = Executors.newSingleThreadScheduledExecutor();

    /*
     * Cumulative data
     */
    /** tally sizes novel, verified (same hash), vouched (not-modified) */
    protected CrawledBytesHistotable crawledBytes = new CrawledBytesHistotable();

    public CrawledBytesHistotable getCrawledBytes() {
        return crawledBytes;
    }

    // TODO: fortify these against key explosion with bigmaps like other tallies
    /** Keep track of the file types we see (mime type -> count) */
    protected ConcurrentMap<String, AtomicLong> mimeTypeDistribution = new ConcurrentHashMap<String, AtomicLong>();
    protected ConcurrentMap<String, AtomicLong> mimeTypeBytes = new ConcurrentHashMap<String, AtomicLong>();

    /** Keep track of fetch status codes */
    protected ConcurrentMap<String, AtomicLong> statusCodeDistribution = new ConcurrentHashMap<String, AtomicLong>();

    /** Keep track of URL counts per host per seed */
    // TODO: restore spill-to-disk, like with processedSeedsRecords
    protected ConcurrentHashMap<String, ConcurrentMap<String, AtomicLong>> sourceHostDistribution = new ConcurrentHashMap<String, ConcurrentMap<String, AtomicLong>>();

    /* Keep track of 'top' hosts for live reports */
    protected TopNSet hostsDistributionTop;
    protected TopNSet hostsBytesTop;
    protected TopNSet hostsLastFinishedTop;

    /**
     * Record of seeds and latest results
     */
    protected ObjectIdentityCache<SeedRecord> processedSeedsRecords = new ObjectIdentityMemCache<SeedRecord>();
    protected long seedsTotal = -1;
    protected long seedsCrawled = -1;

    public StatisticsTracker() {

    }

    protected List<Report> reports;

    public List<Report> getReports() {
        // lazy initialization so we don't pointlessly create a bunch of beans
        // right before setReports is called
        if (reports == null) {
            reports = new LinkedList<Report>();
            reports.add(new CrawlSummaryReport());
            reports.add(new SeedsReport());
            reports.add(new HostsReport());
            reports.add(new SourceTagsReport());
            reports.add(new MimetypesReport());
            reports.add(new ResponseCodeReport());
            reports.add(new ProcessorsReport());
            reports.add(new FrontierSummaryReport());
            reports.add(new ToeThreadsReport());
        }

        return reports;
    }

    public void setReports(List<Report> reports) {
        this.reports = reports;
    }

    protected boolean isRunning = false;

    public boolean isRunning() {
        return isRunning;
    }

    public void stop() {
        isRunning = false;
        executor.shutdownNow();
        progressStatisticsEvent();
        dumpReports();
    }

    @SuppressWarnings("unchecked")
    public void start() {
        isRunning = true;
        boolean isRecover = (recoveryCheckpoint != null);
        try {
            this.processedSeedsRecords = bdb.getObjectCache("processedSeedsRecords", isRecover, SeedRecord.class);

            this.hostsDistributionTop = new TopNSet(getLiveHostReportSize());
            this.hostsBytesTop = new TopNSet(getLiveHostReportSize());
            this.hostsLastFinishedTop = new TopNSet(getLiveHostReportSize());

            if (isRecover) {
                JSONObject json = recoveryCheckpoint.loadJson(beanName);

                crawlStartTime = json.getLong("crawlStartTime");
                crawlEndTime = json.getLong("crawlEndTime");
                crawlTotalPausedTime = json.getLong("crawlTotalPausedTime");
                crawlPauseStarted = json.getLong("crawlPauseStarted");
                tallyCurrentPause();

                JSONUtils.putAllLongs(hostsDistributionTop.getTopSet(), json.getJSONObject("hostsDistributionTop"));
                hostsDistributionTop.updateBounds();
                JSONUtils.putAllLongs(hostsBytesTop.getTopSet(), json.getJSONObject("hostsBytesTop"));
                hostsBytesTop.updateBounds();
                JSONUtils.putAllLongs(hostsLastFinishedTop.getTopSet(), json.getJSONObject("hostsLastFinishedTop"));
                hostsLastFinishedTop.updateBounds();

                JSONUtils.putAllAtomicLongs(mimeTypeDistribution, json.getJSONObject("mimeTypeDistribution"));
                JSONUtils.putAllAtomicLongs(mimeTypeBytes, json.getJSONObject("mimeTypeBytes"));
                JSONUtils.putAllAtomicLongs(statusCodeDistribution, json.getJSONObject("statusCodeDistribution"));

                JSONObject shd = json.getJSONObject("sourceHostDistribution");
                Iterator<String> keyIter = shd.keys();
                for (; keyIter.hasNext();) {
                    String source = keyIter.next();
                    ConcurrentHashMap<String, AtomicLong> hostUriCount = new ConcurrentHashMap<String, AtomicLong>();
                    JSONUtils.putAllAtomicLongs(hostUriCount, shd.getJSONObject(source));
                    sourceHostDistribution.put(source, hostUriCount);
                }

                JSONUtils.putAllLongs(crawledBytes, json.getJSONObject("crawledBytes"));
            }
        } catch (DatabaseException e) {
            throw new IllegalStateException(e);
        } catch (JSONException e) {
            throw new IllegalStateException(e);
        }
        // Log the legend
        this.controller.logProgressStatistics(progressStatisticsLegend());
        executor.scheduleAtFixedRate(this, 0, getIntervalSeconds(), TimeUnit.SECONDS);
    }

    /**
     * Do activity. Is called by ScheduledExecutorService at intervals specified by
     * intervalSeconds
     *
     */
    public void run() {
        progressStatisticsEvent();
    }

    /**
     * @return legend for progress-statistics lines/log
     */
    public String progressStatisticsLegend() {
        return "           timestamp" + "  discovered   " + "   queued   downloaded       doc/s(avg)  KB/s(avg) "
                + "  dl-failures   busy-thread   mem-use-KB  heap-size-KB "
                + "  congestion   max-depth   avg-depth";
    }

    public String getProgressStamp() {
        return progressStatisticsLegend() + "\n" + getSnapshot().getProgressStatisticsLine();
    }

    /**
     * Notify tracker that crawl has begun. Must be called
     * outside tracker's own thread, to ensure it is noted
     * before other threads start interacting with tracker. 
     */
    public void noteStart() {
        if (this.crawlStartTime == 0) {
            // Note the time the crawl starts (only if not already set)
            this.crawlStartTime = System.currentTimeMillis();
        }
    }

    /**
     * A method for logging current crawler state.
     *
     * This method will be called by run() at intervals specified in
     * the crawl order file.  It is also invoked when pausing or
     * stopping a crawl to capture the state at that point.  Default behavior is
     * call to {@link CrawlController#logProgressStatistics} so CrawlController
     * can act on progress statistics event.
     * <p>
     * It is recommended that for implementations of this method it be
     * carefully considered if it should be synchronized in whole or in
     * part
     * @param e Progress statistics event.
     */
    protected synchronized void progressStatisticsEvent() {
        CrawlStatSnapshot snapshot = getSnapshot();

        if (this.controller != null) {
            this.controller.logProgressStatistics(snapshot.getProgressStatisticsLine());
        }
        snapshots.addFirst(snapshot);
        while (snapshots.size() > getKeepSnapshotsCount()) {
            snapshots.removeLast();
        }

        // publish app event 
        appCtx.publishEvent(new StatSnapshotEvent(this, snapshot));

        // temporary workaround for 
        // [ 996161 ] Fix DNSJava issues (memory) -- replace with JNDI-DNS?
        // http://sourceforge.net/support/tracker.php?aid=996161
        Lookup.getDefaultCache(DClass.IN).clearCache();
    }

    public CrawlStatSnapshot getSnapshot() {
        // TODO: take snapshot implementation from a spring prototype?
        CrawlStatSnapshot snapshot = new CrawlStatSnapshot();
        snapshot.collect(controller, this);
        return snapshot;
    }

    public LinkedList<CrawlStatSnapshot> listSnapshots() {
        // not named getSnapshots to avoid autodiscovery as a (invalid) bean-property
        return snapshots;
    }

    public CrawlStatSnapshot getLastSnapshot() {
        CrawlStatSnapshot snap = snapshots.peek();
        return snap == null ? getSnapshot() : snap;
    }

    public long getCrawlElapsedTime() {
        if (crawlStartTime == 0) {
            // if no start time set yet, consider elapsed time zero
            return 0;
        }
        if (crawlPauseStarted != 0) {
            // currently paused, calculate time up to last pause
            return crawlPauseStarted - crawlTotalPausedTime - crawlStartTime;
        }

        // not paused, calculate total time to end or (if running) now
        return ((crawlEndTime > 0) ? crawlEndTime : System.currentTimeMillis()) - crawlTotalPausedTime
                - crawlStartTime;
    }

    public void crawlPausing(String statusMessage) {
        logNote("CRAWL WAITING - " + statusMessage);
    }

    protected void logNote(final String note) {
        this.controller.logProgressStatistics(new PaddingStringBuffer()
                .append(ArchiveUtils.getLog14Date(new Date())).append(" ").append(note).toString());
    }

    public void crawlPaused(String statusMessage) {
        crawlPauseStarted = System.currentTimeMillis();
        progressStatisticsEvent();
        logNote("CRAWL PAUSED - " + statusMessage);
    }

    public void crawlResuming(String statusMessage) {
        tallyCurrentPause();
        if (this.crawlStartTime == 0) {
            noteStart();
        }
        logNote("CRAWL RUNNING - " + statusMessage);
    }

    public void crawlEmpty(String statusMessage) {
        logNote("CRAWL EMPTY - " + statusMessage);
    }

    /**
     * For a current pause (if any), add paused time to total and reset
     */
    protected void tallyCurrentPause() {
        if (this.crawlPauseStarted > 0) {
            // Ok, we managed to actually pause before resuming.
            this.crawlTotalPausedTime += (System.currentTimeMillis() - this.crawlPauseStarted);
        }
        this.crawlPauseStarted = 0;
    }

    public void crawlEnding(String sExitMessage) {
        logNote("CRAWL ENDING - " + sExitMessage);
    }

    public void crawlEnded(String sExitMessage) {
        crawlEndTime = System.currentTimeMillis();
        logNote("CRAWL ENDED - " + sExitMessage);
    }

    /**
     * Returns how long the current crawl has been running *including*
     * time paused (contrast with getCrawlElapsedTime()).
     *
     * @return The length of time - in msec - that this crawl has been running.
     */
    public long getCrawlDuration() {
        return ((crawlEndTime > 0) ? crawlEndTime : System.currentTimeMillis()) - crawlStartTime;
    }

    /** Returns a HashMap that contains information about distributions of
     *  encountered mime types.  Key/value pairs represent
     *  mime type -> count.
     * <p>
     * <b>Note:</b> All the values are wrapped with a {@link AtomicLong AtomicLong}
     * @return mimeTypeDistribution
     */
    public Map<String, AtomicLong> getFileDistribution() {
        return mimeTypeDistribution;
    }

    /**
     * Increment a counter for a key in a given HashMap. Used for various
     * aggregate data.
     * 
     * @param map The Map or ConcurrentMap
     * @param key The key for the counter to be incremented, if it does not
     *               exist it will be added (set to 1).  If null it will
     *            increment the counter "unknown".
     */
    protected static void incrementMapCount(ConcurrentMap<String, AtomicLong> map, String key) {
        incrementMapCount(map, key, 1);
    }

    /**
     * Increment a counter for a key in a given HashMap by an arbitrary amount.
     * Used for various aggregate data. The increment amount can be negative.
     *
     *
     * @param map
     *            The HashMap
     * @param key
     *            The key for the counter to be incremented, if it does not exist
     *            it will be added (set to equal to <code>increment</code>).
     *            If null it will increment the counter "unknown".
     * @param increment
     *            The amount to increment counter related to the <code>key</code>.
     */
    protected static void incrementMapCount(ConcurrentMap<String, AtomicLong> map, String key, long increment) {
        if (key == null) {
            key = "unknown";
        }
        AtomicLong lw = (AtomicLong) map.get(key);
        if (lw == null) {
            lw = new AtomicLong(0);
            AtomicLong prevVal = map.putIfAbsent(key, lw);
            if (prevVal != null) {
                lw = prevVal;
            }
        }
        lw.addAndGet(increment);
    }

    /**
     * Sort the entries of the given Map in descending order by their
     * values, which must be longs wrapped with <code>AtomicLong</code>.
     * <p>
     * Elements are sorted by value from largest to smallest. Equal values are
     * sorted by their keys. The returned map is a StoredSortedMap, and
     * thus may include duplicate keys. 
     *
     * If the passed-in map requires access to be synchronized, the caller
     * should ensure this synchronization. 
     * 
     * @param mapOfAtomicLongValues
     *            Assumes values are wrapped with AtomicLong.
     * @return a sorted set containing the same elements as the map.
     */
    public DisposableStoredSortedMap<Long, String> getReverseSortedCopy(
            final Map<String, AtomicLong> mapOfAtomicLongValues) {
        DisposableStoredSortedMap<Long, String> sortedMap = bdb.getStoredMap(null, Long.class, String.class, true,
                false);
        for (String k : mapOfAtomicLongValues.keySet()) {
            sortedMap.put(-mapOfAtomicLongValues.get(k).longValue(), k);
        }
        return sortedMap;
    }

    /**
     * Return a objectCache representing the distribution of status codes for
     * successfully fetched curis, as represented by a cache where key -&gt;
     * val represents (string)code -&gt; (integer)count.
     * 
     * <b>Note: </b> All the values are wrapped with a
     * {@link AtomicLong AtomicLong}
     * @return 
     * 
     * @return statusCodeDistribution
     */
    public Map<String, AtomicLong> getStatusCodeDistribution() {
        return statusCodeDistribution;
    }

    /**
     * Returns the time (in millisec) when a URI belonging to a given host was
     * last finished processing. 
     * 
     * @param host The host to look up time of last completed URI.
     * @return Returns the time (in millisec) when a URI belonging to a given 
     * host was last finished processing. If no URI has been completed for host
     * -1 will be returned. 
     */
    public long getHostLastFinished(String host) {
        return serverCache.getHostFor(host).getSubstats().getLastSuccessTime();
    }

    /**
     * Returns the accumulated number of bytes downloaded from a given host.
     * @param host name of the host
     * @return the accumulated number of bytes downloaded from a given host
     */
    public long getBytesPerHost(String host) {
        return serverCache.getHostFor(host).getSubstats().getTotalBytes();
    }

    /**
     * Returns the accumulated number of bytes from files of a given file type.
     * @param filetype Filetype to check.
     * @return the accumulated number of bytes from files of a given mime type
     */
    public long getBytesPerFileType(String filetype) {
        return getReportValue(mimeTypeBytes, filetype);
    }

    /**
     * Get the total number of ToeThreads (sleeping and active)
     *
     * @return The total number of ToeThreads
     */
    public int threadCount() {
        return this.controller != null ? controller.getToeCount() : 0;
    }

    public String crawledBytesSummary() {
        return crawledBytes.summary();
    }

    /**
     * If the curi is a seed, we update the processedSeeds cache.
     *
     * @param curi The CrawlURI that may be a seed.
     * @param disposition The disposition of the CrawlURI.
     */
    protected void handleSeed(final CrawlURI curi, final String disposition) {
        if (getTrackSeeds()) {
            if (curi.isSeed()) {
                SeedRecord sr = processedSeedsRecords.getOrUse(curi.getURI(), new Supplier<SeedRecord>() {
                    public SeedRecord get() {
                        return new SeedRecord(curi, disposition);
                    }
                });
                sr.updateWith(curi, disposition);
            }
        } // else ignore
    }

    public void crawledURISuccessful(CrawlURI curi) {
        handleSeed(curi, "Seed successfully crawled");
        // save crawled bytes tally
        crawledBytes.accumulate(curi);

        // Save status codes
        incrementMapCount(statusCodeDistribution, Integer.toString(curi.getFetchStatus()));

        // Save mime types
        String mime = MimetypeUtils.truncate(curi.getContentType());
        incrementMapCount(mimeTypeDistribution, mime);
        incrementMapCount(mimeTypeBytes, mime, curi.getContentSize());

        // Save hosts stats.
        ServerCache sc = serverCache;
        saveHostStats(sc.getHostFor(curi.getUURI()).getHostName(), curi.getContentSize());

        if (getTrackSources() && curi.getData().containsKey(A_SOURCE_TAG)) {
            saveSourceStats((String) curi.getData().get(A_SOURCE_TAG), sc.getHostFor(curi.getUURI()).getHostName());
        }
    }

    protected void saveSourceStats(String source, String hostname) {
        ConcurrentMap<String, AtomicLong> hostUriCount = sourceHostDistribution.get(source);
        if (hostUriCount == null) {
            hostUriCount = new ConcurrentHashMap<String, AtomicLong>();
            ConcurrentMap<String, AtomicLong> prevVal = sourceHostDistribution.putIfAbsent(source, hostUriCount);
            if (prevVal != null) {
                hostUriCount = prevVal;
            }
        }
        incrementMapCount(hostUriCount, hostname);

    }

    /**
     * Update some running-stats based on a URI success
     * 
     * @param hostname
     * @param size
     */
    protected void saveHostStats(String hostname, long size) {
        // TODO: consider moving 'top' accounting elsewhere, such 
        // as the frontier or ServerCache itself

        CrawlHost host = serverCache.getHostFor(hostname);
        hostsDistributionTop.update(hostname, host.getSubstats().getFetchSuccesses());
        hostsBytesTop.update(hostname, host.getSubstats().getSuccessBytes());
        hostsLastFinishedTop.update(hostname, host.getSubstats().getLastSuccessTime());
    }

    public void crawledURINeedRetry(CrawlURI curi) {
        handleSeed(curi, "Failed to crawl seed, will retry");
    }

    public void crawledURIDisregard(CrawlURI curi) {
        handleSeed(curi, "Seed was disregarded");
    }

    public void crawledURIFailure(CrawlURI curi) {
        handleSeed(curi, "Failed to crawl seed");
    }

    /**
     * Get a seed iterator for the job being monitored. Only reports
     * known seeds from processedSeedsRecords -- but as a SeedListener, 
     * that should be complete. 
     * 
     * <b>Note:</b> This iterator will iterate over a list of <i>strings</i> not
     * UURIs like the Scope seed iterator. The strings are equal to the URIs'
     * getURIString() values.
     * @return the seed iterator
     */
    public Iterator<String> getSeedsIterator() {
        return processedSeedsRecords.keySet().iterator();
    }

    public DisposableStoredSortedMap<Integer, SeedRecord> calcSeedRecordsSortedByStatusCode() {
        Iterator<String> i = getSeedsIterator();
        DisposableStoredSortedMap<Integer, SeedRecord> sortedMap = bdb.getStoredMap(null, Integer.class,
                SeedRecord.class, true, false);

        while (i.hasNext()) {
            String seed = i.next();
            SeedRecord sr = (SeedRecord) processedSeedsRecords.get(seed);
            if (sr == null) {
                sr = new SeedRecord(seed, "Seed has not been processed");
                // no need to retain synthesized record
            }
            sortedMap.put(sr.sortShiftStatusCode(), sr);
        }
        return sortedMap;
    }

    /**
     * Return a copy of the hosts distribution in reverse-sorted (largest first)
     * order.
     * 
     * @return SortedMap of hosts distribution
     */
    public DisposableStoredSortedMap<Long, String> getReverseSortedHostCounts(Map<String, AtomicLong> hostCounts) {
        synchronized (hostCounts) {
            return getReverseSortedCopy(hostCounts);
        }
    }

    /**
     * Return a copy of the hosts distribution in reverse-sorted
     * (largest first) order. 
     * @return SortedMap of hosts distribution
     */
    public DisposableStoredSortedMap<Long, String> calcReverseSortedHostsDistribution() {
        final DisposableStoredSortedMap<Long, String> sortedMap = bdb.getStoredMap(null, Long.class, String.class,
                true, false);
        serverCache.forAllHostsDo(new Closure() {
            @Override
            public void execute(Object hostObj) {
                CrawlHost host = (CrawlHost) hostObj;
                sortedMap.put(-host.getSubstats().getFetchSuccesses(), host.getHostName());
            }
        });
        return sortedMap;
    }

    public File writeReportFile(String reportName) {
        for (Report report : getReports()) {
            if (report.getClass().getSimpleName().equals(reportName)) {
                return writeReportFile(report, false);
            }
        }
        return null;
    }

    protected File writeReportFile(Report report, boolean force) {
        File f = new File(getReportsDir().getFile(), report.getFilename());

        if (f.exists() && !controller.isRunning() && controller.hasStarted() && !force) {
            // controller already started and stopped 
            // and file exists
            // and force not requested
            // so, don't overwrite
            logger.info("reusing report: " + f.getAbsolutePath());
            return f;
        }

        try {
            FileUtils.ensureWriteableDirectory(f.getParentFile());
            PrintWriter bw = new PrintWriter(new FileWriter(f));
            report.write(bw, this);
            bw.close();
            addToManifest(f.getAbsolutePath(), CrawlerLoggerModule.MANIFEST_REPORT_FILE, true);
        } catch (IOException e) {
            logger.log(Level.SEVERE, "Unable to write " + f.getAbsolutePath() + " at the end of crawl.", e);
        }
        logger.info("wrote report: " + f.getAbsolutePath());
        return f;
    }

    protected void addToManifest(String absolutePath, char manifest_report_file, boolean b) {
        // TODO Auto-generated method stub

    }

    /**
     * Run the reports.
     */
    public void dumpReports() {
        // TODO: sooner than here! Add all files mentioned in the crawl 
        // order to the manifest set.
        //controller.addOrderToManifest();

        for (Report report : getReports()) {
            if (report.getShouldReportAtEndOfCrawl()) {
                try {
                    writeReportFile(report, true);
                } catch (RuntimeException re) {
                    logger.log(Level.SEVERE, re.getMessage(), re);
                }
            }
        }
    }

    public void crawlCheckpoint(/*StateProvider*/ Object def, File cpDir) throws Exception {
        // CrawlController is managing the checkpointing of this object.
        logNote("CRAWL CHECKPOINTING TO " + cpDir.toString());
    }

    private long getReportValue(Map<String, AtomicLong> map, String key) {
        if (key == null) {
            return -1;
        }
        Object o = map.get(key);
        if (o == null) {
            return -2;
        }
        if (!(o instanceof AtomicLong)) {
            throw new IllegalStateException("Expected AtomicLong but got " + o.getClass() + " for " + key);
        }
        return ((AtomicLong) o).get();
    }

    public void onApplicationEvent(ApplicationEvent event) {
        if (event instanceof CrawlStateEvent) {
            CrawlStateEvent event1 = (CrawlStateEvent) event;
            switch (event1.getState()) {
            case PAUSED:
                this.crawlPaused(event1.getMessage());
                break;
            case RUNNING:
                this.crawlResuming(event1.getMessage());
                break;
            case EMPTY:
                this.crawlEmpty(event1.getMessage());
                break;
            case PAUSING:
                this.crawlPausing(event1.getMessage());
                break;
            case STOPPING:
                this.crawlEnding(event1.getMessage());
                break;
            case FINISHED:
                this.crawlEnded(event1.getMessage());
                break;
            case PREPARING:
                this.crawlResuming(event1.getMessage());
                break;
            default:
                throw new RuntimeException("Unknown state: " + event1.getState());
            }
        }

        if (event instanceof CrawlURIDispositionEvent) {
            CrawlURIDispositionEvent dvent = (CrawlURIDispositionEvent) event;
            switch (dvent.getDisposition()) {
            case SUCCEEDED:
                this.crawledURISuccessful(dvent.getCrawlURI());
                break;
            case FAILED:
                this.crawledURIFailure(dvent.getCrawlURI());
                break;
            case DISREGARDED:
                this.crawledURIDisregard(dvent.getCrawlURI());
                break;
            case DEFERRED_FOR_RETRY:
                this.crawledURINeedRetry(dvent.getCrawlURI());
                break;
            default:
                throw new RuntimeException("Unknown disposition: " + dvent.getDisposition());
            }
        }
    }

    public void tallySeeds() {
        seedsTotal = 0;
        seedsCrawled = 0;
        if (processedSeedsRecords == null) {
            // nothing to tally
            return;
        }
        for (Iterator<String> i = getSeedsIterator(); i.hasNext();) {
            SeedRecord sr = processedSeedsRecords.get(i.next());
            seedsTotal++;
            if (sr != null && (sr.getStatusCode() > 0)) {
                seedsCrawled++;
            }
        }
    }

    /** 
     * Create a seed record, even on initial notification (before
     * any real attempt/processing.
     * 
     * @see org.archive.modules.seeds.SeedListener#addedSeed(org.archive.modules.CrawlURI)
     */
    public void addedSeed(CrawlURI curi) {
        // record even undisposed-seeds for reporting purposes
        handleSeed((CrawlURI) curi, "");
    }

    /**
     * Do nothing with nonseed lines.
     * 
     * @see org.archive.modules.seeds.SeedListener#nonseedLine(java.lang.String)
     */
    public boolean nonseedLine(String line) {
        return false;
    }

    public void concludedSeedBatch() {
        // do nothing;
    }

    // BeanNameAware
    protected String beanName;

    public void setBeanName(String name) {
        this.beanName = name;
    }

    // Checkpointable
    public void startCheckpoint(Checkpoint checkpointInProgress) {
    }

    public void doCheckpoint(Checkpoint checkpointInProgress) throws IOException {
        JSONObject json = new JSONObject();
        try {
            json.put("crawlStartTime", crawlStartTime);
            json.put("crawlEndTime", crawlEndTime);
            long virtualCrawlPauseStarted = crawlPauseStarted;
            if (virtualCrawlPauseStarted < 1) {
                // TODO: use instant checkpoint started?
                virtualCrawlPauseStarted = System.currentTimeMillis();
            }
            json.put("crawlPauseStarted", virtualCrawlPauseStarted);
            json.put("crawlTotalPausedTime", crawlTotalPausedTime);

            json.put("hostsDistributionTop", hostsDistributionTop.getTopSet());
            json.put("hostsBytesTop", hostsBytesTop.getTopSet());
            json.put("hostsLastFinishedTop", hostsLastFinishedTop.getTopSet());

            json.put("mimeTypeDistribution", mimeTypeDistribution);
            json.put("mimeTypeBytes", mimeTypeBytes);
            json.put("statusCodeDistribution", statusCodeDistribution);

            json.put("sourceHostDistribution", sourceHostDistribution);

            json.put("crawledBytes", crawledBytes);

            // TODO: save crawledBytesHistotable
            checkpointInProgress.saveJson(beanName, json);
        } catch (JSONException e) {
            // impossible
            throw new RuntimeException(e);
        }
    }

    public void finishCheckpoint(Checkpoint checkpointInProgress) {
    }

    protected Checkpoint recoveryCheckpoint;

    public void setRecoveryCheckpoint(Checkpoint recoveryCheckpoint) {
        this.recoveryCheckpoint = recoveryCheckpoint;
    }

}