com.cyberway.issue.crawler.frontier.AbstractFrontier.java Source code

Introduction

Here is the source code for com.cyberway.issue.crawler.frontier.AbstractFrontier.java
Source

/* AbstractFrontier
 *
 * $Id: AbstractFrontier.java 5882 2008-07-17 21:02:28Z gojomo $
 *
 * Created on Aug 17, 2004
 *
 * Copyright (C) 2004 Internet Archive.
 *
 * This file is part of the Heritrix web crawler (crawler.archive.org).
 *
 * Heritrix is free software; you can redistribute it and/or modify
 * it under the terms of the GNU Lesser Public License as published by
 * the Free Software Foundation; either version 2.1 of the License, or
 * any later version.
 *
 * Heritrix is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU Lesser Public License for more details.
 *
 * You should have received a copy of the GNU Lesser Public License
 * along with Heritrix; if not, write to the Free Software
 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
 */
package com.cyberway.issue.crawler.frontier;

import java.io.BufferedWriter;
import java.io.File;
import java.io.FileWriter;
import java.io.IOException;
import java.io.PrintWriter;
import java.io.Serializable;
import java.io.StringWriter;
import java.io.Writer;
import java.util.Iterator;
import java.util.List;
import java.util.concurrent.atomic.AtomicLong;
import java.util.logging.Level;
import java.util.logging.Logger;
import java.util.regex.Pattern;

import javax.management.AttributeNotFoundException;

import org.apache.commons.httpclient.HttpStatus;
import com.cyberway.issue.crawler.datamodel.CandidateURI;
import com.cyberway.issue.crawler.datamodel.CoreAttributeConstants;
import com.cyberway.issue.crawler.datamodel.CrawlHost;
import com.cyberway.issue.crawler.datamodel.CrawlOrder;
import com.cyberway.issue.crawler.datamodel.CrawlServer;
import com.cyberway.issue.crawler.datamodel.CrawlSubstats;
import com.cyberway.issue.crawler.datamodel.CrawlURI;
import com.cyberway.issue.crawler.datamodel.FetchStatusCodes;
import com.cyberway.issue.crawler.datamodel.RobotsExclusionPolicy;
import com.cyberway.issue.crawler.datamodel.CrawlSubstats.Stage;
import com.cyberway.issue.crawler.event.CrawlStatusListener;
import com.cyberway.issue.crawler.framework.CrawlController;
import com.cyberway.issue.crawler.framework.Frontier;
import com.cyberway.issue.crawler.framework.ToeThread;
import com.cyberway.issue.crawler.framework.exceptions.EndedException;
import com.cyberway.issue.crawler.framework.exceptions.FatalConfigurationException;
import com.cyberway.issue.crawler.settings.ModuleType;
import com.cyberway.issue.crawler.settings.RegularExpressionConstraint;
import com.cyberway.issue.crawler.settings.SimpleType;
import com.cyberway.issue.crawler.settings.Type;
import com.cyberway.issue.crawler.url.Canonicalizer;
import com.cyberway.issue.net.UURI;
import com.cyberway.issue.util.ArchiveUtils;

/**
 * Shared facilities for Frontier implementations.
 * 
 * @author gojomo
 */
public abstract class AbstractFrontier extends ModuleType
        implements CrawlStatusListener, Frontier, FetchStatusCodes, CoreAttributeConstants, Serializable {
    private static final long serialVersionUID = -4766504935003203930L;

    private static final Logger logger = Logger.getLogger(AbstractFrontier.class.getName());

    protected transient CrawlController controller;

    /** ordinal numbers to assign to created CrawlURIs */
    protected AtomicLong nextOrdinal = new AtomicLong(1);

    /** should the frontier hold any threads asking for URIs? */
    protected boolean shouldPause = false;

    /**
     * should the frontier send an EndedException to any threads asking for
     * URIs?
     */
    protected transient boolean shouldTerminate = false;

    /**
     * how many multiples of last fetch elapsed time to wait before recontacting
     * same server
     */
    public final static String ATTR_DELAY_FACTOR = "delay-factor";

    protected final static Float DEFAULT_DELAY_FACTOR = new Float(5);

    /**
     * always wait this long after one completion before recontacting same
     * server, regardless of multiple
     */
    public final static String ATTR_MIN_DELAY = "min-delay-ms";

    // 3 secs.
    protected final static Integer DEFAULT_MIN_DELAY = new Integer(3000);

    /**
     * Whether to respect a 'Crawl-Delay' (in seconds) given in a site's
     * robots.txt
     */
    public final static String ATTR_RESPECT_CRAWL_DELAY_UP_TO_SECS = "respect-crawl-delay-up-to-secs";

    // by default, respect robots.txt-provided Crawl-Delay up to 300 secs
    protected final static Integer DEFAULT_RESPECT_CRAWL_DELAY_UP_TO_SECS = 300; // 5 minutes

    /** never wait more than this long, regardless of multiple */
    public final static String ATTR_MAX_DELAY = "max-delay-ms";

    // 30 secs
    protected final static Integer DEFAULT_MAX_DELAY = new Integer(30000);

    /** number of hops of embeds (ERX) to bump to front of host queue */
    public final static String ATTR_PREFERENCE_EMBED_HOPS = "preference-embed-hops";

    protected final static Integer DEFAULT_PREFERENCE_EMBED_HOPS = new Integer(1);

    /** maximum per-host bandwidth usage */
    public final static String ATTR_MAX_HOST_BANDWIDTH_USAGE = "max-per-host-bandwidth-usage-KB-sec";

    protected final static Integer DEFAULT_MAX_HOST_BANDWIDTH_USAGE = new Integer(0);

    /** maximum overall bandwidth usage */
    public final static String ATTR_MAX_OVERALL_BANDWIDTH_USAGE = "total-bandwidth-usage-KB-sec";

    protected final static Integer DEFAULT_MAX_OVERALL_BANDWIDTH_USAGE = new Integer(0);

    /** for retryable problems, seconds to wait before a retry */
    public final static String ATTR_RETRY_DELAY = "retry-delay-seconds";

    // 15 mins
    protected final static Long DEFAULT_RETRY_DELAY = new Long(900);

    /** maximum times to emit a CrawlURI without final disposition */
    public final static String ATTR_MAX_RETRIES = "max-retries";

    protected final static Integer DEFAULT_MAX_RETRIES = new Integer(30);

    public final static String ATTR_QUEUE_ASSIGNMENT_POLICY = "queue-assignment-policy";

    /** queue assignment to force onto CrawlURIs; intended to be overridden */
    public final static String ATTR_FORCE_QUEUE = "force-queue-assignment";

    protected final static String DEFAULT_FORCE_QUEUE = "";

    // word chars, dash, period, comma, colon
    protected final static String ACCEPTABLE_FORCE_QUEUE = "[-\\w\\.,:]*";

    /** whether pause, rather than finish, when crawl appears done */
    public final static String ATTR_PAUSE_AT_FINISH = "pause-at-finish";
    // TODO: change default to true once well-tested
    protected final static Boolean DEFAULT_PAUSE_AT_FINISH = Boolean.FALSE;

    /** whether to pause at crawl start */
    public final static String ATTR_PAUSE_AT_START = "pause-at-start";
    protected final static Boolean DEFAULT_PAUSE_AT_START = Boolean.FALSE;

    /** whether to pause at crawl start */
    public final static String ATTR_SOURCE_TAG_SEEDS = "source-tag-seeds";
    protected final static Boolean DEFAULT_SOURCE_TAG_SEEDS = Boolean.FALSE;

    /**
     * Recover log on or off attribute.
     */
    protected final static String ATTR_RECOVERY_ENABLED = "recovery-log-enabled";
    protected final static Boolean DEFAULT_ATTR_RECOVERY_ENABLED = Boolean.TRUE;

    // to maintain serialization compatibility, stored under old names
    protected long queuedUriCount;
    protected long succeededFetchCount;
    protected long failedFetchCount;
    protected long disregardedUriCount;

    // top-level stats
    /** total URIs queued to be visited */
    transient protected AtomicLong liveQueuedUriCount = new AtomicLong(0);

    transient protected AtomicLong liveSucceededFetchCount = new AtomicLong(0);

    transient protected AtomicLong liveFailedFetchCount = new AtomicLong(0);

    /** URIs that are disregarded (for example because of robot.txt rules */
    transient protected AtomicLong liveDisregardedUriCount = new AtomicLong(0);

    /**
     * Used when bandwidth constraint are used.
     */
    protected long totalProcessedBytes = 0;

    private transient long nextURIEmitTime = 0;

    protected long processedBytesAfterLastEmittedURI = 0;

    protected int lastMaxBandwidthKB = 0;

    /**
     * Crawl replay logger.
     * 
     * Currently captures Frontier/URI transitions.
     * Can be null if user chose not to run a recovery.log.
     */
    private transient FrontierJournal recover = null;

    /** file collecting report of ignored seed-file entries (if any) */
    public static final String IGNORED_SEEDS_FILENAME = "seeds.ignored";

    /**
     * @param name Name of this frontier.
     * @param description Description for this frontier.
     */
    public AbstractFrontier(String name, String description) {
        super(name, description);
        addElementToDefinition(new SimpleType(ATTR_DELAY_FACTOR,
                "How many multiples of last fetch elapsed time to wait before " + "recontacting same server",
                DEFAULT_DELAY_FACTOR));
        addElementToDefinition(
                new SimpleType(ATTR_MAX_DELAY, "Never wait more than this long.", DEFAULT_MAX_DELAY));
        addElementToDefinition(new SimpleType(ATTR_MIN_DELAY,
                "Always wait this long after one completion before recontacting " + "same server.",
                DEFAULT_MIN_DELAY));
        addElementToDefinition(new SimpleType(ATTR_RESPECT_CRAWL_DELAY_UP_TO_SECS,
                "Respect a Crawl-Delay directive in a site's robots.txt "
                        + "up to this value in seconds. (If longer, simply "
                        + "respect this value.) Default is 300 seconds (5 minutes).",
                DEFAULT_RESPECT_CRAWL_DELAY_UP_TO_SECS));
        addElementToDefinition(
                new SimpleType(ATTR_MAX_RETRIES, "How often to retry fetching a URI that failed to be retrieved. "
                        + "If zero, the crawler will get the robots.txt only.", DEFAULT_MAX_RETRIES));
        addElementToDefinition(
                new SimpleType(ATTR_RETRY_DELAY, "How long to wait by default until we retry fetching a"
                        + " URI that failed to be retrieved (seconds). ", DEFAULT_RETRY_DELAY));
        addElementToDefinition(new SimpleType(ATTR_PREFERENCE_EMBED_HOPS,
                "Number of embedded (or redirected) hops up to which "
                        + "a URI has higher priority scheduling. For example, if set "
                        + "to 1 (the default), items such as inline images (1-hop "
                        + "embedded resources) will be scheduled ahead of all regular "
                        + "links (or many-hop resources, like nested frames). If set to "
                        + "zero, no preferencing will occur, and embeds/redirects are "
                        + "scheduled the same as regular links.",
                DEFAULT_PREFERENCE_EMBED_HOPS));
        Type t;
        t = addElementToDefinition(new SimpleType(ATTR_MAX_OVERALL_BANDWIDTH_USAGE,
                "The maximum average bandwidth the crawler is allowed to use. "
                        + "The actual read speed is not affected by this setting, it only "
                        + "holds back new URIs from being processed when the bandwidth "
                        + "usage has been to high. 0 means no bandwidth limitation.",
                DEFAULT_MAX_OVERALL_BANDWIDTH_USAGE));
        t.setOverrideable(false);
        t = addElementToDefinition(new SimpleType(ATTR_MAX_HOST_BANDWIDTH_USAGE,
                "The maximum average bandwidth the crawler is allowed to use per "
                        + "host. The actual read speed is not affected by this setting, "
                        + "it only holds back new URIs from being processed when the "
                        + "bandwidth usage has been to high. 0 means no bandwidth " + "limitation.",
                DEFAULT_MAX_HOST_BANDWIDTH_USAGE));
        t.setExpertSetting(true);

        // Read the list of permissible choices from heritrix.properties.
        // Its a list of space- or comma-separated values.
        String queueStr = System.getProperty(AbstractFrontier.class.getName() + "." + ATTR_QUEUE_ASSIGNMENT_POLICY,
                HostnameQueueAssignmentPolicy.class.getName() + " " + IPQueueAssignmentPolicy.class.getName() + " "
                        + BucketQueueAssignmentPolicy.class.getName() + " "
                        + SurtAuthorityQueueAssignmentPolicy.class.getName() + " "
                        + TopmostAssignedSurtQueueAssignmentPolicy.class.getName());
        Pattern p = Pattern.compile("\\s*,\\s*|\\s+");
        String[] queues = p.split(queueStr);
        if (queues.length <= 0) {
            throw new RuntimeException("Failed parse of " + " assignment queue policy string: " + queueStr);
        }
        t = addElementToDefinition(new SimpleType(ATTR_QUEUE_ASSIGNMENT_POLICY,
                "Defines how to assign URIs to queues. Can assign by host, "
                        + "by ip, and into one of a fixed set of buckets (1k).",
                queues[0], queues));
        t.setExpertSetting(true);
        t.setOverrideable(true);

        t = addElementToDefinition(new SimpleType(ATTR_FORCE_QUEUE,
                "The queue name into which to force URIs. Should " + "be left blank at global level.  Specify a "
                        + "per-domain/per-host override to force URIs into "
                        + "a particular named queue, regardless of the assignment "
                        + "policy in effect (domain or ip-based politeness). "
                        + "This could be used on domains known to all be from "
                        + "the same small set of IPs (eg blogspot, dailykos, etc.) "
                        + "to simulate IP-based politeness, or it could be used if "
                        + "you wanted to enforce politeness over a whole domain, even "
                        + "though the subdomains are split across many IPs.",
                DEFAULT_FORCE_QUEUE));
        t.setOverrideable(true);
        t.setExpertSetting(true);
        t.addConstraint(new RegularExpressionConstraint(ACCEPTABLE_FORCE_QUEUE, Level.WARNING,
                "This field must contain only alphanumeric "
                        + "characters plus period, dash, comma, colon, or underscore."));
        t = addElementToDefinition(new SimpleType(ATTR_PAUSE_AT_START,
                "Whether to pause when the crawl begins, before any URIs "
                        + "are tried. This gives the operator a chance to verify or "
                        + "adjust the crawl before actual work begins. " + "Default is false.",
                DEFAULT_PAUSE_AT_START));
        t = addElementToDefinition(new SimpleType(ATTR_PAUSE_AT_FINISH,
                "Whether to pause when the crawl appears finished, rather "
                        + "than immediately end the crawl. This gives the operator an "
                        + "opportunity to view crawl results, and possibly add URIs or "
                        + "adjust settings, while the crawl state is still available. " + "Default is false.",
                DEFAULT_PAUSE_AT_FINISH));
        t.setOverrideable(false);

        t = addElementToDefinition(new SimpleType(ATTR_SOURCE_TAG_SEEDS,
                "Whether to tag seeds with their own URI as a heritable "
                        + "'source' String, which will be carried-forward to all URIs "
                        + "discovered on paths originating from that seed. When "
                        + "present, such source tags appear in the second-to-last " + "crawl.log field.",
                DEFAULT_SOURCE_TAG_SEEDS));
        t.setOverrideable(false);

        t = addElementToDefinition(new SimpleType(ATTR_RECOVERY_ENABLED,
                "Set to false to disable recovery log writing.  Do this if "
                        + "you you are using the checkpoint feature for recovering " + "crashed crawls.",
                DEFAULT_ATTR_RECOVERY_ENABLED));
        t.setExpertSetting(true);
        // No sense in it being overrideable.
        t.setOverrideable(false);
    }

    public void start() {
        if (((Boolean) getUncheckedAttribute(null, ATTR_PAUSE_AT_START)).booleanValue()) {
            // trigger crawl-wide pause
            controller.requestCrawlPause();
        } else {
            // simply begin
            unpause();
        }
    }

    synchronized public void pause() {
        shouldPause = true;
    }

    synchronized public void unpause() {
        shouldPause = false;
        notifyAll();
    }

    public void initialize(CrawlController c) throws FatalConfigurationException, IOException {
        c.addCrawlStatusListener(this);
        File logsDisk = null;
        try {
            logsDisk = c.getSettingsDir(CrawlOrder.ATTR_LOGS_PATH);
        } catch (AttributeNotFoundException e) {
            logger.log(Level.SEVERE, "Failed to get logs directory", e);
        }
        if (logsDisk != null) {
            String logsPath = logsDisk.getAbsolutePath() + File.separatorChar;
            if (((Boolean) getUncheckedAttribute(null, ATTR_RECOVERY_ENABLED)).booleanValue()) {
                this.recover = new RecoveryJournal(logsPath, FrontierJournal.LOGNAME_RECOVER);
            }
        }
        //        try {
        //            final Class qapClass = Class.forName((String)getUncheckedAttribute(
        //                    null, ATTR_QUEUE_ASSIGNMENT_POLICY));
        //
        //            queueAssignmentPolicy =
        //                (QueueAssignmentPolicy)qapClass.newInstance();
        //        } catch (Exception e) {
        //            logger.log(Level.SEVERE, "Bad queue assignment policy class", e);
        //            throw new FatalConfigurationException(e.getMessage());
        //        }
    }

    synchronized public void terminate() {
        shouldTerminate = true;
        if (this.recover != null) {
            this.recover.close();
            this.recover = null;
        }
        unpause();
    }

    /**
     * Report CrawlURI to each of the three 'substats' accumulators
     * (group/queue, server, host) for a given stage. 
     * 
     * @param curi
     * @param stage
     */
    protected void tally(CrawlURI curi, Stage stage) {
        // Tally per-server, per-host, per-frontier-class running totals
        CrawlServer server = controller.getServerCache().getServerFor(curi);
        if (server != null) {
            server.getSubstats().tally(curi, stage);
        }
        CrawlHost host = controller.getServerCache().getHostFor(curi);
        if (host != null) {
            host.getSubstats().tally(curi, stage);
        }
        FrontierGroup group = controller.getFrontier().getGroup(curi);
        group.getSubstats().tally(curi, stage);
    }

    protected void doJournalFinishedSuccess(CrawlURI c) {
        tally(c, CrawlSubstats.Stage.SUCCEEDED);
        if (this.recover != null) {
            this.recover.finishedSuccess(c);
        }
    }

    protected void doJournalAdded(CrawlURI c) {
        tally(c, CrawlSubstats.Stage.SCHEDULED);
        if (this.recover != null) {
            this.recover.added(c);
        }
    }

    protected void doJournalRescheduled(CrawlURI c) {
        tally(c, CrawlSubstats.Stage.RETRIED);
        if (this.recover != null) {
            this.recover.rescheduled(c);
        }
    }

    protected void doJournalFinishedFailure(CrawlURI c) {
        tally(c, CrawlSubstats.Stage.FAILED);
        if (this.recover != null) {
            this.recover.finishedFailure(c);
        }
    }

    protected void doJournalDisregarded(CrawlURI c) {
        tally(c, CrawlSubstats.Stage.DISREGARDED);
        if (this.recover != null) {
            this.recover.finishedDisregard(c);
        }
    }

    protected void doJournalEmitted(CrawlURI c) {
        if (this.recover != null) {
            this.recover.emitted(c);
        }
    }

    /**
     * Frontier is empty only if all queues are empty and no URIs are in-process
     * 
     * @return True if queues are empty.
     */
    public boolean isEmpty() {
        return liveQueuedUriCount.get() == 0;
    }

    /**
     * Increment the running count of queued URIs. 
     */
    protected void incrementQueuedUriCount() {
        liveQueuedUriCount.incrementAndGet();
    }

    /**
     * Increment the running count of queued URIs. Synchronized because
     * operations on longs are not atomic.
     * 
     * @param increment
     *            amount to increment the queued count
     */
    protected void incrementQueuedUriCount(long increment) {
        liveQueuedUriCount.addAndGet(increment);
    }

    /**
     * Note that a number of queued Uris have been deleted.
     * 
     * @param numberOfDeletes
     */
    protected void decrementQueuedCount(long numberOfDeletes) {
        liveQueuedUriCount.addAndGet(-numberOfDeletes);
    }

    /**
     * (non-Javadoc)
     * 
     * @see com.cyberway.issue.crawler.framework.Frontier#queuedUriCount()
     */
    public long queuedUriCount() {
        return liveQueuedUriCount.get();
    }

    /**
     * (non-Javadoc)
     * 
     * @see com.cyberway.issue.crawler.framework.Frontier#finishedUriCount()
     */
    public long finishedUriCount() {
        return liveSucceededFetchCount.get() + liveFailedFetchCount.get() + liveDisregardedUriCount.get();
    }

    /**
     * Increment the running count of successfully fetched URIs. 
     */
    protected void incrementSucceededFetchCount() {
        liveSucceededFetchCount.incrementAndGet();
    }

    /**
     * (non-Javadoc)
     * 
     * @see com.cyberway.issue.crawler.framework.Frontier#succeededFetchCount()
     */
    public long succeededFetchCount() {
        return liveSucceededFetchCount.get();
    }

    /**
     * Increment the running count of failed URIs. 
     */
    protected void incrementFailedFetchCount() {
        liveFailedFetchCount.incrementAndGet();
    }

    /**
     * (non-Javadoc)
     * 
     * @see com.cyberway.issue.crawler.framework.Frontier#failedFetchCount()
     */
    public long failedFetchCount() {
        return liveFailedFetchCount.get();
    }

    /**
     * Increment the running count of disregarded URIs. Synchronized because
     * operations on longs are not atomic.
     */
    protected void incrementDisregardedUriCount() {
        liveDisregardedUriCount.incrementAndGet();
    }

    public long disregardedUriCount() {
        return liveDisregardedUriCount.get();
    }

    /** @deprecated misnomer; use StatisticsTracking figures instead */
    public long totalBytesWritten() {
        return totalProcessedBytes;
    }

    /**
     * Load up the seeds.
     * 
     * This method is called on initialize and inside in the crawlcontroller
     * when it wants to force reloading of configuration.
     * 
     * @see com.cyberway.issue.crawler.framework.CrawlController#kickUpdate()
     */
    public void loadSeeds() {
        Writer ignoredWriter = new StringWriter();
        logger.info("beginning");
        // Get the seeds to refresh.
        Iterator iter = this.controller.getScope().seedsIterator(ignoredWriter);
        int count = 0;
        while (iter.hasNext()) {
            UURI u = (UURI) iter.next();
            CandidateURI caUri = CandidateURI.createSeedCandidateURI(u);
            caUri.setSchedulingDirective(CandidateURI.MEDIUM);
            if (((Boolean) getUncheckedAttribute(null, ATTR_SOURCE_TAG_SEEDS)).booleanValue()) {
                caUri.putString(CoreAttributeConstants.A_SOURCE_TAG, caUri.toString());
                caUri.makeHeritable(CoreAttributeConstants.A_SOURCE_TAG);
            }
            schedule(caUri);
            count++;
            if (count % 1000 == 0) {
                logger.info(count + " seeds");
            }
        }
        // save ignored items (if any) where they can be consulted later
        saveIgnoredItems(ignoredWriter.toString(), controller.getDisk());
        logger.info("finished");
    }

    /**
     * Dump ignored seed items (if any) to disk; delete file otherwise.
     * Static to allow non-derived sibling classes (frontiers not yet 
     * subclassed here) to reuse.
     * 
     * @param ignoredItems
     * @param dir 
     */
    public static void saveIgnoredItems(String ignoredItems, File dir) {
        File ignoredFile = new File(dir, IGNORED_SEEDS_FILENAME);
        if (ignoredItems == null | ignoredItems.length() > 0) {
            try {
                BufferedWriter bw = new BufferedWriter(new FileWriter(ignoredFile));
                bw.write(ignoredItems);
                bw.close();
            } catch (IOException e) {
                // TODO make an alert?
                e.printStackTrace();
            }
        } else {
            // delete any older file (if any)
            ignoredFile.delete();
        }
    }

    protected CrawlURI asCrawlUri(CandidateURI caUri) {
        CrawlURI curi;
        if (caUri instanceof CrawlURI) {
            curi = (CrawlURI) caUri;
        } else {
            curi = CrawlURI.from(caUri, nextOrdinal.getAndIncrement());
        }
        curi.setClassKey(getClassKey(curi));
        return curi;
    }

    /**
     * @param now
     * @throws InterruptedException
     * @throws EndedException
     */
    protected synchronized void preNext(long now) throws InterruptedException, EndedException {
        if (this.controller == null) {
            return;
        }

        // Check completion conditions
        if (this.controller.atFinish()) {
            if (((Boolean) getUncheckedAttribute(null, ATTR_PAUSE_AT_FINISH)).booleanValue()) {
                this.controller.requestCrawlPause();
            } else {
                this.controller.beginCrawlStop();
            }
        }

        // enforce operator pause
        if (shouldPause) {
            while (shouldPause) {
                this.controller.toePaused();
                wait();
            }
            // exitted pause; possibly finish regardless of pause-at-finish
            if (controller != null && controller.atFinish()) {
                this.controller.beginCrawlStop();
            }
        }

        // enforce operator terminate or thread retirement
        if (shouldTerminate || ((ToeThread) Thread.currentThread()).shouldRetire()) {
            throw new EndedException("terminated");
        }

        enforceBandwidthThrottle(now);
    }

    /**
     * Perform any special handling of the CrawlURI, such as promoting its URI
     * to seed-status, or preferencing it because it is an embed.
     * 
     * @param curi
     */
    protected void applySpecialHandling(CrawlURI curi) {
        if (curi.isSeed() && curi.getVia() != null && curi.flattenVia().length() > 0) {
            // The only way a seed can have a non-empty via is if it is the
            // result of a seed redirect. Add it to the seeds list.
            //
            // This is a feature. This is handling for case where a seed
            // gets immediately redirected to another page. What we're doing is
            // treating the immediate redirect target as a seed.
            this.controller.getScope().addSeed(curi);
            // And it needs rapid scheduling.
            if (curi.getSchedulingDirective() == CandidateURI.NORMAL)
                curi.setSchedulingDirective(CandidateURI.MEDIUM);
        }

        // optionally preferencing embeds up to MEDIUM
        int prefHops = ((Integer) getUncheckedAttribute(curi, ATTR_PREFERENCE_EMBED_HOPS)).intValue();
        if (prefHops > 0) {
            int embedHops = curi.getTransHops();
            if (embedHops > 0 && embedHops <= prefHops && curi.getSchedulingDirective() == CandidateURI.NORMAL) {
                // number of embed hops falls within the preferenced range, and
                // uri is not already MEDIUM -- so promote it
                curi.setSchedulingDirective(CandidateURI.MEDIUM);
            }
        }
    }

    /**
     * Perform fixups on a CrawlURI about to be returned via next().
     * 
     * @param curi
     *            CrawlURI about to be returned by next()
     * @param q
     *            the queue from which the CrawlURI came
     */
    protected void noteAboutToEmit(CrawlURI curi, WorkQueue q) {
        curi.setHolder(q);
        // if (curi.getServer() == null) {
        //    // TODO: perhaps short-circuit the emit here,
        //    // because URI will be rejected as unfetchable
        // }
        doJournalEmitted(curi);
    }

    /**
     * @param curi
     * @return the CrawlServer to be associated with this CrawlURI
     */
    protected CrawlServer getServer(CrawlURI curi) {
        return this.controller.getServerCache().getServerFor(curi);
    }

    /**
     * Return a suitable value to wait before retrying the given URI.
     * 
     * @param curi
     *            CrawlURI to be retried
     * @return millisecond delay before retry
     */
    protected long retryDelayFor(CrawlURI curi) {
        int status = curi.getFetchStatus();
        return (status == S_CONNECT_FAILED || status == S_CONNECT_LOST || status == S_DOMAIN_UNRESOLVABLE)
                ? ((Long) getUncheckedAttribute(curi, ATTR_RETRY_DELAY)).longValue()
                : 0; // no delay for most
    }

    /**
     * Update any scheduling structures with the new information in this
     * CrawlURI. Chiefly means make necessary arrangements for no other URIs at
     * the same host to be visited within the appropriate politeness window.
     * 
     * @param curi
     *            The CrawlURI
     * @return millisecond politeness delay
     */
    protected long politenessDelayFor(CrawlURI curi) {
        long durationToWait = 0;
        if (curi.containsKey(A_FETCH_BEGAN_TIME) && curi.containsKey(A_FETCH_COMPLETED_TIME)) {

            long completeTime = curi.getLong(A_FETCH_COMPLETED_TIME);
            long durationTaken = (completeTime - curi.getLong(A_FETCH_BEGAN_TIME));
            durationToWait = (long) (((Float) getUncheckedAttribute(curi, ATTR_DELAY_FACTOR)).floatValue()
                    * durationTaken);

            long minDelay = ((Integer) getUncheckedAttribute(curi, ATTR_MIN_DELAY)).longValue();

            if (minDelay > durationToWait) {
                // wait at least the minimum
                durationToWait = minDelay;
            }

            long maxDelay = ((Integer) getUncheckedAttribute(curi, ATTR_MAX_DELAY)).longValue();
            if (durationToWait > maxDelay) {
                // wait no more than the maximum
                durationToWait = maxDelay;
            }

            long respectThreshold = ((Integer) getUncheckedAttribute(curi, ATTR_RESPECT_CRAWL_DELAY_UP_TO_SECS))
                    .longValue() * 1000;

            if (durationToWait < respectThreshold) {
                // may need to extend wait
                CrawlServer s = controller.getServerCache().getServerFor(curi);
                String ua = curi.getUserAgent();
                if (ua == null) {
                    ua = controller.getOrder().getUserAgent(curi);
                }
                RobotsExclusionPolicy rep = s.getRobots();
                if (rep != null) {
                    long crawlDelay = (long) (1000 * s.getRobots().getCrawlDelay(ua));
                    crawlDelay = (crawlDelay > respectThreshold) ? respectThreshold : crawlDelay;
                    if (crawlDelay > durationToWait) {
                        // wait at least the directive crawl-delay
                        durationToWait = crawlDelay;
                    }
                }
            }

            long now = System.currentTimeMillis();
            int maxBandwidthKB = ((Integer) getUncheckedAttribute(curi, ATTR_MAX_HOST_BANDWIDTH_USAGE)).intValue();
            if (maxBandwidthKB > 0) {
                // Enforce bandwidth limit
                CrawlHost host = controller.getServerCache().getHostFor(curi);
                long minDurationToWait = host.getEarliestNextURIEmitTime() - now;
                float maxBandwidth = maxBandwidthKB * 1.024F; // kilo factor
                long processedBytes = curi.getContentSize();
                host.setEarliestNextURIEmitTime((long) (processedBytes / maxBandwidth) + now);

                if (minDurationToWait > durationToWait) {
                    durationToWait = minDurationToWait;
                }
            }
        }
        return durationToWait;
    }

    /**
     * Ensure that any overall-bandwidth-usage limit is respected, by pausing as
     * long as necessary.
     * 
     * @param now
     * @throws InterruptedException
     */
    private void enforceBandwidthThrottle(long now) throws InterruptedException {
        int maxBandwidthKB = ((Integer) getUncheckedAttribute(null, ATTR_MAX_OVERALL_BANDWIDTH_USAGE)).intValue();
        if (maxBandwidthKB > 0) {
            // Make sure that new bandwidth setting doesn't affect total crawl
            if (maxBandwidthKB != lastMaxBandwidthKB) {
                lastMaxBandwidthKB = maxBandwidthKB;
                processedBytesAfterLastEmittedURI = totalProcessedBytes;
            }

            // Enforce bandwidth limit
            long sleepTime = nextURIEmitTime - now;
            float maxBandwidth = maxBandwidthKB * 1.024F; // Kilo_factor
            long processedBytes = totalProcessedBytes - processedBytesAfterLastEmittedURI;
            long shouldHaveEmittedDiff = nextURIEmitTime == 0 ? 0 : nextURIEmitTime - now;
            nextURIEmitTime = (long) (processedBytes / maxBandwidth) + now + shouldHaveEmittedDiff;
            processedBytesAfterLastEmittedURI = totalProcessedBytes;
            if (sleepTime > 0) {
                long targetTime = now + sleepTime;
                now = System.currentTimeMillis();
                while (now < targetTime) {
                    synchronized (this) {
                        if (logger.isLoggable(Level.FINE)) {
                            logger.fine("Frontier waits for: " + sleepTime + "ms to respect bandwidth limit.");
                        }
                        // TODO: now that this is a wait(), frontier can
                        // still schedule and finish items while waiting,
                        // which is good, but multiple threads could all
                        // wait for the same wakeTime, which somewhat
                        // spoils the throttle... should be fixed.
                        wait(targetTime - now);
                    }
                    now = System.currentTimeMillis();
                }
            }
        }
    }

    /**
     * Take note of any processor-local errors that have been entered into the
     * CrawlURI.
     * 
     * @param curi
     *  
     */
    protected void logLocalizedErrors(CrawlURI curi) {
        if (curi.containsKey(A_LOCALIZED_ERRORS)) {
            List localErrors = (List) curi.getObject(A_LOCALIZED_ERRORS);
            Iterator iter = localErrors.iterator();
            while (iter.hasNext()) {
                Object array[] = { curi, iter.next() };
                controller.localErrors.log(Level.WARNING, curi.getUURI().toString(), array);
            }
            // once logged, discard
            curi.remove(A_LOCALIZED_ERRORS);
        }
    }

    /**
     * Utility method to return a scratch dir for the given key's temp files.
     * Every key gets its own subdir. To avoid having any one directory with
     * thousands of files, there are also two levels of enclosing directory
     * named by the least-significant hex digits of the key string's java
     * hashcode.
     * 
     * @param key
     * @return File representing scratch directory
     */
    protected File scratchDirFor(String key) {
        String hex = Integer.toHexString(key.hashCode());
        while (hex.length() < 4) {
            hex = "0" + hex;
        }
        int len = hex.length();
        return new File(this.controller.getStateDisk(), hex.substring(len - 2, len) + File.separator
                + hex.substring(len - 4, len - 2) + File.separator + key);
    }

    protected boolean overMaxRetries(CrawlURI curi) {
        // never retry more than the max number of times
        if (curi.getFetchAttempts() >= ((Integer) getUncheckedAttribute(curi, ATTR_MAX_RETRIES)).intValue()) {
            return true;
        }
        return false;
    }

    public void importRecoverLog(String pathToLog, boolean retainFailures) throws IOException {
        File source = new File(pathToLog);
        if (!source.isAbsolute()) {
            source = new File(getSettingsHandler().getOrder().getController().getDisk(), pathToLog);
        }
        RecoveryJournal.importRecoverLog(source, controller, retainFailures);
    }

    /*
     * (non-Javadoc)
     * 
     * @see com.cyberway.issue.crawler.framework.URIFrontier#kickUpdate()
     */
    public void kickUpdate() {
        // by default, do nothing
        // (scope will loadSeeds, if appropriate)
    }

    /**
     * Log to the main crawl.log
     * 
     * @param curi
     */
    protected void log(CrawlURI curi) {
        curi.aboutToLog();
        Object array[] = { curi };
        this.controller.uriProcessing.log(Level.INFO, curi.getUURI().toString(), array);
    }

    protected boolean isDisregarded(CrawlURI curi) {
        switch (curi.getFetchStatus()) {
        case S_ROBOTS_PRECLUDED: // they don't want us to have it
        case S_BLOCKED_BY_CUSTOM_PROCESSOR:
        case S_OUT_OF_SCOPE: // filtered out by scope
        case S_BLOCKED_BY_USER: // filtered out by user
        case S_TOO_MANY_EMBED_HOPS: // too far from last true link
        case S_TOO_MANY_LINK_HOPS: // too far from seeds
        case S_DELETED_BY_USER: // user deleted
            return true;
        default:
            return false;
        }
    }

    /**
     * Checks if a recently completed CrawlURI that did not finish successfully
     * needs to be retried (processed again after some time elapses)
     * 
     * @param curi
     *            The CrawlURI to check
     * @return True if we need to retry.
     */
    protected boolean needsRetrying(CrawlURI curi) {
        if (overMaxRetries(curi)) {
            return false;
        }

        switch (curi.getFetchStatus()) {
        case HttpStatus.SC_UNAUTHORIZED:
            // We can get here though usually a positive status code is
            // a success. We get here if there is rfc2617 credential data
            // loaded and we're supposed to go around again. See if any
            // rfc2617 credential present and if there, assume it got
            // loaded in FetchHTTP on expectation that we're to go around
            // again. If no rfc2617 loaded, we should not be here.
            boolean loaded = curi.hasRfc2617CredentialAvatar();
            if (!loaded && logger.isLoggable(Level.INFO)) {
                logger.info("Have 401 but no creds loaded " + curi);
            }
            return loaded;
        case S_DEFERRED:
        case S_CONNECT_FAILED:
        case S_CONNECT_LOST:
        case S_DOMAIN_UNRESOLVABLE:
            // these are all worth a retry
            // TODO: consider if any others (S_TIMEOUT in some cases?) deserve
            // retry
            return true;
        default:
            return false;
        }
    }

    /**
     * Canonicalize passed uuri. Its would be sweeter if this canonicalize
     * function was encapsulated by that which it canonicalizes but because
     * settings change with context -- i.e. there may be overrides in operation
     * for a particular URI -- its not so easy; Each CandidateURI would need a
     * reference to the settings system. That's awkward to pass in.
     * 
     * @param uuri Candidate URI to canonicalize.
     * @return Canonicalized version of passed <code>uuri</code>.
     */
    protected String canonicalize(UURI uuri) {
        return Canonicalizer.canonicalize(uuri, this.controller.getOrder());
    }

    /**
     * Canonicalize passed CandidateURI. This method differs from
     * {@link #canonicalize(UURI)} in that it takes a look at
     * the CandidateURI context possibly overriding any canonicalization effect if
     * it could make us miss content. If canonicalization produces an URL that
     * was 'alreadyseen', but the entry in the 'alreadyseen' database did
     * nothing but redirect to the current URL, we won't get the current URL;
     * we'll think we've already see it. Examples would be archive.org
     * redirecting to www.archive.org or the inverse, www.netarkivet.net
     * redirecting to netarkivet.net (assuming stripWWW rule enabled).
     * <p>Note, this method under circumstance sets the forceFetch flag.
     * 
     * @param cauri CandidateURI to examine.
     * @return Canonicalized <code>cacuri</code>.
     */
    protected String canonicalize(CandidateURI cauri) {
        String canon = canonicalize(cauri.getUURI());
        if (cauri.isLocation()) {
            // If the via is not the same as where we're being redirected (i.e.
            // we're not being redirected back to the same page, AND the
            // canonicalization of the via is equal to the the current cauri, 
            // THEN forcefetch (Forcefetch so no chance of our not crawling
            // content because alreadyseen check things its seen the url before.
            // An example of an URL that redirects to itself is:
            // http://bridalelegance.com/images/buttons3/tuxedos-off.gif.
            // An example of an URL whose canonicalization equals its via's
            // canonicalization, and we want to fetch content at the
            // redirection (i.e. need to set forcefetch), is netarkivet.dk.
            if (!cauri.toString().equals(cauri.getVia().toString()) && canonicalize(cauri.getVia()).equals(canon)) {
                cauri.setForceFetch(true);
            }
        }
        return canon;
    }

    /**
     * @param cauri CrawlURI we're to get a key for.
     * @return a String token representing a queue
     */
    public String getClassKey(CandidateURI cauri) {
        String queueKey = (String) getUncheckedAttribute(cauri, ATTR_FORCE_QUEUE);
        if ("".equals(queueKey)) {
            // no forced override
            QueueAssignmentPolicy queueAssignmentPolicy = getQueueAssignmentPolicy(cauri);
            queueKey = queueAssignmentPolicy.getClassKey(this.controller, cauri);
        }
        return queueKey;
    }

    protected QueueAssignmentPolicy getQueueAssignmentPolicy(CandidateURI cauri) {
        String clsName = (String) getUncheckedAttribute(cauri, ATTR_QUEUE_ASSIGNMENT_POLICY);
        try {
            return (QueueAssignmentPolicy) Class.forName(clsName).newInstance();
        } catch (Exception e) {
            throw new RuntimeException(e);
        }
    }

    /**
     * @return RecoveryJournal instance.  May be null.
     */
    public FrontierJournal getFrontierJournal() {
        return this.recover;
    }

    public void crawlEnding(String sExitMessage) {
        // TODO Auto-generated method stub
    }

    public void crawlEnded(String sExitMessage) {
        if (logger.isLoggable(Level.INFO)) {
            logger.info("Closing with " + Long.toString(queuedUriCount()) + " urls still in queue.");
        }
    }

    public void crawlStarted(String message) {
        // TODO Auto-generated method stub
    }

    public void crawlPausing(String statusMessage) {
        // TODO Auto-generated method stub
    }

    public void crawlPaused(String statusMessage) {
        // TODO Auto-generated method stub
    }

    public void crawlResuming(String statusMessage) {
        // TODO Auto-generated method stub
    }

    public void crawlCheckpoint(File checkpointDir) throws Exception {
        if (this.recover == null) {
            return;
        }
        this.recover.checkpoint(checkpointDir);
    }

    //
    // Reporter implementation
    // 
    public String singleLineReport() {
        return ArchiveUtils.singleLineReport(this);
    }

    public void reportTo(PrintWriter writer) {
        reportTo(null, writer);
    }

    //
    // maintain serialization compatibility to pre-AtomicLong impl
    private void writeObject(java.io.ObjectOutputStream out) throws IOException {
        queuedUriCount = liveQueuedUriCount.get();
        succeededFetchCount = liveSucceededFetchCount.get();
        failedFetchCount = liveFailedFetchCount.get();
        disregardedUriCount = liveDisregardedUriCount.get();
        out.defaultWriteObject();
    }

    private void readObject(java.io.ObjectInputStream in) throws IOException, ClassNotFoundException {
        in.defaultReadObject();
        liveQueuedUriCount = new AtomicLong(queuedUriCount);
        liveSucceededFetchCount = new AtomicLong(succeededFetchCount);
        liveFailedFetchCount = new AtomicLong(failedFetchCount);
        liveDisregardedUriCount = new AtomicLong(disregardedUriCount);
    }
}