com.cyberway.issue.crawler.frontier.AdaptiveRevisitFrontier.java Source code

Introduction

Here is the source code for com.cyberway.issue.crawler.frontier.AdaptiveRevisitFrontier.java
Source

/* AdaptiveRevisitFrontier.java
*
* Created on Sep 13, 2004
*
* Copyright (C) 2004 Kristinn Sigur?sson.
*
* This file is part of the Heritrix web crawler (crawler.archive.org).
*
* Heritrix is free software; you can redistribute it and/or modify
* it under the terms of the GNU Lesser Public License as published by
* the Free Software Foundation; either version 2.1 of the License, or
* any later version.
*
* Heritrix is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
* GNU Lesser Public License for more details.
*
* You should have received a copy of the GNU Lesser Public License
* along with Heritrix; if not, write to the Free Software
* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
*/
package com.cyberway.issue.crawler.frontier;

import java.io.File;
import java.io.IOException;
import java.io.PrintWriter;
import java.io.Serializable;
import java.io.StringWriter;
import java.io.Writer;
import java.util.ArrayList;
import java.util.Date;
import java.util.Iterator;
import java.util.List;
import java.util.logging.Level;
import java.util.logging.Logger;
import java.util.regex.Pattern;

import javax.management.AttributeNotFoundException;

import org.apache.commons.httpclient.HttpStatus;
import com.cyberway.issue.crawler.datamodel.CandidateURI;
import com.cyberway.issue.crawler.datamodel.CoreAttributeConstants;
import com.cyberway.issue.crawler.datamodel.CrawlServer;
import com.cyberway.issue.crawler.datamodel.CrawlURI;
import com.cyberway.issue.crawler.datamodel.FetchStatusCodes;
import com.cyberway.issue.crawler.datamodel.UriUniqFilter;
import com.cyberway.issue.crawler.datamodel.UriUniqFilter.HasUriReceiver;
import com.cyberway.issue.crawler.event.CrawlStatusListener;
import com.cyberway.issue.crawler.framework.CrawlController;
import com.cyberway.issue.crawler.framework.Frontier;
import com.cyberway.issue.crawler.framework.FrontierMarker;
import com.cyberway.issue.crawler.framework.ToeThread;
import com.cyberway.issue.crawler.framework.exceptions.EndedException;
import com.cyberway.issue.crawler.framework.exceptions.FatalConfigurationException;
import com.cyberway.issue.crawler.framework.exceptions.InvalidFrontierMarkerException;
import com.cyberway.issue.crawler.settings.ModuleType;
import com.cyberway.issue.crawler.settings.RegularExpressionConstraint;
import com.cyberway.issue.crawler.settings.SimpleType;
import com.cyberway.issue.crawler.settings.Type;
import com.cyberway.issue.crawler.url.Canonicalizer;
import com.cyberway.issue.crawler.util.BdbUriUniqFilter;
import com.cyberway.issue.net.UURI;
import com.cyberway.issue.queue.MemQueue;
import com.cyberway.issue.queue.Queue;
import com.cyberway.issue.util.ArchiveUtils;

/**
 * A Frontier that will repeatedly visit all encountered URIs. 
 * <p>
 * Wait time between visits is configurable and varies based on observed 
 * changes of documents.
 * <p>
 * The Frontier borrows many things from HostQueuesFrontier, but implements 
 * an entirely different strategy in issuing URIs and consequently in keeping a
 * record of discovered URIs.
 *
 * @author Kristinn Sigurdsson
 */
public class AdaptiveRevisitFrontier extends ModuleType implements Frontier, FetchStatusCodes,
        CoreAttributeConstants, AdaptiveRevisitAttributeConstants, CrawlStatusListener, HasUriReceiver {

    private static final long serialVersionUID = -8666872690438543671L;

    private static final Logger logger = Logger.getLogger(AdaptiveRevisitFrontier.class.getName());

    /** How many multiples of last fetch elapsed time to wait before recontacting
     * same server */
    public final static String ATTR_DELAY_FACTOR = "delay-factor";
    private final static Float DEFAULT_DELAY_FACTOR = new Float(5);

    /** Always wait this long after one completion before recontacting
     * same server, regardless of multiple */
    public final static String ATTR_MIN_DELAY = "min-delay-ms";

    // 2 seconds
    private final static Integer DEFAULT_MIN_DELAY = new Integer(2000);

    /** Never wait more than this long, regardless of multiple */
    public final static String ATTR_MAX_DELAY = "max-delay-ms";

    // 30 seconds
    private final static Integer DEFAULT_MAX_DELAY = new Integer(30000);

    /** Maximum times to emit a CrawlURI without final disposition */
    public final static String ATTR_MAX_RETRIES = "max-retries";
    private final static Integer DEFAULT_MAX_RETRIES = new Integer(30);

    /** For retryable problems, seconds to wait before a retry */
    public final static String ATTR_RETRY_DELAY = "retry-delay-seconds";

    // 15 minutes
    private final static Long DEFAULT_RETRY_DELAY = new Long(900);

    /** Maximum simultaneous requests in process to a host (queue) */
    public final static String ATTR_HOST_VALENCE = "host-valence";
    private final static Integer DEFAULT_HOST_VALENCE = new Integer(1);

    /** Number of hops of embeds (ERX) to bump to front of host queue */
    public final static String ATTR_PREFERENCE_EMBED_HOPS = "preference-embed-hops";
    private final static Integer DEFAULT_PREFERENCE_EMBED_HOPS = new Integer(0);

    /** Queue assignment to force on CrawlURIs. Intended to be used 
     *  via overrides*/
    public final static String ATTR_FORCE_QUEUE = "force-queue-assignment";
    protected final static String DEFAULT_FORCE_QUEUE = "";
    /** Acceptable characters in forced queue names.
     *  Word chars, dash, period, comma, colon */
    protected final static String ACCEPTABLE_FORCE_QUEUE = "[-\\w\\.,:]*";

    /** Should the queue assignment ignore www in hostnames, effectively 
     *  stripping them away. 
     */
    public final static String ATTR_QUEUE_IGNORE_WWW = "queue-ignore-www";
    protected final static Boolean DEFAULT_QUEUE_IGNORE_WWW = new Boolean(false);

    /** Should the Frontier use a seperate 'already included' datastructure
     *  or rely on the queues'. 
     */
    public final static String ATTR_USE_URI_UNIQ_FILTER = "use-uri-uniq-filter";
    protected final static Boolean DEFAULT_USE_URI_UNIQ_FILTER = new Boolean(false);

    /** The Class to use for QueueAssignmentPolicy
     */
    public final static String ATTR_QUEUE_ASSIGNMENT_POLICY = "queue-assignment-policy";
    protected final static String DEFAULT_QUEUE_ASSIGNMENT_POLICY = HostnameQueueAssignmentPolicy.class.getName();

    private CrawlController controller;

    private AdaptiveRevisitQueueList hostQueues;

    private UriUniqFilter alreadyIncluded;

    private ThreadLocalQueue threadWaiting = new ThreadLocalQueue();

    /** Policy for assigning CrawlURIs to named queues */
    private QueueAssignmentPolicy queueAssignmentPolicy = null;

    // top-level stats
    private long succeededFetchCount = 0;
    private long failedFetchCount = 0;
    // URI's that are disregarded (for example because of robot.txt rules)
    private long disregardedUriCount = 0;

    private long totalProcessedBytes = 0;

    // Flags indicating operator-specified crawl pause/end 
    private boolean shouldPause = false;
    private boolean shouldTerminate = false;

    public AdaptiveRevisitFrontier(String name) {
        this(name,
                "AdaptiveRevisitFrontier. EXPERIMENTAL Frontier that " + "will repeatedly visit all "
                        + "encountered URIs. Wait time between visits is configurable"
                        + " and is determined by seperate Processor(s). See " + "WaitEvaluators "
                        + "See documentation for ARFrontier limitations.");
    }

    public AdaptiveRevisitFrontier(String name, String description) {
        super(Frontier.ATTR_NAME, description);
        addElementToDefinition(new SimpleType(ATTR_DELAY_FACTOR,
                "How many multiples of last fetch elapsed time to wait before " + "recontacting same server",
                DEFAULT_DELAY_FACTOR));
        addElementToDefinition(new SimpleType(ATTR_MAX_DELAY,
                "Never wait more than this long, regardless of multiple", DEFAULT_MAX_DELAY));
        addElementToDefinition(
                new SimpleType(ATTR_MIN_DELAY, "Always wait this long after one completion before recontacting "
                        + "same server, regardless of multiple", DEFAULT_MIN_DELAY));
        addElementToDefinition(
                new SimpleType(ATTR_MAX_RETRIES, "How often to retry fetching a URI that failed to be retrieved.\n"
                        + "If zero, the crawler will get the robots.txt only.", DEFAULT_MAX_RETRIES));
        addElementToDefinition(
                new SimpleType(ATTR_RETRY_DELAY, "How long to wait by default until we retry fetching a"
                        + " URI that failed to be retrieved (seconds). ", DEFAULT_RETRY_DELAY));
        addElementToDefinition(new SimpleType(ATTR_PREFERENCE_EMBED_HOPS,
                "Number of embedded (or redirected) hops up to which "
                        + "a URI has higher priority scheduling. For example, if set "
                        + "to 1 (the default), items such as inline images (1-hop "
                        + "embedded resources) will be scheduled ahead of all regular "
                        + "links (or many-hop resources, like nested frames). If set to "
                        + "zero, no preferencing will occur, and embeds/redirects are "
                        + "scheduled the same as regular links.",
                DEFAULT_PREFERENCE_EMBED_HOPS));
        Type t;
        t = addElementToDefinition(new SimpleType(ATTR_HOST_VALENCE,
                "Maximum number of simultaneous requests to a single" + " host.", DEFAULT_HOST_VALENCE));
        t.setExpertSetting(true);
        t = addElementToDefinition(
                new SimpleType(ATTR_QUEUE_IGNORE_WWW, "If true then documents from x.com, www.x.com and any "
                        + "www[0-9]+.x.com will be assigned to the same queue.", DEFAULT_QUEUE_IGNORE_WWW));
        t.setExpertSetting(true);
        t = addElementToDefinition(new SimpleType(ATTR_FORCE_QUEUE,
                "The queue name into which to force URIs. Should " + "be left blank at global level.  Specify a "
                        + "per-domain/per-host override to force URIs into "
                        + "a particular named queue, regardless of the assignment "
                        + "policy in effect (domain or ip-based politeness). "
                        + "This could be used on domains known to all be from "
                        + "the same small set of IPs (eg blogspot, dailykos, etc.) "
                        + "to simulate IP-based politeness, or it could be used if "
                        + "you wanted to enforce politeness over a whole domain, even "
                        + "though the subdomains are split across many IPs.",
                DEFAULT_FORCE_QUEUE));
        t.setOverrideable(true);
        t.setExpertSetting(true);
        t.addConstraint(new RegularExpressionConstraint(ACCEPTABLE_FORCE_QUEUE, Level.WARNING,
                "This field must contain only alphanumeric "
                        + "characters plus period, dash, comma, colon, or underscore."));
        t = addElementToDefinition(new SimpleType(ATTR_USE_URI_UNIQ_FILTER,
                "If true then the Frontier will use a seperate "
                        + "datastructure to detect and eliminate duplicates.\n"
                        + "This is required for Canonicalization rules to work.",
                DEFAULT_USE_URI_UNIQ_FILTER));
        t.setExpertSetting(true);
        t.setOverrideable(false);
        // Read the list of permissible choices from heritrix.properties.
        // Its a list of space- or comma-separated values.
        String queueStr = System.getProperty(AbstractFrontier.class.getName() + "." + ATTR_QUEUE_ASSIGNMENT_POLICY,
                HostnameQueueAssignmentPolicy.class.getName() + " " + IPQueueAssignmentPolicy.class.getName() + " "
                        + BucketQueueAssignmentPolicy.class.getName() + " "
                        + SurtAuthorityQueueAssignmentPolicy.class.getName() + " "
                        + TopmostAssignedSurtQueueAssignmentPolicy.class.getName());
        Pattern p = Pattern.compile("\\s*,\\s*|\\s+");
        String[] queues = p.split(queueStr);
        if (queues.length <= 0) {
            throw new RuntimeException("Failed parse of " + " assignment queue policy string: " + queueStr);
        }
        t = addElementToDefinition(new SimpleType(ATTR_QUEUE_ASSIGNMENT_POLICY,
                "Defines how to assign URIs to queues. Can assign by host, "
                        + "by ip, and into one of a fixed set of buckets (1k). NOTE: "
                        + "Use of policies other than the default "
                        + "HostnameQueueAssignmentPolicy is untested and provided "
                        + "for use at your own risk. Further, changing this policy "
                        + "during a crawl, or between restarts using the same data "
                        + "directory, is likely to cause unrecoverable problems.",
                DEFAULT_QUEUE_ASSIGNMENT_POLICY, queues));
        t.setExpertSetting(true);

        // Register persistent CrawlURI items 
        CrawlURI.addAlistPersistentMember(A_CONTENT_STATE_KEY);
        CrawlURI.addAlistPersistentMember(A_TIME_OF_NEXT_PROCESSING);
    }

    public synchronized void initialize(CrawlController c) throws FatalConfigurationException, IOException {
        controller = c;
        controller.addCrawlStatusListener(this);

        String clsName = (String) getUncheckedAttribute(null, ATTR_QUEUE_ASSIGNMENT_POLICY);
        try {
            queueAssignmentPolicy = (QueueAssignmentPolicy) Class.forName(clsName).newInstance();
        } catch (Exception e) {
            throw new RuntimeException(e);
        }

        hostQueues = new AdaptiveRevisitQueueList(c.getBdbEnvironment(), c.getBdbEnvironment().getClassCatalog());

        if (((Boolean) getUncheckedAttribute(null, ATTR_USE_URI_UNIQ_FILTER)).booleanValue()) {
            alreadyIncluded = createAlreadyIncluded();
        } else {
            alreadyIncluded = null;
        }

        loadSeeds();
    }

    /**
     * Create a UriUniqFilter that will serve as record 
     * of already seen URIs.
     *
     * @return A UURISet that will serve as a record of already seen URIs
     * @throws IOException
     */
    protected UriUniqFilter createAlreadyIncluded() throws IOException {
        UriUniqFilter uuf = new BdbUriUniqFilter(this.controller.getBdbEnvironment());
        uuf.setDestination(this);
        return uuf;
    }

    /**
     * Loads the seeds
     * <p>
     * This method is called by initialize() and kickUpdate()
     */
    public void loadSeeds() {
        Writer ignoredWriter = new StringWriter();
        // Get the seeds to refresh.
        Iterator iter = this.controller.getScope().seedsIterator(ignoredWriter);
        while (iter.hasNext()) {
            CandidateURI caUri = CandidateURI.createSeedCandidateURI((UURI) iter.next());
            caUri.setSchedulingDirective(CandidateURI.MEDIUM);
            schedule(caUri);
        }
        batchFlush();
        // save ignored items (if any) where they can be consulted later
        AbstractFrontier.saveIgnoredItems(ignoredWriter.toString(), controller.getDisk());
    }

    public String getClassKey(CandidateURI cauri) {
        String queueKey = (String) getUncheckedAttribute(cauri, ATTR_FORCE_QUEUE);
        if ("".equals(queueKey)) {
            // Typical case, barring overrides
            queueKey = queueAssignmentPolicy.getClassKey(controller, cauri);
            // The queueAssignmentPolicy is always based on Hostnames
            // We may need to remove any www[0-9]{0,}\. prefixes from the
            // hostnames
            if (((Boolean) getUncheckedAttribute(cauri, ATTR_QUEUE_IGNORE_WWW)).booleanValue()) {
                queueKey = queueKey.replaceAll("^www[0-9]{0,}\\.", "");
            }
        }
        return queueKey;
    }

    /**
     * Canonicalize passed uuri. Its would be sweeter if this canonicalize
     * function was encapsulated by that which it canonicalizes but because
     * settings change with context -- i.e. there may be overrides in operation
     * for a particular URI -- its not so easy; Each CandidateURI would need a
     * reference to the settings system. That's awkward to pass in.
     * 
     * @param uuri Candidate URI to canonicalize.
     * @return Canonicalized version of passed <code>uuri</code>.
     */
    protected String canonicalize(UURI uuri) {
        return Canonicalizer.canonicalize(uuri, this.controller.getOrder());
    }

    /**
     * Canonicalize passed CandidateURI. This method differs from
     * {@link #canonicalize(UURI)} in that it takes a look at
     * the CandidateURI context possibly overriding any canonicalization effect if
     * it could make us miss content. If canonicalization produces an URL that
     * was 'alreadyseen', but the entry in the 'alreadyseen' database did
     * nothing but redirect to the current URL, we won't get the current URL;
     * we'll think we've already see it. Examples would be archive.org
     * redirecting to www.archive.org or the inverse, www.netarkivet.net
     * redirecting to netarkivet.net (assuming stripWWW rule enabled).
     * <p>Note, this method under circumstance sets the forceFetch flag.
     * 
     * @param cauri CandidateURI to examine.
     * @return Canonicalized <code>cacuri</code>.
     */
    protected String canonicalize(CandidateURI cauri) {
        String canon = canonicalize(cauri.getUURI());
        if (cauri.isLocation()) {
            // If the via is not the same as where we're being redirected (i.e.
            // we're not being redirected back to the same page, AND the
            // canonicalization of the via is equal to the the current cauri, 
            // THEN forcefetch (Forcefetch so no chance of our not crawling
            // content because alreadyseen check things its seen the url before.
            // An example of an URL that redirects to itself is:
            // http://bridalelegance.com/images/buttons3/tuxedos-off.gif.
            // An example of an URL whose canonicalization equals its via's
            // canonicalization, and we want to fetch content at the
            // redirection (i.e. need to set forcefetch), is netarkivet.dk.
            if (!cauri.toString().equals(cauri.getVia().toString()) && canonicalize(cauri.getVia()).equals(canon)) {
                cauri.setForceFetch(true);
            }
        }
        return canon;
    }

    /**
     * 
     * @param caUri The URI to schedule.
     */
    protected void innerSchedule(CandidateURI caUri) {
        CrawlURI curi;
        if (caUri instanceof CrawlURI) {
            curi = (CrawlURI) caUri;
        } else {
            curi = CrawlURI.from(caUri, System.currentTimeMillis());
            curi.putLong(A_TIME_OF_NEXT_PROCESSING, System.currentTimeMillis());
            // New CrawlURIs get 'current time' as the time of next processing.
        }

        if (curi.getClassKey() == null) {
            curi.setClassKey(getClassKey(curi));
        }

        if (curi.isSeed() && curi.getVia() != null && curi.flattenVia().length() > 0) {
            // The only way a seed can have a non-empty via is if it is the
            // result of a seed redirect.  Add it to the seeds list.
            //
            // This is a feature.  This is handling for case where a seed
            // gets immediately redirected to another page.  What we're doing
            // is treating the immediate redirect target as a seed.
            this.controller.getScope().addSeed(curi);
            // And it needs rapid scheduling.
            curi.setSchedulingDirective(CandidateURI.MEDIUM);
        }

        // Optionally preferencing embeds up to MEDIUM
        int prefHops = ((Integer) getUncheckedAttribute(curi, ATTR_PREFERENCE_EMBED_HOPS)).intValue();
        boolean prefEmbed = false;
        if (prefHops > 0) {
            int embedHops = curi.getTransHops();
            if (embedHops > 0 && embedHops <= prefHops && curi.getSchedulingDirective() == CandidateURI.NORMAL) {
                // number of embed hops falls within the preferenced range, and
                // uri is not already MEDIUM -- so promote it
                curi.setSchedulingDirective(CandidateURI.MEDIUM);
                prefEmbed = true;
            }
        }

        // Finally, allow curi to be fetched right now 
        // (while not overriding overdue items)
        curi.putLong(A_TIME_OF_NEXT_PROCESSING, System.currentTimeMillis());

        try {
            logger.finest("scheduling " + curi.toString());
            AdaptiveRevisitHostQueue hq = getHQ(curi);
            hq.add(curi, prefEmbed);
        } catch (IOException e) {
            // TODO Handle IOExceptions
            e.printStackTrace();
        }

    }

    /**
     * Get the AdaptiveRevisitHostQueue for the given CrawlURI, creating
     * it if necessary. 
     * 
     * @param curi CrawlURI for which to get a queue
     * @return AdaptiveRevisitHostQueue for given CrawlURI
     * @throws IOException
     */
    protected AdaptiveRevisitHostQueue getHQ(CrawlURI curi) throws IOException {
        AdaptiveRevisitHostQueue hq = hostQueues.getHQ(curi.getClassKey());
        if (hq == null) {
            // Need to create it.
            int valence = DEFAULT_HOST_VALENCE.intValue();
            try {
                valence = ((Integer) getAttribute(curi, ATTR_HOST_VALENCE)).intValue();
            } catch (AttributeNotFoundException e2) {
                logger.severe("Unable to load valence.");
            }
            hq = hostQueues.createHQ(curi.getClassKey(), valence);
        }
        return hq;
    }

    protected void batchSchedule(CandidateURI caUri) {
        threadWaiting.getQueue().enqueue(caUri);
    }

    synchronized protected void batchFlush() {
        innerBatchFlush();
    }

    private void innerBatchFlush() {
        Queue q = threadWaiting.getQueue();
        while (!q.isEmpty()) {
            CandidateURI caUri = (CandidateURI) q.dequeue();
            if (alreadyIncluded != null) {
                String cannon = canonicalize(caUri);
                System.out.println("Cannon of " + caUri + " is " + cannon);
                if (caUri.forceFetch()) {
                    alreadyIncluded.addForce(cannon, caUri);
                } else {
                    alreadyIncluded.add(cannon, caUri);
                }
            } else {
                innerSchedule(caUri);
            }
        }
    }

    /**
     * @param curi
     * @return the CrawlServer to be associated with this CrawlURI
     */
    protected CrawlServer getServer(CrawlURI curi) {
        return this.controller.getServerCache().getServerFor(curi);
    }

    /* (non-Javadoc)
     * @see com.cyberway.issue.crawler.framework.Frontier#next()
     */
    public synchronized CrawlURI next() throws InterruptedException, EndedException {
        controller.checkFinish();

        while (shouldPause) {
            controller.toePaused();
            wait();
        }

        if (shouldTerminate) {
            throw new EndedException("terminated");
        }

        AdaptiveRevisitHostQueue hq = hostQueues.getTopHQ();

        while (hq.getState() != AdaptiveRevisitHostQueue.HQSTATE_READY) {
            // Ok, so we don't have a ready queue, wait until the top one
            // will become available.
            long waitTime = hq.getNextReadyTime() - System.currentTimeMillis();
            if (waitTime > 0) {
                wait(waitTime);
            }
            // The top HQ may have changed, so get it again
            hq = hostQueues.getTopHQ();
        }

        if (shouldTerminate) {
            // May have been terminated while thread was waiting for IO
            throw new EndedException("terminated");
        }

        try {
            CrawlURI curi = hq.next();
            // Populate CURI with 'transient' variables such as server.
            logger.fine("Issuing " + curi.toString());
            long temp = curi.getLong(A_TIME_OF_NEXT_PROCESSING);
            long currT = System.currentTimeMillis();
            long overdue = (currT - temp);
            if (logger.isLoggable(Level.FINER)) {
                String waitI = "not set";
                if (curi.containsKey(A_WAIT_INTERVAL)) {
                    waitI = ArchiveUtils.formatMillisecondsToConventional(curi.getLong(A_WAIT_INTERVAL));
                }
                logger.finer("Wait interval: " + waitI + ", Time of next proc: " + temp + ", Current time: " + currT
                        + ", Overdue by: " + overdue + "ms");
            }
            if (overdue < 0) {
                // This should never happen.
                logger.severe("Time overdue for " + curi.toString() + "is negative (" + overdue + ")!");
            }
            curi.putLong(A_FETCH_OVERDUE, overdue);
            return curi;
        } catch (IOException e) {
            // TODO: Need to handle this in an intelligent manner. 
            //       Is probably fatal?
            e.printStackTrace();
        }

        return null;
    }

    /* (non-Javadoc)
     * @see com.cyberway.issue.crawler.framework.Frontier#isEmpty()
     */
    public boolean isEmpty() {
        // Technically, the Frontier should never become empty since URIs are
        // only discarded under exceptional circumstances.
        return hostQueues.getSize() == 0;
    }

    /* (non-Javadoc)
     * @see com.cyberway.issue.crawler.framework.Frontier#schedule(com.cyberway.issue.crawler.datamodel.CandidateURI)
     */
    public void schedule(CandidateURI caURI) {
        batchSchedule(caURI);
        if (!(Thread.currentThread() instanceof ToeThread)) {
            // can't count on processing-cycle flush; force
            batchFlush();
        }
    }

    /* (non-Javadoc)
     * @see com.cyberway.issue.crawler.framework.Frontier#finished(com.cyberway.issue.crawler.datamodel.CrawlURI)
     */
    public synchronized void finished(CrawlURI curi) {
        logger.fine(curi.toString() + " " + CrawlURI.fetchStatusCodesToString(curi.getFetchStatus()));
        curi.incrementFetchAttempts();
        logLocalizedErrors(curi);

        innerFinished(curi);
    }

    protected synchronized void innerFinished(CrawlURI curi) {
        try {
            innerBatchFlush();

            if (curi.isSuccess()) {
                successDisposition(curi);
            } else if (needsPromptRetry(curi)) {
                // Consider statuses which allow nearly-immediate retry
                // (like deferred to allow precondition to be fetched)
                reschedule(curi, false);
            } else if (needsRetrying(curi)) {
                // Consider errors which can be retried
                reschedule(curi, true);
                controller.fireCrawledURINeedRetryEvent(curi);
            } else if (isDisregarded(curi)) {
                // Check for codes that mean that while the crawler did
                // manage to get it it must be disregarded for any reason.
                disregardDisposition(curi);
            } else {
                // In that case FAILURE, note & log
                failureDisposition(curi);
            }

            // New items might be available, let waiting threads know
            // More then one queue might have become available due to 
            // scheduling of items outside the parent URIs host, so we
            // wake all waiting threads.
            notifyAll();
        } catch (RuntimeException e) {
            curi.setFetchStatus(S_RUNTIME_EXCEPTION);
            // store exception temporarily for logging
            logger.warning("RTE in innerFinished() " + e.getMessage());
            e.printStackTrace();
            curi.putObject(A_RUNTIME_EXCEPTION, e);
            failureDisposition(curi);
        } catch (AttributeNotFoundException e) {
            logger.severe(e.getMessage());
        }
    }

    /**
     * Take note of any processor-local errors that have
     * been entered into the CrawlURI.
     * @param curi CrawlURI with errors.
     */
    private void logLocalizedErrors(CrawlURI curi) {
        if (curi.containsKey(A_LOCALIZED_ERRORS)) {
            List localErrors = (List) curi.getObject(A_LOCALIZED_ERRORS);
            Iterator iter = localErrors.iterator();
            while (iter.hasNext()) {
                Object array[] = { curi, iter.next() };
                controller.localErrors.log(Level.WARNING, curi.getUURI().toString(), array);
            }
            // once logged, discard
            curi.remove(A_LOCALIZED_ERRORS);
        }
    }

    /**
     * The CrawlURI has been successfully crawled. 
     *
     * @param curi The CrawlURI
     */
    protected void successDisposition(CrawlURI curi) {
        curi.aboutToLog();

        long waitInterval = 0;

        if (curi.containsKey(A_WAIT_INTERVAL)) {
            waitInterval = curi.getLong(A_WAIT_INTERVAL);
            curi.addAnnotation("wt:" + ArchiveUtils.formatMillisecondsToConventional(waitInterval));
        } else {
            logger.severe("Missing wait interval for " + curi.toString() + " WaitEvaluator may be missing.");
        }
        if (curi.containsKey(A_NUMBER_OF_VISITS)) {
            curi.addAnnotation(curi.getInt(A_NUMBER_OF_VISITS) + "vis");
        }
        if (curi.containsKey(A_NUMBER_OF_VERSIONS)) {
            curi.addAnnotation(curi.getInt(A_NUMBER_OF_VERSIONS) + "ver");
        }
        if (curi.containsKey(A_FETCH_OVERDUE)) {
            curi.addAnnotation(
                    "ov:" + ArchiveUtils.formatMillisecondsToConventional((curi.getLong(A_FETCH_OVERDUE))));
        }

        Object array[] = { curi };
        controller.uriProcessing.log(Level.INFO, curi.getUURI().toString(), array);

        succeededFetchCount++;
        totalProcessedBytes += curi.getContentSize();

        // Let everyone know in case they want to do something before we strip
        // the curi.
        controller.fireCrawledURISuccessfulEvent(curi);

        curi.setSchedulingDirective(CandidateURI.NORMAL);

        // Set time of next processing
        curi.putLong(A_TIME_OF_NEXT_PROCESSING, System.currentTimeMillis() + waitInterval);

        /* Update HQ */
        AdaptiveRevisitHostQueue hq = hostQueues.getHQ(curi.getClassKey());

        // Wake up time is based on the time when a fetch was completed + the
        // calculated snooze time for politeness. If the fetch completion time
        // is missing, we'll use current time.
        long wakeupTime = (curi.containsKey(A_FETCH_COMPLETED_TIME) ? curi.getLong(A_FETCH_COMPLETED_TIME)
                : (new Date()).getTime()) + calculateSnoozeTime(curi);

        // Ready the URI for reserialization.
        curi.processingCleanup();
        curi.resetDeferrals();
        curi.resetFetchAttempts();

        try {
            hq.update(curi, true, wakeupTime, shouldBeForgotten(curi));
        } catch (IOException e) {
            logger.severe("An IOException occured when updating " + curi.toString() + "\n" + e.getMessage());
            e.printStackTrace();
        }
    }

    /**
     * Put near top of relevant hostQueue (but behind anything recently
     * scheduled 'high')..
     *
     * @param curi CrawlURI to reschedule. Its time of next processing is not
     *             modified.
     * @param errorWait signals if there should be a wait before retrying.
     * @throws AttributeNotFoundException
     */
    protected void reschedule(CrawlURI curi, boolean errorWait) throws AttributeNotFoundException {
        long delay = 0;
        if (errorWait) {
            if (curi.containsKey(A_RETRY_DELAY)) {
                delay = curi.getLong(A_RETRY_DELAY);
            } else {
                // use ARFrontier default
                delay = ((Long) getAttribute(ATTR_RETRY_DELAY, curi)).longValue();
            }
        }

        long retryTime = (curi.containsKey(A_FETCH_COMPLETED_TIME) ? curi.getLong(A_FETCH_COMPLETED_TIME)
                : (new Date()).getTime()) + delay;

        AdaptiveRevisitHostQueue hq = hostQueues.getHQ(curi.getClassKey());
        // Ready the URI for reserialization.
        curi.processingCleanup();
        if (errorWait) {
            curi.resetDeferrals(); //Defferals only refer to immediate retries.
        }
        try {
            hq.update(curi, errorWait, retryTime, shouldBeForgotten(curi));
        } catch (IOException e) {
            // TODO Handle IOException
            e.printStackTrace();
        }
    }

    /**
     * The CrawlURI has encountered a problem, and will not
     * be retried.
     *
     * @param curi The CrawlURI
     */
    protected void failureDisposition(CrawlURI curi) {
        //Let interested listeners know of failed disposition.
        this.controller.fireCrawledURIFailureEvent(curi);

        // send to basic log
        curi.aboutToLog();
        Object array[] = { curi };
        this.controller.uriProcessing.log(Level.INFO, curi.getUURI().toString(), array);

        // if exception, also send to crawlErrors
        if (curi.getFetchStatus() == S_RUNTIME_EXCEPTION) {
            this.controller.runtimeErrors.log(Level.WARNING, curi.getUURI().toString(), array);
        }
        failedFetchCount++;

        // Put the failed URI at the very back of the queue.
        curi.setSchedulingDirective(CandidateURI.NORMAL);
        // TODO: reconsider this
        curi.putLong(A_TIME_OF_NEXT_PROCESSING, Long.MAX_VALUE);

        AdaptiveRevisitHostQueue hq = hostQueues.getHQ(curi.getClassKey());
        // Ready the URI for serialization.
        curi.processingCleanup();
        curi.resetDeferrals();
        curi.resetFetchAttempts();
        try {
            // No wait on failure. No contact was made with the server.
            boolean shouldForget = shouldBeForgotten(curi);
            if (shouldForget && alreadyIncluded != null) {
                alreadyIncluded.forget(canonicalize(curi.getUURI()), curi);
            }

            hq.update(curi, false, 0, shouldForget);
        } catch (IOException e) {
            // TODO Handle IOException
            e.printStackTrace();
        }
    }

    protected void disregardDisposition(CrawlURI curi) {
        //Let interested listeners know of disregard disposition.
        controller.fireCrawledURIDisregardEvent(curi);

        // send to basic log
        curi.aboutToLog();
        Object array[] = { curi };
        controller.uriProcessing.log(Level.INFO, curi.getUURI().toString(), array);

        disregardedUriCount++;

        // Todo: consider timout before retrying disregarded elements.
        //       Possibly add a setting to the WaitEvaluators?
        curi.putLong(A_TIME_OF_NEXT_PROCESSING, Long.MAX_VALUE);
        curi.setSchedulingDirective(CandidateURI.NORMAL);

        AdaptiveRevisitHostQueue hq = hostQueues.getHQ(curi.getClassKey());
        // Ready the URI for reserialization.
        curi.processingCleanup();
        curi.resetDeferrals();
        curi.resetFetchAttempts();
        try {
            // No politness wait on disregard. No contact was made with server
            hq.update(curi, false, 0, shouldBeForgotten(curi));
        } catch (IOException e) {
            // TODO Handle IOException
            e.printStackTrace();
        }
    }

    /**
     * Some URIs, if they recur,  deserve another
     * chance at consideration: they might not be too
     * many hops away via another path, or the scope
     * may have been updated to allow them passage.
     *
     * @param curi
     * @return True if curi should be forgotten.
     */
    protected boolean shouldBeForgotten(CrawlURI curi) {
        boolean shouldForget = false;

        switch (curi.getFetchStatus()) {
        case S_OUT_OF_SCOPE:
        case S_TOO_MANY_EMBED_HOPS:
        case S_TOO_MANY_LINK_HOPS:
            shouldForget = true;
        default:
            shouldForget = false;
        }

        if (!shouldForget) {
            if (curi.containsKey(A_DISCARD_REVISIT)) {
                Boolean noRevisit = (Boolean) curi.getObject(A_DISCARD_REVISIT);
                if (noRevisit) {
                    if (logger.isLoggable(Level.FINE)) {
                        logger.fine("NO_REVISIT tag set for URI: " + curi.getUURI().toString());
                    }
                    shouldForget = true;
                }
            }
        }

        return shouldForget;
    }

    /**
     * Checks if a recently completed CrawlURI that did not finish successfully
     * needs to be retried immediately (processed again as soon as politeness
     * allows.)
     *
     * @param curi The CrawlURI to check
     * @return True if we need to retry promptly.
     * @throws AttributeNotFoundException If problems occur trying to read the
     *            maximum number of retries from the settings framework.
     */
    protected boolean needsPromptRetry(CrawlURI curi) throws AttributeNotFoundException {
        if (curi.getFetchAttempts() >= ((Integer) getAttribute(ATTR_MAX_RETRIES, curi)).intValue()) {
            return false;
        }

        switch (curi.getFetchStatus()) {
        case S_DEFERRED:
            return true;

        case HttpStatus.SC_UNAUTHORIZED:
            // We can get here though usually a positive status code is
            // a success.  We get here if there is rfc2617 credential data
            // loaded and we're supposed to go around again.  See if any
            // rfc2617 credential present and if there, assume it got
            // loaded in FetchHTTP on expectation that we're to go around
            // again.  If no rfc2617 loaded, we should not be here.
            boolean loaded = curi.hasRfc2617CredentialAvatar();
            if (!loaded) {
                logger.severe("Have 401 but no creds loaded " + curi);
            }
            return loaded;

        default:
            return false;
        }
    }

    /**
     * Checks if a recently completed CrawlURI that did not finish successfully
     * needs to be retried (processed again after some time elapses)
     *
     * @param curi The CrawlURI to check
     * @return True if we need to retry.
     * @throws AttributeNotFoundException If problems occur trying to read the
     *            maximum number of retries from the settings framework.
     */
    protected boolean needsRetrying(CrawlURI curi) throws AttributeNotFoundException {
        // Check to see if maximum number of retries has been exceeded.
        if (curi.getFetchAttempts() >= ((Integer) getAttribute(ATTR_MAX_RETRIES, curi)).intValue()) {
            return false;
        } else {
            // Check if FetchStatus indicates that a delayed retry is needed.
            switch (curi.getFetchStatus()) {
            case S_CONNECT_FAILED:
            case S_CONNECT_LOST:
            case S_DOMAIN_UNRESOLVABLE:
                // these are all worth a retry
                // TODO: consider if any others (S_TIMEOUT in some cases?) 
                //       deserve retry
                return true;
            default:
                return false;
            }
        }
    }

    protected boolean isDisregarded(CrawlURI curi) {
        switch (curi.getFetchStatus()) {
        case S_ROBOTS_PRECLUDED: // they don't want us to have it
        case S_OUT_OF_SCOPE: // filtered out by scope
        case S_BLOCKED_BY_CUSTOM_PROCESSOR:
        case S_BLOCKED_BY_USER: // filtered out by user
        case S_TOO_MANY_EMBED_HOPS: // too far from last true link
        case S_TOO_MANY_LINK_HOPS: // too far from seeds
        case S_DELETED_BY_USER: // user deleted
            return true;
        default:
            return false;
        }
    }

    /**
     * Calculates how long a host queue needs to be snoozed following the
     * crawling of a URI.
     *
     * @param curi The CrawlURI
     * @return How long to snooze.
     */
    protected long calculateSnoozeTime(CrawlURI curi) {
        long durationToWait = 0;
        if (curi.containsKey(A_FETCH_BEGAN_TIME) && curi.containsKey(A_FETCH_COMPLETED_TIME)) {

            try {

                long completeTime = curi.getLong(A_FETCH_COMPLETED_TIME);
                long durationTaken = (completeTime - curi.getLong(A_FETCH_BEGAN_TIME));

                durationToWait = (long) (((Float) getAttribute(ATTR_DELAY_FACTOR, curi)).floatValue()
                        * durationTaken);

                long minDelay = ((Integer) getAttribute(ATTR_MIN_DELAY, curi)).longValue();

                if (minDelay > durationToWait) {
                    // wait at least the minimum
                    durationToWait = minDelay;
                }

                long maxDelay = ((Integer) getAttribute(ATTR_MAX_DELAY, curi)).longValue();
                if (durationToWait > maxDelay) {
                    // wait no more than the maximum
                    durationToWait = maxDelay;
                }
            } catch (AttributeNotFoundException e) {
                logger.severe("Unable to find attribute. " + curi.toString());
                //Wait for max interval.
                durationToWait = DEFAULT_MAX_DELAY.longValue();
            }

        }
        long ret = durationToWait > DEFAULT_MIN_DELAY.longValue() ? durationToWait : DEFAULT_MIN_DELAY.longValue();
        logger.finest("Snooze time for " + curi.toString() + " = " + ret);
        return ret;
    }

    /* (non-Javadoc)
     * @see com.cyberway.issue.crawler.framework.Frontier#discoveredUriCount()
     */
    public synchronized long discoveredUriCount() {
        return (this.alreadyIncluded != null) ? this.alreadyIncluded.count() : hostQueues.getSize();
    }

    /* (non-Javadoc)
     * @see com.cyberway.issue.crawler.framework.Frontier#queuedUriCount()
     */
    public synchronized long queuedUriCount() {
        return hostQueues.getSize();
    }

    /* (non-Javadoc)
     * @see com.cyberway.issue.crawler.framework.Frontier#finishedUriCount()
     */
    public long finishedUriCount() {
        return succeededFetchCount + failedFetchCount + disregardedUriCount;
    }

    /* (non-Javadoc)
     * @see com.cyberway.issue.crawler.framework.Frontier#succeededFetchCount()
     */
    public long succeededFetchCount() {
        return succeededFetchCount;
    }

    /* (non-Javadoc)
     * @see com.cyberway.issue.crawler.framework.Frontier#failedFetchCount()
     */
    public long failedFetchCount() {
        return failedFetchCount;
    }

    /* (non-Javadoc)
     * @see com.cyberway.issue.crawler.framework.Frontier#disregardedUriCount()
     */
    public long disregardedUriCount() {
        return disregardedUriCount++;
    }

    /* (non-Javadoc)
     * @see com.cyberway.issue.crawler.framework.Frontier#totalBytesWritten()
     */
    public long totalBytesWritten() {
        return totalProcessedBytes;
    }

    /**
     * Method is not supported by this Frontier implementation..
     * @param pathToLog
     * @throws IOException
     */
    public void importRecoverLog(String pathToLog) throws IOException {
        throw new IOException("Unsupported by this frontier.");
    }

    public synchronized FrontierMarker getInitialMarker(String regexpr, boolean inCacheOnly) {
        return null;
    }

    /* (non-Javadoc)
     * @see com.cyberway.issue.crawler.framework.Frontier#getURIsList(com.cyberway.issue.crawler.framework.FrontierMarker, int, boolean)
     */
    public synchronized ArrayList<String> getURIsList(FrontierMarker marker, int numberOfMatches, boolean verbose)
            throws InvalidFrontierMarkerException {
        // TODO Auto-generated method stub
        return null;
    }

    /* (non-Javadoc)
     * @see com.cyberway.issue.crawler.framework.Frontier#deleteURIs(java.lang.String)
     */
    public synchronized long deleteURIs(String match) {
        // TODO: implement?
        return 0;
    }

    /* (non-Javadoc)
     * @see com.cyberway.issue.crawler.framework.Frontier#deleteURIs(java.lang.String)
     */
    public synchronized long deleteURIs(String uriMatch, String queueMatch) {
        // TODO implement?
        return 0;
    }

    /* (non-Javadoc)
     * @see com.cyberway.issue.crawler.framework.Frontier#deleted(com.cyberway.issue.crawler.datamodel.CrawlURI)
     */
    public synchronized void deleted(CrawlURI curi) {
        // TODO Auto-generated method stub
    }

    public void considerIncluded(UURI u) {
        // This will cause the URI to be crawled!!!
        CrawlURI curi = new CrawlURI(u);
        innerSchedule(curi);

    }

    public void kickUpdate() {
        loadSeeds();
    }

    public void start() {
        unpause();
    }

    synchronized public void pause() {
        shouldPause = true;
        notifyAll();
    }

    synchronized public void unpause() {
        shouldPause = false;
        notifyAll();
    }

    synchronized public void terminate() {
        shouldTerminate = true;
    }

    /* (non-Javadoc)
     * @see com.cyberway.issue.crawler.framework.Frontier#getFrontierJournal()
     */
    public FrontierJournal getFrontierJournal() {
        return null;
    }

    private static class ThreadLocalQueue extends ThreadLocal<Queue<CandidateURI>> implements Serializable {

        private static final long serialVersionUID = 8268977225156462059L;

        protected Queue<CandidateURI> initialValue() {
            return new MemQueue<CandidateURI>();
        }

        /**
         * @return Queue of 'batched' items
         */
        public Queue<CandidateURI> getQueue() {
            return get();
        }
    }

    /**
     * This method is not supported by this Frontier implementation
     * @param pathToLog
     * @param retainFailures
     * @throws IOException
     */
    public void importRecoverLog(String pathToLog, boolean retainFailures) throws IOException {
        throw new IOException("Unsupported");
    }

    //
    // Reporter implementation
    //

    public String[] getReports() {
        // none but default for now
        return new String[] {};
    }

    /* (non-Javadoc)
     * @see com.cyberway.issue.util.Reporter#singleLineReport()
     */
    public String singleLineReport() {
        return ArchiveUtils.singleLineReport(this);
    }

    /* (non-Javadoc)
     * @see com.cyberway.issue.util.Reporter#reportTo(java.io.Writer)
     */
    public void reportTo(PrintWriter writer) throws IOException {
        reportTo(null, writer);
    }

    /* (non-Javadoc)
     * @see com.cyberway.issue.crawler.framework.Frontier#oneLineReport()
     */
    public synchronized void singleLineReportTo(PrintWriter w) throws IOException {
        hostQueues.singleLineReportTo(w);
    }

    /* (non-Javadoc)
     * @see com.cyberway.issue.util.Reporter#singleLineLegend()
     */
    public String singleLineLegend() {
        return hostQueues.singleLineLegend();
    }

    /* (non-Javadoc)
     * @see com.cyberway.issue.crawler.framework.Frontier#report()
     */
    public synchronized void reportTo(String name, PrintWriter writer) {
        // ignore name; only one report for now
        hostQueues.reportTo(name, writer);
    }

    /* (non-Javadoc)
     * @see com.cyberway.issue.crawler.event.CrawlStatusListener#crawlStarted(java.lang.String)
     */
    public void crawlStarted(String message) {
        // Not interested
    }

    /* (non-Javadoc)
     * @see com.cyberway.issue.crawler.event.CrawlStatusListener#crawlEnding(java.lang.String)
     */
    public void crawlEnding(String sExitMessage) {
        // Not interested
    }

    /* (non-Javadoc)
     * @see com.cyberway.issue.crawler.event.CrawlStatusListener#crawlEnded(java.lang.String)
     */
    public void crawlEnded(String sExitMessage) {
        // Cleanup!
        if (this.alreadyIncluded != null) {
            this.alreadyIncluded.close();
            this.alreadyIncluded = null;
        }
        hostQueues.close();
    }

    /* (non-Javadoc)
     * @see com.cyberway.issue.crawler.event.CrawlStatusListener#crawlPausing(java.lang.String)
     */
    public void crawlPausing(String statusMessage) {
        // Not interested
    }

    /* (non-Javadoc)
     * @see com.cyberway.issue.crawler.event.CrawlStatusListener#crawlPaused(java.lang.String)
     */
    public void crawlPaused(String statusMessage) {
        // Not interested
    }

    /* (non-Javadoc)
     * @see com.cyberway.issue.crawler.event.CrawlStatusListener#crawlResuming(java.lang.String)
     */
    public void crawlResuming(String statusMessage) {
        // Not interested
    }

    /* (non-Javadoc)
     * @see com.cyberway.issue.crawler.event.CrawlStatusListener#crawlCheckpoint(java.io.File)
     */
    public void crawlCheckpoint(File checkpointDir) throws Exception {
        // Not interested
    }

    /* (non-Javadoc)
     * @see com.cyberway.issue.crawler.datamodel.UriUniqFilter.HasUriReceiver#receive(com.cyberway.issue.crawler.datamodel.CandidateURI)
     */
    public void receive(CandidateURI item) {
        System.out.println("Received " + item);
        innerSchedule(item);
    }

    /* (non-Javadoc)
     * @see com.cyberway.issue.crawler.framework.Frontier#getGroup(com.cyberway.issue.crawler.datamodel.CrawlURI)
     */
    public FrontierGroup getGroup(CrawlURI curi) {
        try {
            return getHQ(curi);
        } catch (IOException ioe) {
            throw new RuntimeException(ioe);
        }
    }

    public long averageDepth() {
        return hostQueues.getAverageDepth();
    }

    public float congestionRatio() {
        return hostQueues.getCongestionRatio();
    }

    public long deepestUri() {
        return hostQueues.getDeepestQueueSize();
    }
}