com.cyberway.issue.crawler.frontier.WorkQueueFrontier.java Source code

Java tutorial

Introduction

Here is the source code for com.cyberway.issue.crawler.frontier.WorkQueueFrontier.java

Source

/* $Id: WorkQueueFrontier.java 5439 2007-08-28 05:15:25Z gojomo $
 * Created on Sep 24, 2004
 *
 *  Copyright (C) 2004 Internet Archive.
 *
 * This file is part of the Heritrix web crawler (crawler.archive.org).
 *
 * Heritrix is free software; you can redistribute it and/or modify
 * it under the terms of the GNU Lesser Public License as published by
 * the Free Software Foundation; either version 2.1 of the License, or
 * any later version.
 *
 * Heritrix is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU Lesser Public License for more details.
 *
 * You should have received a copy of the GNU Lesser Public License
 * along with Heritrix; if not, write to the Free Software
 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
 *
  */
package com.cyberway.issue.crawler.frontier;

import java.io.IOException;
import java.io.PrintWriter;
import java.io.Serializable;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.HashMap;
import java.util.Iterator;
import java.util.Map;
import java.util.Queue;
import java.util.SortedSet;
import java.util.Timer;
import java.util.TimerTask;
import java.util.TreeSet;
import java.util.concurrent.BlockingQueue;
import java.util.concurrent.LinkedBlockingQueue;
import java.util.concurrent.Semaphore;
import java.util.concurrent.TimeUnit;
import java.util.logging.Level;
import java.util.logging.Logger;

import org.apache.commons.collections.Bag;
import org.apache.commons.collections.BagUtils;
import org.apache.commons.collections.bag.HashBag;
import org.apache.commons.lang.StringUtils;
import com.cyberway.issue.crawler.datamodel.CandidateURI;
import com.cyberway.issue.crawler.datamodel.CoreAttributeConstants;
import com.cyberway.issue.crawler.datamodel.CrawlURI;
import com.cyberway.issue.crawler.datamodel.FetchStatusCodes;
import com.cyberway.issue.crawler.datamodel.UriUniqFilter;
import com.cyberway.issue.crawler.datamodel.UriUniqFilter.HasUriReceiver;
import com.cyberway.issue.crawler.framework.CrawlController;
import com.cyberway.issue.crawler.framework.Frontier;
import com.cyberway.issue.crawler.framework.exceptions.EndedException;
import com.cyberway.issue.crawler.framework.exceptions.FatalConfigurationException;
import com.cyberway.issue.crawler.settings.SimpleType;
import com.cyberway.issue.crawler.settings.Type;
import com.cyberway.issue.net.UURI;
import com.cyberway.issue.util.ArchiveUtils;

import com.sleepycat.collections.StoredIterator;

/**
 * A common Frontier base using several queues to hold pending URIs. 
 * 
 * Uses in-memory map of all known 'queues' inside a single database.
 * Round-robins between all queues.
 *
 * @author Gordon Mohr
 * @author Christian Kohlschuetter
 */
public abstract class WorkQueueFrontier extends AbstractFrontier
        implements FetchStatusCodes, CoreAttributeConstants, HasUriReceiver, Serializable {
    private static final long serialVersionUID = 570384305871965843L;

    public class WakeTask extends TimerTask {
        @Override
        public void run() {
            synchronized (snoozedClassQueues) {
                if (this != nextWake) {
                    // an intervening waketask was made
                    return;
                }
                wakeQueues();
            }
        }
    }

    /** truncate reporting of queues at some large but not unbounded number */
    private static final int REPORT_MAX_QUEUES = 2000;

    /**
     * If we know that only a small amount of queues is held in memory,
     * we can avoid using a disk-based BigMap.
     * This only works efficiently if the WorkQueue does not hold its
     * entries in memory as well.
     */
    private static final int MAX_QUEUES_TO_HOLD_ALLQUEUES_IN_MEMORY = 3000;

    /**
     * When a snooze target for a queue is longer than this amount, and 
     * there are already ready queues, deactivate rather than snooze 
     * the current queue -- so other more responsive sites get a chance
     * in active rotation. (As a result, queue's next try may be much
     * further in the future than the snooze target delay.)
     */
    public final static String ATTR_SNOOZE_DEACTIVATE_MS = "snooze-deactivate-ms";
    public static Long DEFAULT_SNOOZE_DEACTIVATE_MS = new Long(5 * 60 * 1000); // 5 minutes

    private static final Logger logger = Logger.getLogger(WorkQueueFrontier.class.getName());

    /** whether to hold queues INACTIVE until needed for throughput */
    public final static String ATTR_HOLD_QUEUES = "hold-queues";
    protected final static Boolean DEFAULT_HOLD_QUEUES = new Boolean(true);

    /** amount to replenish budget on each activation (duty cycle) */
    public final static String ATTR_BALANCE_REPLENISH_AMOUNT = "balance-replenish-amount";
    protected final static Integer DEFAULT_BALANCE_REPLENISH_AMOUNT = new Integer(3000);

    /** whether to hold queues INACTIVE until needed for throughput */
    public final static String ATTR_ERROR_PENALTY_AMOUNT = "error-penalty-amount";
    protected final static Integer DEFAULT_ERROR_PENALTY_AMOUNT = new Integer(100);

    /** total expenditure to allow a queue before 'retiring' it  */
    public final static String ATTR_QUEUE_TOTAL_BUDGET = "queue-total-budget";
    protected final static Long DEFAULT_QUEUE_TOTAL_BUDGET = new Long(-1);

    /** cost assignment policy to use (by class name) */
    public final static String ATTR_COST_POLICY = "cost-policy";
    protected final static String DEFAULT_COST_POLICY = UnitCostAssignmentPolicy.class.getName();

    /** target size of ready queues backlog */
    public final static String ATTR_TARGET_READY_QUEUES_BACKLOG = "target-ready-backlog";
    protected final static Integer DEFAULT_TARGET_READY_QUEUES_BACKLOG = new Integer(50);

    /** those UURIs which are already in-process (or processed), and
     thus should not be rescheduled */
    protected transient UriUniqFilter alreadyIncluded;

    /** All known queues.
     */
    protected transient Map<String, WorkQueue> allQueues = null;
    // of classKey -> ClassKeyQueue

    /**
     * All per-class queues whose first item may be handed out.
     * Linked-list of keys for the queues.
     */
    protected BlockingQueue<String> readyClassQueues;

    /** Target (minimum) size to keep readyClassQueues */
    protected int targetSizeForReadyQueues;

    /** single-thread access to ready-filling code */
    protected transient Semaphore readyFiller = new Semaphore(1);

    /** 
     * All 'inactive' queues, not yet in active rotation.
     * Linked-list of keys for the queues.
     */
    protected Queue<String> inactiveQueues;

    /**
     * 'retired' queues, no longer considered for activation.
     * Linked-list of keys for queues.
     */
    protected Queue<String> retiredQueues;

    /** all per-class queues from whom a URI is outstanding */
    protected Bag inProcessQueues = BagUtils.synchronizedBag(new HashBag()); // of ClassKeyQueue

    /**
     * All per-class queues held in snoozed state, sorted by wake time.
     */
    protected SortedSet<WorkQueue> snoozedClassQueues;

    /** Timer for tasks which wake head item of snoozedClassQueues */
    protected transient Timer wakeTimer;

    /** Task for next wake */
    protected transient WakeTask nextWake;

    protected WorkQueue longestActiveQueue = null;

    /** how long to wait for a ready queue when there's nothing snoozed */
    private static final long DEFAULT_WAIT = 1000; // 1 second

    /** a policy for assigning 'cost' values to CrawlURIs */
    private transient CostAssignmentPolicy costAssignmentPolicy;

    /** all policies available to be chosen */
    String[] AVAILABLE_COST_POLICIES = new String[] { ZeroCostAssignmentPolicy.class.getName(),
            UnitCostAssignmentPolicy.class.getName(), WagCostAssignmentPolicy.class.getName(),
            AntiCalendarCostAssignmentPolicy.class.getName() };

    /**
     * Create the CommonFrontier
     * 
     * @param name
     * @param description
     */
    public WorkQueueFrontier(String name, String description) {
        // The 'name' of all frontiers should be the same (URIFrontier.ATTR_NAME)
        // therefore we'll ignore the supplied parameter.
        super(Frontier.ATTR_NAME, description);
        Type t = addElementToDefinition(new SimpleType(ATTR_HOLD_QUEUES,
                "Whether to hold newly-created per-host URI work"
                        + " queues until needed to stay busy. If false (default),"
                        + " all queues may contribute URIs for crawling at all"
                        + " times. If true, queues begin (and collect URIs) in"
                        + " an 'inactive' state, and only when the Frontier needs"
                        + " another queue to keep all ToeThreads busy will new" + " queues be activated.",
                DEFAULT_HOLD_QUEUES));
        t.setExpertSetting(true);
        t.setOverrideable(false);
        t = addElementToDefinition(new SimpleType(ATTR_BALANCE_REPLENISH_AMOUNT,
                "Amount to replenish a queue's activity balance when it becomes "
                        + "active. Larger amounts mean more URIs will be tried from the "
                        + "queue before it is deactivated in favor of waiting queues. " + "Default is 3000",
                DEFAULT_BALANCE_REPLENISH_AMOUNT));
        t.setExpertSetting(true);
        t.setOverrideable(true);
        t = addElementToDefinition(new SimpleType(ATTR_ERROR_PENALTY_AMOUNT,
                "Amount to additionally penalize a queue when one of"
                        + "its URIs fails completely. Accelerates deactivation or "
                        + "full retirement of problem queues and unresponsive sites. " + "Default is 100",
                DEFAULT_ERROR_PENALTY_AMOUNT));
        t.setExpertSetting(true);
        t.setOverrideable(true);
        t = addElementToDefinition(new SimpleType(ATTR_QUEUE_TOTAL_BUDGET,
                "Total activity expenditure allowable to a single queue; queues "
                        + "over this expenditure will be 'retired' and crawled no more. "
                        + "Default of -1 means no ceiling on activity expenditures is " + "enforced.",
                DEFAULT_QUEUE_TOTAL_BUDGET));
        t.setExpertSetting(true);
        t.setOverrideable(true);

        t = addElementToDefinition(new SimpleType(ATTR_COST_POLICY,
                "Policy for calculating the cost of each URI attempted. "
                        + "The default UnitCostAssignmentPolicy considers the cost of " + "each URI to be '1'.",
                DEFAULT_COST_POLICY, AVAILABLE_COST_POLICIES));
        t.setExpertSetting(true);

        t = addElementToDefinition(new SimpleType(ATTR_SNOOZE_DEACTIVATE_MS,
                "Threshold above which any 'snooze' delay will cause the "
                        + "affected queue to go inactive, allowing other queues a "
                        + "chance to rotate into active state. Typically set to be "
                        + "longer than the politeness pauses between successful "
                        + "fetches, but shorter than the connection-failed "
                        + "'retry-delay-seconds'. (Default is 5 minutes.)",
                DEFAULT_SNOOZE_DEACTIVATE_MS));
        t.setExpertSetting(true);
        t.setOverrideable(false);
        t = addElementToDefinition(new SimpleType(ATTR_TARGET_READY_QUEUES_BACKLOG,
                "Target size for backlog of ready queues. This many queues "
                        + "will be brought into 'ready' state even if a thread is "
                        + "not waiting. Only has effect if 'hold-queues' is true. " + "Default is 50.",
                DEFAULT_TARGET_READY_QUEUES_BACKLOG));
        t.setExpertSetting(true);
        t.setOverrideable(false);
    }

    /**
     * Initializes the Frontier, given the supplied CrawlController.
     *
     * @see com.cyberway.issue.crawler.framework.Frontier#initialize(com.cyberway.issue.crawler.framework.CrawlController)
     */
    public void initialize(CrawlController c) throws FatalConfigurationException, IOException {
        // Call the super method. It sets up frontier journalling.
        super.initialize(c);
        this.controller = c;

        initQueuesOfQueues();

        this.targetSizeForReadyQueues = (Integer) getUncheckedAttribute(null, ATTR_TARGET_READY_QUEUES_BACKLOG);
        if (this.targetSizeForReadyQueues < 1) {
            this.targetSizeForReadyQueues = 1;
        }
        this.wakeTimer = new Timer("waker for " + c.toString());

        try {
            if (workQueueDataOnDisk() && getQueueAssignmentPolicy(null).maximumNumberOfKeys() >= 0
                    && getQueueAssignmentPolicy(null)
                            .maximumNumberOfKeys() <= MAX_QUEUES_TO_HOLD_ALLQUEUES_IN_MEMORY) {
                this.allQueues = Collections.synchronizedMap(new HashMap<String, WorkQueue>());
            } else {
                this.allQueues = c.getBigMap("allqueues", String.class, WorkQueue.class);
                if (logger.isLoggable(Level.FINE)) {
                    Iterator i = this.allQueues.keySet().iterator();
                    try {
                        for (; i.hasNext();) {
                            logger.fine((String) i.next());
                        }
                    } finally {
                        StoredIterator.close(i);
                    }
                }
            }
            this.alreadyIncluded = createAlreadyIncluded();
            initQueue();
        } catch (IOException e) {
            e.printStackTrace();
            throw (FatalConfigurationException) new FatalConfigurationException(e.getMessage()).initCause(e);
        } catch (Exception e) {
            e.printStackTrace();
            throw (FatalConfigurationException) new FatalConfigurationException(e.getMessage()).initCause(e);
        }

        initCostPolicy();

        loadSeeds();
    }

    /**
     * Set up the various queues-of-queues used by the frontier. Override
     * in implementing subclasses to reduce or eliminate risk of queues
     * growing without bound. 
     */
    protected void initQueuesOfQueues() {
        // small risk of OutOfMemoryError: if 'hold-queues' is false,
        // readyClassQueues may grow in size without bound
        readyClassQueues = new LinkedBlockingQueue<String>();
        // risk of OutOfMemoryError: in large crawls, 
        // inactiveQueues may grow in size without bound
        inactiveQueues = new LinkedBlockingQueue<String>();
        // risk of OutOfMemoryError: in large crawls with queue max-budgets, 
        // inactiveQueues may grow in size without bound
        retiredQueues = new LinkedBlockingQueue<String>();
        // small risk of OutOfMemoryError: in large crawls with many 
        // unresponsive queues, an unbounded number of snoozed queues 
        // may exist
        snoozedClassQueues = Collections.synchronizedSortedSet(new TreeSet<WorkQueue>());
    }

    /**
     * Set (or reset after configuration change) the cost policy in effect.
     * 
     * @throws FatalConfigurationException
     */
    private void initCostPolicy() throws FatalConfigurationException {
        try {
            costAssignmentPolicy = (CostAssignmentPolicy) Class
                    .forName((String) getUncheckedAttribute(null, ATTR_COST_POLICY)).newInstance();
        } catch (Exception e) {
            e.printStackTrace();
            throw new FatalConfigurationException(e.getMessage());
        }
    }

    /* (non-Javadoc)
     * @see com.cyberway.issue.crawler.frontier.AbstractFrontier#crawlEnded(java.lang.String)
     */
    public void crawlEnded(String sExitMessage) {
        // Cleanup.  CrawlJobs persist after crawl has finished so undo any
        // references.
        if (this.alreadyIncluded != null) {
            this.alreadyIncluded.close();
            this.alreadyIncluded = null;
        }

        try {
            closeQueue();
        } catch (IOException e) {
            // FIXME exception handling
            e.printStackTrace();
        }
        this.wakeTimer.cancel();

        this.allQueues.clear();
        this.allQueues = null;
        this.inProcessQueues = null;
        this.readyClassQueues = null;
        this.snoozedClassQueues = null;
        this.inactiveQueues = null;
        this.retiredQueues = null;

        this.costAssignmentPolicy = null;

        // Clearing controller is a problem. We get NPEs in #preNext.
        super.crawlEnded(sExitMessage);
        this.controller = null;
    }

    /**
     * Create a UriUniqFilter that will serve as record 
     * of already seen URIs.
     *
     * @return A UURISet that will serve as a record of already seen URIs
     * @throws IOException
     */
    protected abstract UriUniqFilter createAlreadyIncluded() throws IOException;

    /**
     * Arrange for the given CandidateURI to be visited, if it is not
     * already scheduled/completed.
     *
     * @see com.cyberway.issue.crawler.framework.Frontier#schedule(com.cyberway.issue.crawler.datamodel.CandidateURI)
     */
    public void schedule(CandidateURI caUri) {
        // Canonicalization may set forceFetch flag.  See
        // #canonicalization(CandidateURI) javadoc for circumstance.
        String canon = canonicalize(caUri);
        if (caUri.forceFetch()) {
            alreadyIncluded.addForce(canon, caUri);
        } else {
            alreadyIncluded.add(canon, caUri);
        }
    }

    /**
     * Accept the given CandidateURI for scheduling, as it has
     * passed the alreadyIncluded filter. 
     * 
     * Choose a per-classKey queue and enqueue it. If this
     * item has made an unready queue ready, place that 
     * queue on the readyClassQueues queue. 
     * @param caUri CandidateURI.
     */
    public void receive(CandidateURI caUri) {
        CrawlURI curi = asCrawlUri(caUri);
        applySpecialHandling(curi);
        sendToQueue(curi);
        // Update recovery log.
        doJournalAdded(curi);
    }

    /* (non-Javadoc)
     * @see com.cyberway.issue.crawler.frontier.AbstractFrontier#asCrawlUri(com.cyberway.issue.crawler.datamodel.CandidateURI)
     */
    protected CrawlURI asCrawlUri(CandidateURI caUri) {
        CrawlURI curi = super.asCrawlUri(caUri);
        // force cost to be calculated, pre-insert
        getCost(curi);
        return curi;
    }

    /**
     * Send a CrawlURI to the appropriate subqueue.
     * 
     * @param curi
     */
    protected void sendToQueue(CrawlURI curi) {
        WorkQueue wq = getQueueFor(curi);
        synchronized (wq) {
            wq.enqueue(this, curi);
            if (!wq.isRetired()) {
                incrementQueuedUriCount();
            }
            if (!wq.isHeld()) {
                wq.setHeld();
                if (holdQueues() && readyClassQueues.size() >= targetSizeForReadyQueues()) {
                    deactivateQueue(wq);
                } else {
                    replenishSessionBalance(wq);
                    readyQueue(wq);
                }
            }
            WorkQueue laq = longestActiveQueue;
            if (!wq.isRetired() && ((laq == null) || wq.getCount() > laq.getCount())) {
                longestActiveQueue = wq;
            }
        }
    }

    /**
     * Whether queues should start inactive (only becoming active when needed
     * to keep the crawler busy), or if queues should start out ready.
     * 
     * @return true if new queues should held inactive
     */
    private boolean holdQueues() {
        return ((Boolean) getUncheckedAttribute(null, ATTR_HOLD_QUEUES)).booleanValue();
    }

    /**
     * Put the given queue on the readyClassQueues queue
     * @param wq
     */
    private void readyQueue(WorkQueue wq) {
        try {
            wq.setActive(this, true);
            readyClassQueues.put(wq.getClassKey());
        } catch (InterruptedException e) {
            e.printStackTrace();
            System.err.println("unable to ready queue " + wq);
            // propagate interrupt up 
            throw new RuntimeException(e);
        }
    }

    /**
     * Put the given queue on the inactiveQueues queue
     * @param wq
     */
    private void deactivateQueue(WorkQueue wq) {
        //        try {
        wq.setSessionBalance(0); // zero out session balance
        inactiveQueues.add(wq.getClassKey());
        wq.setActive(this, false);
        //        } catch (InterruptedException e) {
        //            e.printStackTrace();
        //            System.err.println("unable to deactivate queue "+wq);
        //            // propagate interrupt up 
        //            throw new RuntimeException(e);
        //        }
    }

    /**
     * Put the given queue on the retiredQueues queue
     * @param wq
     */
    private void retireQueue(WorkQueue wq) {
        //        try {
        retiredQueues.add(wq.getClassKey());
        decrementQueuedCount(wq.getCount());
        wq.setRetired(true);
        wq.setActive(this, false);
        //        } catch (InterruptedException e) {
        //            e.printStackTrace();
        //            System.err.println("unable to retire queue "+wq);
        //            // propagate interrupt up 
        //            throw new RuntimeException(e);
        //        }
    }

    /** 
     * Accomodate any changes in settings.
     * 
     * @see com.cyberway.issue.crawler.framework.Frontier#kickUpdate()
     */
    public void kickUpdate() {
        super.kickUpdate();
        int target = (Integer) getUncheckedAttribute(null, ATTR_TARGET_READY_QUEUES_BACKLOG);
        if (target < 1) {
            target = 1;
        }
        this.targetSizeForReadyQueues = target;
        try {
            initCostPolicy();
        } catch (FatalConfigurationException fce) {
            throw new RuntimeException(fce);
        }
        // The rules for a 'retired' queue may have changed; so,
        // unretire all queues to 'inactive'. If they still qualify
        // as retired/overbudget next time they come up, they'll
        // be re-retired; if not, they'll get a chance to become
        // active under the new rules.
        Object key = this.retiredQueues.poll();
        while (key != null) {
            WorkQueue q = (WorkQueue) this.allQueues.get(key);
            if (q != null) {
                unretireQueue(q);
            }
            key = this.retiredQueues.poll();
        }
    }

    /**
     * Restore a retired queue to the 'inactive' state. 
     * 
     * @param q
     */
    private void unretireQueue(WorkQueue q) {
        deactivateQueue(q);
        q.setRetired(false);
        incrementQueuedUriCount(q.getCount());
    }

    /**
     * Return the work queue for the given CrawlURI's classKey. URIs
     * are ordered and politeness-delayed within their 'class'.
     * If the requested queue is not found, a new instance is created.
     * 
     * @param curi CrawlURI to base queue on
     * @return the found or created ClassKeyQueue
     */
    protected abstract WorkQueue getQueueFor(CrawlURI curi);

    /**
     * Return the work queue for the given classKey, or null
     * if no such queue exists.
     * 
     * @param classKey key to look for
     * @return the found WorkQueue
     */
    protected abstract WorkQueue getQueueFor(String classKey);

    /**
     * Return the next CrawlURI to be processed (and presumably
     * visited/fetched) by a a worker thread.
     *
     * Relies on the readyClassQueues having been loaded with
     * any work queues that are eligible to provide a URI. 
     *
     * @return next CrawlURI to be processed. Or null if none is available.
     *
     * @see com.cyberway.issue.crawler.framework.Frontier#next()
     */
    public CrawlURI next() throws InterruptedException, EndedException {
        while (true) { // loop left only by explicit return or exception
            long now = System.currentTimeMillis();

            // Do common checks for pause, terminate, bandwidth-hold
            preNext(now);

            // allow up-to-1 thread to fill readyClassQueues to target
            if (readyFiller.tryAcquire()) {
                try {
                    int activationsNeeded = targetSizeForReadyQueues() - readyClassQueues.size();
                    while (activationsNeeded > 0 && !inactiveQueues.isEmpty()) {
                        activateInactiveQueue();
                        activationsNeeded--;
                    }
                } finally {
                    readyFiller.release();
                }
            }

            WorkQueue readyQ = null;
            Object key = readyClassQueues.poll(DEFAULT_WAIT, TimeUnit.MILLISECONDS);
            if (key != null) {
                readyQ = (WorkQueue) this.allQueues.get(key);
            }
            if (readyQ != null) {
                while (true) { // loop left by explicit return or break on empty
                    CrawlURI curi = null;
                    synchronized (readyQ) {
                        curi = readyQ.peek(this);
                        if (curi != null) {
                            // check if curi belongs in different queue
                            String currentQueueKey = getClassKey(curi);
                            if (currentQueueKey.equals(curi.getClassKey())) {
                                // curi was in right queue, emit
                                noteAboutToEmit(curi, readyQ);
                                inProcessQueues.add(readyQ);
                                return curi;
                            }
                            // URI's assigned queue has changed since it
                            // was queued (eg because its IP has become
                            // known). Requeue to new queue.
                            curi.setClassKey(currentQueueKey);
                            readyQ.dequeue(this);
                            decrementQueuedCount(1);
                            curi.setHolderKey(null);
                            // curi will be requeued to true queue after lock
                            //  on readyQ is released, to prevent deadlock
                        } else {
                            // readyQ is empty and ready: it's exhausted
                            // release held status, allowing any subsequent 
                            // enqueues to again put queue in ready
                            readyQ.clearHeld();
                            break;
                        }
                    }
                    if (curi != null) {
                        // complete the requeuing begun earlier
                        sendToQueue(curi);
                    }
                }
            } else {
                // ReadyQ key wasn't in all queues: unexpected
                if (key != null) {
                    logger.severe("Key " + key + " in readyClassQueues but not allQueues");
                }
            }

            if (shouldTerminate) {
                // skip subsequent steps if already on last legs
                throw new EndedException("shouldTerminate is true");
            }

            if (inProcessQueues.size() == 0) {
                // Nothing was ready or in progress or imminent to wake; ensure 
                // any piled-up pending-scheduled URIs are considered
                this.alreadyIncluded.requestFlush();
            }
        }
    }

    private int targetSizeForReadyQueues() {
        return targetSizeForReadyQueues;
    }

    /**
     * Return the 'cost' of a CrawlURI (how much of its associated
     * queue's budget it depletes upon attempted processing)
     * 
     * @param curi
     * @return the associated cost
     */
    private int getCost(CrawlURI curi) {
        int cost = curi.getHolderCost();
        if (cost == CrawlURI.UNCALCULATED) {
            cost = costAssignmentPolicy.costOf(curi);
            curi.setHolderCost(cost);
        }
        return cost;
    }

    /**
     * Activate an inactive queue, if any are available. 
     */
    private void activateInactiveQueue() {
        Object key = this.inactiveQueues.poll();
        if (key == null) {
            return;
        }
        WorkQueue candidateQ = (WorkQueue) this.allQueues.get(key);
        if (candidateQ != null) {
            synchronized (candidateQ) {
                replenishSessionBalance(candidateQ);
                if (candidateQ.isOverBudget()) {
                    // if still over-budget after an activation & replenishing,
                    // retire
                    retireQueue(candidateQ);
                    return;
                }
                long now = System.currentTimeMillis();
                long delay_ms = candidateQ.getWakeTime() - now;
                if (delay_ms > 0) {
                    // queue still due for snoozing
                    snoozeQueue(candidateQ, now, delay_ms);
                    return;
                }
                candidateQ.setWakeTime(0); // clear obsolete wake time, if any
                readyQueue(candidateQ);
                if (logger.isLoggable(Level.FINE)) {
                    logger.fine("ACTIVATED queue: " + candidateQ.getClassKey());

                }
            }
        }
    }

    /**
     * Replenish the budget of the given queue by the appropriate amount.
     * 
     * @param queue queue to replenish
     */
    private void replenishSessionBalance(WorkQueue queue) {
        UURI contextUri = queue.getContextUURI(this);

        // TODO: consider confusing cross-effects of this and IP-based politeness
        queue.setSessionBalance(
                ((Integer) getUncheckedAttribute(contextUri, ATTR_BALANCE_REPLENISH_AMOUNT)).intValue());
        // reset total budget (it may have changed)
        // TODO: is this the best way to be sensitive to potential mid-crawl changes
        long totalBudget = ((Long) getUncheckedAttribute(contextUri, ATTR_QUEUE_TOTAL_BUDGET)).longValue();
        queue.setTotalBudget(totalBudget);
    }

    /**
     * Enqueue the given queue to either readyClassQueues or inactiveQueues,
     * as appropriate.
     * 
     * @param wq
     */
    private void reenqueueQueue(WorkQueue wq) {
        if (wq.isOverBudget()) {
            // if still over budget, deactivate
            if (logger.isLoggable(Level.FINE)) {
                logger.fine("DEACTIVATED queue: " + wq.getClassKey());
            }
            deactivateQueue(wq);
        } else {
            readyQueue(wq);
        }
    }

    /**
     * Wake any queues sitting in the snoozed queue whose time has come.
     */
    void wakeQueues() {
        synchronized (snoozedClassQueues) {
            long now = System.currentTimeMillis();
            long nextWakeDelay = 0;
            int wokenQueuesCount = 0;
            while (true) {
                if (snoozedClassQueues.isEmpty()) {
                    return;
                }
                WorkQueue peek = (WorkQueue) snoozedClassQueues.first();
                nextWakeDelay = peek.getWakeTime() - now;
                if (nextWakeDelay <= 0) {
                    snoozedClassQueues.remove(peek);
                    peek.setWakeTime(0);
                    reenqueueQueue(peek);
                    wokenQueuesCount++;
                } else {
                    break;
                }
            }
            this.nextWake = new WakeTask();
            this.wakeTimer.schedule(nextWake, nextWakeDelay);
        }
    }

    /**
     * Note that the previously emitted CrawlURI has completed
     * its processing (for now).
     *
     * The CrawlURI may be scheduled to retry, if appropriate,
     * and other related URIs may become eligible for release
     * via the next next() call, as a result of finished().
     *
     *  (non-Javadoc)
     * @see com.cyberway.issue.crawler.framework.Frontier#finished(com.cyberway.issue.crawler.datamodel.CrawlURI)
     */
    public void finished(CrawlURI curi) {
        long now = System.currentTimeMillis();

        curi.incrementFetchAttempts();
        logLocalizedErrors(curi);
        WorkQueue wq = (WorkQueue) curi.getHolder();
        assert (wq.peek(this) == curi) : "unexpected peek " + wq;
        inProcessQueues.remove(wq, 1);

        if (includesRetireDirective(curi)) {
            // CrawlURI is marked to trigger retirement of its queue
            curi.processingCleanup();
            wq.unpeek();
            wq.update(this, curi); // rewrite any changes
            retireQueue(wq);
            return;
        }

        if (needsRetrying(curi)) {
            // Consider errors which can be retried, leaving uri atop queue
            if (curi.getFetchStatus() != S_DEFERRED) {
                wq.expend(getCost(curi)); // all retries but DEFERRED cost
            }
            long delay_sec = retryDelayFor(curi);
            curi.processingCleanup(); // lose state that shouldn't burden retry
            synchronized (wq) {
                wq.unpeek();
                // TODO: consider if this should happen automatically inside unpeek()
                wq.update(this, curi); // rewrite any changes
                if (delay_sec > 0) {
                    long delay_ms = delay_sec * 1000;
                    snoozeQueue(wq, now, delay_ms);
                } else {
                    reenqueueQueue(wq);
                }
            }
            // Let everyone interested know that it will be retried.
            controller.fireCrawledURINeedRetryEvent(curi);
            doJournalRescheduled(curi);
            return;
        }

        // Curi will definitely be disposed of without retry, so remove from queue
        wq.dequeue(this);
        decrementQueuedCount(1);
        log(curi);

        if (curi.isSuccess()) {
            totalProcessedBytes += curi.getRecordedSize();
            incrementSucceededFetchCount();
            // Let everyone know in case they want to do something before we strip the curi.
            controller.fireCrawledURISuccessfulEvent(curi);
            doJournalFinishedSuccess(curi);
            wq.expend(getCost(curi)); // successes cost
        } else if (isDisregarded(curi)) {
            // Check for codes that mean that while we the crawler did
            // manage to schedule it, it must be disregarded for some reason.
            incrementDisregardedUriCount();
            // Let interested listeners know of disregard disposition.
            controller.fireCrawledURIDisregardEvent(curi);
            doJournalDisregarded(curi);
            // if exception, also send to crawlErrors
            if (curi.getFetchStatus() == S_RUNTIME_EXCEPTION) {
                Object[] array = { curi };
                controller.runtimeErrors.log(Level.WARNING, curi.getUURI().toString(), array);
            }
            // TODO: consider reinstating forget-uri
        } else {
            // In that case FAILURE, note & log
            //Let interested listeners know of failed disposition.
            this.controller.fireCrawledURIFailureEvent(curi);
            // if exception, also send to crawlErrors
            if (curi.getFetchStatus() == S_RUNTIME_EXCEPTION) {
                Object[] array = { curi };
                this.controller.runtimeErrors.log(Level.WARNING, curi.getUURI().toString(), array);
            }
            incrementFailedFetchCount();
            // let queue note error
            wq.noteError(((Integer) getUncheckedAttribute(curi, ATTR_ERROR_PENALTY_AMOUNT)).intValue());
            doJournalFinishedFailure(curi);
            wq.expend(getCost(curi)); // failures cost
        }

        long delay_ms = politenessDelayFor(curi);
        synchronized (wq) {
            if (delay_ms > 0) {
                snoozeQueue(wq, now, delay_ms);
            } else {
                reenqueueQueue(wq);
            }
        }

        curi.stripToMinimal();
        curi.processingCleanup();

    }

    private boolean includesRetireDirective(CrawlURI curi) {
        return curi.containsKey(A_FORCE_RETIRE) && (Boolean) curi.getObject(A_FORCE_RETIRE);
    }

    /**
     * Place the given queue into 'snoozed' state, ineligible to
     * supply any URIs for crawling, for the given amount of time. 
     * 
     * @param wq queue to snooze 
     * @param now time now in ms 
     * @param delay_ms time to snooze in ms
     */
    private void snoozeQueue(WorkQueue wq, long now, long delay_ms) {
        long nextTime = now + delay_ms;
        wq.setWakeTime(nextTime);
        long snoozeToInactiveDelayMs = ((Long) getUncheckedAttribute(null, ATTR_SNOOZE_DEACTIVATE_MS)).longValue();
        if (delay_ms > snoozeToInactiveDelayMs && !inactiveQueues.isEmpty()) {
            deactivateQueue(wq);
        } else {
            synchronized (snoozedClassQueues) {
                snoozedClassQueues.add(wq);
                if (wq == snoozedClassQueues.first()) {
                    this.nextWake = new WakeTask();
                    this.wakeTimer.schedule(nextWake, delay_ms);
                }
            }
        }
    }

    /**
     * Forget the given CrawlURI. This allows a new instance
     * to be created in the future, if it is reencountered under
     * different circumstances.
     *
     * @param curi The CrawlURI to forget
     */
    protected void forget(CrawlURI curi) {
        logger.finer("Forgetting " + curi);
        alreadyIncluded.forget(canonicalize(curi.getUURI()), curi);
    }

    /**  (non-Javadoc)
     * @see com.cyberway.issue.crawler.framework.Frontier#discoveredUriCount()
     */
    public long discoveredUriCount() {
        return (this.alreadyIncluded != null) ? this.alreadyIncluded.count() : 0;
    }

    /**
     * Delete all scheduled URIs matching the given regex. 
     * 
     * @param match regex of URIs to delete
     * @return Number of items deleted.
     */
    public long deleteURIs(String uriMatch) {
        return deleteURIs(uriMatch, null);
    }

    /**
     * Delete all scheduled URIs matching the given regex, in queues with
     * names matching the second given regex. 
     * 
     * @param uriMatch regex of URIs to delete
     * @param queueMatch regex of queues to affect, or null for all
     * @return Number of items deleted.
     */
    public long deleteURIs(String uriMatch, String queueMatch) {
        long count = 0;
        // TODO: DANGER/ values() may not work right from CachedBdbMap
        Iterator iter = allQueues.keySet().iterator();
        while (iter.hasNext()) {
            String queueKey = ((String) iter.next());
            if (StringUtils.isNotEmpty(queueMatch) && !queueKey.matches(queueMatch)) {
                // skip this queue
                continue;
            }
            WorkQueue wq = getQueueFor(queueKey);
            wq.unpeek();
            count += wq.deleteMatching(this, uriMatch);
        }
        decrementQueuedCount(count);
        return count;
    }

    //
    // Reporter implementation
    //

    public static String STANDARD_REPORT = "standard";
    public static String ALL_NONEMPTY = "nonempty";
    public static String ALL_QUEUES = "all";
    protected static String[] REPORTS = { STANDARD_REPORT, ALL_NONEMPTY, ALL_QUEUES };

    public String[] getReports() {
        return REPORTS;
    }

    /**
     * @param w Where to write to.
     */
    public void singleLineReportTo(PrintWriter w) {
        if (this.allQueues == null) {
            return;
        }
        int allCount = allQueues.size();
        int inProcessCount = inProcessQueues.uniqueSet().size();
        int readyCount = readyClassQueues.size();
        int snoozedCount = snoozedClassQueues.size();
        int activeCount = inProcessCount + readyCount + snoozedCount;
        int inactiveCount = inactiveQueues.size();
        int retiredCount = retiredQueues.size();
        int exhaustedCount = allCount - activeCount - inactiveCount - retiredCount;
        w.print(allCount);
        w.print(" queues: ");
        w.print(activeCount);
        w.print(" active (");
        w.print(inProcessCount);
        w.print(" in-process; ");
        w.print(readyCount);
        w.print(" ready; ");
        w.print(snoozedCount);
        w.print(" snoozed); ");
        w.print(inactiveCount);
        w.print(" inactive; ");
        w.print(retiredCount);
        w.print(" retired; ");
        w.print(exhaustedCount);
        w.print(" exhausted");
        w.flush();
    }

    /* (non-Javadoc)
     * @see com.cyberway.issue.util.Reporter#singleLineLegend()
     */
    public String singleLineLegend() {
        return "total active in-process ready snoozed inactive retired exhausted";
    }

    /**
     * This method compiles a human readable report on the status of the frontier
     * at the time of the call.
     * @param name Name of report.
     * @param writer Where to write to.
     */
    public synchronized void reportTo(String name, PrintWriter writer) {
        if (ALL_NONEMPTY.equals(name)) {
            allNonemptyReportTo(writer);
            return;
        }
        if (ALL_QUEUES.equals(name)) {
            allQueuesReportTo(writer);
            return;
        }
        if (name != null && !STANDARD_REPORT.equals(name)) {
            writer.print(name);
            writer.print(" unavailable; standard report:\n");
        }
        standardReportTo(writer);
    }

    /** Compact report of all nonempty queues (one queue per line)
     * 
     * @param writer
     */
    private void allNonemptyReportTo(PrintWriter writer) {
        ArrayList<WorkQueue> inProcessQueuesCopy;
        synchronized (this.inProcessQueues) {
            // grab a copy that will be stable against mods for report duration 
            @SuppressWarnings("unchecked")
            Collection<WorkQueue> inProcess = this.inProcessQueues;
            inProcessQueuesCopy = new ArrayList<WorkQueue>(inProcess);
        }
        writer.print("\n -----===== IN-PROCESS QUEUES =====-----\n");
        queueSingleLinesTo(writer, inProcessQueuesCopy.iterator());

        writer.print("\n -----===== READY QUEUES =====-----\n");
        queueSingleLinesTo(writer, this.readyClassQueues.iterator());

        writer.print("\n -----===== SNOOZED QUEUES =====-----\n");
        queueSingleLinesTo(writer, this.snoozedClassQueues.iterator());

        writer.print("\n -----===== INACTIVE QUEUES =====-----\n");
        queueSingleLinesTo(writer, this.inactiveQueues.iterator());

        writer.print("\n -----===== RETIRED QUEUES =====-----\n");
        queueSingleLinesTo(writer, this.retiredQueues.iterator());
    }

    /** Compact report of all nonempty queues (one queue per line)
     * 
     * @param writer
     */
    private void allQueuesReportTo(PrintWriter writer) {
        queueSingleLinesTo(writer, allQueues.keySet().iterator());
    }

    /**
     * Writer the single-line reports of all queues in the
     * iterator to the writer 
     * 
     * @param writer to receive report
     * @param iterator over queues of interest.
     */
    private void queueSingleLinesTo(PrintWriter writer, Iterator iterator) {
        Object obj;
        WorkQueue q;
        boolean legendWritten = false;
        while (iterator.hasNext()) {
            obj = iterator.next();
            if (obj == null) {
                continue;
            }
            q = (obj instanceof WorkQueue) ? (WorkQueue) obj : (WorkQueue) this.allQueues.get(obj);
            if (q == null) {
                writer.print(" ERROR: " + obj);
            }
            if (!legendWritten) {
                writer.println(q.singleLineLegend());
                legendWritten = true;
            }
            q.singleLineReportTo(writer);
        }
    }

    /**
     * @param w Writer to print to.
     */
    private void standardReportTo(PrintWriter w) {
        int allCount = allQueues.size();
        int inProcessCount = inProcessQueues.uniqueSet().size();
        int readyCount = readyClassQueues.size();
        int snoozedCount = snoozedClassQueues.size();
        int activeCount = inProcessCount + readyCount + snoozedCount;
        int inactiveCount = inactiveQueues.size();
        int retiredCount = retiredQueues.size();
        int exhaustedCount = allCount - activeCount - inactiveCount - retiredCount;

        w.print("Frontier report - ");
        w.print(ArchiveUtils.get12DigitDate());
        w.print("\n");
        w.print(" Job being crawled: ");
        w.print(controller.getOrder().getCrawlOrderName());
        w.print("\n");
        w.print("\n -----===== STATS =====-----\n");
        w.print(" Discovered:    ");
        w.print(Long.toString(discoveredUriCount()));
        w.print("\n");
        w.print(" Queued:        ");
        w.print(Long.toString(queuedUriCount()));
        w.print("\n");
        w.print(" Finished:      ");
        w.print(Long.toString(finishedUriCount()));
        w.print("\n");
        w.print("  Successfully: ");
        w.print(Long.toString(succeededFetchCount()));
        w.print("\n");
        w.print("  Failed:       ");
        w.print(Long.toString(failedFetchCount()));
        w.print("\n");
        w.print("  Disregarded:  ");
        w.print(Long.toString(disregardedUriCount()));
        w.print("\n");
        w.print("\n -----===== QUEUES =====-----\n");
        w.print(" Already included size:     ");
        w.print(Long.toString(alreadyIncluded.count()));
        w.print("\n");
        w.print("               pending:     ");
        w.print(Long.toString(alreadyIncluded.pending()));
        w.print("\n");
        w.print("\n All class queues map size: ");
        w.print(Long.toString(allCount));
        w.print("\n");
        w.print("             Active queues: ");
        w.print(activeCount);
        w.print("\n");
        w.print("                    In-process: ");
        w.print(inProcessCount);
        w.print("\n");
        w.print("                         Ready: ");
        w.print(readyCount);
        w.print("\n");
        w.print("                       Snoozed: ");
        w.print(snoozedCount);
        w.print("\n");
        w.print("           Inactive queues: ");
        w.print(inactiveCount);
        w.print("\n");
        w.print("            Retired queues: ");
        w.print(retiredCount);
        w.print("\n");
        w.print("          Exhausted queues: ");
        w.print(exhaustedCount);
        w.print("\n");

        w.print("\n -----===== IN-PROCESS QUEUES =====-----\n");
        @SuppressWarnings("unchecked")
        Collection<WorkQueue> inProcess = inProcessQueues;
        ArrayList<WorkQueue> copy = extractSome(inProcess, REPORT_MAX_QUEUES);
        appendQueueReports(w, copy.iterator(), copy.size(), REPORT_MAX_QUEUES);

        w.print("\n -----===== READY QUEUES =====-----\n");
        appendQueueReports(w, this.readyClassQueues.iterator(), this.readyClassQueues.size(), REPORT_MAX_QUEUES);

        w.print("\n -----===== SNOOZED QUEUES =====-----\n");
        copy = extractSome(snoozedClassQueues, REPORT_MAX_QUEUES);
        appendQueueReports(w, copy.iterator(), copy.size(), REPORT_MAX_QUEUES);

        WorkQueue longest = longestActiveQueue;
        if (longest != null) {
            w.print("\n -----===== LONGEST QUEUE =====-----\n");
            longest.reportTo(w);
        }

        w.print("\n -----===== INACTIVE QUEUES =====-----\n");
        appendQueueReports(w, this.inactiveQueues.iterator(), this.inactiveQueues.size(), REPORT_MAX_QUEUES);

        w.print("\n -----===== RETIRED QUEUES =====-----\n");
        appendQueueReports(w, this.retiredQueues.iterator(), this.retiredQueues.size(), REPORT_MAX_QUEUES);

        w.flush();
    }

    /**
     * Extract some of the elements in the given collection to an
     * ArrayList.  This method synchronizes on the given collection's
     * monitor.  The returned list will never contain more than the
     * specified maximum number of elements.
     * 
     * @param c    the collection whose elements to extract
     * @param max  the maximum number of elements to extract
     * @return  the extraction
     */
    private static <T> ArrayList<T> extractSome(Collection<T> c, int max) {
        // Try to guess a sane initial capacity for ArrayList
        // Hopefully given collection won't grow more than 10 items
        // between now and the synchronized block...
        int initial = Math.min(c.size() + 10, max);
        int count = 0;
        ArrayList<T> list = new ArrayList<T>(initial);
        synchronized (c) {
            Iterator<T> iter = c.iterator();
            while (iter.hasNext() && (count < max)) {
                list.add(iter.next());
                count++;
            }
        }
        return list;
    }

    /**
     * Append queue report to general Frontier report.
     * @param w StringBuffer to append to.
     * @param iterator An iterator over 
     * @param total
     * @param max
     */
    protected void appendQueueReports(PrintWriter w, Iterator iterator, int total, int max) {
        Object obj;
        WorkQueue q;
        for (int count = 0; iterator.hasNext() && (count < max); count++) {
            obj = iterator.next();
            if (obj == null) {
                continue;
            }
            q = (obj instanceof WorkQueue) ? (WorkQueue) obj : (WorkQueue) this.allQueues.get(obj);
            if (q == null) {
                w.print("WARNING: No report for queue " + obj);
            }
            q.reportTo(w);
        }
        if (total > max) {
            w.print("...and " + (total - max) + " more.\n");
        }
    }

    /**
     * Force logging, etc. of operator- deleted CrawlURIs
     * 
     * @see com.cyberway.issue.crawler.framework.Frontier#deleted(com.cyberway.issue.crawler.datamodel.CrawlURI)
     */
    public synchronized void deleted(CrawlURI curi) {
        //treat as disregarded
        controller.fireCrawledURIDisregardEvent(curi);
        log(curi);
        incrementDisregardedUriCount();
        curi.stripToMinimal();
        curi.processingCleanup();
    }

    public void considerIncluded(UURI u) {
        this.alreadyIncluded.note(canonicalize(u));
        CrawlURI temp = new CrawlURI(u);
        temp.setClassKey(getClassKey(temp));
        getQueueFor(temp).expend(getCost(temp));
    }

    protected abstract void initQueue() throws IOException;

    protected abstract void closeQueue() throws IOException;

    /**
     * Returns <code>true</code> if the WorkQueue implementation of this
     * Frontier stores its workload on disk instead of relying
     * on serialization mechanisms.
     * 
     * TODO: rename! (this is a very misleading name) or kill (don't
     * see any implementations that return false)
     * 
     * @return a constant boolean value for this class/instance
     */
    protected abstract boolean workQueueDataOnDisk();

    public FrontierGroup getGroup(CrawlURI curi) {
        return getQueueFor(curi);
    }

    public long averageDepth() {
        int inProcessCount = inProcessQueues.uniqueSet().size();
        int readyCount = readyClassQueues.size();
        int snoozedCount = snoozedClassQueues.size();
        int activeCount = inProcessCount + readyCount + snoozedCount;
        int inactiveCount = inactiveQueues.size();
        int totalQueueCount = (activeCount + inactiveCount);
        return (totalQueueCount == 0) ? 0 : liveQueuedUriCount.get() / totalQueueCount;
    }

    public float congestionRatio() {
        int inProcessCount = inProcessQueues.uniqueSet().size();
        int readyCount = readyClassQueues.size();
        int snoozedCount = snoozedClassQueues.size();
        int activeCount = inProcessCount + readyCount + snoozedCount;
        int inactiveCount = inactiveQueues.size();
        return (float) (activeCount + inactiveCount) / (inProcessCount + snoozedCount);
    }

    public long deepestUri() {
        return longestActiveQueue == null ? -1 : longestActiveQueue.getCount();
    }

    /* (non-Javadoc)
     * @see com.cyberway.issue.crawler.framework.Frontier#isEmpty()
     */
    public synchronized boolean isEmpty() {
        return liveQueuedUriCount.get() == 0 && alreadyIncluded.pending() == 0;
    }
}