Java tutorial
/* * This file is part of the Heritrix web crawler (crawler.archive.org). * * Licensed to the Internet Archive (IA) by one or more individual * contributors. * * The IA licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.archive.crawler.frontier; import static org.archive.crawler.event.CrawlURIDispositionEvent.Disposition.DEFERRED_FOR_RETRY; import static org.archive.crawler.event.CrawlURIDispositionEvent.Disposition.DISREGARDED; import static org.archive.crawler.event.CrawlURIDispositionEvent.Disposition.FAILED; import static org.archive.crawler.event.CrawlURIDispositionEvent.Disposition.SUCCEEDED; import static org.archive.modules.fetcher.FetchStatusCodes.S_DEFERRED; import static org.archive.modules.fetcher.FetchStatusCodes.S_RUNTIME_EXCEPTION; import java.io.Closeable; import java.io.IOException; import java.io.PrintWriter; import java.util.ArrayList; import java.util.Arrays; import java.util.Collection; import java.util.Collections; import java.util.Iterator; import java.util.LinkedHashMap; import java.util.Map; import java.util.Map.Entry; import java.util.Queue; import java.util.Set; import java.util.SortedMap; import java.util.concurrent.BlockingQueue; import java.util.concurrent.ConcurrentHashMap; import java.util.concurrent.DelayQueue; import java.util.concurrent.Delayed; import java.util.concurrent.TimeUnit; import java.util.concurrent.atomic.AtomicInteger; import java.util.logging.Level; import java.util.logging.Logger; import java.util.regex.Pattern; import org.apache.commons.collections.iterators.ObjectArrayIterator; import org.archive.crawler.datamodel.UriUniqFilter; import org.archive.crawler.event.CrawlURIDispositionEvent; import org.archive.crawler.framework.ToeThread; import org.archive.crawler.frontier.precedence.BaseQueuePrecedencePolicy; import org.archive.crawler.frontier.precedence.QueuePrecedencePolicy; import org.archive.crawler.util.TopNSet; import org.archive.modules.CrawlURI; import org.archive.spring.KeyedProperties; import org.archive.util.ArchiveUtils; import org.archive.util.ObjectIdentityCache; import org.archive.util.ObjectIdentityMemCache; import org.springframework.beans.BeansException; import org.springframework.beans.factory.annotation.Autowired; import org.springframework.context.ApplicationContext; import org.springframework.context.ApplicationContextAware; import org.springframework.context.support.AbstractApplicationContext; import com.sleepycat.collections.StoredSortedMap; import com.sleepycat.je.DatabaseException; /** * A common Frontier base using several queues to hold pending URIs. * * Uses in-memory map of all known 'queues' inside a single database. * Round-robins between all queues. * * @author Gordon Mohr * @author Christian Kohlschuetter */ public abstract class WorkQueueFrontier extends AbstractFrontier implements Closeable, ApplicationContextAware { @SuppressWarnings("unused") private static final long serialVersionUID = 570384305871965843L; /** * If we know that only a small amount of queues is held in memory, * we can avoid using a disk-based BigMap. * This only works efficiently if the WorkQueue does not hold its * entries in memory as well. */ private static final int MAX_QUEUES_TO_HOLD_ALLQUEUES_IN_MEMORY = 3000; /** * When a snooze target for a queue is longer than this amount, the queue * will be "long snoozed" instead of "short snoozed". A "long snoozed" * queue may be swapped to disk because it's not needed soon. */ protected long snoozeLongMs = 5L * 60L * 1000L; public long getSnoozeLongMs() { return snoozeLongMs; } public void setSnoozeLongMs(long snooze) { this.snoozeLongMs = snooze; } private static final Logger logger = Logger.getLogger(WorkQueueFrontier.class.getName()); // ApplicationContextAware implementation, for eventing protected AbstractApplicationContext appCtx; public void setApplicationContext(ApplicationContext applicationContext) throws BeansException { this.appCtx = (AbstractApplicationContext) applicationContext; } /** amount to replenish budget on each activation (duty cycle) */ { setBalanceReplenishAmount(3000); } public int getBalanceReplenishAmount() { return (Integer) kp.get("balanceReplenishAmount"); } public void setBalanceReplenishAmount(int replenish) { kp.put("balanceReplenishAmount", replenish); } /** budget penalty for an error fetch */ { setErrorPenaltyAmount(100); } public int getErrorPenaltyAmount() { return (Integer) kp.get("errorPenaltyAmount"); } public void setErrorPenaltyAmount(int penalty) { kp.put("errorPenaltyAmount", penalty); } /** total expenditure to allow a queue before 'retiring' it */ { setQueueTotalBudget(-1L); } public long getQueueTotalBudget() { return (Long) kp.get("queueTotalBudget"); } public void setQueueTotalBudget(long budget) { kp.put("queueTotalBudget", budget); } /** queue precedence assignment policy to use. */ { setQueuePrecedencePolicy(new BaseQueuePrecedencePolicy()); } public QueuePrecedencePolicy getQueuePrecedencePolicy() { return (QueuePrecedencePolicy) kp.get("queuePrecedencePolicy"); } public void setQueuePrecedencePolicy(QueuePrecedencePolicy policy) { kp.put("queuePrecedencePolicy", policy); } /** precedence rank at or below which queues are not crawled */ protected int precedenceFloor = 255; public int getPrecedenceFloor() { return this.precedenceFloor; } public void setPrecedenceFloor(int floor) { this.precedenceFloor = floor; } /** truncate reporting of queues at this large but not unbounded number */ protected int maxQueuesPerReportCategory = 2000; public int getMaxQueuesPerReportCategory() { return this.maxQueuesPerReportCategory; } public void setMaxQueuesPerReportCategory(int max) { this.maxQueuesPerReportCategory = max; } /** All known queues. */ protected ObjectIdentityCache<WorkQueue> allQueues = null; // of classKey -> ClassKeyQueue /** * All per-class queues whose first item may be handed out. * Linked-list of keys for the queues. */ protected BlockingQueue<String> readyClassQueues; /** all per-class queues from whom a URI is outstanding */ protected Set<WorkQueue> inProcessQueues = Collections .newSetFromMap(new ConcurrentHashMap<WorkQueue, Boolean>()); // of ClassKeyQueue /** * All per-class queues held in snoozed state, sorted by wake time. */ transient protected DelayQueue<DelayedWorkQueue> snoozedClassQueues; protected StoredSortedMap<Long, DelayedWorkQueue> snoozedOverflow; protected AtomicInteger snoozedOverflowCount = new AtomicInteger(0); protected static int MAX_SNOOZED_IN_MEMORY = 10000; /** URIs scheduled to be re-enqueued at future date */ protected StoredSortedMap<Long, CrawlURI> futureUris; /** remember keys of small number of largest queues for reporting */ transient protected TopNSet largestQueues = new TopNSet(20); /** remember this many largest queues for reporting's sake; actual tracking * can be somewhat approximate when some queues shrink before others' * sizes are again noted, or if the size is adjusted mid-crawl. */ public int getLargestQueuesCount() { return largestQueues.getMaxSize(); } public void setLargestQueuesCount(int count) { largestQueues.setMaxSize(count); } protected int highestPrecedenceWaiting = Integer.MAX_VALUE; /** The UriUniqFilter to use, tracking those UURIs which are * already in-process (or processed), and thus should not be * rescheduled. Also known as the 'alreadyIncluded' or * 'alreadySeen' structure */ protected UriUniqFilter uriUniqFilter; public UriUniqFilter getUriUniqFilter() { return this.uriUniqFilter; } @Autowired public void setUriUniqFilter(UriUniqFilter uriUniqFilter) { this.uriUniqFilter = uriUniqFilter; } /** * Constructor. */ public WorkQueueFrontier() { super(); } public void start() { if (isRunning()) { return; } uriUniqFilter.setDestination(this); super.start(); try { initInternalQueues(); } catch (Exception e) { throw new IllegalStateException(e); } } /** * Initializes internal queues. May decide to keep all queues in memory based on * {@link QueueAssignmentPolicy#maximumNumberOfKeys}. Otherwise invokes * {@link #initAllQueues()} to actually set up the queues. * * Subclasses should invoke this method with recycle set to "true" in * a private readObject method, to restore queues after a checkpoint. * * @param recycle * @throws IOException * @throws DatabaseException */ protected void initInternalQueues() throws IOException, DatabaseException { this.initOtherQueues(); if (workQueueDataOnDisk() && preparer.getQueueAssignmentPolicy().maximumNumberOfKeys() >= 0 && preparer .getQueueAssignmentPolicy().maximumNumberOfKeys() <= MAX_QUEUES_TO_HOLD_ALLQUEUES_IN_MEMORY) { this.allQueues = new ObjectIdentityMemCache<WorkQueue>(701, .9f, 100); } else { this.initAllQueues(); } } /** * Initialize the allQueues field in an implementation-appropriate * way. * @throws DatabaseException */ protected abstract void initAllQueues() throws DatabaseException; /** * Initialize all other internal queues in an implementation-appropriate * way. * @throws DatabaseException */ protected abstract void initOtherQueues() throws DatabaseException; /* (non-Javadoc) * @see org.archive.crawler.frontier.AbstractFrontier#stop() */ @Override public void stop() { super.stop(); } public void destroy() { // release resources and trigger end-of-frontier actions close(); } /** * Release resources only needed when running */ public void close() { ArchiveUtils.closeQuietly(uriUniqFilter); ArchiveUtils.closeQuietly(allQueues); } /** * Accept the given CrawlURI for scheduling, as it has * passed the alreadyIncluded filter. * * Choose a per-classKey queue and enqueue it. If this * item has made an unready queue ready, place that * queue on the readyClassQueues queue. * @param caUri CrawlURI. */ protected void processScheduleAlways(CrawlURI curi) { // assert Thread.currentThread() == managerThread; assert KeyedProperties.overridesActiveFrom(curi); prepForFrontier(curi); sendToQueue(curi); } /** * Arrange for the given CrawlURI to be visited, if it is not * already enqueued/completed. * * Differs from superclass in that it operates in calling thread, rather * than deferring operations via in-queue to managerThread. TODO: settle * on either defer or in-thread approach after testing. * * @see org.archive.crawler.framework.Frontier#schedule(org.archive.modules.CrawlURI) */ @Override public void schedule(CrawlURI curi) { sheetOverlaysManager.applyOverlaysTo(curi); try { KeyedProperties.loadOverridesFrom(curi); if (curi.getClassKey() == null) { // remedial processing preparer.prepare(curi); } processScheduleIfUnique(curi); } finally { KeyedProperties.clearOverridesFrom(curi); } } /** * Arrange for the given CrawlURI to be visited, if it is not * already scheduled/completed. * * @see org.archive.crawler.framework.Frontier#schedule(org.archive.modules.CrawlURI) */ protected void processScheduleIfUnique(CrawlURI curi) { // assert Thread.currentThread() == managerThread; assert KeyedProperties.overridesActiveFrom(curi); // Canonicalization may set forceFetch flag. See // #canonicalization(CrawlURI) javadoc for circumstance. String canon = curi.getCanonicalString(); if (curi.forceFetch()) { uriUniqFilter.addForce(canon, curi); } else { uriUniqFilter.add(canon, curi); } } /** * Send a CrawlURI to the appropriate subqueue. * * @param curi */ protected void sendToQueue(CrawlURI curi) { // assert Thread.currentThread() == managerThread; WorkQueue wq = getQueueFor(curi.getClassKey()); synchronized (wq) { int originalPrecedence = wq.getPrecedence(); wq.enqueue(this, curi); // always take budgeting values from current curi // (whose overlay settings should be active here) wq.setSessionBudget(getBalanceReplenishAmount()); wq.setTotalBudget(getQueueTotalBudget()); if (!wq.isRetired()) { incrementQueuedUriCount(); int currentPrecedence = wq.getPrecedence(); if (!wq.isManaged() || currentPrecedence < originalPrecedence) { // queue newly filled or bumped up in precedence; ensure enqueuing // at precedence level (perhaps duplicate; if so that's handled elsewhere) deactivateQueue(wq); } } } // Update recovery log. doJournalAdded(curi); wq.makeDirty(); largestQueues.update(wq.getClassKey(), wq.getCount()); } /** * Put the given queue on the readyClassQueues queue * @param wq */ protected void readyQueue(WorkQueue wq) { // assert Thread.currentThread() == managerThread; try { readyClassQueues.put(wq.getClassKey()); if (logger.isLoggable(Level.FINE)) { logger.log(Level.FINE, "queue readied: " + wq.getClassKey()); } } catch (InterruptedException e) { e.printStackTrace(); System.err.println("unable to ready queue " + wq); // propagate interrupt up throw new RuntimeException(e); } } /** * Put the given queue on the inactiveQueues queue * @param wq */ protected void deactivateQueue(WorkQueue wq) { int precedence = wq.getPrecedence(); synchronized (wq) { wq.noteDeactivated(); inProcessQueues.remove(wq); if (wq.getCount() == 0) { System.err.println("deactivate empty queue?"); } synchronized (getInactiveQueuesByPrecedence()) { getInactiveQueuesForPrecedence(precedence).add(wq.getClassKey()); if (wq.getPrecedence() < highestPrecedenceWaiting) { highestPrecedenceWaiting = wq.getPrecedence(); } } if (logger.isLoggable(Level.FINE)) { logger.log(Level.FINE, "queue deactivated to p" + precedence + ": " + wq.getClassKey()); } } } /** * Get the queue of inactive uri-queue names at the given precedence. * * @param precedence * @return queue of inacti */ protected Queue<String> getInactiveQueuesForPrecedence(int precedence) { Map<Integer, Queue<String>> inactiveQueuesByPrecedence = getInactiveQueuesByPrecedence(); Queue<String> candidate = inactiveQueuesByPrecedence.get(precedence); if (candidate == null) { candidate = createInactiveQueueForPrecedence(precedence); inactiveQueuesByPrecedence.put(precedence, candidate); } return candidate; } /** * Return a sorted map of all queues of WorkQueue keys, keyed by precedence * @return SortedMap<Integer, Queue<String>> of inactiveQueues */ protected abstract SortedMap<Integer, Queue<String>> getInactiveQueuesByPrecedence(); /** * Create an inactiveQueue to hold queue names at the given precedence * @param precedence * @return Queue<String> for names of inactive queues */ protected abstract Queue<String> createInactiveQueueForPrecedence(int precedence); /** * Put the given queue on the retiredQueues queue * @param wq */ protected void retireQueue(WorkQueue wq) { // assert Thread.currentThread() == managerThread; inProcessQueues.remove(wq); getRetiredQueues().add(wq.getClassKey()); decrementQueuedCount(wq.getCount()); wq.setRetired(true); if (logger.isLoggable(Level.FINE)) { logger.log(Level.FINE, "queue retired: " + wq.getClassKey()); } } /** * Return queue of all retired queue names. * * @return Queue<String> of retired queue names */ protected abstract Queue<String> getRetiredQueues(); /** * Accommodate any changes in retirement-determining settings (like * total-budget or force-retire changes/overlays. * * (Essentially, exists to be called from tools like the UI * Scripting Console when the operator knows it's necessary.) */ public void reconsiderRetiredQueues() { // The rules for a 'retired' queue may have changed; so, // unretire all queues to 'inactive'. If they still qualify // as retired/overbudget next time they come up, they'll // be re-retired; if not, they'll get a chance to become // active under the new rules. // TODO: Do this automatically, only when necessary. String key = getRetiredQueues().poll(); while (key != null) { WorkQueue q = (WorkQueue) this.allQueues.get(key); if (q != null) { unretireQueue(q); q.makeDirty(); } key = getRetiredQueues().poll(); } } /** * Restore a retired queue to the 'inactive' state. * * @param q */ private void unretireQueue(WorkQueue q) { // assert Thread.currentThread() == managerThread; deactivateQueue(q); q.setRetired(false); incrementQueuedUriCount(q.getCount()); } /** * Return the work queue for the given classKey, or null * if no such queue exists. * * @param classKey key to look for * @return the found WorkQueue */ protected abstract WorkQueue getQueueFor(String classKey); /** * Return the next CrawlURI eligible to be processed (and presumably * visited/fetched) by a a worker thread. * * Relies on the readyClassQueues having been loaded with * any work queues that are eligible to provide a URI. * * @return next CrawlURI eligible to be processed, or null if none available * * @see org.archive.crawler.framework.Frontier#next() */ protected CrawlURI findEligibleURI() { // wake any snoozed queues wakeQueues(); // consider rescheduled URIS checkFutures(); // find a non-empty ready queue, if any // TODO: refactor to untangle these loops, early-exits, etc! WorkQueue readyQ = null; findauri: while (true) { findaqueue: do { String key = readyClassQueues.poll(); if (key == null) { // no ready queues; try to activate one if (!getInactiveQueuesByPrecedence().isEmpty() && highestPrecedenceWaiting < getPrecedenceFloor()) { activateInactiveQueue(); continue findaqueue; } else { // nothing ready or readyable break findaqueue; } } readyQ = getQueueFor(key); if (readyQ == null) { // readyQ key wasn't in all queues: unexpected logger.severe("Key " + key + " in readyClassQueues but not allQueues"); break findaqueue; } if (readyQ.getCount() == 0) { // readyQ is empty and ready: it's exhausted readyQ.noteExhausted(); readyQ.makeDirty(); readyQ = null; continue; } if (!inProcessQueues.add(readyQ)) { // double activation; discard this and move on // (this guard allows other enqueuings to ready or // the various inactive-by-precedence queues to // sometimes redundantly enqueue a queue key) readyQ = null; continue; } // queue has gone 'in process' readyQ.considerActive(); readyQ.setWakeTime(0); // clear obsolete wake time, if any // we know readyQ is not empty (getCount()!=0) so peek() shouldn't return null CrawlURI readyQUri = readyQ.peek(this); // see HER-1973 and HER-1946 sheetOverlaysManager.applyOverlaysTo(readyQUri); try { KeyedProperties.loadOverridesFrom(readyQUri); readyQ.setSessionBudget(getBalanceReplenishAmount()); readyQ.setTotalBudget(getQueueTotalBudget()); } finally { KeyedProperties.clearOverridesFrom(readyQUri); } if (readyQ.isOverSessionBudget()) { deactivateQueue(readyQ); readyQ.makeDirty(); readyQ = null; continue; } if (readyQ.isOverTotalBudget()) { retireQueue(readyQ); readyQ.makeDirty(); readyQ = null; continue; } } while (readyQ == null); if (readyQ == null) { // no queues left in ready or readiable break findauri; } returnauri: while (true) { // loop left by explicit return or break on empty CrawlURI curi = null; curi = readyQ.peek(this); if (curi == null) { // should not reach logger.severe("No CrawlURI from ready non-empty queue " + readyQ.classKey + "\n" + readyQ.shortReportLegend() + "\n" + readyQ.shortReportLine() + "\n"); break returnauri; } // from queues, override names persist but not map source curi.setOverlayMapsSource(sheetOverlaysManager); // TODO: consider optimizations avoiding this recalc of // overrides when not necessary sheetOverlaysManager.applyOverlaysTo(curi); // check if curi belongs in different queue String currentQueueKey; try { KeyedProperties.loadOverridesFrom(curi); currentQueueKey = getClassKey(curi); } finally { KeyedProperties.clearOverridesFrom(curi); } if (currentQueueKey.equals(curi.getClassKey())) { // curi was in right queue, emit noteAboutToEmit(curi, readyQ); return curi; } // URI's assigned queue has changed since it // was queued (eg because its IP has become // known). Requeue to new queue. // TODO: consider synchronization on readyQ readyQ.dequeue(this, curi); doJournalRelocated(curi); curi.setClassKey(currentQueueKey); decrementQueuedCount(1); curi.setHolderKey(null); sendToQueue(curi); if (readyQ.getCount() == 0) { // readyQ is empty and ready: it's exhausted // release held status, allowing any subsequent // enqueues to again put queue in ready // FIXME: tiny window here where queue could // receive new URI, be readied, fail not-in-process? inProcessQueues.remove(readyQ); readyQ.noteExhausted(); readyQ.makeDirty(); readyQ = null; continue findauri; } } } if (inProcessQueues.size() == 0) { // Nothing was ready or in progress or imminent to wake; ensure // any piled-up pending-scheduled URIs are considered uriUniqFilter.requestFlush(); } // if truly nothing ready, wait a moment before returning null // so that loop in surrounding next() has a chance of getting something // next time if (getTotalEligibleInactiveQueues() == 0) { try { Thread.sleep(1000); } catch (InterruptedException e) { // } } // nothing eligible return null; } /** * Check for any future-scheduled URIs now eligible for reenqueuing */ protected void checkFutures() { // assert Thread.currentThread() == managerThread; // TODO: consider only checking this every set interval if (!futureUris.isEmpty()) { synchronized (futureUris) { Iterator<CrawlURI> iter = futureUris.headMap(System.currentTimeMillis()).values().iterator(); while (iter.hasNext()) { CrawlURI curi = iter.next(); curi.setRescheduleTime(-1); // unless again set elsewhere iter.remove(); futureUriCount.decrementAndGet(); receive(curi); } } } } /** * Activate an inactive queue, if any are available. */ protected boolean activateInactiveQueue() { for (Entry<Integer, Queue<String>> entry : getInactiveQueuesByPrecedence().entrySet()) { int expectedPrecedence = entry.getKey(); Queue<String> queueOfWorkQueueKeys = entry.getValue(); while (true) { String workQueueKey; synchronized (getInactiveQueuesByPrecedence()) { workQueueKey = queueOfWorkQueueKeys.poll(); if (workQueueKey == null) { break; } updateHighestWaiting(expectedPrecedence); } WorkQueue candidateQ = (WorkQueue) this.allQueues.get(workQueueKey); if (candidateQ.getPrecedence() > expectedPrecedence) { // queue demoted since placed; re-deactivate deactivateQueue(candidateQ); candidateQ.makeDirty(); continue; } try { readyClassQueues.put(workQueueKey); } catch (InterruptedException e) { throw new RuntimeException(e); } return true; } } return false; } /** * Recalculate the value of thehighest-precedence queue waiting * among inactive queues. * * @param startFrom start looking at this precedence value */ protected void updateHighestWaiting(int startFrom) { // probe for new highestWaiting for (int precedenceKey : getInactiveQueuesByPrecedence().tailMap(startFrom).keySet()) { if (!getInactiveQueuesByPrecedence().get(precedenceKey).isEmpty()) { highestPrecedenceWaiting = precedenceKey; return; } } // nothing waiting highestPrecedenceWaiting = Integer.MAX_VALUE; } /** * Enqueue the given queue to either readyClassQueues or inactiveQueues, * as appropriate. * * @param wq */ protected void reenqueueQueue(WorkQueue wq) { if (logger.isLoggable(Level.FINE)) { logger.fine("queue reenqueued: " + wq.getClassKey()); } if (highestPrecedenceWaiting < wq.getPrecedence() || wq.getPrecedence() >= getPrecedenceFloor()) { // if still over budget, deactivate deactivateQueue(wq); } else { readyQueue(wq); } } /* (non-Javadoc) * @see org.archive.crawler.frontier.AbstractFrontier#getMaxInWait() */ @Override protected long getMaxInWait() { Delayed next = snoozedClassQueues.peek(); return next == null ? 60000 : next.getDelay(TimeUnit.MILLISECONDS); } /** * Utility method for advanced users/experimentation: force wake all snoozed * queues -- for example to kick a crawl where connectivity problems have * put all queues in slow-retry-snoozes back to busy-ness. */ public void forceWakeQueues() { Iterator<DelayedWorkQueue> iterSnoozed = snoozedClassQueues.iterator(); while (iterSnoozed.hasNext()) { WorkQueue queue = iterSnoozed.next().getWorkQueue(WorkQueueFrontier.this); queue.setWakeTime(0); reenqueueQueue(queue); queue.makeDirty(); iterSnoozed.remove(); } Iterator<DelayedWorkQueue> iterOverflow = snoozedOverflow.values().iterator(); while (iterOverflow.hasNext()) { WorkQueue queue = iterOverflow.next().getWorkQueue(WorkQueueFrontier.this); queue.setWakeTime(0); reenqueueQueue(queue); queue.makeDirty(); iterOverflow.remove(); snoozedOverflowCount.decrementAndGet(); } } /** * Wake any queues sitting in the snoozed queue whose time has come. */ protected void wakeQueues() { DelayedWorkQueue waked; while ((waked = snoozedClassQueues.poll()) != null) { WorkQueue queue = waked.getWorkQueue(this); queue.setWakeTime(0); queue.makeDirty(); reenqueueQueue(queue); } // also consider overflow (usually empty) if (!snoozedOverflow.isEmpty()) { synchronized (snoozedOverflow) { Iterator<DelayedWorkQueue> iter = snoozedOverflow.headMap(System.currentTimeMillis()).values() .iterator(); while (iter.hasNext()) { DelayedWorkQueue dq = iter.next(); iter.remove(); snoozedOverflowCount.decrementAndGet(); WorkQueue queue = dq.getWorkQueue(this); queue.setWakeTime(0); queue.makeDirty(); reenqueueQueue(queue); } } } } /** * Note that the previously emitted CrawlURI has completed * its processing (for now). * * The CrawlURI may be scheduled to retry, if appropriate, * and other related URIs may become eligible for release * via the next next() call, as a result of finished(). * * TODO: make as many decisions about what happens to the CrawlURI * (success, failure, retry) and queue (retire, snooze, ready) as * possible elsewhere, such as in DispositionProcessor. Then, break * this into simple branches or focused methods for each case. * * @see org.archive.crawler.framework.Frontier#finished(org.archive.modules.CrawlURI) */ protected void processFinish(CrawlURI curi) { // assert Thread.currentThread() == managerThread; long now = System.currentTimeMillis(); curi.incrementFetchAttempts(); logNonfatalErrors(curi); WorkQueue wq = (WorkQueue) curi.getHolder(); // always refresh budgeting values from current curi // (whose overlay settings should be active here) wq.setSessionBudget(getBalanceReplenishAmount()); wq.setTotalBudget(getQueueTotalBudget()); assert (wq.peek(this) == curi) : "unexpected peek " + wq; int holderCost = curi.getHolderCost(); if (needsReenqueuing(curi)) { // codes/errors which don't consume the URI, leaving it atop queue if (curi.getFetchStatus() != S_DEFERRED) { wq.expend(holderCost); // all retries but DEFERRED cost } long delay_ms = retryDelayFor(curi) * 1000; curi.processingCleanup(); // lose state that shouldn't burden retry wq.unpeek(curi); wq.update(this, curi); // rewrite any changes handleQueue(wq, curi.includesRetireDirective(), now, delay_ms); appCtx.publishEvent(new CrawlURIDispositionEvent(this, curi, DEFERRED_FOR_RETRY)); doJournalReenqueued(curi); wq.makeDirty(); return; // no further dequeueing, logging, rescheduling to occur } // Curi will definitely be disposed of without retry, so remove from queue wq.dequeue(this, curi); decrementQueuedCount(1); largestQueues.update(wq.getClassKey(), wq.getCount()); log(curi); if (curi.isSuccess()) { // codes deemed 'success' incrementSucceededFetchCount(); totalProcessedBytes.addAndGet(curi.getRecordedSize()); appCtx.publishEvent(new CrawlURIDispositionEvent(this, curi, SUCCEEDED)); doJournalFinishedSuccess(curi); } else if (isDisregarded(curi)) { // codes meaning 'undo' (even though URI was enqueued, // we now want to disregard it from normal success/failure tallies) // (eg robots-excluded, operator-changed-scope, etc) incrementDisregardedUriCount(); appCtx.publishEvent(new CrawlURIDispositionEvent(this, curi, DISREGARDED)); holderCost = 0; // no charge for disregarded URIs // TODO: consider reinstating forget-URI capability, so URI could be // re-enqueued if discovered again doJournalDisregarded(curi); } else { // codes meaning 'failure' incrementFailedFetchCount(); appCtx.publishEvent(new CrawlURIDispositionEvent(this, curi, FAILED)); // if exception, also send to crawlErrors if (curi.getFetchStatus() == S_RUNTIME_EXCEPTION) { Object[] array = { curi }; loggerModule.getRuntimeErrors().log(Level.WARNING, curi.getUURI().toString(), array); } // charge queue any extra error penalty wq.noteError(getErrorPenaltyAmount()); doJournalFinishedFailure(curi); } wq.expend(holderCost); // successes & failures charge cost to queue long delay_ms = curi.getPolitenessDelay(); handleQueue(wq, curi.includesRetireDirective(), now, delay_ms); wq.makeDirty(); if (curi.getRescheduleTime() > 0) { // marked up for forced-revisit at a set time curi.processingCleanup(); curi.resetForRescheduling(); futureUris.put(curi.getRescheduleTime(), curi); futureUriCount.incrementAndGet(); } else { curi.stripToMinimal(); curi.processingCleanup(); } } /** * Send an active queue to its next state, based on the supplied * parameters. * * @param wq * @param forceRetire * @param now * @param delay_ms */ protected void handleQueue(WorkQueue wq, boolean forceRetire, long now, long delay_ms) { inProcessQueues.remove(wq); if (forceRetire) { retireQueue(wq); } else if (delay_ms > 0) { snoozeQueue(wq, now, delay_ms); } else { getQueuePrecedencePolicy().queueReevaluate(wq); reenqueueQueue(wq); } } /** * Place the given queue into 'snoozed' state, ineligible to * supply any URIs for crawling, for the given amount of time. * * @param wq queue to snooze * @param now time now in ms * @param delay_ms time to snooze in ms */ private void snoozeQueue(WorkQueue wq, long now, long delay_ms) { long nextTime = now + delay_ms; wq.setWakeTime(nextTime); DelayedWorkQueue dq = new DelayedWorkQueue(wq); if (snoozedClassQueues.size() < MAX_SNOOZED_IN_MEMORY) { snoozedClassQueues.add(dq); } else { synchronized (snoozedOverflow) { snoozedOverflow.put(nextTime, dq); snoozedOverflowCount.incrementAndGet(); } } } /** * Forget the given CrawlURI. This allows a new instance * to be created in the future, if it is reencountered under * different circumstances. * * @param curi The CrawlURI to forget */ protected void forget(CrawlURI curi) { logger.finer("Forgetting " + curi); uriUniqFilter.forget(curi.getCanonicalString(), curi); } /** (non-Javadoc) * @see org.archive.crawler.framework.Frontier#discoveredUriCount() */ public long discoveredUriCount() { return (this.uriUniqFilter != null) ? this.uriUniqFilter.count() : 0; } /** * @param match String to match. * @return Number of items deleted. */ public long deleteURIs(String queueRegex, String uriRegex) { long count = 0; Pattern queuePat = Pattern.compile(queueRegex); for (String qname : allQueues.keySet()) { if (queuePat.matcher(qname).matches()) { WorkQueue wq = getQueueFor(qname); wq.unpeek(null); long delCount = wq.deleteMatching(this, uriRegex); if (!wq.isRetired()) { count += delCount; } wq.makeDirty(); } } decrementQueuedCount(count); return count; } // // Reporter implementation // @Override public Map<String, Object> shortReportMap() { if (this.allQueues == null) { return null; } int allCount = allQueues.size(); int inProcessCount = inProcessQueues.size(); int readyCount = readyClassQueues.size(); int snoozedCount = getSnoozedCount(); int activeCount = inProcessCount + readyCount + snoozedCount; int inactiveCount = getTotalEligibleInactiveQueues(); int ineligibleCount = getTotalIneligibleInactiveQueues(); int retiredCount = getRetiredQueues().size(); int exhaustedCount = allCount - activeCount - inactiveCount - retiredCount; Map<String, Object> map = new LinkedHashMap<String, Object>(); map.put("totalQueues", allCount); map.put("inProcessQueues", inProcessCount); map.put("readyQueues", readyCount); map.put("snoozedQueues", snoozedCount); map.put("activeQueues", activeCount); map.put("inactiveQueues", inactiveCount); map.put("ineligibleQueues", ineligibleCount); map.put("retiredQueues", retiredCount); map.put("exhaustedQueues", exhaustedCount); map.put("lastReachedState", lastReachedState); return map; } /** * @param w Where to write to. */ @Override public void shortReportLineTo(PrintWriter w) { if (!isRunning()) return; //??? if (this.allQueues == null) { return; } int allCount = allQueues.size(); int inProcessCount = inProcessQueues.size(); int readyCount = readyClassQueues.size(); int snoozedCount = getSnoozedCount(); int activeCount = inProcessCount + readyCount + snoozedCount; int inactiveCount = getTotalEligibleInactiveQueues(); int ineligibleCount = getTotalIneligibleInactiveQueues(); int retiredCount = getRetiredQueues().size(); int exhaustedCount = allCount - activeCount - inactiveCount - retiredCount; State last = lastReachedState; w.print(last); w.print(" - "); w.print(allCount); w.print(" URI queues: "); w.print(activeCount); w.print(" active ("); w.print(inProcessCount); w.print(" in-process; "); w.print(readyCount); w.print(" ready; "); w.print(snoozedCount); w.print(" snoozed); "); w.print(inactiveCount); w.print(" inactive; "); w.print(ineligibleCount); w.print(" ineligible; "); w.print(retiredCount); w.print(" retired; "); w.print(exhaustedCount); w.print(" exhausted"); w.flush(); } /** * Total of all URIs in inactive queues at all precedences * @return int total */ protected int getTotalInactiveQueues() { return tallyInactiveTotals(getInactiveQueuesByPrecedence()); } /** * Total of all URIs in inactive queues at precedences above the floor * @return int total */ protected int getTotalEligibleInactiveQueues() { return tallyInactiveTotals(getInactiveQueuesByPrecedence().headMap(getPrecedenceFloor())); } /** * Total of all URIs in inactive queues at precedences at or below the floor * @return int total */ protected int getTotalIneligibleInactiveQueues() { return tallyInactiveTotals(getInactiveQueuesByPrecedence().tailMap(getPrecedenceFloor())); } /** * @param iqueue * @return */ private int tallyInactiveTotals(SortedMap<Integer, Queue<String>> iqueues) { int inactiveCount = 0; for (Queue<String> q : iqueues.values()) { inactiveCount += q.size(); } return inactiveCount; } /* (non-Javadoc) * @see org.archive.util.Reporter#singleLineLegend() */ @Override public String shortReportLegend() { return "total active in-process ready snoozed inactive retired exhausted"; } /** * This method compiles a human readable report on the status of the frontier * at the time of the call. * @param name Name of report. * @param writer Where to write to. */ @Override public synchronized void reportTo(PrintWriter writer) { int allCount = allQueues.size(); int inProcessCount = inProcessQueues.size(); int readyCount = readyClassQueues.size(); int snoozedCount = getSnoozedCount(); int activeCount = inProcessCount + readyCount + snoozedCount; int inactiveCount = getTotalInactiveQueues(); int retiredCount = getRetiredQueues().size(); int exhaustedCount = allCount - activeCount - inactiveCount - retiredCount; writer.print("Frontier report - "); writer.print(ArchiveUtils.get12DigitDate()); writer.print("\n"); writer.print(" Job being crawled: "); writer.print(controller.getMetadata().getJobName()); writer.print("\n"); writer.print("\n -----===== STATS =====-----\n"); writer.print(" Discovered: "); writer.print(Long.toString(discoveredUriCount())); writer.print("\n"); writer.print(" Queued: "); writer.print(Long.toString(queuedUriCount())); writer.print("\n"); writer.print(" Finished: "); writer.print(Long.toString(finishedUriCount())); writer.print("\n"); writer.print(" Successfully: "); writer.print(Long.toString(succeededFetchCount())); writer.print("\n"); writer.print(" Failed: "); writer.print(Long.toString(failedFetchCount())); writer.print("\n"); writer.print(" Disregarded: "); writer.print(Long.toString(disregardedUriCount())); writer.print("\n"); writer.print("\n -----===== QUEUES =====-----\n"); writer.print(" Already included size: "); writer.print(Long.toString(uriUniqFilter.count())); writer.print("\n"); writer.print(" pending: "); writer.print(Long.toString(uriUniqFilter.pending())); writer.print("\n"); writer.print("\n All class queues map size: "); writer.print(Long.toString(allCount)); writer.print("\n"); writer.print(" Active queues: "); writer.print(activeCount); writer.print("\n"); writer.print(" In-process: "); writer.print(inProcessCount); writer.print("\n"); writer.print(" Ready: "); writer.print(readyCount); writer.print("\n"); writer.print(" Snoozed: "); writer.print(snoozedCount); writer.print("\n"); writer.print(" Inactive queues: "); writer.print(inactiveCount); writer.print(" ("); Map<Integer, Queue<String>> inactives = getInactiveQueuesByPrecedence(); boolean betwixt = false; for (Integer k : inactives.keySet()) { if (betwixt) { writer.print("; "); } writer.print("p"); writer.print(k); writer.print(": "); writer.print(inactives.get(k).size()); betwixt = true; } writer.print(")\n"); writer.print(" Retired queues: "); writer.print(retiredCount); writer.print("\n"); writer.print(" Exhausted queues: "); writer.print(exhaustedCount); writer.print("\n"); State last = lastReachedState; writer.print("\n Last state: " + last); writer.print("\n -----===== MANAGER THREAD =====-----\n"); ToeThread.reportThread(managerThread, writer); writer.print("\n -----===== " + largestQueues.size() + " LONGEST QUEUES =====-----\n"); appendQueueReports(writer, "LONGEST", largestQueues.getEntriesDescending().iterator(), largestQueues.size(), largestQueues.size()); writer.print("\n -----===== IN-PROCESS QUEUES =====-----\n"); Collection<WorkQueue> inProcess = inProcessQueues; ArrayList<WorkQueue> copy = extractSome(inProcess, maxQueuesPerReportCategory); appendQueueReports(writer, "IN-PROCESS", copy.iterator(), copy.size(), maxQueuesPerReportCategory); writer.print("\n -----===== READY QUEUES =====-----\n"); appendQueueReports(writer, "READY", this.readyClassQueues.iterator(), this.readyClassQueues.size(), maxQueuesPerReportCategory); writer.print("\n -----===== SNOOZED QUEUES =====-----\n"); Object[] objs = snoozedClassQueues.toArray(); DelayedWorkQueue[] qs = Arrays.copyOf(objs, objs.length, DelayedWorkQueue[].class); Arrays.sort(qs); appendQueueReports(writer, "SNOOZED", new ObjectArrayIterator(qs), getSnoozedCount(), maxQueuesPerReportCategory); writer.print("\n -----===== INACTIVE QUEUES =====-----\n"); SortedMap<Integer, Queue<String>> sortedInactives = getInactiveQueuesByPrecedence(); for (Integer prec : sortedInactives.keySet()) { Queue<String> inactiveQueues = sortedInactives.get(prec); appendQueueReports(writer, "INACTIVE-p" + prec, inactiveQueues.iterator(), inactiveQueues.size(), maxQueuesPerReportCategory); } writer.print("\n -----===== RETIRED QUEUES =====-----\n"); appendQueueReports(writer, "RETIRED", getRetiredQueues().iterator(), getRetiredQueues().size(), maxQueuesPerReportCategory); writer.flush(); } /** Compact report of all nonempty queues (one queue per line) * * @param writer */ public void allNonemptyReportTo(PrintWriter writer) { ArrayList<WorkQueue> inProcessQueuesCopy; synchronized (this.inProcessQueues) { // grab a copy that will be stable against mods for report duration Collection<WorkQueue> inProcess = this.inProcessQueues; inProcessQueuesCopy = new ArrayList<WorkQueue>(inProcess); } writer.print("\n -----===== IN-PROCESS QUEUES =====-----\n"); queueSingleLinesTo(writer, inProcessQueuesCopy.iterator()); writer.print("\n -----===== READY QUEUES =====-----\n"); queueSingleLinesTo(writer, this.readyClassQueues.iterator()); writer.print("\n -----===== SNOOZED QUEUES =====-----\n"); queueSingleLinesTo(writer, this.snoozedClassQueues.iterator()); queueSingleLinesTo(writer, this.snoozedOverflow.values().iterator()); writer.print("\n -----===== INACTIVE QUEUES =====-----\n"); for (Queue<String> inactiveQueues : getInactiveQueuesByPrecedence().values()) { queueSingleLinesTo(writer, inactiveQueues.iterator()); } writer.print("\n -----===== RETIRED QUEUES =====-----\n"); queueSingleLinesTo(writer, getRetiredQueues().iterator()); } /** Compact report of all nonempty queues (one queue per line) * * @param writer */ public void allQueuesReportTo(PrintWriter writer) { queueSingleLinesTo(writer, allQueues.keySet().iterator()); } /** * Writer the single-line reports of all queues in the * iterator to the writer * * @param writer to receive report * @param iterator over queues of interest. */ private void queueSingleLinesTo(PrintWriter writer, Iterator<?> iterator) { Object obj; WorkQueue q; boolean legendWritten = false; while (iterator.hasNext()) { obj = iterator.next(); if (obj == null) { continue; } if (obj instanceof WorkQueue) { q = (WorkQueue) obj; } else if (obj instanceof DelayedWorkQueue) { q = ((DelayedWorkQueue) obj).getWorkQueue(this); } else { try { q = this.allQueues.get((String) obj); } catch (ClassCastException cce) { logger.log(Level.SEVERE, "not convertible to workqueue:" + obj, cce); q = null; } } if (q != null) { if (!legendWritten) { writer.println(q.shortReportLegend()); legendWritten = true; } q.shortReportLineTo(writer); } else { writer.print(" ERROR: " + obj); } } } /** * Extract some of the elements in the given collection to an * ArrayList. This method synchronizes on the given collection's * monitor. The returned list will never contain more than the * specified maximum number of elements. * * @param c the collection whose elements to extract * @param max the maximum number of elements to extract * @return the extraction */ private static <T> ArrayList<T> extractSome(Collection<T> c, int max) { // Try to guess a sane initial capacity for ArrayList // Hopefully given collection won't grow more than 10 items // between now and the synchronized block... int initial = Math.min(c.size() + 10, max); int count = 0; ArrayList<T> list = new ArrayList<T>(initial); synchronized (c) { Iterator<T> iter = c.iterator(); while (iter.hasNext() && (count < max)) { list.add(iter.next()); count++; } } return list; } /** * Append queue report to general Frontier report. * @param w StringBuffer to append to. * @param iterator An iterator over * @param total * @param max */ @SuppressWarnings("rawtypes") protected void appendQueueReports(PrintWriter w, String label, Iterator<?> iterator, int total, int max) { Object obj; WorkQueue q; int count; for (count = 0; iterator.hasNext() && (count < max); count++) { obj = iterator.next(); if (obj == null) { continue; } if (obj instanceof WorkQueue) { q = (WorkQueue) obj; } else if (obj instanceof DelayedWorkQueue) { q = (WorkQueue) ((DelayedWorkQueue) obj).getWorkQueue(this); } else if (obj instanceof Map.Entry) { q = this.allQueues.get((String) ((Map.Entry) obj).getKey()); } else { q = this.allQueues.get((String) obj); } if (q != null) { w.println(label + "#" + count + ":"); q.reportTo(w); } else { w.print("WARNING: No report for queue " + obj); } } count++; if (count < total) { w.print("...and " + (total - count) + " more " + label + ".\n"); } } /** * Force logging, etc. of operator- deleted CrawlURIs * * @see org.archive.crawler.framework.Frontier#deleted(org.archive.modules.CrawlURI) */ public void deleted(CrawlURI curi) { //treat as disregarded appCtx.publishEvent(new CrawlURIDispositionEvent(this, curi, DISREGARDED)); log(curi); incrementDisregardedUriCount(); curi.stripToMinimal(); curi.processingCleanup(); } public void considerIncluded(CrawlURI curi) { sheetOverlaysManager.applyOverlaysTo(curi); if (curi.getClassKey() == null) { // remedial processing preparer.prepare(curi); } this.uriUniqFilter.note(curi.getCanonicalString()); try { KeyedProperties.loadOverridesFrom(curi); curi.setClassKey(getClassKey(curi)); WorkQueue wq = getQueueFor(curi.getClassKey()); wq.expend(curi.getHolderCost()); wq.makeDirty(); } finally { KeyedProperties.clearOverridesFrom(curi); } } /** * Returns <code>true</code> if the WorkQueue implementation of this * Frontier stores its workload on disk instead of relying * on serialization mechanisms. * * TODO: rename! (this is a very misleading name) or kill (don't * see any implementations that return false) * * @return a constant boolean value for this class/instance */ protected abstract boolean workQueueDataOnDisk(); public long averageDepth() { if (inProcessQueues == null || readyClassQueues == null || snoozedClassQueues == null) { return 0; } int inProcessCount = inProcessQueues.size(); int readyCount = readyClassQueues.size(); int snoozedCount = getSnoozedCount(); int activeCount = inProcessCount + readyCount + snoozedCount; int inactiveCount = getTotalInactiveQueues(); int totalQueueCount = (activeCount + inactiveCount); return (totalQueueCount == 0) ? 0 : queuedUriCount.get() / totalQueueCount; } protected int getSnoozedCount() { return snoozedClassQueues.size() + snoozedOverflowCount.get(); } public float congestionRatio() { if (inProcessQueues == null || readyClassQueues == null || snoozedClassQueues == null) { return 0; } int inProcessCount = inProcessQueues.size(); int readyCount = readyClassQueues.size(); int snoozedCount = getSnoozedCount(); int activeCount = inProcessCount + readyCount + snoozedCount; int eligibleInactiveCount = getTotalEligibleInactiveQueues(); return (float) (activeCount + eligibleInactiveCount) / (inProcessCount + snoozedCount); } public long deepestUri() { return largestQueues.getTopSet().size() == 0 ? -1 : largestQueues.getTopSet().get(largestQueues.getLargest()); } /** * Return whether frontier is exhausted: all crawlable URIs done (none * waiting or pending). Only gives precise answer inside managerThread. * * @see org.archive.crawler.framework.Frontier#isEmpty() */ public boolean isEmpty() { return queuedUriCount.get() == 0 && (uriUniqFilter == null || uriUniqFilter.pending() == 0) && futureUriCount.get() == 0; } /* (non-Javadoc) * @see org.archive.crawler.frontier.AbstractFrontier#getInProcessCount() */ @Override protected int getInProcessCount() { return inProcessQueues.size(); } } // TODO: slim class! Suspect it should be < 800 lines, shedding budgeting/reporting