org.archive.crawler.frontier.BdbFrontier.java Source code

Java tutorial

Introduction

Here is the source code for org.archive.crawler.frontier.BdbFrontier.java

Source

/*
 *  This file is part of the Heritrix web crawler (crawler.archive.org).
 *
 *  Licensed to the Internet Archive (IA) by one or more individual 
 *  contributors. 
 *
 *  The IA licenses this file to You under the Apache License, Version 2.0
 *  (the "License"); you may not use this file except in compliance with
 *  the License.  You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 *  Unless required by applicable law or agreed to in writing, software
 *  distributed under the License is distributed on an "AS IS" BASIS,
 *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 *  See the License for the specific language governing permissions and
 *  limitations under the License.
 */
package org.archive.crawler.frontier;

import java.io.BufferedReader;
import java.io.IOException;
import java.io.PrintWriter;
import java.util.Map.Entry;
import java.util.Queue;
import java.util.SortedMap;
import java.util.concurrent.ConcurrentSkipListMap;
import java.util.concurrent.DelayQueue;
import java.util.concurrent.LinkedBlockingQueue;
import java.util.logging.Level;
import java.util.logging.Logger;
import java.util.regex.Pattern;

import javax.management.openmbean.CompositeData;

import org.apache.commons.collections.Closure;
import org.apache.commons.io.IOUtils;
import org.archive.bdb.BdbModule;
import org.archive.bdb.DisposableStoredSortedMap;
import org.archive.bdb.StoredQueue;
import org.archive.checkpointing.Checkpoint;
import org.archive.checkpointing.Checkpointable;
import org.archive.modules.CrawlURI;
import org.archive.util.ArchiveUtils;
import org.archive.util.Supplier;
import org.json.JSONArray;
import org.json.JSONException;
import org.json.JSONObject;
import org.springframework.beans.factory.BeanNameAware;
import org.springframework.beans.factory.annotation.Autowired;

import com.sleepycat.je.Database;
import com.sleepycat.je.DatabaseException;

/**
 * A Frontier using several BerkeleyDB JE Databases to hold its record of
 * known hosts (queues), and pending URIs. 
 *
 * @author Gordon Mohr
 */
public class BdbFrontier extends WorkQueueFrontier implements Checkpointable, BeanNameAware {
    @SuppressWarnings("unused")
    private static final long serialVersionUID = 1L;

    private static final Logger logger = Logger.getLogger(BdbFrontier.class.getName());

    /** 
     * All 'inactive' queues, not yet in active rotation.
     * Linked-list of keys for the queues.
     */
    protected SortedMap<Integer, Queue<String>> inactiveQueuesByPrecedence;

    /**
     * 'retired' queues, no longer considered for activation.
     * Linked-list of keys for queues.
     */
    protected StoredQueue<String> retiredQueues;

    /** all URIs scheduled to be crawled */
    protected transient BdbMultipleWorkQueues pendingUris;

    protected BdbModule bdb;

    @Autowired
    public void setBdbModule(BdbModule bdb) {
        this.bdb = bdb;
    }

    protected String beanName;

    public void setBeanName(String name) {
        this.beanName = name;
    }

    protected boolean dumpPendingAtClose = false;

    public boolean getDumpPendingAtClose() {
        return dumpPendingAtClose;
    }

    public void setDumpPendingAtClose(boolean dumpPendingAtClose) {
        this.dumpPendingAtClose = dumpPendingAtClose;
    }

    /* (non-Javadoc)
     * @see org.archive.crawler.frontier.WorkQueueFrontier#getInactiveQueuesByPrecedence()
     */
    @Override
    protected SortedMap<Integer, Queue<String>> getInactiveQueuesByPrecedence() {
        return inactiveQueuesByPrecedence;
    }

    /* (non-Javadoc)
     * @see org.archive.crawler.frontier.WorkQueueFrontier#getRetiredQueues()
     */
    @Override
    protected Queue<String> getRetiredQueues() {
        return retiredQueues;
    }

    /**
     * Create the single object (within which is one BDB database)
     * inside which all the other queues live. 
     * 
     * @return the created BdbMultipleWorkQueues
     * @throws DatabaseException
     */
    protected BdbMultipleWorkQueues createMultipleWorkQueues() throws DatabaseException {
        Database db;
        boolean recycle = (recoveryCheckpoint != null);

        BdbModule.BdbConfig dbConfig = new BdbModule.BdbConfig();
        dbConfig.setAllowCreate(!recycle);
        // Make database deferred write: URLs that are added then removed 
        // before a page-out is required need never cause disk IO.
        db = bdb.openDatabase("pending", dbConfig, recycle);

        return new BdbMultipleWorkQueues(db, bdb.getClassCatalog());
    }

    /**
     * Return the work queue for the given classKey, or null
     * if no such queue exists.
     * 
     * @param classKey key to look for
     * @return the found WorkQueue
     */
    protected WorkQueue getQueueFor(final String classKey) {
        WorkQueue wq = allQueues.getOrUse(classKey, new Supplier<WorkQueue>() {
            public BdbWorkQueue get() {
                String qKey = new String(classKey); // ensure private minimal key
                BdbWorkQueue q = new BdbWorkQueue(qKey, BdbFrontier.this);
                q.setTotalBudget(getQueueTotalBudget());
                getQueuePrecedencePolicy().queueCreated(q);
                return q;
            }
        });
        return wq;
    }

    @Override
    public FrontierGroup getGroup(CrawlURI curi) {
        return getQueueFor(curi.getClassKey());
    }

    /**
     * Return list of urls.
     * @param marker
     * @param numberOfMatches
     * @param verbose 
     * @return List of URIs (strings).
     */
    public CompositeData getURIsList(String marker, int numberOfMatches, String pattern, final boolean verbose) {
        try {
            Pattern p = Pattern.compile(pattern);
            return pendingUris.getFrom(marker, numberOfMatches, p, verbose);
        } catch (DatabaseException e) {
            throw new IllegalStateException(e);
        }
    }

    /* (non-Javadoc)
     * @see org.archive.crawler.frontier.AbstractFrontier#finalTasks()
     */
    @Override
    protected void finalTasks() {
        super.finalTasks();
        // before closing/releasing, dump if requested
        if (getDumpPendingAtClose()) {
            try {
                dumpAllPendingToLog();
            } catch (Exception e) {
                logger.log(Level.WARNING, "dump pending problem", e);
            }
        }
    }

    /* (non-Javadoc)
     * @see org.archive.crawler.frontier.WorkQueueFrontier#close()
     */
    @Override
    public void close() {
        ArchiveUtils.closeQuietly(pendingUris);
        super.close();
    }

    protected BdbMultipleWorkQueues getWorkQueues() {
        return pendingUris;
    }

    protected boolean workQueueDataOnDisk() {
        return true;
    }

    public BdbFrontier() {
        super();
    }

    public void startCheckpoint(Checkpoint checkpointInProgress) {
        dispositionInProgressLock.writeLock().lock();
    }

    public void doCheckpoint(Checkpoint checkpointInProgress) {
        // An explicit sync on any deferred write dbs is needed to make the
        // db recoverable. Sync'ing the environment is insufficient
        this.pendingUris.sync();
        // object caches will be sync()d by BdbModule

        // save simple instance fields & inactive-levels summary
        JSONObject json = new JSONObject();
        try {
            json.put("nextOrdinal", nextOrdinal.get());
            json.put("queuedUriCount", queuedUriCount.get());
            json.put("futureUriCount", futureUriCount.get());
            json.put("succeededFetchCount", succeededFetchCount.get());
            json.put("failedFetchCount", failedFetchCount.get());
            json.put("disregardedUriCount", disregardedUriCount.get());
            json.put("totalProcessedBytes", totalProcessedBytes.get());
            json.put("inactivePrecedences", inactiveQueuesByPrecedence.keySet());
            checkpointInProgress.saveJson(beanName, json);
        } catch (JSONException e) {
            // impossible
            throw new RuntimeException(e);
        }
        // write all active (inProcess, ready, snoozed) queues to list for quick-resume-use
        PrintWriter activeQueuesWriter = null;
        try {
            activeQueuesWriter = new PrintWriter(checkpointInProgress.saveWriter(beanName, "active"));
            for (WorkQueue q : inProcessQueues) {
                activeQueuesWriter.println(q.getClassKey());
            }
            for (String qk : readyClassQueues) {
                activeQueuesWriter.println(qk);
            }
            for (DelayedWorkQueue q : snoozedClassQueues) {
                activeQueuesWriter.println(q.getClassKey());
            }
            for (DelayedWorkQueue q : snoozedOverflow.values()) {
                activeQueuesWriter.println(q.getClassKey());
            }
        } catch (IOException ioe) {
            checkpointInProgress.setSuccess(false);
            logger.log(Level.SEVERE, "problem writing checkpoint", ioe);
        } finally {
            IOUtils.closeQuietly(activeQueuesWriter);
        }
        // rotate recover log, if any
        if (this.recover != null) {
            recover.rotateForCheckpoint(checkpointInProgress);
        }
    }

    public void finishCheckpoint(Checkpoint checkpointInProgress) {
        dispositionInProgressLock.writeLock().unlock();
    }

    protected Checkpoint recoveryCheckpoint;

    @Autowired(required = false)
    public void setRecoveryCheckpoint(Checkpoint checkpoint) {
        this.recoveryCheckpoint = checkpoint;
    }

    @Override
    protected void initAllQueues() throws DatabaseException {
        boolean isRecovery = (recoveryCheckpoint != null);
        this.allQueues = bdb.getObjectCache("allqueues", isRecovery, WorkQueue.class, BdbWorkQueue.class);
        if (isRecovery) {
            // restore simple instance fields 
            JSONObject json = recoveryCheckpoint.loadJson(beanName);
            try {
                nextOrdinal.set(json.getLong("nextOrdinal"));
                queuedUriCount.set(json.getLong("queuedUriCount"));
                futureUriCount.set(json.getLong("futureUriCount"));
                succeededFetchCount.set(json.getLong("succeededFetchCount"));
                failedFetchCount.set(json.getLong("failedFetchCount"));
                disregardedUriCount.set(json.getLong("disregardedUriCount"));
                totalProcessedBytes.set(json.getLong("totalProcessedBytes"));
                JSONArray inactivePrecedences = json.getJSONArray("inactivePrecedences");
                // restore all intended inactiveQueues
                for (int i = 0; i < inactivePrecedences.length(); i++) {
                    int precedence = inactivePrecedences.getInt(i);
                    inactiveQueuesByPrecedence.put(precedence, createInactiveQueueForPrecedence(precedence, true));
                }
            } catch (JSONException e) {
                throw new RuntimeException(e);
            }

            // retired queues already restored with prior data in initOtherQueues

            // restore ready queues (those not already on inactive, retired)
            BufferedReader activeQueuesReader = null;
            try {
                activeQueuesReader = recoveryCheckpoint.loadReader(beanName, "active");
                String line;
                while ((line = activeQueuesReader.readLine()) != null) {
                    readyClassQueues.add(line);
                }
            } catch (IOException ioe) {
                throw new RuntimeException(ioe);
            } finally {
                IOUtils.closeQuietly(activeQueuesReader);
            }

            // TODO: restore largestQueues topNset?
        }
    }

    @Override
    protected void initOtherQueues() throws DatabaseException {
        boolean recycle = (recoveryCheckpoint != null);

        // tiny risk of OutOfMemoryError: if giant number of snoozed
        // queues all wake-to-ready at once
        readyClassQueues = new LinkedBlockingQueue<String>();

        inactiveQueuesByPrecedence = new ConcurrentSkipListMap<Integer, Queue<String>>();

        retiredQueues = bdb.getStoredQueue("retiredQueues", String.class, recycle);

        // primary snoozed queues
        snoozedClassQueues = new DelayQueue<DelayedWorkQueue>();
        // just in case: overflow for extreme situations
        snoozedOverflow = bdb.getStoredMap("snoozedOverflow", Long.class, DelayedWorkQueue.class, true, false);

        this.futureUris = bdb.getStoredMap("futureUris", Long.class, CrawlURI.class, true,
                recoveryCheckpoint != null);

        // initialize master map in which other queues live
        this.pendingUris = createMultipleWorkQueues();
    }

    /* (non-Javadoc)
     * @see org.archive.crawler.frontier.WorkQueueFrontier#createInactiveQueueForPrecedence(int)
     */
    @Override
    protected Queue<String> createInactiveQueueForPrecedence(int precedence) {
        return createInactiveQueueForPrecedence(precedence, false);
    }

    /** 
     * Optionally reuse prior data, for use when resuming from a checkpoint
     */
    protected Queue<String> createInactiveQueueForPrecedence(int precedence, boolean usePriorData) {
        return bdb.getStoredQueue("inactiveQueues-" + precedence, String.class, usePriorData);
    }

    /**
     * Dump all still-enqueued URIs to the crawl.log -- without actually
     * dequeuing. Useful for understanding what was remaining in a crawl that
     * was ended early, for example at a time limit.
     * 
     * @throws DatabaseException
     */
    public void dumpAllPendingToLog() throws DatabaseException {
        Closure tolog = new Closure() {
            public void execute(Object curi) {
                log((CrawlURI) curi);
            }
        };
        pendingUris.forAllPendingDo(tolog);
    }

    /**
     * Run a self-consistency check over queue collections, queues-of-queues, 
     * etc. for testing purposes. Requires one of the same locks as for PAUSE, 
     * so should only be run while crawl is running. 
     */
    public void consistencyCheck() {
        //        outboundLock.writeLock().lock(); 
        dispositionInProgressLock.writeLock().lock();
        System.err.println("<<<CHECKING FRONTIER CONSISTENCY");
        DisposableStoredSortedMap<String, String> queueSummaries = bdb.getStoredMap(null, String.class,
                String.class, false, false);
        // mark every queue with the 'managed' collections it's in
        consistencyMarkup(queueSummaries, inProcessQueues, "i");
        consistencyMarkup(queueSummaries, readyClassQueues, "r");
        consistencyMarkup(queueSummaries, snoozedClassQueues, "s");
        consistencyMarkup(queueSummaries, snoozedOverflow.values(), "S");
        for (Entry<Integer, Queue<String>> entry : getInactiveQueuesByPrecedence().entrySet()) {
            consistencyMarkup(queueSummaries, entry.getValue(), Integer.toString(entry.getKey()));
        }
        consistencyMarkup(queueSummaries, retiredQueues, "R");

        // report problems where a queue isn't as expected or ideal
        int anomalies = 0;
        for (String q : allQueues.keySet()) {
            WorkQueue wq = allQueues.get(q);
            String summary = queueSummaries.get(q);
            if (wq.getCount() > 0 && summary == null) {
                // every non-empty queue should have been in at least one collection
                System.err.println("FRONTIER ANOMALY: " + q + " " + wq.getCount() + " " + wq.isManaged()
                        + " but not in managed collections");
                //                System.err.println(wq.shortReportLegend()+"\n"+inactiveByClass.get(q)+"\n"+wq.shortReportLine());
                anomalies++;
            }
            if (wq.getCount() == 0 && summary == null && wq.isManaged()) {
                // any empty queue should only report isManaged if in a collection
                System.err.println("FRONTIER ANOMALY: " + q + " " + wq.getCount() + " " + wq.isManaged()
                        + " but not in managed collections");
                //                System.err.println(wq.shortReportLegend()+"\n"+inactiveByClass.get(q)+"\n"+wq.shortReportLine());
                anomalies++;
            }
        }
        System.err.println(anomalies + " ANOMALIES");
        int concerns = 0;
        for (String q : queueSummaries.keySet()) {
            String summary = queueSummaries.get(q);
            if (summary != null && summary.split(",").length > 1) {
                // ideally queues won't be more than one place (though frontier
                // should operate if they are, and changing precedence values 
                // will cause multiple entries by design)
                WorkQueue wq = allQueues.get(q);
                System.err.println("FRONTIER CONCERN: " + q + " " + wq.getCount() + " multiple places: " + summary);
                System.err.println("\n" + wq.shortReportLegend() + "\n" + wq.shortReportLine());
                concerns++;
            }
        }
        System.err.println(concerns + " CONCERNS");
        System.err.println("END CHECKING FRONTIER>>>");

        queueSummaries.dispose();
        dispositionInProgressLock.writeLock().unlock();
        //        outboundLock.writeLock().unlock(); 
    }

    protected void consistencyMarkup(DisposableStoredSortedMap<String, String> queueSummaries, Iterable<?> queues,
            String mark) {
        for (Object qq : queues) {
            String key = (qq instanceof String) ? (String) qq
                    : (qq instanceof WorkQueue) ? ((WorkQueue) qq).getClassKey()
                            : ((DelayedWorkQueue) qq).getClassKey();
            String val = queueSummaries.get(key);
            val = (val == null) ? mark : val + "," + mark;
            queueSummaries.put(key, val);
        }
    }
}