com.cyberway.issue.crawler.frontier.BdbMultipleWorkQueues.java Source code

Java tutorial

Introduction

Here is the source code for com.cyberway.issue.crawler.frontier.BdbMultipleWorkQueues.java

Source

/* BdbMultipleWorkQueues
 * 
 * Created on Dec 24, 2004
 *
 * Copyright (C) 2004 Internet Archive.
 * 
 * This file is part of the Heritrix web crawler (crawler.archive.org).
 * 
 * Heritrix is free software; you can redistribute it and/or modify
 * it under the terms of the GNU Lesser Public License as published by
 * the Free Software Foundation; either version 2.1 of the License, or
 * any later version.
 * 
 * Heritrix is distributed in the hope that it will be useful, 
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU Lesser Public License for more details.
 * 
 * You should have received a copy of the GNU Lesser Public License
 * along with Heritrix; if not, write to the Free Software
 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
 */
package com.cyberway.issue.crawler.frontier;

import java.io.UnsupportedEncodingException;
import java.math.BigInteger;
import java.util.ArrayList;
import java.util.List;
import java.util.logging.Level;
import java.util.logging.Logger;
import java.util.regex.Pattern;

import org.apache.commons.collections.Closure;
import com.cyberway.issue.crawler.datamodel.CrawlURI;
import com.cyberway.issue.crawler.framework.FrontierMarker;
import com.cyberway.issue.util.ArchiveUtils;

import com.sleepycat.bind.serial.StoredClassCatalog;
import com.sleepycat.je.Cursor;
import com.sleepycat.je.Database;
import com.sleepycat.je.DatabaseConfig;
import com.sleepycat.je.DatabaseEntry;
import com.sleepycat.je.DatabaseException;
import com.sleepycat.je.DatabaseNotFoundException;
import com.sleepycat.je.Environment;
import com.sleepycat.je.OperationStatus;
import com.sleepycat.util.RuntimeExceptionWrapper;

/**
 * A BerkeleyDB-database-backed structure for holding ordered
 * groupings of CrawlURIs. Reading the groupings from specific
 * per-grouping (per-classKey/per-Host) starting points allows
 * this to act as a collection of independent queues. 
 * 
 * <p>For how the bdb keys are made, see {@link #calculateInsertKey(CrawlURI)}.
 * 
 * <p>TODO: refactor, improve naming.
 * 
 * @author gojomo
 */
public class BdbMultipleWorkQueues {
    private static final long serialVersionUID = ArchiveUtils.classnameBasedUID(BdbMultipleWorkQueues.class, 1);

    private static final Logger LOGGER = Logger.getLogger(BdbMultipleWorkQueues.class.getName());

    /** Database holding all pending URIs, grouped in virtual queues */
    private Database pendingUrisDB = null;

    /**  Supporting bdb serialization of CrawlURIs */
    private RecyclingSerialBinding crawlUriBinding;

    /**
     * Create the multi queue in the given environment. 
     * 
     * @param env bdb environment to use
     * @param classCatalog Class catalog to use.
     * @param recycle True if we are to reuse db content if any.
     * @throws DatabaseException
     */
    public BdbMultipleWorkQueues(Environment env, StoredClassCatalog classCatalog, final boolean recycle)
            throws DatabaseException {
        // Open the database. Create it if it does not already exist. 
        DatabaseConfig dbConfig = new DatabaseConfig();
        dbConfig.setAllowCreate(true);
        if (!recycle) {
            try {
                env.truncateDatabase(null, "pending", false);
            } catch (DatabaseNotFoundException e) {
                // Ignored
            }
        }
        // Make database deferred write: URLs that are added then removed 
        // before a page-out is required need never cause disk IO.
        dbConfig.setDeferredWrite(true);

        this.pendingUrisDB = env.openDatabase(null, "pending", dbConfig);
        crawlUriBinding = new RecyclingSerialBinding(classCatalog, CrawlURI.class);
    }

    /**
     * Delete all CrawlURIs matching the given expression.
     * 
     * @param match
     * @param queue
     * @param headKey
     * @return count of deleted items
     * @throws DatabaseException
     * @throws DatabaseException
     */
    public long deleteMatchingFromQueue(String match, String queue, DatabaseEntry headKey)
            throws DatabaseException {
        long deletedCount = 0;
        Pattern pattern = Pattern.compile(match);
        DatabaseEntry key = headKey;
        DatabaseEntry value = new DatabaseEntry();
        Cursor cursor = null;
        try {
            cursor = pendingUrisDB.openCursor(null, null);
            OperationStatus result = cursor.getSearchKeyRange(headKey, value, null);

            while (result == OperationStatus.SUCCESS) {
                if (value.getData().length > 0) {
                    CrawlURI curi = (CrawlURI) crawlUriBinding.entryToObject(value);
                    if (!curi.getClassKey().equals(queue)) {
                        // rolled into next queue; finished with this queue
                        break;
                    }
                    if (pattern.matcher(curi.toString()).matches()) {
                        cursor.delete();
                        deletedCount++;
                    }
                }
                result = cursor.getNext(key, value, null);
            }
        } finally {
            if (cursor != null) {
                cursor.close();
            }
        }

        return deletedCount;
    }

    /**
     * @param m marker
     * @param maxMatches
     * @return list of matches starting from marker position
     * @throws DatabaseException
     */
    public List getFrom(FrontierMarker m, int maxMatches) throws DatabaseException {
        int matches = 0;
        int tries = 0;
        ArrayList<CrawlURI> results = new ArrayList<CrawlURI>(maxMatches);
        BdbFrontierMarker marker = (BdbFrontierMarker) m;

        DatabaseEntry key = marker.getStartKey();
        DatabaseEntry value = new DatabaseEntry();

        if (key != null) {
            Cursor cursor = null;
            OperationStatus result = null;
            try {
                cursor = pendingUrisDB.openCursor(null, null);
                // NOTE: this mutates key, and thus also the marker, 
                // advancing the marker as a side-effect for future 
                // followup operations
                result = cursor.getSearchKey(key, value, null);

                while (matches < maxMatches && result == OperationStatus.SUCCESS) {
                    if (value.getData().length > 0) {
                        CrawlURI curi = (CrawlURI) crawlUriBinding.entryToObject(value);
                        if (marker.accepts(curi)) {
                            results.add(curi);
                            matches++;
                        }
                        tries++;
                    }
                    result = cursor.getNext(key, value, null);
                }
            } finally {
                if (cursor != null) {
                    cursor.close();
                }
            }

            if (result != OperationStatus.SUCCESS) {
                // end of scan
                marker.setStartKey(null);
            }
        }
        return results;
    }

    /**
     * Get a marker for beginning a scan over all contents
     * 
     * @param regexpr
     * @return a marker pointing to the first item
     */
    public FrontierMarker getInitialMarker(String regexpr) {
        try {
            return new BdbFrontierMarker(getFirstKey(), regexpr);
        } catch (DatabaseException e) {
            e.printStackTrace();
            return null;
        }
    }

    /**
     * @return the key to the first item in the database
     * @throws DatabaseException
     */
    protected DatabaseEntry getFirstKey() throws DatabaseException {
        DatabaseEntry key = new DatabaseEntry();
        DatabaseEntry value = new DatabaseEntry();
        Cursor cursor = pendingUrisDB.openCursor(null, null);
        OperationStatus status = cursor.getNext(key, value, null);
        cursor.close();
        if (status == OperationStatus.SUCCESS) {
            return key;
        }
        return null;
    }

    /**
     * Get the next nearest item after the given key. Relies on 
     * external discipline -- we'll look at the queues count of how many
     * items it has -- to avoid asking for something from a
     * range where there are no associated items --
     * otherwise could get first item of next 'queue' by mistake. 
     * 
     * <p>TODO: hold within a queue's range
     * 
     * @param headKey Key prefix that demarks the beginning of the range
     * in <code>pendingUrisDB</code> we're interested in.
     * @return CrawlURI.
     * @throws DatabaseException
     */
    public CrawlURI get(DatabaseEntry headKey) throws DatabaseException {
        DatabaseEntry result = new DatabaseEntry();

        // From Linda Lee of sleepycat:
        // "You want to check the status returned from Cursor.getSearchKeyRange
        // to make sure that you have OperationStatus.SUCCESS. In that case,
        // you have found a valid data record, and result.getData()
        // (called by internally by the binding code, in this case) will be
        // non-null. The other possible status return is
        // OperationStatus.NOTFOUND, in which case no data record matched
        // the criteria. "
        OperationStatus status = getNextNearestItem(headKey, result);
        CrawlURI retVal = null;
        if (status != OperationStatus.SUCCESS) {
            LOGGER.severe("See '1219854 NPE je-2.0 " + "entryToObject...'. OperationStatus " + " was not SUCCESS: "
                    + status + ", headKey " + BdbWorkQueue.getPrefixClassKey(headKey.getData()));
            return null;
        }
        try {
            retVal = (CrawlURI) crawlUriBinding.entryToObject(result);
        } catch (RuntimeExceptionWrapper rw) {
            LOGGER.log(Level.SEVERE,
                    "expected object missing in queue " + BdbWorkQueue.getPrefixClassKey(headKey.getData()), rw);
            return null;
        }
        retVal.setHolderKey(headKey);
        return retVal;
    }

    protected OperationStatus getNextNearestItem(DatabaseEntry headKey, DatabaseEntry result)
            throws DatabaseException {
        Cursor cursor = null;
        OperationStatus status;
        try {
            cursor = this.pendingUrisDB.openCursor(null, null);
            // get cap; headKey at this point should always point to 
            // a queue-beginning cap entry (zero-length value)
            status = cursor.getSearchKey(headKey, result, null);
            if (status != OperationStatus.SUCCESS || result.getData().length > 0) {
                // cap missing
                throw new DatabaseException("bdb queue cap missing");
            }
            // get next item (real first item of queue)
            status = cursor.getNext(headKey, result, null);
        } finally {
            if (cursor != null) {
                cursor.close();
            }
        }
        return status;
    }

    /**
     * Put the given CrawlURI in at the appropriate place. 
     * 
     * @param curi
     * @throws DatabaseException
     */
    public void put(CrawlURI curi, boolean overwriteIfPresent) throws DatabaseException {
        DatabaseEntry insertKey = (DatabaseEntry) curi.getHolderKey();
        if (insertKey == null) {
            insertKey = calculateInsertKey(curi);
            curi.setHolderKey(insertKey);
        }
        DatabaseEntry value = new DatabaseEntry();
        crawlUriBinding.objectToEntry(curi, value);
        // Output tally on avg. size if level is FINE or greater.
        if (LOGGER.isLoggable(Level.FINE)) {
            tallyAverageEntrySize(curi, value);
        }
        OperationStatus status;
        if (overwriteIfPresent) {
            status = pendingUrisDB.put(null, insertKey, value);
        } else {
            status = pendingUrisDB.putNoOverwrite(null, insertKey, value);
        }
        if (status != OperationStatus.SUCCESS) {
            LOGGER.severe("failed; " + status + " " + curi);
        }
    }

    private long entryCount = 0;
    private long entrySizeSum = 0;
    private int largestEntry = 0;

    /**
     * Log average size of database entry.
     * @param curi CrawlURI this entry is for.
     * @param value Database entry value.
     */
    private synchronized void tallyAverageEntrySize(CrawlURI curi, DatabaseEntry value) {
        entryCount++;
        int length = value.getData().length;
        entrySizeSum += length;
        int avg = (int) (entrySizeSum / entryCount);
        if (entryCount % 1000 == 0) {
            LOGGER.fine("Average entry size at " + entryCount + ": " + avg);
        }
        if (length > largestEntry) {
            largestEntry = length;
            LOGGER.fine("Largest entry: " + length + " " + curi);
            if (length > (2 * avg)) {
                LOGGER.fine("excessive?");
            }
        }
    }

    /**
     * Calculate the 'origin' key for a virtual queue of items
     * with the given classKey. This origin key will be a 
     * prefix of the keys for all items in the queue. 
     * 
     * @param classKey String key to derive origin byte key from 
     * @return a byte array key 
     */
    static byte[] calculateOriginKey(String classKey) {
        byte[] classKeyBytes = null;
        int len = 0;
        try {
            classKeyBytes = classKey.getBytes("UTF-8");
            len = classKeyBytes.length;
        } catch (UnsupportedEncodingException e) {
            // should be impossible; all JVMs must support UTF-8
            e.printStackTrace();
        }
        byte[] keyData = new byte[len + 1];
        System.arraycopy(classKeyBytes, 0, keyData, 0, len);
        keyData[len] = 0;
        return keyData;
    }

    /**
     * Calculate the insertKey that places a CrawlURI in the
     * desired spot. First bytes are always classKey (usu. host)
     * based -- ensuring grouping by host -- terminated by a zero
     * byte. Then 8 bytes of data ensuring desired ordering 
     * within that 'queue' are used. The first byte of these 8 is
     * priority -- allowing 'immediate' and 'soon' items to 
     * sort above regular. Next 1 byte is 'cost'. Last 6 bytes 
     * are ordinal serial number, ensuring earlier-discovered 
     * URIs sort before later. 
     * 
     * NOTE: Dangers here are:
     * (1) priorities or costs over 2^7 (signed byte comparison)
     * (2) ordinals over 2^48
     * 
     * Package access & static for testing purposes. 
     * 
     * @param curi
     * @return a DatabaseEntry key for the CrawlURI
     */
    static DatabaseEntry calculateInsertKey(CrawlURI curi) {
        byte[] classKeyBytes = null;
        int len = 0;
        try {
            classKeyBytes = curi.getClassKey().getBytes("UTF-8");
            len = classKeyBytes.length;
        } catch (UnsupportedEncodingException e) {
            // should be impossible; all JVMs must support UTF-8
            e.printStackTrace();
        }
        byte[] keyData = new byte[len + 9];
        System.arraycopy(classKeyBytes, 0, keyData, 0, len);
        keyData[len] = 0;
        long ordinalPlus = curi.getOrdinal() & 0x0000FFFFFFFFFFFFL;
        ordinalPlus = ((long) curi.getSchedulingDirective() << 56) | ordinalPlus;
        ordinalPlus = ((((long) curi.getHolderCost()) & 0xFFL) << 48) | ordinalPlus;
        ArchiveUtils.longIntoByteArray(ordinalPlus, keyData, len + 1);
        return new DatabaseEntry(keyData);
    }

    /**
     * Delete the given CrawlURI from persistent store. Requires
     * the key under which it was stored be available. 
     * 
     * @param item
     * @throws DatabaseException
     */
    public void delete(CrawlURI item) throws DatabaseException {
        OperationStatus status;
        status = pendingUrisDB.delete(null, (DatabaseEntry) item.getHolderKey());
        if (status != OperationStatus.SUCCESS) {
            LOGGER.severe("expected item not present: " + item + "("
                    + (new BigInteger(((DatabaseEntry) item.getHolderKey()).getData())).toString(16) + ")");
        }

    }

    /**
     * Method used by BdbFrontier during checkpointing.
     * <p>The backing bdbje database has been marked deferred write so we save
     * on writes to disk.  Means no guarantees disk will have whats in memory
     * unless a sync is called (Calling sync on the bdbje Environment is not
     * sufficent).
     * <p>Package access only because only Frontiers of this package would ever
     * need access.
     * @see <a href="http://www.sleepycat.com/jedocs/GettingStartedGuide/DB.html">Deferred Write Databases</a>
     */
    void sync() {
        if (this.pendingUrisDB == null) {
            return;
        }
        try {
            this.pendingUrisDB.sync();
        } catch (DatabaseException e) {
            e.printStackTrace();
        }
    }

    /**
     * clean up 
     *
     */
    public void close() {
        try {
            this.pendingUrisDB.close();
        } catch (DatabaseException e) {
            e.printStackTrace();
        }
    }

    /**
     * Marker for remembering a position within the BdbMultipleWorkQueues.
     * 
     * @author gojomo
     */
    public class BdbFrontierMarker implements FrontierMarker {
        DatabaseEntry startKey;
        Pattern pattern;
        int nextItemNumber;

        /**
         * Create a marker pointed at the given start location.
         * 
         * @param startKey
         * @param regexpr
         */
        public BdbFrontierMarker(DatabaseEntry startKey, String regexpr) {
            this.startKey = startKey;
            pattern = Pattern.compile(regexpr);
            nextItemNumber = 1;
        }

        /**
         * @param curi
         * @return whether the marker accepts the given CrawlURI
         */
        public boolean accepts(CrawlURI curi) {
            boolean retVal = pattern.matcher(curi.toString()).matches();
            if (retVal == true) {
                nextItemNumber++;
            }
            return retVal;
        }

        /**
         * @param key position for marker
         */
        public void setStartKey(DatabaseEntry key) {
            startKey = key;
        }

        /**
         * @return startKey
         */
        public DatabaseEntry getStartKey() {
            return startKey;
        }

        /* (non-Javadoc)
         * @see com.cyberway.issue.crawler.framework.FrontierMarker#getMatchExpression()
         */
        public String getMatchExpression() {
            return pattern.pattern();
        }

        /* (non-Javadoc)
         * @see com.cyberway.issue.crawler.framework.FrontierMarker#getNextItemNumber()
         */
        public long getNextItemNumber() {
            return nextItemNumber;
        }

        /* (non-Javadoc)
         * @see com.cyberway.issue.crawler.framework.FrontierMarker#hasNext()
         */
        public boolean hasNext() {
            // as long as any startKey is stated, consider as having next
            return startKey != null;
        }
    }

    /**
     * Add a dummy 'cap' entry at the given insertion key. Prevents
     * 'seeks' to queue heads from holding lock on last item of 
     * 'preceding' queue. See:
     * http://sourceforge.net/tracker/index.php?func=detail&aid=1262665&group_id=73833&atid=539102
     * 
     * @param origin key at which to insert the cap
     */
    public void addCap(byte[] origin) {
        try {
            pendingUrisDB.put(null, new DatabaseEntry(origin), new DatabaseEntry(new byte[0]));
        } catch (DatabaseException e) {
            throw new RuntimeException(e);
        }
    }

    /**
     * Utility method to perform action for all pending CrawlURI instances.
     * @param c Closure action to perform
     * @throws DatabaseException
     */
    protected void forAllPendingDo(Closure c) throws DatabaseException {
        DatabaseEntry key = new DatabaseEntry();
        DatabaseEntry value = new DatabaseEntry();
        Cursor cursor = pendingUrisDB.openCursor(null, null);
        while (cursor.getNext(key, value, null) == OperationStatus.SUCCESS) {
            if (value.getData().length == 0) {
                continue;
            }
            CrawlURI item = (CrawlURI) crawlUriBinding.entryToObject(value);
            c.execute(item);
        }
    }
}