org.archive.crawler.frontier.BdbMultipleWorkQueues.java Source code

Java tutorial

Introduction

Here is the source code for org.archive.crawler.frontier.BdbMultipleWorkQueues.java

Source

/*
 *  This file is part of the Heritrix web crawler (crawler.archive.org).
 *
 *  Licensed to the Internet Archive (IA) by one or more individual 
 *  contributors. 
 *
 *  The IA licenses this file to You under the Apache License, Version 2.0
 *  (the "License"); you may not use this file except in compliance with
 *  the License.  You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 *  Unless required by applicable law or agreed to in writing, software
 *  distributed under the License is distributed on an "AS IS" BASIS,
 *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 *  See the License for the specific language governing permissions and
 *  limitations under the License.
 */
package org.archive.crawler.frontier;

import java.io.IOException;
import java.io.UnsupportedEncodingException;
import java.math.BigInteger;
import java.util.ArrayList;
import java.util.logging.Level;
import java.util.logging.Logger;
import java.util.regex.Pattern;

import javax.management.openmbean.CompositeData;
import javax.management.openmbean.CompositeDataSupport;
import javax.management.openmbean.OpenDataException;

import org.apache.commons.collections.Closure;
import org.archive.bdb.KryoBinding;
import org.archive.modules.CrawlURI;
import org.archive.util.ArchiveUtils;

import com.google.common.base.Charsets;
import com.sleepycat.bind.EntryBinding;
import com.sleepycat.bind.serial.StoredClassCatalog;
import com.sleepycat.je.Cursor;
import com.sleepycat.je.Database;
import com.sleepycat.je.DatabaseEntry;
import com.sleepycat.je.DatabaseException;
import com.sleepycat.je.OperationStatus;
import com.sleepycat.util.RuntimeExceptionWrapper;

/**
 * A BerkeleyDB-database-backed structure for holding ordered
 * groupings of CrawlURIs. Reading the groupings from specific
 * per-grouping (per-classKey/per-Host) starting points allows
 * this to act as a collection of independent queues. 
 * 
 * <p>For how the bdb keys are made, see {@link #calculateInsertKey(CrawlURI)}.
 * 
 * <p>TODO: refactor, improve naming.
 * 
 * @author gojomo
 */
public class BdbMultipleWorkQueues {
    @SuppressWarnings("unused")
    private static final long serialVersionUID = 1L;

    private static final Logger LOGGER = Logger.getLogger(BdbMultipleWorkQueues.class.getName());

    /** Database holding all pending URIs, grouped in virtual queues */
    private Database pendingUrisDB = null;

    /**  Supporting bdb serialization of CrawlURIs */
    private EntryBinding<CrawlURI> crawlUriBinding;

    /**
     * Create the multi queue in the given environment. 
     * 
     * @param env bdb environment to use
     * @param classCatalog Class catalog to use.
     * @param recycle True if we are to reuse db content if any.
     * @throws DatabaseException
     */
    public BdbMultipleWorkQueues(Database db, StoredClassCatalog classCatalog) throws DatabaseException {
        this.pendingUrisDB = db;
        crawlUriBinding = new KryoBinding<CrawlURI>(CrawlURI.class);
        //            new RecyclingSerialBinding<CrawlURI>(classCatalog, CrawlURI.class);
        //            new BenchmarkingBinding<CrawlURI>(new EntryBinding[] {
        //                new KryoBinding<CrawlURI>(CrawlURI.class,true),
        //                new KryoBinding<CrawlURI>(CrawlURI.class,false),                    
        //                new RecyclingSerialBinding<CrawlURI>(classCatalog, CrawlURI.class),
        //            });

    }

    /**
     * Delete all CrawlURIs matching the given expression.
     * 
     * @param match
     * @param queue
     * @param headKey
     * @return count of deleted items
     * @throws DatabaseException
     * @throws DatabaseException
     */
    public long deleteMatchingFromQueue(String match, String queue, DatabaseEntry headKey)
            throws DatabaseException {
        long deletedCount = 0;
        Pattern pattern = Pattern.compile(match);
        DatabaseEntry key = headKey;
        DatabaseEntry value = new DatabaseEntry();
        Cursor cursor = null;
        try {
            cursor = pendingUrisDB.openCursor(null, null);
            OperationStatus result = cursor.getSearchKeyRange(headKey, value, null);

            while (result == OperationStatus.SUCCESS) {
                if (value.getData().length > 0) {
                    CrawlURI curi = (CrawlURI) crawlUriBinding.entryToObject(value);
                    if (!curi.getClassKey().equals(queue)) {
                        // rolled into next queue; finished with this queue
                        break;
                    }
                    if (pattern.matcher(curi.toString()).matches()) {
                        cursor.delete();
                        deletedCount++;
                    }
                }
                result = cursor.getNext(key, value, null);
            }
        } finally {
            if (cursor != null) {
                cursor.close();
            }
        }

        return deletedCount;
    }

    /**
     * @param m marker or null to start with first entry
     * @param maxMatches
     * @return list of matches starting from marker position
     * @throws DatabaseException
     */
    public CompositeData getFrom(String m, int maxMatches, Pattern pattern, boolean verbose)
            throws DatabaseException {
        int matches = 0;
        ArrayList<String> results = new ArrayList<String>(maxMatches);

        DatabaseEntry key;
        if (m == null) {
            key = getFirstKey();
        } else {
            byte[] marker = m.getBytes(); // = FrontierJMXTypes.fromString(m);
            key = new DatabaseEntry(marker);
        }

        DatabaseEntry value = new DatabaseEntry();

        Cursor cursor = null;
        OperationStatus result = null;
        try {
            cursor = pendingUrisDB.openCursor(null, null);
            result = cursor.getSearchKey(key, value, null);

            while (matches < maxMatches && result == OperationStatus.SUCCESS) {
                if (value.getData().length > 0) {
                    CrawlURI curi = (CrawlURI) crawlUriBinding.entryToObject(value);
                    if (pattern.matcher(curi.toString()).matches()) {
                        if (verbose) {
                            results.add("[" + curi.getClassKey() + "] " + curi.shortReportLine());
                        } else {
                            results.add(curi.toString());
                        }
                        matches++;
                    }
                }
                result = cursor.getNext(key, value, null);
            }
        } finally {
            if (cursor != null) {
                cursor.close();
            }
        }

        if (result != OperationStatus.SUCCESS) {
            // end of scan
            m = null;
        } else {
            m = new String(key.getData()); // = FrontierJMXTypes.toString(key.getData());
        }

        String[] arr = results.toArray(new String[results.size()]);
        CompositeData cd;
        try {
            cd = new CompositeDataSupport(/*FrontierJMXTypes.URI_LIST_DATA*/ null,
                    new String[] { "list", "marker" }, new Object[] { arr, m });
        } catch (OpenDataException e) {
            throw new IllegalStateException(e);
        }
        return cd;
    }

    /**
     * @return the key to the first item in the database
     * @throws DatabaseException
     */
    protected DatabaseEntry getFirstKey() throws DatabaseException {
        DatabaseEntry key = new DatabaseEntry();
        DatabaseEntry value = new DatabaseEntry();
        Cursor cursor = pendingUrisDB.openCursor(null, null);
        OperationStatus status = cursor.getNext(key, value, null);
        cursor.close();
        if (status == OperationStatus.SUCCESS) {
            return key;
        }
        return null;
    }

    /**
     * Get the next nearest item after the given key. Relies on 
     * external discipline -- we'll look at the queues count of how many
     * items it has -- to avoid asking for something from a
     * range where there are no associated items --
     * otherwise could get first item of next 'queue' by mistake. 
     * 
     * <p>TODO: hold within a queue's range
     * 
     * @param headKey Key prefix that demarks the beginning of the range
     * in <code>pendingUrisDB</code> we're interested in.
     * @return CrawlURI.
     * @throws DatabaseException
     */
    public CrawlURI get(DatabaseEntry headKey) throws DatabaseException {
        DatabaseEntry result = new DatabaseEntry();

        // From Linda Lee of sleepycat:
        // "You want to check the status returned from Cursor.getSearchKeyRange
        // to make sure that you have OperationStatus.SUCCESS. In that case,
        // you have found a valid data record, and result.getData()
        // (called by internally by the binding code, in this case) will be
        // non-null. The other possible status return is
        // OperationStatus.NOTFOUND, in which case no data record matched
        // the criteria. "
        OperationStatus status = getNextNearestItem(headKey, result);
        CrawlURI retVal = null;
        if (status != OperationStatus.SUCCESS) {
            LOGGER.severe("See '1219854 NPE je-2.0 " + "entryToObject...'. OperationStatus " + " was not SUCCESS: "
                    + status + ", headKey " + BdbWorkQueue.getPrefixClassKey(headKey.getData()));
            return null;
        }

        try {
            retVal = (CrawlURI) crawlUriBinding.entryToObject(result);
        } catch (ClassCastException cce) {
            Object obj = crawlUriBinding.entryToObject(result);
            LOGGER.log(Level.SEVERE, "see [#HER-1283]: deserialized " + obj.getClass() + " has ClassLoader "
                    + obj.getClass().getClassLoader().getClass(), cce);
            return null;
        } catch (RuntimeExceptionWrapper rw) {
            LOGGER.log(Level.SEVERE,
                    "expected object missing in queue " + BdbWorkQueue.getPrefixClassKey(headKey.getData()), rw);
            return null;
        }
        retVal.setHolderKey(headKey);
        return retVal;
    }

    protected OperationStatus getNextNearestItem(DatabaseEntry headKey, DatabaseEntry result)
            throws DatabaseException {
        Cursor cursor = null;
        OperationStatus status;
        try {
            cursor = this.pendingUrisDB.openCursor(null, null);

            // get cap; headKey at this point should always point to 
            // a queue-beginning cap entry (zero-length value)
            status = cursor.getSearchKey(headKey, result, null);
            if (status != OperationStatus.SUCCESS) {
                LOGGER.severe("bdb queue cap missing: " + status.toString() + " " + new String(headKey.getData()));
                return status;
            }
            if (result.getData().length > 0) {
                LOGGER.severe("bdb queue has nonzero size: " + result.getData().length);
                return OperationStatus.KEYEXIST;
            }
            // get next item (real first item of queue)
            status = cursor.getNext(headKey, result, null);
        } finally {
            if (cursor != null) {
                cursor.close();
            }
        }
        return status;
    }

    /**
     * Put the given CrawlURI in at the appropriate place. 
     * 
     * @param curi
     * @throws DatabaseException
     */
    public void put(CrawlURI curi, boolean overwriteIfPresent) throws DatabaseException {
        DatabaseEntry insertKey = (DatabaseEntry) curi.getHolderKey();
        if (insertKey == null) {
            insertKey = calculateInsertKey(curi);
            curi.setHolderKey(insertKey);
        }
        DatabaseEntry value = new DatabaseEntry();
        crawlUriBinding.objectToEntry(curi, value);
        // Output tally on avg. size if level is FINE or greater.
        if (LOGGER.isLoggable(Level.FINE)) {
            tallyAverageEntrySize(curi, value);
        }
        OperationStatus status;
        if (overwriteIfPresent) {
            status = pendingUrisDB.put(null, insertKey, value);
        } else {
            status = pendingUrisDB.putNoOverwrite(null, insertKey, value);
        }

        if (status != OperationStatus.SUCCESS) {
            LOGGER.log(Level.SEVERE, "URI enqueueing failed; " + status + " " + curi, new RuntimeException());
        }
    }

    private long entryCount = 0;
    private long entrySizeSum = 0;
    private int largestEntry = 0;

    /**
     * Log average size of database entry.
     * @param curi CrawlURI this entry is for.
     * @param value Database entry value.
     */
    private synchronized void tallyAverageEntrySize(CrawlURI curi, DatabaseEntry value) {
        entryCount++;
        int length = value.getData().length;
        entrySizeSum += length;
        int avg = (int) (entrySizeSum / entryCount);
        if (entryCount % 1000 == 0) {
            LOGGER.fine("Average entry size at " + entryCount + ": " + avg);
        }
        if (length > largestEntry) {
            largestEntry = length;
            LOGGER.fine("Largest entry: " + length + " " + curi);
            if (length > (2 * avg)) {
                LOGGER.fine("excessive?");
            }
        }
    }

    /**
     * Calculate the 'origin' key for a virtual queue of items
     * with the given classKey. This origin key will be a 
     * prefix of the keys for all items in the queue. 
     * 
     * @param classKey String key to derive origin byte key from 
     * @return a byte array key 
     */
    protected static byte[] calculateOriginKey(String classKey) {
        byte[] classKeyBytes = null;
        int len = 0;
        try {
            classKeyBytes = classKey.getBytes("UTF-8");
            len = classKeyBytes.length;
        } catch (UnsupportedEncodingException e) {
            // should be impossible; all JVMs must support UTF-8
            e.printStackTrace();
        }
        byte[] keyData = new byte[len + 1];
        System.arraycopy(classKeyBytes, 0, keyData, 0, len);
        keyData[len] = 0;
        return keyData;
    }

    /**
     * Calculate the insertKey that places a CrawlURI in the
     * desired spot. First bytes are always classKey (usu. host)
     * based -- ensuring grouping by host -- terminated by a zero
     * byte. Then 8 bytes of data ensuring desired ordering 
     * within that 'queue' are used. The first byte of these 8 is
     * priority -- allowing 'immediate' and 'soon' items to 
     * sort above regular. Next 1 byte is 'precedence'. Last 6 bytes 
     * are ordinal serial number, ensuring earlier-discovered 
     * URIs sort before later. 
     * 
     * NOTE: Dangers here are:
     * (1) priorities or precedences over 2^7 (signed byte comparison)
     * (2) ordinals over 2^48
     * 
     * Package access & static for testing purposes. 
     * 
     * @param curi
     * @return a DatabaseEntry key for the CrawlURI
     */
    protected static DatabaseEntry calculateInsertKey(CrawlURI curi) {
        byte[] classKeyBytes = null;
        int len = 0;
        classKeyBytes = curi.getClassKey().getBytes(Charsets.UTF_8);
        len = classKeyBytes.length;
        byte[] keyData = new byte[len + 9];
        System.arraycopy(classKeyBytes, 0, keyData, 0, len);
        keyData[len] = 0;
        long ordinalPlus = curi.getOrdinal() & 0x0000FFFFFFFFFFFFL;
        ordinalPlus = ((long) curi.getSchedulingDirective() << 56) | ordinalPlus;
        long precedence = Math.min(curi.getPrecedence(), 127);
        ordinalPlus = (((precedence) & 0xFFL) << 48) | ordinalPlus;
        ArchiveUtils.longIntoByteArray(ordinalPlus, keyData, len + 1);
        return new DatabaseEntry(keyData);
    }

    protected static String insertKeyToString(DatabaseEntry holderKey) {
        StringBuilder result = new StringBuilder();
        byte[] data = holderKey.getData();
        int p = findFirstZero(data);
        result.append(new String(data, 0, p));

        java.io.ByteArrayInputStream binp = new java.io.ByteArrayInputStream(data, p + 1, data.length);
        java.io.DataInputStream dinp = new java.io.DataInputStream(binp);
        long l = 0;
        try {
            l = dinp.readLong();
        } catch (IOException e) {
            // TODO Auto-generated catch block
            e.printStackTrace();
        }

        result.append(" blah=").append(l);

        return result.toString();
    }

    private static int findFirstZero(byte[] b) {
        for (int i = 0; i < b.length; i++) {
            if (b[i] == 0) {
                return i;
            }
        }
        return -1;
    }

    /**
     * Delete the given CrawlURI from persistent store. Requires
     * the key under which it was stored be available. 
     * 
     * @param item
     * @throws DatabaseException
     */
    public void delete(CrawlURI item) throws DatabaseException {
        OperationStatus status;
        DatabaseEntry de = (DatabaseEntry) item.getHolderKey();
        status = pendingUrisDB.delete(null, de);
        if (status != OperationStatus.SUCCESS) {
            LOGGER.severe("expected item not present: " + item + "("
                    + (new BigInteger(((DatabaseEntry) item.getHolderKey()).getData())).toString(16) + ")");
        }
    }

    /**
     * Method used by BdbFrontier during checkpointing.
     * <p>The backing bdbje database has been marked deferred write so we save
     * on writes to disk.  Means no guarantees disk will have whats in memory
     * unless a sync is called (Calling sync on the bdbje Environment is not
     * sufficent).
     * <p>Package access only because only Frontiers of this package would ever
     * need access.
     * @see <a href="http://www.sleepycat.com/jedocs/GettingStartedGuide/DB.html">Deferred Write Databases</a>
     */
    protected void sync() {
        if (this.pendingUrisDB == null) {
            return;
        }
        try {
            this.pendingUrisDB.sync();
        } catch (DatabaseException e) {
            e.printStackTrace();
        }
    }

    /**
     * clean up 
     *
     */
    public void close() {
        /*        try {
        this.pendingUrisDB.close();
                } catch (DatabaseException e) {
        e.printStackTrace();
                } */
    }

    /**
     * Add a dummy 'cap' entry at the given insertion key. Prevents
     * 'seeks' to queue heads from holding lock on last item of 
     * 'preceding' queue. See:
     * http://sourceforge.net/tracker/index.php?func=detail&aid=1262665&group_id=73833&atid=539102
     * 
     * @param origin key at which to insert the cap
     */
    public void addCap(byte[] origin) {
        try {
            pendingUrisDB.put(null, new DatabaseEntry(origin), new DatabaseEntry(new byte[0]));
        } catch (DatabaseException e) {
            throw new RuntimeException(e);
        }
    }

    /**
     * Utility method to perform action for all pending CrawlURI instances.
     * @param c Closure action to perform
     * @throws DatabaseException
     */
    protected void forAllPendingDo(Closure c) throws DatabaseException {
        DatabaseEntry key = new DatabaseEntry();
        DatabaseEntry value = new DatabaseEntry();
        Cursor cursor = pendingUrisDB.openCursor(null, null);
        while (cursor.getNext(key, value, null) == OperationStatus.SUCCESS) {
            if (value.getData().length == 0) {
                continue;
            }
            CrawlURI item = (CrawlURI) crawlUriBinding.entryToObject(value);
            c.execute(item);
        }
        cursor.close();
    }
}