org.archive.crawler.frontier.precedence.SuccessCountsQueuePrecedencePolicy.java Source code

Introduction

Here is the source code for org.archive.crawler.frontier.precedence.SuccessCountsQueuePrecedencePolicy.java
Source

/*
 *  This file is part of the Heritrix web crawler (crawler.archive.org).
 *
 *  Licensed to the Internet Archive (IA) by one or more individual 
 *  contributors. 
 *
 *  The IA licenses this file to You under the Apache License, Version 2.0
 *  (the "License"); you may not use this file except in compliance with
 *  the License.  You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 *  Unless required by applicable law or agreed to in writing, software
 *  distributed under the License is distributed on an "AS IS" BASIS,
 *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 *  See the License for the specific language governing permissions and
 *  limitations under the License.
 */
package org.archive.crawler.frontier.precedence;

import java.util.Arrays;
import java.util.Collection;
import java.util.Iterator;

import org.apache.commons.collections.CollectionUtils;
import org.apache.commons.collections.Transformer;
import org.archive.crawler.frontier.WorkQueue;

/**
 * QueuePrecedencePolicy that sets a uri-queue's precedence to a configured
 * base value, then lowers its precedence with each tier of successful URIs
 * completed. Any number of comma-separated tier sizes may be provided, with 
 * the last value assumed to repeat indefinitely. For example, with a 
 * 'base-precedence' value of 2, and 'increment-counts' of "100,1000", the
 * queue will have a precedence of 2 until 100 URIs are successfully fetched, 
 * then a precedence of 3 for the next 1000 URIs successfully fetched, then 
 * continue to drop one precedence rank for each 1000 URIs successfully 
 * fetched.
 */
public class SuccessCountsQueuePrecedencePolicy extends BaseQueuePrecedencePolicy {
    private static final long serialVersionUID = -4469760728466350850L;

    // TODO: determine why this doesn't work
    //
    //    /** comma-separated list of success-counts at which precedence is bumped*/
    //    final public static Key<List<Integer>> INCREMENT_COUNTS = 
    //        Key.make((List<Integer>)Arrays.asList(new Integer[] {100}));
    //
    //    /**
    //     * @param wq
    //     * @return
    //     */
    //    protected int calculatePrecedence(WorkQueue wq) {
    //        // FIXME: it's inefficient to do this every time; optimizing 
    //        // should be possible via more sophisticated custom PrecedenceProvider
    //        int precedence = wq.get(this,BASE_PRECEDENCE) - 1;
    //        Iterator<Integer> iter = wq.get(this,INCREMENT_COUNTS).iterator();
    //        int increment = iter.next(); 
    //        long successes = wq.getSubstats().getFetchSuccesses();
    //        while(successes>0) {
    //            successes -= increment;
    //            precedence++;
    //            increment = iter.hasNext() ? iter.next() : increment; 
    //        }
    //        return precedence;
    //    }

    /** comma-separated list of success-counts at which precedence is bumped*/
    {
        setIncrementCounts("100,1000");
    }

    public String getIncrementCounts() {
        return (String) kp.get("incrementCounts");
    }

    public void setIncrementCounts(String counts) {
        kp.put("incrementCounts", counts);
    }

    /* (non-Javadoc)
     * @see org.archive.crawler.frontier.QueuePrecedencePolicy#queueReevaluate(org.archive.crawler.frontier.WorkQueue)
     */
    @SuppressWarnings("unchecked")
    @Override
    protected int calculatePrecedence(WorkQueue wq) {
        // FIXME: it's ridiculously inefficient to do this every time, 
        // and optimizing will probably require inserting stateful policy 
        // helper object into WorkQueue -- expected when URI-precedence is
        // also supported
        int precedence = getBasePrecedence() - 1;
        Collection<Integer> increments = CollectionUtils.collect(Arrays.asList(getIncrementCounts().split(",")),
                new Transformer() {
                    public Object transform(final Object string) {
                        return Integer.parseInt((String) string);
                    }
                });
        Iterator<Integer> iter = increments.iterator();
        int increment = iter.next();
        long successes = wq.getSubstats().getFetchSuccesses();
        while (successes >= 0) {
            successes -= increment;
            precedence++;
            increment = iter.hasNext() ? iter.next() : increment;
        }
        return precedence;
    }
}