org.archive.modules.Processor.java Source code

Introduction

Here is the source code for org.archive.modules.Processor.java
Source

/*
 *  This file is part of the Heritrix web crawler (crawler.archive.org).
 *
 *  Licensed to the Internet Archive (IA) by one or more individual 
 *  contributors. 
 *
 *  The IA licenses this file to You under the Apache License, Version 2.0
 *  (the "License"); you may not use this file except in compliance with
 *  the License.  You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 *  Unless required by applicable law or agreed to in writing, software
 *  distributed under the License is distributed on an "AS IS" BASIS,
 *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 *  See the License for the specific language governing permissions and
 *  limitations under the License.
 */
package org.archive.modules;

import java.io.IOException;
import java.util.Set;
import java.util.concurrent.atomic.AtomicLong;

import org.apache.commons.httpclient.HttpStatus;
import org.archive.checkpointing.Checkpoint;
import org.archive.checkpointing.Checkpointable;
import org.archive.modules.credential.Credential;
import org.archive.modules.credential.HttpAuthenticationCredential;
import org.archive.modules.deciderules.AcceptDecideRule;
import org.archive.modules.deciderules.DecideResult;
import org.archive.modules.deciderules.DecideRule;
import org.archive.net.UURI;
import org.archive.spring.HasKeyedProperties;
import org.archive.spring.KeyedProperties;
import org.json.JSONException;
import org.json.JSONObject;
import org.springframework.beans.factory.BeanNameAware;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.context.Lifecycle;

/**
 * A processor of URIs.  The URI provides the context for the process; 
 * settings can be altered based on the URI.
 * 
 * @author pjack
 */
public abstract class Processor implements HasKeyedProperties, Lifecycle, BeanNameAware, Checkpointable {
    protected KeyedProperties kp = new KeyedProperties();

    public KeyedProperties getKeyedProperties() {
        return kp;
    }

    protected String beanName;

    public String getBeanName() {
        return this.beanName;
    }

    public void setBeanName(String name) {
        this.beanName = name;
    }

    /** 
     * Whether or not this process will execute for a particular URI. 
     * If this is false for a URI, then the URI isn't processed,
     * regardless of what the DecideRules say.
     */
    {
        setEnabled(true);
    }

    public boolean getEnabled() {
        return (Boolean) kp.get("enabled");
    }

    public void setEnabled(boolean enabled) {
        kp.put("enabled", enabled);
    }

    /** 
     * Decide rule(s) (also particular to a URI) that determine whether 
     * or not a particular URI is processed here. If the rule(s) answer
     * REJECT, processing is skipped. (ACCEPT or PASS allow processing
     * to continue). 
     */
    {
        setShouldProcessRule(new AcceptDecideRule());
    }

    public DecideRule getShouldProcessRule() {
        return (DecideRule) kp.get("shouldProcessRule");
    }

    public void setShouldProcessRule(DecideRule rule) {
        kp.put("shouldProcessRule", rule);
    }

    /**
     * The number of URIs processed by this processor.
     */
    protected AtomicLong uriCount = new AtomicLong(0);

    /**
     * Processes the given URI.  First checks {@link #ENABLED} and
     * {@link #DECIDE_RULES}.  If ENABLED is false, then nothing happens.
     * If the DECIDE_RULES indicate REJECT, then the 
     * {@link #innerRejectProcess(ProcessorURI)} method is invoked, and
     * the process method returns.
     * 
     * <p>Next, the {@link #shouldProcess(ProcessorURI)} method is 
     * consulted to see if this Processor knows how to handle the given
     * URI.  If it returns false, then nothing futher occurs.
     * 
     * <p>FIXME: Should innerRejectProcess be called when ENABLED is false,
     * or when shouldProcess returns false?  The previous Processor 
     * implementation didn't handle it that way.
     * 
     * <p>Otherwise, the URI is considered valid.  This processor's count
     * of handled URIs is incremented, and the 
     * {@link #innerProcess(ProcessorURI)} method is invoked to actually
     * perform the process.
     * 
     * @param uri  The URI to process
     * @throws  InterruptedException   if the thread is interrupted
     */
    public ProcessResult process(CrawlURI uri) throws InterruptedException {
        if (!getEnabled()) {
            return ProcessResult.PROCEED;
        }

        if (getShouldProcessRule().decisionFor(uri) == DecideResult.REJECT) {
            innerRejectProcess(uri);
            return ProcessResult.PROCEED;
        }

        if (shouldProcess(uri)) {
            uriCount.incrementAndGet();
            return innerProcessResult(uri);
        } else {
            return ProcessResult.PROCEED;
        }
    }

    /**
     * Returns the number of URIs this processor has handled.  The returned
     * number does not include URIs that were rejected by the 
     * {@link #ENABLED} flag, by the {@link #DECIDE_RULES}, or by the 
     * {@link #shouldProcess(ProcessorURI)} method.
     * 
     * @return  the number of URIs this processor has handled
     */
    public long getURICount() {
        return uriCount.get();
    }

    /**
     * Determines whether the given uri should be processed by this 
     * processor.  For instance, a processor that only works on HTML 
     * content might reject the URI if its content type is not 
     * "text/html", if its content length is zero, and so on.
     * 
     * @param uri   the URI to test
     * @return  true if this processor should process that uri; false if not
     */
    protected abstract boolean shouldProcess(CrawlURI uri);

    protected ProcessResult innerProcessResult(CrawlURI uri) throws InterruptedException {
        innerProcess(uri);
        return ProcessResult.PROCEED;
    }

    /**
     * Actually performs the process.  By the time this method is invoked,
     * it is known that the given URI passes the {@link #ENABLED}, the 
     * {@link #DECIDE_RULES} and the {@link #shouldProcess(ProcessorURI)}
     * tests.  
     * 
     * @param uri    the URI to process
     * @throws InterruptedException   if the thread is interrupted
     */
    protected abstract void innerProcess(CrawlURI uri) throws InterruptedException;

    /**
     * Invoked after a URI has been rejected.  The default implementation
     * does nothing; subclasses may override to log rejects or something.
     * 
     * @param uri   the URI that was rejected
     * @throws InterruptedException   if the thread is interrupted
     */
    protected void innerRejectProcess(CrawlURI uri) throws InterruptedException {
    }

    public static String flattenVia(CrawlURI puri) {
        UURI uuri = puri.getVia();
        return (uuri == null) ? "" : uuri.toString();
    }

    public static boolean isSuccess(CrawlURI puri) {
        boolean result = false;
        int statusCode = puri.getFetchStatus();
        if (statusCode == HttpStatus.SC_UNAUTHORIZED && hasHttpAuthenticationCredential(puri)) {
            result = false;
        } else {
            result = (statusCode > 0);
        }
        return result;
    }

    public static long getRecordedSize(CrawlURI puri) {
        if (puri.getRecorder() == null) {
            return puri.getContentSize();
        } else {
            return puri.getRecorder().getRecordedInput().getSize();
        }
    }

    /**
     * @return True if we have an HttpAuthentication (rfc2617) payload.
     */
    public static boolean hasHttpAuthenticationCredential(CrawlURI puri) {
        Set<Credential> credentials = puri.getCredentials();
        for (Credential ca : credentials) {
            if (ca instanceof HttpAuthenticationCredential) {
                return true;
            }
        }
        return false;
    }

    // FIXME: Raise to interface
    // FIXME: Internationalize somehow
    // FIXME: Pass in PrintWriter instead creating large in-memory strings
    public String report() {
        return "Processor: " + getClass().getName() + "\n";
    }

    protected boolean isRunning = false;

    public boolean isRunning() {
        return isRunning;
    }

    public void start() {
        if (isRunning) {
            return;
        }
        isRunning = true;
        if (recoveryCheckpoint != null) {
            try {
                JSONObject json = recoveryCheckpoint.loadJson(getBeanName());
                fromCheckpointJson(json);
            } catch (JSONException e) {
                throw new RuntimeException(e);
            }
        }
    }

    public void stop() {
        isRunning = false;
    }

    public void startCheckpoint(Checkpoint checkpointInProgress) {
    }

    public void doCheckpoint(Checkpoint checkpointInProgress) throws IOException {
        try {
            JSONObject json = toCheckpointJson();
            checkpointInProgress.saveJson(beanName, json);
        } catch (JSONException j) {
            // impossible
        }
    }

    /**
     * Return a JSONObject of current stat that can be consulted 
     * on recovery to restore necessary values. 
     * 
     * @return JSONObject
     * @throws JSONException
     */
    protected JSONObject toCheckpointJson() throws JSONException {
        JSONObject json = new JSONObject();
        json.put("uriCount", getURICount());
        return json;
    }

    /**
     * Restore internal state from JSONObject stored at earlier
     * checkpoint-time.
     * 
     * @param json JSONObject
     * @throws JSONException
     */
    protected void fromCheckpointJson(JSONObject json) throws JSONException {
        uriCount.set(json.getLong("uriCount"));
    }

    public void finishCheckpoint(Checkpoint checkpointInProgress) {
    }

    protected Checkpoint recoveryCheckpoint;

    @Autowired(required = false)
    public void setRecoveryCheckpoint(Checkpoint checkpoint) {
        this.recoveryCheckpoint = checkpoint;
    }
}