org.archive.modules.net.RobotsExclusionPolicy.java Source code

Java tutorial

Introduction

Here is the source code for org.archive.modules.net.RobotsExclusionPolicy.java

Source

/*
 *  This file is part of the Heritrix web crawler (crawler.archive.org).
 *
 *  Licensed to the Internet Archive (IA) by one or more individual 
 *  contributors. 
 *
 *  The IA licenses this file to You under the Apache License, Version 2.0
 *  (the "License"); you may not use this file except in compliance with
 *  the License.  You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 *  Unless required by applicable law or agreed to in writing, software
 *  distributed under the License is distributed on an "AS IS" BASIS,
 *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 *  See the License for the specific language governing permissions and
 *  limitations under the License.
 */

package org.archive.modules.net;

import java.io.BufferedReader;
import java.io.IOException;
import java.io.ObjectInputStream;
import java.io.ObjectOutputStream;
import java.io.Serializable;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import java.util.logging.Level;
import java.util.logging.Logger;

import org.apache.commons.httpclient.URIException;
import org.archive.modules.CrawlURI;

/**
 * RobotsExclusionPolicy represents the actual policy adopted with 
 * respect to a specific remote server, usually constructed from 
 * consulting the robots.txt, if any, the server provided. 
 * 
 * (The similarly named RobotsHonoringPolicy, on the other hand, 
 * describes the strategy used by the crawler to determine to what
 * extent it respects exclusion rules.)
 * 
 * The expiration of policies after a suitable amount of time has
 * elapsed since last fetch is handled outside this class, in 
 * CrawlServer itself. 
 * 
 * TODO: refactor RobotsHonoringPolicy to be a class-per-policy, and 
 * then see if a CrawlServer with a HonoringPolicy and a RobotsTxt
 * makes this mediating class unnecessary. 
 * 
 * @author gojomo
 *
 */
public class RobotsExclusionPolicy implements Serializable {
    private static final long serialVersionUID = 3L;

    private static final Logger logger = Logger.getLogger(RobotsExclusionPolicy.class.getName());

    public static enum Type {
        NORMAL, ALLOWALL, DENYALL
    }

    private transient Type type = Type.NORMAL;

    final public static RobotsExclusionPolicy ALLOWALL = new RobotsExclusionPolicy(Type.ALLOWALL);
    final public static RobotsExclusionPolicy DENYALL = new RobotsExclusionPolicy(Type.DENYALL);

    private Robotstxt robotstxt = null;
    // FIXME?: this 'transient' seems wrong -- likely to cause
    // all non-normal policies to break when CrawlServer
    // go through a serialization/deserialization cycle
    RobotsHonoringPolicy honoringPolicy = null; // was transient

    private String lastUsedUserAgent = null;
    private List<String> userAgentsToTest = null;

    /**
     * @param settings 
     * @param reader
     * @param honoringPolicy
     * @return Robot exclusion policy.
     * @throws IOException
     */
    public static RobotsExclusionPolicy policyFor(BufferedReader reader, RobotsHonoringPolicy honoringPolicy)
            throws IOException {
        Robotstxt robots = new Robotstxt(reader);
        return (robots.allowsAll()) ? ALLOWALL : new RobotsExclusionPolicy(robots, honoringPolicy);
    }

    /**
     * @param settings 
     * @param u
     * @param d
     * @param honoringPolicy
     */
    public RobotsExclusionPolicy(Robotstxt robotstxt, RobotsHonoringPolicy honoringPolicy) {
        this.robotstxt = robotstxt;
        this.honoringPolicy = honoringPolicy;

        if (honoringPolicy == null)
            return;

        // If honoring policy is most favored user agent, all rules should be checked
        if (honoringPolicy.isType(RobotsHonoringPolicy.Type.MOST_FAVORED)) {
            userAgentsToTest = robotstxt.getUserAgents();

            // IF honoring policy is most favored of set, then make a list with only the set as members
        } else if (honoringPolicy.isType(RobotsHonoringPolicy.Type.MOST_FAVORED_SET)) {
            userAgentsToTest = new ArrayList<String>();
            Iterator<String> userAgentSet = honoringPolicy.getUserAgents().iterator();
            while (userAgentSet.hasNext()) {
                String userAgent = (String) userAgentSet.next();

                Iterator<String> iter = robotstxt.getUserAgents().iterator();
                while (iter.hasNext()) {
                    String ua = (String) iter.next();
                    if (userAgent.indexOf(ua) > -1) {
                        userAgentsToTest.add(ua);
                        break;
                    }
                }
            }
        }
    }

    public RobotsExclusionPolicy(Type type) {
        this(null, null);
        this.type = type;
    }

    public boolean disallows(CrawlURI curi, String userAgent) {
        if (this == ALLOWALL)
            return false;
        if (this == DENYALL)
            return true;

        // In the common case with policy=Classic, the useragent is remembered from uri to uri on
        // the same server
        if ((honoringPolicy.isType(RobotsHonoringPolicy.Type.CLASSIC)
                || honoringPolicy.isType(RobotsHonoringPolicy.Type.CUSTOM))
                && (lastUsedUserAgent == null || !lastUsedUserAgent.equals(userAgent))) {

            lastUsedUserAgent = userAgent;
            userAgentsToTest = new ArrayList<String>();
            Iterator<String> iter = robotstxt.getUserAgents().iterator();
            String lowerCaseUserAgent = userAgent.toLowerCase();
            while (iter.hasNext()) {
                String ua = (String) iter.next();
                // ua in below is already lowercase. See Robotstxt.java line 60.
                if (lowerCaseUserAgent.indexOf(ua) > -1) {
                    userAgentsToTest.add(ua);
                    break; // consider no more sections
                }
            }
        }

        boolean disallow = false;
        String ua = null;

        // Go thru list of all user agents we might act as
        Iterator<String> uas = userAgentsToTest.iterator();
        while (uas.hasNext()) {
            ua = (String) uas.next();
            String path = null;
            try {
                path = curi.getUURI().getPathQuery();
            } catch (URIException e) {
                logger.log(Level.SEVERE, "Failed getPathQuery from " + curi, e);
                disallow = false;
                break;
            }
            if (robotstxt.getDirectivesFor(ua).allows(path)) {
                // at least one applicable set of rules allows
                disallow = false;
                break;
            } else {
                // at least one applicable set of rules disallows
                // so disallow unless later test allows
                disallow = true;
            }
        }

        // Are we supposed to masquerade as the user agent to which restrictions
        // we follow?
        if (honoringPolicy.shouldMasquerade() && ua != null && !ua.equals("")) {
            curi.setUserAgent(ua);
        }
        return disallow;
    }

    // Methods for object serialization.

    /** If object is DENYALL or ALLOWALL, only the object identity and type
     * is written in the serialization stream.
     *
     * @param stream the serialization stream.
     * @throws IOException 
     */
    private void writeObject(ObjectOutputStream stream) throws IOException {
        stream.writeInt(type.ordinal());
        if (type == Type.NORMAL) {
            stream.defaultWriteObject();
        }
    }

    /** If object is DENYALL or ALLOWALL, only the object identity and type
     * is read from the serialization stream.
     *
     * @param stream the serialization stream.
     * @throws IOException 
     * @throws ClassNotFoundException 
     */
    private void readObject(ObjectInputStream stream) throws IOException, ClassNotFoundException {
        int ordinal = stream.readInt();
        type = Type.values()[ordinal];
        if (type == Type.NORMAL) {
            stream.defaultReadObject();
        }
    }

    /** If object is DENYALL or ALLOWALL, the object is replaced by constants
     * so that check for object equality works.
     * @return Object.
     */
    private Object readResolve() {
        if (type == Type.NORMAL) {
            return this;
        } else if (type == Type.ALLOWALL) {
            return ALLOWALL;
        } else if (type == Type.DENYALL) {
            return DENYALL;
        }
        return null;
    }

    /**
     * Get the crawl-delay that applies to the given user-agent, or
     * -1 (indicating no crawl-delay known) if not internal RobotsTxt
     * instance. 
     * 
     * @param userAgent
     * @return int Crawl-Delay value, or -1 if non available
     */
    public float getCrawlDelay(String userAgent) {
        if (robotstxt == null) {
            return -1;
        }
        return robotstxt.getDirectivesFor(userAgent).getCrawlDelay();
    }
}