org.apache.nutch.protocol.http.api.RobotRulesParser.java Source code

Introduction

Here is the source code for org.apache.nutch.protocol.http.api.RobotRulesParser.java
Source

/**
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.nutch.protocol.http.api;

// JDK imports
import java.io.FileInputStream;
import java.io.FileReader;
import java.io.LineNumberReader;
import java.io.IOException;
import java.net.URL;
import java.net.URLDecoder;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.Hashtable;
import java.util.StringTokenizer;

// Commons Logging imports
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
// Nutch imports
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configurable;
import org.apache.hadoop.io.Text;
import org.apache.nutch.crawl.CrawlDatum;
import org.apache.nutch.net.protocols.Response;
import org.apache.nutch.protocol.ProtocolException;
import org.apache.nutch.protocol.RobotRules;

/**
 * This class handles the parsing of <code>robots.txt</code> files.
 * It emits RobotRules objects, which describe the download permissions
 * as described in RobotRulesParser.
 *
 * @author Tom Pierce
 * @author Mike Cafarella
 * @author Doug Cutting
 */
public class RobotRulesParser implements Configurable {

    public static final Log LOG = LogFactory.getLog(RobotRulesParser.class);

    private boolean allowForbidden = false;

    private static final Hashtable CACHE = new Hashtable();

    private static final String CHARACTER_ENCODING = "UTF-8";
    private static final int NO_PRECEDENCE = Integer.MAX_VALUE;

    private static final RobotRuleSet EMPTY_RULES = new RobotRuleSet();

    private static RobotRuleSet FORBID_ALL_RULES = getForbidAllRules();

    private Configuration conf;
    private HashMap robotNames;

    /**
     * This class holds the rules which were parsed from a robots.txt
     * file, and can test paths against those rules.
     */
    public static class RobotRuleSet implements RobotRules {
        ArrayList tmpEntries = new ArrayList();
        RobotsEntry[] entries = null;
        long expireTime;
        long crawlDelay = -1;

        /**
         */
        private class RobotsEntry {
            String prefix;
            boolean allowed;

            RobotsEntry(String prefix, boolean allowed) {
                this.prefix = prefix;
                this.allowed = allowed;
            }
        }

        /**
         */
        private void addPrefix(String prefix, boolean allow) {
            if (tmpEntries == null) {
                tmpEntries = new ArrayList();
                if (entries != null) {
                    for (int i = 0; i < entries.length; i++)
                        tmpEntries.add(entries[i]);
                }
                entries = null;
            }

            tmpEntries.add(new RobotsEntry(prefix, allow));
        }

        /**
         */
        private void clearPrefixes() {
            if (tmpEntries == null) {
                tmpEntries = new ArrayList();
                entries = null;
            } else {
                tmpEntries.clear();
            }
        }

        /**
         * Change when the ruleset goes stale.
         */
        public void setExpireTime(long expireTime) {
            this.expireTime = expireTime;
        }

        /**
         * Get expire time
         */
        public long getExpireTime() {
            return expireTime;
        }

        /**
         * Get Crawl-Delay, in milliseconds. This returns -1 if not set.
         */
        public long getCrawlDelay() {
            return crawlDelay;
        }

        /**
         * Set Crawl-Delay, in milliseconds
         */
        public void setCrawlDelay(long crawlDelay) {
            this.crawlDelay = crawlDelay;
        }

        /**
         *  Returns <code>false</code> if the <code>robots.txt</code> file
         *  prohibits us from accessing the given <code>url</code>, or
         *  <code>true</code> otherwise.
         */
        public boolean isAllowed(URL url) {
            String path = url.getPath(); // check rules
            if ((path == null) || "".equals(path)) {
                path = "/";
            }
            return isAllowed(path);
        }

        /** 
         *  Returns <code>false</code> if the <code>robots.txt</code> file
         *  prohibits us from accessing the given <code>path</code>, or
         *  <code>true</code> otherwise.
         */
        public boolean isAllowed(String path) {
            try {
                path = URLDecoder.decode(path, CHARACTER_ENCODING);
            } catch (Exception e) {
                // just ignore it- we can still try to match 
                // path prefixes
            }

            if (entries == null) {
                entries = new RobotsEntry[tmpEntries.size()];
                entries = (RobotsEntry[]) tmpEntries.toArray(entries);
                tmpEntries = null;
            }

            int pos = 0;
            int end = entries.length;
            while (pos < end) {
                if (path.startsWith(entries[pos].prefix))
                    return entries[pos].allowed;
                pos++;
            }

            return true;
        }

        /**
         */
        public String toString() {
            isAllowed("x"); // force String[] representation
            StringBuffer buf = new StringBuffer();
            for (int i = 0; i < entries.length; i++)
                if (entries[i].allowed)
                    buf.append("Allow: " + entries[i].prefix + System.getProperty("line.separator"));
                else
                    buf.append("Disallow: " + entries[i].prefix + System.getProperty("line.separator"));
            return buf.toString();
        }
    }

    RobotRulesParser() {
    }

    public RobotRulesParser(Configuration conf) {
        setConf(conf);
    }

    /* ---------------------------------- *
     * <implementation:Configurable> *
     * ---------------------------------- */

    public void setConf(Configuration conf) {
        this.conf = conf;
        allowForbidden = conf.getBoolean("http.robots.403.allow", false);
        //
        // Grab the agent names we advertise to robots files.
        //
        String agentName = conf.get("http.agent.name");
        String agentNames = conf.get("http.robots.agents");
        StringTokenizer tok = new StringTokenizer(agentNames, ",");
        ArrayList agents = new ArrayList();
        while (tok.hasMoreTokens()) {
            agents.add(tok.nextToken().trim());
        }

        setRobotNames((String[]) agents.toArray(new String[agents.size()]));
    }

    public Configuration getConf() {
        return conf;
    }

    /* ---------------------------------- *
     * <implementation:Configurable> *
     * ---------------------------------- */

    private void setRobotNames(String[] robotNames) {
        this.robotNames = new HashMap();
        for (int i = 0; i < robotNames.length; i++) {
            this.robotNames.put(robotNames[i].toLowerCase(), new Integer(i));
        }
        // always make sure "*" is included
        if (!this.robotNames.containsKey("*"))
            this.robotNames.put("*", new Integer(robotNames.length));
    }

    /**
     *  Creates a new <code>RobotRulesParser</code> which will use the
     *  supplied <code>robotNames</code> when choosing which stanza to
     *  follow in <code>robots.txt</code> files.  Any name in the array
     *  may be matched.  The order of the <code>robotNames</code>
     *  determines the precedence- if many names are matched, only the
     *  rules associated with the robot name having the smallest index
     *  will be used.
     */
    RobotRulesParser(String[] robotNames) {
        setRobotNames(robotNames);
    }

    /**
     * Returns a {@link RobotRuleSet} object which encapsulates the
     * rules parsed from the supplied <code>robotContent</code>.
     */
    RobotRuleSet parseRules(byte[] robotContent) {
        if (robotContent == null)
            return EMPTY_RULES;

        String content = new String(robotContent);

        StringTokenizer lineParser = new StringTokenizer(content, "\n\r");

        RobotRuleSet bestRulesSoFar = null;
        int bestPrecedenceSoFar = NO_PRECEDENCE;

        RobotRuleSet currentRules = new RobotRuleSet();
        int currentPrecedence = NO_PRECEDENCE;

        boolean addRules = false; // in stanza for our robot
        boolean doneAgents = false; // detect multiple agent lines

        while (lineParser.hasMoreTokens()) {
            String line = lineParser.nextToken();

            // trim out comments and whitespace
            int hashPos = line.indexOf("#");
            if (hashPos >= 0)
                line = line.substring(0, hashPos);
            line = line.trim();

            if ((line.length() >= 11) && (line.substring(0, 11).equalsIgnoreCase("User-agent:"))) {

                if (doneAgents) {
                    if (currentPrecedence < bestPrecedenceSoFar) {
                        bestPrecedenceSoFar = currentPrecedence;
                        bestRulesSoFar = currentRules;
                        currentPrecedence = NO_PRECEDENCE;
                        currentRules = new RobotRuleSet();
                    }
                    addRules = false;
                }
                doneAgents = false;

                String agentNames = line.substring(line.indexOf(":") + 1);
                agentNames = agentNames.trim();
                StringTokenizer agentTokenizer = new StringTokenizer(agentNames);

                while (agentTokenizer.hasMoreTokens()) {
                    // for each agent listed, see if it's us:
                    String agentName = agentTokenizer.nextToken().toLowerCase();

                    Integer precedenceInt = (Integer) robotNames.get(agentName);

                    if (precedenceInt != null) {
                        int precedence = precedenceInt.intValue();
                        if ((precedence < currentPrecedence) && (precedence < bestPrecedenceSoFar))
                            currentPrecedence = precedence;
                    }
                }

                if (currentPrecedence < bestPrecedenceSoFar)
                    addRules = true;

            } else if ((line.length() >= 9) && (line.substring(0, 9).equalsIgnoreCase("Disallow:"))) {

                doneAgents = true;
                String path = line.substring(line.indexOf(":") + 1);
                path = path.trim();
                try {
                    path = URLDecoder.decode(path, CHARACTER_ENCODING);
                } catch (Exception e) {
                    if (LOG.isWarnEnabled()) {
                        LOG.warn("error parsing robots rules- can't decode path: " + path);
                    }
                }

                if (path.length() == 0) { // "empty rule"
                    if (addRules)
                        currentRules.clearPrefixes();
                } else { // rule with path
                    if (addRules)
                        currentRules.addPrefix(path, false);
                }

            } else if ((line.length() >= 6) && (line.substring(0, 6).equalsIgnoreCase("Allow:"))) {

                doneAgents = true;
                String path = line.substring(line.indexOf(":") + 1);
                path = path.trim();

                if (path.length() == 0) {
                    // "empty rule"- treat same as empty disallow
                    if (addRules)
                        currentRules.clearPrefixes();
                } else { // rule with path
                    if (addRules)
                        currentRules.addPrefix(path, true);
                }
            } else if ((line.length() >= 12) && (line.substring(0, 12).equalsIgnoreCase("Crawl-Delay:"))) {
                doneAgents = true;
                if (addRules) {
                    long crawlDelay = -1;
                    String delay = line.substring("Crawl-Delay:".length(), line.length()).trim();
                    if (delay.length() > 0) {
                        try {
                            crawlDelay = Long.parseLong(delay) * 1000; // sec to millisec
                        } catch (Exception e) {
                            LOG.info("can not parse Crawl-Delay:" + e.toString());
                        }
                        currentRules.setCrawlDelay(crawlDelay);
                    }
                }
            }
        }

        if (currentPrecedence < bestPrecedenceSoFar) {
            bestPrecedenceSoFar = currentPrecedence;
            bestRulesSoFar = currentRules;
        }

        if (bestPrecedenceSoFar == NO_PRECEDENCE)
            return EMPTY_RULES;
        return bestRulesSoFar;
    }

    /**
     *  Returns a <code>RobotRuleSet</code> object appropriate for use
     *  when the <code>robots.txt</code> file is empty or missing; all
     *  requests are allowed.
     */
    static RobotRuleSet getEmptyRules() {
        return EMPTY_RULES;
    }

    /**
     *  Returns a <code>RobotRuleSet</code> object appropriate for use
     *  when the <code>robots.txt</code> file is not fetched due to a
     *  <code>403/Forbidden</code> response; all requests are
     *  disallowed.
     */
    static RobotRuleSet getForbidAllRules() {
        RobotRuleSet rules = new RobotRuleSet();
        rules.addPrefix("", false);
        return rules;
    }

    public RobotRuleSet getRobotRulesSet(HttpBase http, Text url) {
        URL u = null;
        try {
            u = new URL(url.toString());
        } catch (Exception e) {
            return EMPTY_RULES;
        }
        return getRobotRulesSet(http, u);
    }

    private RobotRuleSet getRobotRulesSet(HttpBase http, URL url) {

        String host = url.getHost().toLowerCase(); // normalize to lower case

        RobotRuleSet robotRules = (RobotRuleSet) CACHE.get(host);

        boolean cacheRule = true;

        if (robotRules == null) { // cache miss
            URL redir = null;
            if (LOG.isTraceEnabled()) {
                LOG.trace("cache miss " + url);
            }
            try {
                Response response = http.getResponse(new URL(url, "/robots.txt"), new CrawlDatum(), true);
                // try one level of redirection ?
                if (response.getCode() == 301 || response.getCode() == 302) {
                    String redirection = response.getHeader("Location");
                    if (redirection == null) {
                        // some versions of MS IIS are known to mangle this header
                        redirection = response.getHeader("location");
                    }
                    if (redirection != null) {
                        if (!redirection.startsWith("http")) {
                            // RFC says it should be absolute, but apparently it isn't
                            redir = new URL(url, redirection);
                        } else {
                            redir = new URL(redirection);
                        }

                        response = http.getResponse(redir, new CrawlDatum(), true);
                    }
                }

                if (response.getCode() == 200) // found rules: parse them
                    robotRules = parseRules(response.getContent());
                else if ((response.getCode() == 403) && (!allowForbidden))
                    robotRules = FORBID_ALL_RULES; // use forbid all
                else if (response.getCode() >= 500) {
                    cacheRule = false;
                    robotRules = EMPTY_RULES;
                } else
                    robotRules = EMPTY_RULES; // use default rules
            } catch (Throwable t) {
                if (LOG.isInfoEnabled()) {
                    LOG.info("Couldn't get robots.txt for " + url + ": " + t.toString());
                }
                cacheRule = false;
                robotRules = EMPTY_RULES;
            }

            if (cacheRule) {
                CACHE.put(host, robotRules); // cache rules for host
                if (redir != null && !redir.getHost().equals(host)) {
                    // cache also for the redirected host
                    CACHE.put(redir.getHost(), robotRules);
                }
            }
        }
        return robotRules;
    }

    public boolean isAllowed(HttpBase http, URL url) throws ProtocolException, IOException {
        String path = url.getPath(); // check rules
        if ((path == null) || "".equals(path)) {
            path = "/";
        }

        return getRobotRulesSet(http, url).isAllowed(path);
    }

    public long getCrawlDelay(HttpBase http, URL url) throws ProtocolException, IOException {
        return getRobotRulesSet(http, url).getCrawlDelay();
    }

    private final static int BUFSIZE = 2048;

    /** command-line main for testing */
    public static void main(String[] argv) {
        if (argv.length < 3) {
            System.out.println("Usage:");
            System.out.println("   java <robots-file> <url-file> <agent-name>+");
            System.out.println("");
            System.out.println("The <robots-file> will be parsed as a robots.txt file,");
            System.out.println("using the given <agent-name> to select rules.  URLs ");
            System.out.println("will be read (one per line) from <url-file>, and tested");
            System.out.println("against the rules.");
            System.exit(-1);
        }
        try {
            FileInputStream robotsIn = new FileInputStream(argv[0]);
            LineNumberReader testsIn = new LineNumberReader(new FileReader(argv[1]));
            String[] robotNames = new String[argv.length - 2];

            for (int i = 0; i < argv.length - 2; i++)
                robotNames[i] = argv[i + 2];

            ArrayList bufs = new ArrayList();
            byte[] buf = new byte[BUFSIZE];
            int totBytes = 0;

            int rsize = robotsIn.read(buf);
            while (rsize >= 0) {
                totBytes += rsize;
                if (rsize != BUFSIZE) {
                    byte[] tmp = new byte[rsize];
                    System.arraycopy(buf, 0, tmp, 0, rsize);
                    bufs.add(tmp);
                } else {
                    bufs.add(buf);
                    buf = new byte[BUFSIZE];
                }
                rsize = robotsIn.read(buf);
            }

            byte[] robotsBytes = new byte[totBytes];
            int pos = 0;

            for (int i = 0; i < bufs.size(); i++) {
                byte[] currBuf = (byte[]) bufs.get(i);
                int currBufLen = currBuf.length;
                System.arraycopy(currBuf, 0, robotsBytes, pos, currBufLen);
                pos += currBufLen;
            }

            RobotRulesParser parser = new RobotRulesParser(robotNames);
            RobotRuleSet rules = parser.parseRules(robotsBytes);
            System.out.println("Rules:");
            System.out.println(rules);
            System.out.println();

            String testPath = testsIn.readLine().trim();
            while (testPath != null) {
                System.out.println((rules.isAllowed(testPath) ? "allowed" : "not allowed") + ":\t" + testPath);
                testPath = testsIn.readLine();
            }

        } catch (Exception e) {
            e.printStackTrace();
        }
    }

}