org.attribyte.api.http.util.RobotsTxt.java Source code

Java tutorial

Introduction

Here is the source code for org.attribyte.api.http.util.RobotsTxt.java

Source

/*
 * Copyright (C) 2008,2014 Attribyte, LLC  All Rights Reserved.
 * 
 * This software is the confidential and proprietary information of Attribyte, LLC.
 * ("Confidential Information").  You shall not
 * disclose such Confidential Information and shall use it only in
 * accordance with the terms of the license agreement you entered into
 * with Attribyte, LLC
 * 
 * ATTRIBYTE, LLC MAKES NO REPRESENTATIONS OR WARRANTIES ABOUT THE SUITABILITY OF
 * THE SOFTWARE, EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR
 * PURPOSE, OR NON-INFRINGEMENT. ATTRIBYTE, LLC SHALL NOT BE LIABLE FOR ANY DAMAGES
 * SUFFERED BY LICENSEE AS A RESULT OF USING, MODIFYING OR DISTRIBUTING
 * THIS SOFTWARE OR ITS DERIVATIVES.
 */

package org.attribyte.api.http.util;

import com.google.common.base.Charsets;
import com.google.common.collect.Maps;
import com.google.common.collect.Sets;
import com.google.common.io.CharStreams;
import com.google.protobuf.ByteString;
import org.attribyte.api.Logger;
import org.attribyte.api.http.Client;
import org.attribyte.api.http.GetRequestBuilder;
import org.attribyte.api.http.Request;
import org.attribyte.api.http.Response;

import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.Reader;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import java.util.Set;

/**
 * A parsed <code>robots.txt</code> file.
 */
public class RobotsTxt {

    /**
     * Creates a robots.txt from the standard location (<code>/robots.txt</code>).
     * @param host The hostname. The URL will be created as <code>[host]/robots.txt</code>.
     * @param httpClient The HTTP client for making the request.
     * @param userAgent The <code>User-Agent</code> sent with the request.
     * @param preserveAgents The set of agents to preserve. Agents not contained
     * in this set will be ignored during parse.
     * @param logger A logger for errors. May be <code>null</code>. If specified HTTP errors during
     * parse will be logged at the <code>warn</code> level.
     */
    public static RobotsTxt parse(final String host, final Client httpClient, final String userAgent,
            final Set<String> preserveAgents, final Logger logger) {

        String url = host + "/robots.txt";
        if (!url.startsWith("http://")) {
            url = "http://" + url;
        }

        try {

            Request request = new GetRequestBuilder(url).addHeader("User-Agent", userAgent).create();
            Response response = httpClient.send(request);
            int responseCode = response.getStatusCode();

            if (responseCode == 200) {
                ByteString body = response.getBody();
                if (body == null) {
                    return NO_ROBOTS;
                } else {
                    byte[] bodyBytes = body.toByteArray();
                    if (bodyBytes.length > 0) {
                        return new RobotsTxt(
                                new InputStreamReader(new ByteArrayInputStream(bodyBytes), Charsets.UTF_8),
                                preserveAgents);
                    } else {
                        return NO_ROBOTS;
                    }
                }

            } else {
                return NO_ROBOTS;
            }

        } catch (IOException ioe) {
            ioe.printStackTrace();
            if (logger != null) {
                logger.warn("I/O error during parse of " + url, ioe);
            }
            return NO_ROBOTS;
        } catch (Throwable t) {
            t.printStackTrace();
            if (logger != null) {
                logger.warn("Error during parse of " + url, t);
            }
            return NO_ROBOTS;
        }
    }

    private RobotsTxt() {
        //Internal-use only...
    }

    @SuppressWarnings("unchecked")
    /**
     * Parse robots.txt from a character stream.
     * @param r A reader from which the <code>robots.txt</code> is read.
     * @param agents A list of user agents that, if listed in the file, should be preserved.
     * The wildcard (*) is always preserved.
     */
    public RobotsTxt(final Reader r, final Set<String> agents) throws IOException {

        Set<String> preserveAgents = null;
        if (agents != null) {
            preserveAgents = Sets.newHashSetWithExpectedSize(agents.size() + 1);
            preserveAgents.add("*");
            for (String agent : agents) {
                preserveAgents.add(agent.toLowerCase().trim());
            }
        }

        List<String>[] currRecordLists = null;
        boolean newAgent = false;
        List<String> lines = CharStreams.readLines(r);
        for (String currLine : lines) {

            currLine = currLine.trim();
            if (currLine.length() == 0 || currLine.startsWith("#")) {
                continue;
            }

            currLine = currLine.toLowerCase();

            if (currLine.startsWith("user-agent")) {
                String currAgent = getValue(currLine);
                if (currAgent != null) {
                    if (currRecordLists == null || newAgent) {
                        currRecordLists = (ArrayList<String>[]) new ArrayList[2];
                        newAgent = false;
                    }

                    if (preserveAgents == null) {
                        agentMap.put(currAgent, currRecordLists);
                    } else if (preserveAgents.contains(currAgent)) {
                        agentMap.put(currAgent, currRecordLists);
                    }
                }

            } else if (currLine.startsWith("disallow")) {
                newAgent = true;
                String path = getValue(currLine);
                if (path == null || path.length() == 0) {
                    path = EMPTY_PATH;

                }
                if (currRecordLists != null) { //Agent must have appeared first
                    if (currRecordLists[DISALLOW] == null) {
                        currRecordLists[DISALLOW] = new ArrayList<String>(8);
                    }
                    currRecordLists[DISALLOW].add(path);
                }

            } else if (currLine.startsWith("allow")) {
                newAgent = true;
                String path = getValue(currLine);
                if (path == null || path.length() == 0) {
                    path = EMPTY_PATH;
                }
                if (currRecordLists != null) {
                    if (currRecordLists[ALLOW] == null) {
                        currRecordLists[ALLOW] = new ArrayList<String>(8);
                    }

                    currRecordLists[ALLOW].add(path);
                }

            } else {
                newAgent = true;
                //Ignore
            }
        }
    }

    /**
     * Determine if a user agent is allowed for the specified path.
     * @param userAgent The user agent string.
     * @param path The path.
     */
    public final boolean isAllowed(String userAgent, String path) {
        return isAllowed(userAgent, path, true);
    }

    /**
     * Determine if a user agent is allowed for the specified path.
     * <p>
     * Technically, the treatment of Allow is not right (http://www.robotstxt.org/wc/norobots-rfc.html).
     * A single list should be processed - matching all records in the order they appear.  However,
     * in practice, I have found that many times people do things that don't make sense - like disallow all,
     * then allow, etc.
     * </p>
     * @param userAgent The user agent string.
     * @param path The path.
     * @param checkWildcard Should the wildcard record be checked? (This gives a way to know if a
     * user agent is explicitly disallowed by name.)
     */
    public final boolean isAllowed(String userAgent, String path, final boolean checkWildcard) {

        path = path == null ? "/" : path.toLowerCase().trim();
        if (path.length() == 0)
            path = "/";

        userAgent = userAgent.toLowerCase().trim();

        List<String>[] agentLists = agentMap.get(userAgent);
        if (agentLists == null && checkWildcard) {
            agentLists = agentMap.get("*");
        }

        if (agentLists == null) //Empty, or no wildcard...
            return true;

        List<String> allowList = agentLists[ALLOW];
        if (allowList != null) {
            for (String matchPath : allowList) {
                if (matchPath == EMPTY_PATH) { //Allow none
                    return false;
                }
                if (path.startsWith(matchPath)) { //Explicitly allowed
                    return true;
                }
            }
        }

        List<String> disallowList = agentLists[DISALLOW];
        if (disallowList == null && allowList != null) {
            return false; //If allows are specified - assume these are the only things allowed
        } else if (disallowList == null) {
            return true;
        }

        for (String matchPath : disallowList) {
            if (matchPath == EMPTY_PATH) {
                return true; //Disallow none
            }
            if (path.startsWith(matchPath)) {
                return false;
            }
        }

        return true;
    }

    private String getValue(final String currLine) {
        int index = currLine.indexOf(":");
        if (index == -1)
            return null;
        if (index == currLine.length() - 1)
            return null; //No value at end of line

        int endIndex = currLine.indexOf("#"); //EOL comment
        if (endIndex > 0) {
            return currLine.substring(index + 1, endIndex).trim();
        } else {
            return currLine.substring(index + 1).trim();
        }
    }

    private final Map<String, List<String>[]> agentMap = Maps.newHashMapWithExpectedSize(8);
    private static final int DISALLOW = 1;
    private static final int ALLOW = 0;
    private static final String EMPTY_PATH = "";
    public static final RobotsTxt NO_ROBOTS = new RobotsTxt();
}