crawlercommons.robots.RobotUtils.java Source code

Introduction

Here is the source code for crawlercommons.robots.RobotUtils.java
Source

/**
 * Copyright 2016 Crawler-Commons
 * 
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 * 
 *     http://www.apache.org/licenses/LICENSE-2.0
 * 
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package crawlercommons.robots;

import java.net.URL;

import org.apache.http.HttpStatus;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import crawlercommons.fetcher.FetchedResult;
import crawlercommons.fetcher.HttpFetchException;
import crawlercommons.fetcher.IOFetchException;
import crawlercommons.fetcher.RedirectFetchException;
import crawlercommons.fetcher.http.BaseHttpFetcher;
import crawlercommons.fetcher.http.SimpleHttpFetcher;
import crawlercommons.fetcher.http.UserAgent;

public class RobotUtils {
    private static final Logger LOGGER = LoggerFactory.getLogger(RobotUtils.class);

    // Some robots.txt files are > 64K, amazingly enough.
    private static final int MAX_ROBOTS_SIZE = 128 * 1024;

    // subdomain.domain.com can direct to domain.com, so if we're simultaneously
    // fetching
    // a bunch of robots from subdomains that redirect, we'll exceed the default
    // limit.
    private static final int MAX_CONNECTIONS_PER_HOST = 20;

    // Crank down default values when fetching robots.txt, as this should be
    // super
    // fast to get back.
    private static final int ROBOTS_CONNECTION_TIMEOUT = 10 * 1000;
    private static final int ROBOTS_SOCKET_TIMEOUT = 10 * 1000;
    private static final int ROBOTS_RETRY_COUNT = 2;

    // TODO KKr - set up min response rate, use it with max size to calc max
    // time for valid download, use it for COMMAND_TIMEOUT

    // Amount of time we'll wait for pending tasks to finish up. This is roughly
    // equal
    // to the max amount of time it might take to fetch a robots.txt file
    // (excluding
    // download time, which we could add).
    // FUTURE KKr - add in time to do the download.
    private static final long MAX_FETCH_TIME = (ROBOTS_CONNECTION_TIMEOUT + ROBOTS_SOCKET_TIMEOUT)
            * ROBOTS_RETRY_COUNT;

    public static BaseHttpFetcher createFetcher(BaseHttpFetcher fetcher) {
        return createFetcher(fetcher.getUserAgent(), fetcher.getMaxThreads());
    }

    public static BaseHttpFetcher createFetcher(UserAgent userAgent, int maxThreads) {
        SimpleHttpFetcher fetcher = new SimpleHttpFetcher(maxThreads, userAgent);
        fetcher.setDefaultMaxContentSize(MAX_ROBOTS_SIZE);
        fetcher.setMaxConnectionsPerHost(MAX_CONNECTIONS_PER_HOST);
        fetcher.setMaxRetryCount(ROBOTS_RETRY_COUNT);
        fetcher.setConnectionTimeout(ROBOTS_CONNECTION_TIMEOUT);
        fetcher.setSocketTimeout(ROBOTS_SOCKET_TIMEOUT);

        return fetcher;
    }

    public static long getMaxFetchTime() {
        return MAX_FETCH_TIME;
    }

    /**
     * Externally visible, static method for use in tools and for testing. Fetch
     * the indicated robots.txt file, parse it, and generate rules.
     * 
     * @param fetcher
     *            Fetcher for downloading robots.txt file
     * @param robotsUrl
     *            URL to robots.txt file
     * @return Robot rules
     */
    public static BaseRobotRules getRobotRules(BaseHttpFetcher fetcher, BaseRobotsParser parser, URL robotsUrl) {

        try {
            String urlToFetch = robotsUrl.toExternalForm();
            FetchedResult result = fetcher.get(urlToFetch);

            // HACK! DANGER! Some sites will redirect the request to the
            // top-level domain
            // page, without returning a 404. So look for a response which has a
            // redirect,
            // and the fetched content is not plain text, and assume it's one of
            // these...
            // which is the same as not having a robots.txt file.

            String contentType = result.getContentType();
            boolean isPlainText = (contentType != null) && (contentType.startsWith("text/plain"));
            if ((result.getNumRedirects() > 0) && !isPlainText) {
                return parser.failedFetch(HttpStatus.SC_GONE);
            }

            return parser.parseContent(urlToFetch, result.getContent(), result.getContentType(),
                    fetcher.getUserAgent().getAgentName());
        } catch (HttpFetchException e) {
            return parser.failedFetch(e.getHttpStatus());
        } catch (IOFetchException e) {
            return parser.failedFetch(HttpStatus.SC_INTERNAL_SERVER_ERROR);
        } catch (RedirectFetchException e) {
            // Other sites will have circular redirects, so treat this as a
            // missing robots.txt
            return parser.failedFetch(HttpStatus.SC_GONE);
        } catch (Exception e) {
            LOGGER.error("Unexpected exception fetching robots.txt: " + robotsUrl, e);
            return parser.failedFetch(HttpStatus.SC_INTERNAL_SERVER_ERROR);
        }
    }

}