Java tutorial
/* * Copyright 2009-2013 Scale Unlimited * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * */ package bixo.robots; import java.net.URL; import org.apache.http.HttpStatus; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import bixo.config.FetcherPolicy; import bixo.config.UserAgent; import bixo.datum.FetchedDatum; import bixo.datum.ScoredUrlDatum; import bixo.exceptions.HttpFetchException; import bixo.exceptions.IOFetchException; import bixo.exceptions.RedirectFetchException; import bixo.fetcher.BaseFetcher; import bixo.fetcher.SimpleHttpFetcher; public class RobotUtils { private static final Logger LOGGER = LoggerFactory.getLogger(RobotUtils.class); // Some robots.txt files are > 64K, amazingly enough. private static final int MAX_ROBOTS_SIZE = 128 * 1024; // subdomain.domain.com can direct to domain.com, so if we're simultaneously fetching // a bunch of robots from subdomains that redirect, we'll exceed the default limit. private static final int MAX_CONNECTIONS_PER_HOST = 20; // Crank down default values when fetching robots.txt, as this should be super // fast to get back. private static final int ROBOTS_CONNECTION_TIMEOUT = 10 * 1000; private static final int ROBOTS_SOCKET_TIMEOUT = 10 * 1000; private static final int ROBOTS_RETRY_COUNT = 2; // TODO KKr - set up min response rate, use it with max size to calc max // time for valid download, use it for COMMAND_TIMEOUT // Amount of time we'll wait for pending tasks to finish up. This is roughly equal // to the max amount of time it might take to fetch a robots.txt file (excluding // download time, which we could add). // FUTURE KKr - add in time to do the download. private static final long MAX_FETCH_TIME = (ROBOTS_CONNECTION_TIMEOUT + ROBOTS_SOCKET_TIMEOUT) * ROBOTS_RETRY_COUNT; public static BaseFetcher createFetcher(BaseFetcher fetcher) { return createFetcher(fetcher.getUserAgent(), fetcher.getMaxThreads()); } public static BaseFetcher createFetcher(UserAgent userAgent, int maxThreads) { // TODO KKr - add static createRobotsFetcher method somewhere that // I can use here, and also in SimpleGroupingKeyGenerator FetcherPolicy policy = new FetcherPolicy(); policy.setMaxContentSize(MAX_ROBOTS_SIZE); policy.setMaxConnectionsPerHost(MAX_CONNECTIONS_PER_HOST); SimpleHttpFetcher fetcher = new SimpleHttpFetcher(maxThreads, policy, userAgent); fetcher.setMaxRetryCount(ROBOTS_RETRY_COUNT); fetcher.setConnectionTimeout(ROBOTS_CONNECTION_TIMEOUT); fetcher.setSocketTimeout(ROBOTS_SOCKET_TIMEOUT); return fetcher; } public static long getMaxFetchTime() { return MAX_FETCH_TIME; } /** * Externally visible, static method for use in tools and for testing. * Fetch the indicated robots.txt file, parse it, and generate rules. * * @param fetcher Fetcher for downloading robots.txt file * @param robotsUrl URL to robots.txt file * @return Robot rules */ public static BaseRobotRules getRobotRules(BaseFetcher fetcher, BaseRobotsParser parser, URL robotsUrl) { try { String urlToFetch = robotsUrl.toExternalForm(); ScoredUrlDatum scoredUrl = new ScoredUrlDatum(urlToFetch); FetchedDatum result = fetcher.get(scoredUrl); // HACK! DANGER! Some sites will redirect the request to the top-level domain // page, without returning a 404. So look for a response which has a redirect, // and the fetched content is not plain text, and assume it's one of these... // which is the same as not having a robots.txt file. String contentType = result.getContentType(); boolean isPlainText = (contentType != null) && (contentType.startsWith("text/plain")); if ((result.getNumRedirects() > 0) && !isPlainText) { return parser.failedFetch(HttpStatus.SC_GONE); } return parser.parseContent(urlToFetch, result.getContentBytes(), result.getContentType(), fetcher.getUserAgent().getAgentName()); } catch (HttpFetchException e) { return parser.failedFetch(e.getHttpStatus()); } catch (IOFetchException e) { return parser.failedFetch(HttpStatus.SC_INTERNAL_SERVER_ERROR); } catch (RedirectFetchException e) { // Other sites will have circular redirects, so treat this as a missing robots.txt return parser.failedFetch(HttpStatus.SC_GONE); } catch (Exception e) { LOGGER.error("Unexpected exception fetching robots.txt: " + robotsUrl, e); return parser.failedFetch(HttpStatus.SC_INTERNAL_SERVER_ERROR); } catch (Throwable t) { LOGGER.error("Unexpected throwable caught while fetching robots.tx: " + robotsUrl, t); return parser.failedFetch(HttpStatus.SC_INTERNAL_SERVER_ERROR); } } }