Java tutorial
/** * This file is part of the Paxle project. * Visit http://www.paxle.net for more information. * Copyright 2007-2010 the original author or authors. * * Licensed under the terms of the Common Public License 1.0 ("CPL 1.0"). * Any use, reproduction or distribution of this program constitutes the recipient's acceptance of this agreement. * The full license text is available under http://www.opensource.org/licenses/cpl1.0.txt * or in the file LICENSE.txt in the root directory of the Paxle distribution. * * Unless required by applicable law or agreed to in writing, this software is distributed * on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. */ package org.paxle.crawler.http.impl; import java.io.IOException; import java.io.InputStream; import java.net.ConnectException; import java.net.NoRouteToHostException; import java.net.SocketTimeoutException; import java.net.URI; import java.net.UnknownHostException; import java.util.Arrays; import java.util.Date; import java.util.Dictionary; import java.util.HashSet; import java.util.Map; import java.util.Set; import java.util.StringTokenizer; import java.util.concurrent.ConcurrentHashMap; import java.util.regex.Matcher; import java.util.regex.Pattern; import java.util.zip.GZIPInputStream; import java.util.zip.ZipInputStream; import org.apache.commons.httpclient.CircularRedirectException; import org.apache.commons.httpclient.ConnectTimeoutException; import org.apache.commons.httpclient.Header; import org.apache.commons.httpclient.HttpClient; import org.apache.commons.httpclient.HttpException; import org.apache.commons.httpclient.HttpMethod; import org.apache.commons.httpclient.HttpStatus; import org.apache.commons.httpclient.MultiThreadedHttpConnectionManager; import org.apache.commons.httpclient.NoHttpResponseException; import org.apache.commons.httpclient.ProxyHost; import org.apache.commons.httpclient.URIException; import org.apache.commons.httpclient.UsernamePasswordCredentials; import org.apache.commons.httpclient.auth.AuthScope; import org.apache.commons.httpclient.cookie.CookiePolicy; import org.apache.commons.httpclient.methods.GetMethod; import org.apache.commons.httpclient.methods.HeadMethod; import org.apache.commons.httpclient.params.HttpConnectionManagerParams; import org.apache.commons.httpclient.protocol.Protocol; import org.apache.commons.httpclient.util.DateParseException; import org.apache.commons.httpclient.util.DateUtil; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.felix.scr.annotations.Activate; import org.apache.felix.scr.annotations.Component; import org.apache.felix.scr.annotations.Deactivate; import org.apache.felix.scr.annotations.Modified; import org.apache.felix.scr.annotations.Property; import org.apache.felix.scr.annotations.Reference; import org.apache.felix.scr.annotations.Service; import org.apache.felix.scr.annotations.Services; import org.osgi.framework.BundleContext; import org.paxle.core.doc.ICrawlerDocument; import org.paxle.core.prefs.IPropertiesStore; import org.paxle.core.prefs.Properties; import org.paxle.crawler.ContentLengthLimitExceededException; import org.paxle.crawler.ICrawlerContext; import org.paxle.crawler.ICrawlerContextAware; import org.paxle.crawler.ICrawlerContextLocal; import org.paxle.crawler.ICrawlerTools; import org.paxle.crawler.ISubCrawler; import org.paxle.crawler.ICrawlerTools.ILimitedRateCopier; @Component(name = HttpCrawler.PID, immediate = true, metatype = false) @Services({ @Service(ISubCrawler.class), @Service(ICrawlerContextAware.class) }) @Property(name = ISubCrawler.PROP_PROTOCOL, value = { "http", "https" }) public class HttpCrawler implements ISubCrawler, ICrawlerContextAware { /* ========================================================= * Config Properties * ========================================================= */ static final String PID = "org.paxle.crawler.http.IHttpCrawler"; @Property(intValue = 15000) static final String PROP_CONNECTION_TIMEOUT = PID + '.' + "connectionTimeout"; @Property(intValue = 15000) static final String PROP_SOCKET_TIMEOUT = PID + '.' + "socketTimeout"; @Property(intValue = 15) static final String PROP_MAXCONNECTIONS_PER_HOST = PID + '.' + "maxConnectionsPerHost"; @Property(intValue = 10485760) static final String PROP_MAXDOWNLOAD_SIZE = PID + '.' + "maxDownloadSize"; @Property(boolValue = true) static final String PROP_ACCEPT_ENCODING = PID + '.' + "acceptEncodings"; @Property(boolValue = true) static final String PROP_SKIP_UNSUPPORTED_MIMETYPES = PID + '.' + "skipUnsupportedMimeTypes"; @Property(intValue = -1) static final String PROP_TRANSFER_LIMIT = PID + '.' + "transferLimit"; // in KB/s @Property(value = CookiePolicy.BROWSER_COMPATIBILITY) static final String PROP_COOKIE_POLICY = PID + '.' + "cookiePolicy"; @Property(value = "Mozilla/5.0 (compatible; ${paxle.userAgent}/${paxle.version}; +http://www.paxle.net/en/bot)") static final String PROP_USER_AGENT = PID + '.' + "userAgent"; @Property(boolValue = false) static final String PROP_PROXY_USE = PID + '.' + "useProxy"; @Property(value = "") static final String PROP_PROXY_HOST = PID + '.' + "proxyHost"; @Property(intValue = 3128) static final String PROP_PROXY_PORT = PID + '.' + "proxyPort"; @Property(value = "") static final String PROP_PROXY_USER = PID + '.' + "proxyUser"; @Property(value = "") static final String PROP_PROXY_PASSWORD = PID + '.' + "proxyPassword"; /* ========================================================= * Constants for HTTP headers * ========================================================= */ private static final String HTTPHEADER_LAST_MODIFIED = "Last-Modified"; private static final String HTTPHEADER_DATE = "Date"; private static final String HTTPHEADER_CONTENT_LANGUAGE = "Content-Language"; private static final String HTTPHEADER_CONTENT_TYPE = "Content-Type"; private static final String HTTPHEADER_CONTENT_LENGTH = "Content-Length"; private static final String HTTPHEADER_CONTENT_ENCODING = "Content-Encoding"; private static final String HTTPHEADER_ACCEPT_ENCODING = "Accept-Encoding"; private static final int PREF_NO_ENCODING = 1; /** * The MIME-type detection of some servers is not that mature, i.e. Apache often tends to * report 'text/plain' for binary files or 'application/x-tar' for compressed tar-archives, * which does not help us at all. This set contains MIME-types, known to be reported erroneously * by servers in general. * Of course this list could be divided further by extending it into a per-server map, but * our means to determine the type (and possibly version) of the servers here are limited, so * this shall suffice for now. */ private static final HashSet<String> ERRONEOUS_MIME_TYPES = new HashSet<String>( Arrays.asList("text/plain", "application/x-tar")); protected ICrawlerContextLocal contextLocal; @Reference protected IPropertiesStore propstore; /** * Connection manager used for http connection pooling */ private MultiThreadedHttpConnectionManager connectionManager = null; /** * http client class */ private HttpClient httpClient = null; /** * Logger class */ private Log logger = LogFactory.getLog(this.getClass()); /** * The maximum size of a file. If set to <code>-1</code>, all files are fetched, otherwise * (if the server provides the file-size) only files with a size equal to or less than this value. */ private int maxDownloadSize = -1; private boolean acceptEncoding = true; private boolean skipUnsupportedMimeTypes = true; private ILimitedRateCopier lrc = null; /** * The Cookie policy to use for crawling * @see CookiePolicy */ private String cookiePolicy = null; /** * The User-Agent header the crawler should use */ private String userAgent = null; private Properties props = null; private ConcurrentHashMap<String, Integer> hostSettings; @Activate public void activate(BundleContext context, Map<String, Object> config) { // reading the configuration this.modified(config); // registering the protocol handler for https Protocol.registerProtocol("https", new Protocol("https", new AllSSLProtocolSocketFactory(), 443)); // getting the component preferences if (this.propstore != null) { this.props = this.propstore.getProperties(context); if (props != null) { final Set<Object> keySet = props.keySet(); this.hostSettings = new ConcurrentHashMap<String, Integer>(keySet.size(), 0.75f, 10); for (final Object o : keySet) { final String key = (String) o; this.hostSettings.put(key, Integer.valueOf(props.getProperty(key))); } } } if (this.hostSettings == null) { this.hostSettings = new ConcurrentHashMap<String, Integer>(10, 0.75f, 10); } } @Deactivate public void deactivate() { this.cleanup(); this.saveProperties(); } public void setCrawlerContextLocal(ICrawlerContextLocal contextLocal) { this.contextLocal = contextLocal; } public void saveProperties() { if (props != null) { for (Map.Entry<String, Integer> e : hostSettings.entrySet()) props.setProperty(e.getKey(), Integer.toString(e.getValue().intValue())); } } private boolean isHostSettingSet(final String host, final int pref) { final Integer i = hostSettings.get(host); if (i == null) return false; return (i.intValue() & pref) != 0; } private void setHostSetting(final String host, final int pref) { final Integer i = hostSettings.get(host); final int val = (i == null) ? pref : (i.intValue() | pref); this.hostSettings.put(host, Integer.valueOf(val)); } /** * cleanup old settings */ private void cleanup() { if (this.connectionManager != null) { this.connectionManager.shutdown(); this.connectionManager = null; } if (this.httpClient != null) { this.httpClient = null; } } @Modified public synchronized void modified(Map<String, Object> configuration) { /* * Cleanup old config */ this.cleanup(); /* * Init with changed configuration */ this.connectionManager = new MultiThreadedHttpConnectionManager(); final HttpConnectionManagerParams connectionManagerParams = connectionManager.getParams(); // configure connections per host final Integer maxConnections = (Integer) configuration.get(PROP_MAXCONNECTIONS_PER_HOST); if (maxConnections != null) { connectionManagerParams.setDefaultMaxConnectionsPerHost(maxConnections.intValue()); } // configuring timeouts final Integer connectionTimeout = (Integer) configuration.get(PROP_CONNECTION_TIMEOUT); if (connectionTimeout != null) { connectionManagerParams.setConnectionTimeout(connectionTimeout.intValue()); } final Integer socketTimeout = (Integer) configuration.get(PROP_SOCKET_TIMEOUT); if (socketTimeout != null) { connectionManagerParams.setSoTimeout(socketTimeout.intValue()); } // set new http client this.httpClient = new HttpClient(connectionManager); // the crawler should request and accept content-encoded data final Boolean acceptEncoding = (Boolean) configuration.get(PROP_ACCEPT_ENCODING); if (acceptEncoding != null) { this.acceptEncoding = acceptEncoding.booleanValue(); } // specifies if the crawler should skipp unsupported-mime-types final Boolean skipUnsupportedMimeTypes = (Boolean) configuration.get(PROP_SKIP_UNSUPPORTED_MIMETYPES); if (skipUnsupportedMimeTypes != null) { this.skipUnsupportedMimeTypes = skipUnsupportedMimeTypes.booleanValue(); } // the cookie policy to use for crawling final String propCookiePolicy = (String) configuration.get(PROP_COOKIE_POLICY); this.cookiePolicy = (propCookiePolicy == null || propCookiePolicy.length() == 0) ? CookiePolicy.BROWSER_COMPATIBILITY : propCookiePolicy; // the http-user-agent string that should be used final String userAgent = (String) configuration.get(PROP_USER_AGENT); if (userAgent != null) { StringBuffer buf = new StringBuffer(); Pattern pattern = Pattern.compile("\\$\\{[^\\}]*}"); Matcher matcher = pattern.matcher(userAgent); // replacing property placeholders with system-properties while (matcher.find()) { String placeHolder = matcher.group(); String propName = placeHolder.substring(2, placeHolder.length() - 1); String propValue = System.getProperty(propName); if (propValue != null) matcher.appendReplacement(buf, propValue); } matcher.appendTail(buf); this.userAgent = buf.toString(); } else { // Fallback this.userAgent = "PaxleFramework"; } // download limit in bytes final Integer maxDownloadSize = (Integer) configuration.get(PROP_MAXDOWNLOAD_SIZE); if (maxDownloadSize != null) { this.maxDownloadSize = maxDownloadSize.intValue(); } // limit data transfer rate final Integer transferLimit = (Integer) configuration.get(PROP_TRANSFER_LIMIT); int limitKBps = 0; if (transferLimit != null) limitKBps = transferLimit.intValue(); this.logger.debug("transfer rate limit: " + limitKBps + " kb/s"); // TODO: lrc = (limitKBps > 0) ? new CrawlerTools.LimitedRateCopier(limitKBps) : null; // proxy configuration final Boolean useProxyVal = (Boolean) configuration.get(PROP_PROXY_USE); final String host = (String) configuration.get(PROP_PROXY_HOST); final Integer portVal = (Integer) configuration.get(PROP_PROXY_PORT); if (useProxyVal != null && useProxyVal.booleanValue() && host != null && host.length() > 0 && portVal != null) { this.logger.info(String.format("Proxy is enabled: %s:%d", host, portVal)); final int port = portVal.intValue(); final ProxyHost proxyHost = new ProxyHost(host, port); this.httpClient.getHostConfiguration().setProxyHost(proxyHost); final String user = (String) configuration.get(PROP_PROXY_HOST); final String pwd = (String) configuration.get(PROP_PROXY_PASSWORD); if (user != null && user.length() > 0 && pwd != null && pwd.length() > 0) this.httpClient.getState().setProxyCredentials(new AuthScope(host, port), new UsernamePasswordCredentials(user, pwd)); } else { this.logger.info("Proxy is disabled"); this.httpClient.getHostConfiguration().setProxyHost(null); this.httpClient.getState().clearCredentials(); } } /** * This method is synchronized with {@link #modified(Dictionary)} to avoid * problems during configuration update. * * @return the {@link HttpClient} to use */ private synchronized HttpClient getHttpClient() { return this.httpClient; } /** * Initializes the {@link HttpMethod} with common attributes for all requests this crawler * initiates. * <p> * Currently the following attributes (represented as HTTP header values in the final request) * are set: * <ul> * <li>the cookie policy to use ({@link #PROP_COOKIE_POLICY})</li> * <li> * if enabled, content-transformation using <code>compress</code>, <code>gzip</code> and * <code>deflate</code> is supported</li> * </li> * </ul> * * @param method the method to set the standard attributes on */ private void initRequestMethod(final HttpMethod method) throws URIException { method.getParams().setCookiePolicy( this.cookiePolicy == null ? CookiePolicy.BROWSER_COMPATIBILITY : this.cookiePolicy); if (acceptEncoding && !isHostSettingSet(method.getURI().getHost(), PREF_NO_ENCODING)) method.setRequestHeader(HTTPHEADER_ACCEPT_ENCODING, "compress, gzip, identity, deflate"); // see RFC 2616, section 14.3 // set some additional http headers if (this.userAgent != null) { method.setRequestHeader("User-Agent", this.userAgent); } } /** * This method handles the <code>Content-Type</code> {@link Header} of a HTTP-request. * If available, the header is used to set the MIME-type of the {@link ICrawlerDocument} * as well as the character set. The return value determines whether processing of the * URL shall be continued or not * <p> * In case of a positive result, the <code>doc</code>'s MIME-type and - if available - * charset will be set. Otherwise it's <code>result</code> will be set to * {@link ICrawlerDocument.Status#UNKNOWN_FAILURE} and a warning message will be logged. * * @see #HTTPHEADER_CONTENT_TYPE * @param contentTypeHeader the HTTP-{@link Header}'s <code>Content-Type</code> attribute * @param doc the {@link ICrawlerDocument} the resulting MIME-type and charset shall be set in * @return <code>true</code> if proceeding with the URL may continue or <code>false</code> if * it shall be aborted due to an unsupported MIME-type of the requested document */ boolean handleContentTypeHeader(final Header contentTypeHeader, final ICrawlerDocument doc) { if (contentTypeHeader == null) { // might be ok, might be not, we don't know yet return true; } // separate MIME-type and charset from the content-type specification String contentMimeType = null; String contentCharset = null; contentMimeType = contentTypeHeader.getValue(); int idx = contentMimeType.indexOf(";"); if (idx != -1) { contentCharset = contentMimeType.substring(idx + 1).trim(); contentMimeType = contentMimeType.substring(0, idx); if (contentCharset.toLowerCase().startsWith("charset=")) { contentCharset = contentCharset.substring("charset=".length()).trim(); idx = contentCharset.indexOf(";"); if (idx != -1) contentCharset = contentCharset.substring(0, idx); if (contentCharset.matches("^['\"].*")) { contentCharset = contentCharset.substring(1); } if (contentCharset.matches(".*['\"]$")) { contentCharset = contentCharset.substring(0, contentCharset.length() - 1); } doc.setCharset(contentCharset.trim()); } else { contentCharset = null; } } // check against common MIME-types wrongly attributed to files by servers // if this is one of them, we just ignore the MIME-type and let the MimeType-bundle do the job if (!ERRONEOUS_MIME_TYPES.contains(contentMimeType)) { doc.setMimeType(contentMimeType); // getting the crawler-context final ICrawlerContext context = this.contextLocal.getCurrentContext(); // check if there is any parser installed, supporting this mime-type if (this.skipUnsupportedMimeTypes) { if (!context.getSupportedMimeTypes().contains(contentMimeType)) { // abort String msg = String.format( "Mimetype '%s' of resource '%s' not supported by any parser installed on the system.", contentMimeType, doc.getLocation()); this.logger.warn(msg); doc.setStatus(ICrawlerDocument.Status.UNKNOWN_FAILURE, msg); return false; } } } // continue return true; } /** * This method handles the <code>Content-Length</code> {@link Header} of a HTTP-request. * If given, and this {@link HttpCrawler} has a valid {@link #maxDownloadSize} set, both values * are being compared and if the further exceeds the latter, <code>false</code> is retured, * <code>true</code> otherwise * <p> * No values are set in the {@link ICrawlerDocument} in case of a positive result, otherwise * the document's <code>result</code> is set to {@link ICrawlerDocument.Status#UNKNOWN_FAILURE} * and a warning message is logged. * * @see #maxDownloadSize * @param contentTypeLength the HTTP-{@link Header}'s <code>Content-Length</code> attribute * @param doc the {@link ICrawlerDocument} the resulting MIME-type and charset shall be set in * @return <code>true</code> if proceeding with the URL may continue or <code>false</code> if * it shall be aborted due to an exceeded maximum file-size of the document */ private boolean handleContentLengthHeader(final Header contentTypeLength, final ICrawlerDocument doc) { if (contentTypeLength == null) { // no Content-Length given, continue return true; } final int maxDownloadSize = this.maxDownloadSize; if (maxDownloadSize < 0) { // no maximum specified, continue return true; } // extract the content length in bytes final String lengthString = contentTypeLength.getValue(); if (lengthString.length() > 0 && lengthString.matches("\\d+")) { final int contentLength = Integer.parseInt(lengthString); if (contentLength > maxDownloadSize) { // reject the document final String msg = String.format( "Content-length '%d' of resource '%s' is larger than the max. allowed size of '%d' bytes.", Integer.valueOf(contentLength), doc.getLocation(), Integer.valueOf(maxDownloadSize)); this.logger.warn(msg); doc.setStatus(ICrawlerDocument.Status.UNKNOWN_FAILURE, msg); return false; } } // continue return true; } public ICrawlerDocument request(URI requestUri) { if (requestUri == null) throw new NullPointerException("URL was null"); this.logger.debug(String.format("Crawling URL '%s' ...", requestUri)); ICrawlerDocument doc = null; HttpMethod method = null; try { final ICrawlerContext ctx = this.contextLocal.getCurrentContext(); // creating an empty crawler-document doc = ctx.createDocument(); doc.setLocation(requestUri); final String uriAsciiString = requestUri.toASCIIString(); /* ============================================================================== * HTTP HEAD request * * first use the HEAD method to determine whether the MIME-type is supported * and to compare the content-length with the maximum allowed download size * (both only if the server provides this information, if not, the file is * fetched) * ============================================================================== */ method = new HeadMethod(uriAsciiString); // automatically follows redirects this.initRequestMethod(method); int statusCode = this.getHttpClient().executeMethod(method); final boolean headUnsupported = (statusCode == HttpStatus.SC_METHOD_FAILURE || statusCode == HttpStatus.SC_METHOD_NOT_ALLOWED); if (!headUnsupported) { if (statusCode != HttpStatus.SC_OK) { // RFC 2616 states that the GET and HEAD methods _must_ be supported by any // general purpose servers (which are in fact the ones we are connecting to here) if (statusCode == HttpStatus.SC_NOT_FOUND) { doc.setStatus(ICrawlerDocument.Status.NOT_FOUND); } else { doc.setStatus(ICrawlerDocument.Status.UNKNOWN_FAILURE, String.format("Server returned: %s", method.getStatusLine())); } this.logger.warn(String.format("Crawling of URL '%s' failed. Server returned: %s", requestUri, method.getStatusLine())); return doc; } // getting the mimetype and charset Header contentTypeHeader = method.getResponseHeader(HTTPHEADER_CONTENT_TYPE); if (!handleContentTypeHeader(contentTypeHeader, doc)) return doc; // reject the document if content-length is above our limit Header contentLengthHeader = method.getResponseHeader(HTTPHEADER_CONTENT_LENGTH); if (!handleContentLengthHeader(contentLengthHeader, doc)) return doc; // FIXME: we've been redirected, re-enqueue the new URL and abort processing //if (!requestUri.equals(method.getURI())) ; } /* ============================================================================== * HTTP GET request * * secondly - if everything is alright up to now - proceed with getting the * actual document * ============================================================================== */ HttpMethod getMethod = new GetMethod(uriAsciiString); // automatically follows redirects method.releaseConnection(); method = getMethod; this.initRequestMethod(method); // send the request to the server statusCode = this.getHttpClient().executeMethod(method); // check the response status code if (statusCode != HttpStatus.SC_OK) { if (statusCode == HttpStatus.SC_NOT_FOUND) { doc.setStatus(ICrawlerDocument.Status.NOT_FOUND); } else { doc.setStatus(ICrawlerDocument.Status.UNKNOWN_FAILURE, String.format("Server returned: %s", method.getStatusLine())); } this.logger.warn(String.format("Crawling of URL '%s' failed. Server returned: %s", requestUri, method.getStatusLine())); return doc; } // FIXME: we've been redirected, re-enqueue the new URL and abort processing // if (!requestUri.equals(method.getURI())) ; /* * HTTP Content-Type * - getting the mimetype and charset */ Header contentTypeHeader = method.getResponseHeader(HTTPHEADER_CONTENT_TYPE); if (!handleContentTypeHeader(contentTypeHeader, doc)) return doc; /* * HTTP Content-Length * - Reject the document if content-length is above our limit * * We do this a second time here because some servers may have set the content-length * of the head response to <code>0</code> */ Header contentLengthHeader = method.getResponseHeader(HTTPHEADER_CONTENT_LENGTH); if (!handleContentLengthHeader(contentLengthHeader, doc)) return doc; extractHttpHeaders(method, doc); // externalised into this method to cleanup here a bit // getting the response body InputStream respBody = method.getResponseBodyAsStream(); // handle the content-encoding, i.e. decompress the server's response Header contentEncodingHeader = method.getResponseHeader(HTTPHEADER_CONTENT_ENCODING); try { respBody = handleContentEncoding(contentEncodingHeader, respBody); /* Limit the max allowed length of the content to copy. -1 is used for no limit. * * We need to set a limit if: * a) the user has configured a max-download-size AND * b) the server returned no content-length header */ int copyLimit = (this.maxDownloadSize <= 0 || contentLengthHeader != null) ? -1 : this.maxDownloadSize; // copy the content to file final ICrawlerTools crawlerTools = ctx.getCrawlerTools(); crawlerTools.saveInto(doc, respBody, lrc, copyLimit); doc.setStatus(ICrawlerDocument.Status.OK); this.logger.debug(String.format("Crawling of URL '%s' finished.", requestUri)); } catch (IOException e) { String msg = e.getMessage(); if (msg == null || !msg.equals("Corrupt GZIP trailer")) throw e; setHostSetting(method.getURI().getHost(), PREF_NO_ENCODING); msg = String.format("server sent a corrupt gzip trailer at URL '%s'", requestUri); logger.warn(msg); // FIXME re-enqueue command doc.setStatus(ICrawlerDocument.Status.UNKNOWN_FAILURE, msg); } finally { respBody.close(); } } catch (NoRouteToHostException e) { this.logger.warn(String.format("Error crawling %s: %s", requestUri, e.getMessage())); doc.setStatus(ICrawlerDocument.Status.NOT_FOUND, e.getMessage()); } catch (UnknownHostException e) { this.logger.warn(String.format("Error crawling %s: Unknown host.", requestUri)); doc.setStatus(ICrawlerDocument.Status.NOT_FOUND, e.getMessage()); } catch (ConnectException e) { this.logger.warn(String.format("Error crawling %s: Unable to connect to host.", requestUri)); doc.setStatus(ICrawlerDocument.Status.NOT_FOUND, e.getMessage()); } catch (ConnectTimeoutException e) { this.logger.warn(String.format("Error crawling %s: %s.", requestUri, e.getMessage())); doc.setStatus(ICrawlerDocument.Status.NOT_FOUND, e.getMessage()); } catch (SocketTimeoutException e) { this.logger.warn(String.format("Error crawling %s: Connection timeout.", requestUri)); doc.setStatus(ICrawlerDocument.Status.NOT_FOUND, e.getMessage()); } catch (CircularRedirectException e) { this.logger.warn(String.format("Error crawling %s: %s", requestUri, e.getMessage())); doc.setStatus(ICrawlerDocument.Status.NOT_FOUND, e.getMessage()); } catch (NoHttpResponseException e) { this.logger.warn(String.format("Error crawling %s: %s", requestUri, e.getMessage())); doc.setStatus(ICrawlerDocument.Status.NOT_FOUND, e.getMessage()); } catch (ContentLengthLimitExceededException e) { this.logger.warn(String.format("Error crawling %s: %s", requestUri, e.getMessage())); doc.setStatus(ICrawlerDocument.Status.UNKNOWN_FAILURE, e.getMessage()); } catch (Throwable e) { String errorMsg; if (e instanceof HttpException) { errorMsg = "Unrecovered protocol exception: [%s] %s"; } else if (e instanceof IOException) { errorMsg = "Transport exceptions: [%s] %s"; } else { errorMsg = "Unexpected exception: [%s] %s"; } errorMsg = String.format(errorMsg, e.getClass().getName(), e.getMessage()); this.logger.error(String.format("Error crawling %s: %s", requestUri, errorMsg)); doc.setStatus(ICrawlerDocument.Status.UNKNOWN_FAILURE, errorMsg); e.printStackTrace(); } finally { if (method != null) method.releaseConnection(); } return doc; } private static void extractHttpHeaders(final HttpMethod method, final ICrawlerDocument doc) throws IOException { // getting the document languages Header contentLanguageHeader = method.getResponseHeader(HTTPHEADER_CONTENT_LANGUAGE); if (contentLanguageHeader != null) { String contentLanguage = contentLanguageHeader.getValue(); String[] languages = contentLanguage.split(","); doc.setLanguages(languages); } // crawling Date Date crawlingDate = null; Header crawlingDateHeader = method.getResponseHeader(HTTPHEADER_DATE); if (crawlingDateHeader == null) { crawlingDate = new Date(); } else try { String dateStr = crawlingDateHeader.getValue(); crawlingDate = DateUtil.parseDate(dateStr); } catch (DateParseException e) { crawlingDate = new Date(); } doc.setCrawlerDate(crawlingDate); // last mod date Date lastModDate = null; Header lastModDateHeader = method.getResponseHeader(HTTPHEADER_LAST_MODIFIED); if (lastModDateHeader != null) try { String dateStr = lastModDateHeader.getValue(); lastModDate = DateUtil.parseDate(dateStr); } catch (DateParseException e) { lastModDate = crawlingDate; } doc.setLastModDate(lastModDate); // ETAG // XXX: this is protocol specific. How to store this in a generic crawler-document? // Header etageHeader = method.getResponseHeader(HTTPHEADER_ETAG); // if (etageHeader != null) { // String etag = etageHeader.getValue(); // doc.setEtag(etag); // } } private static InputStream handleContentEncoding(final Header contentEncodingHeader, InputStream responseBody) throws IOException { if (contentEncodingHeader == null) return responseBody; final String contentEncoding = contentEncodingHeader.getValue(); final StringTokenizer st = new StringTokenizer(contentEncoding, ","); // apply decompression methods in the order given, see RFC 2616, section 14.11 while (st.hasMoreTokens()) { String encoding = st.nextToken().trim(); // the "identity"-encoding does not need any transformation if (encoding.equals("deflate")) { responseBody = new ZipInputStream(responseBody); } else { // support for the recommendation of the W3C, see http://www.w3.org/Protocols/rfc2616/rfc2616-sec3.html#sec3.5 if (encoding.startsWith("x-")) encoding = encoding.substring("x-".length()); if (encoding.equals("gzip") || encoding.equals("compress")) { responseBody = new GZIPInputStream(responseBody); } } } return responseBody; } }