com.nanocrawler.fetcher.PageFetcher.java Source code

Introduction

Here is the source code for com.nanocrawler.fetcher.PageFetcher.java
Source

/**
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 * 
 * Based on crawler4j project by Yasser Ganjisaffar
 */
package com.nanocrawler.fetcher;

import com.nanocrawler.data.CustomFetchStatus;
import com.nanocrawler.data.PageFetchResult;
import com.nanocrawler.urlmanipulation.URLCanonicalizer;
import com.nanocrawler.urlmanipulation.WebURL;
import com.nanocrawler.util.CrawlConfig;
import java.io.IOException;
import java.util.Date;
import org.apache.http.Header;
import org.apache.http.HeaderElement;
import org.apache.http.HttpEntity;
import org.apache.http.HttpException;
import org.apache.http.HttpHost;
import org.apache.http.HttpResponse;
import org.apache.http.HttpResponseInterceptor;
import org.apache.http.HttpStatus;
import org.apache.http.HttpVersion;
import org.apache.http.auth.AuthScope;
import org.apache.http.auth.UsernamePasswordCredentials;
import org.apache.http.client.HttpClient;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.client.params.ClientPNames;
import org.apache.http.client.params.CookiePolicy;
import org.apache.http.conn.params.ConnRoutePNames;
import org.apache.http.conn.scheme.PlainSocketFactory;
import org.apache.http.conn.scheme.Scheme;
import org.apache.http.conn.scheme.SchemeRegistry;
import org.apache.http.conn.ssl.SSLSocketFactory;
import org.apache.http.impl.client.DefaultHttpClient;
import org.apache.http.impl.conn.PoolingClientConnectionManager;
import org.apache.http.params.BasicHttpParams;
import org.apache.http.params.CoreConnectionPNames;
import org.apache.http.params.CoreProtocolPNames;
import org.apache.http.params.HttpParams;
import org.apache.http.params.HttpProtocolParamBean;
import org.apache.http.protocol.HttpContext;
import org.apache.log4j.Logger;

// Page fetcher class
public class PageFetcher {

    protected static final Logger logger = Logger.getLogger(PageFetcher.class);

    protected DefaultHttpClient httpClient;
    protected final Object mutex = new Object();
    protected long lastFetchTime = 0;

    protected PoolingClientConnectionManager connectionManager;
    protected IdleConnectionMonitorThread connectionMonitorThread = null;

    private final CrawlConfig config;

    private static PageFetcher instance = null;

    // Returns HTTP client used in fetching pages
    public HttpClient getHttpClient() {
        return httpClient;
    }

    // Constructor
    public PageFetcher(CrawlConfig config) {
        this.config = config;

        HttpParams params = new BasicHttpParams();
        HttpProtocolParamBean paramsBean = new HttpProtocolParamBean(params);
        paramsBean.setVersion(HttpVersion.HTTP_1_1);
        paramsBean.setContentCharset("UTF-8");
        paramsBean.setUseExpectContinue(false);
        params.setParameter(ClientPNames.COOKIE_POLICY, CookiePolicy.BROWSER_COMPATIBILITY);
        params.setParameter(CoreProtocolPNames.USER_AGENT, config.getUserAgentString());
        params.setIntParameter(CoreConnectionPNames.SO_TIMEOUT, config.getSocketTimeout());
        params.setIntParameter(CoreConnectionPNames.CONNECTION_TIMEOUT, config.getConnectionTimeout());
        params.setBooleanParameter("http.protocol.handle-redirects", false);

        SchemeRegistry schemeRegistry = new SchemeRegistry();
        schemeRegistry.register(new Scheme("http", 80, PlainSocketFactory.getSocketFactory()));
        if (config.isIncludeHttpsPages()) {
            schemeRegistry.register(new Scheme("https", 443, SSLSocketFactory.getSocketFactory()));
        }

        connectionManager = new PoolingClientConnectionManager(schemeRegistry);
        connectionManager.setMaxTotal(config.getMaxTotalConnections());
        connectionManager.setDefaultMaxPerRoute(config.getMaxConnectionsPerHost());
        httpClient = new DefaultHttpClient(connectionManager, params);
    }

    // Initializes fetcher and starts connection monitoring
    public void initialize() {
        if (config.getProxyHost() != null) {
            if (config.getProxyUsername() != null) {
                httpClient.getCredentialsProvider().setCredentials(
                        new AuthScope(config.getProxyHost(), config.getProxyPort()),
                        new UsernamePasswordCredentials(config.getProxyUsername(), config.getProxyPassword()));
            }

            HttpHost proxy = new HttpHost(config.getProxyHost(), config.getProxyPort());
            httpClient.getParams().setParameter(ConnRoutePNames.DEFAULT_PROXY, proxy);
        }

        // Since fetcher is manually working on HTTP responses, gzipped response data has to be decompressed manually
        httpClient.addResponseInterceptor(new HttpResponseInterceptor() {
            @Override
            public void process(final HttpResponse response, final HttpContext context)
                    throws HttpException, IOException {
                HttpEntity entity = response.getEntity();
                Header contentEncoding = entity.getContentEncoding();

                if (contentEncoding != null) {
                    HeaderElement[] codecs = contentEncoding.getElements();
                    for (HeaderElement codec : codecs) {
                        if (codec.getName().equalsIgnoreCase("gzip")) {
                            response.setEntity(new GzipDecompressingEntity(response.getEntity()));
                            return;
                        }
                    }
                }
            }
        });

        if (connectionMonitorThread == null) {
            connectionMonitorThread = new IdleConnectionMonitorThread(connectionManager);
        }
        connectionMonitorThread.start();
    }

    // Waits for the next fetch start - single fetcher, called from several crawlers, hence mutex object is used
    private void waitForFetchStart() throws InterruptedException {
        synchronized (mutex) {
            long now = (new Date()).getTime();
            // TO_DO: CRAWL DELAY HERE!
            if (now - lastFetchTime < config.getPolitenessDelay()) {
                Thread.sleep(config.getPolitenessDelay() - (now - lastFetchTime));
            }
            lastFetchTime = (new Date()).getTime();
        }
    }

    // Checks the header and returns true / false depending on status code from the server
    private boolean checkHeader(PageFetchResult fetchResult, HttpResponse response, String toFetchURL,
            HttpGet get) {
        boolean headerOk = false;

        int statusCode = response.getStatusLine().getStatusCode();
        if (statusCode != HttpStatus.SC_OK) {
            if (statusCode != HttpStatus.SC_NOT_FOUND) {
                if (statusCode == HttpStatus.SC_MOVED_PERMANENTLY
                        || statusCode == HttpStatus.SC_MOVED_TEMPORARILY) {
                    Header header = response.getFirstHeader("Location");
                    if (header != null) {
                        String movedToUrl = header.getValue();
                        movedToUrl = URLCanonicalizer.getCanonicalURL(movedToUrl, toFetchURL);
                        fetchResult.setMovedToUrl(movedToUrl);
                    }
                    fetchResult.setStatusCode(statusCode);
                }
                logger.info("Failed: " + response.getStatusLine().toString() + ", while fetching " + toFetchURL);
            }
            fetchResult.setStatusCode(response.getStatusLine().getStatusCode());
        } else {
            fetchResult.setFetchedUrl(toFetchURL);
            String uri = get.getURI().toString();
            if (!uri.equals(toFetchURL)) {
                if (!URLCanonicalizer.getCanonicalURL(uri).equals(toFetchURL)) {
                    fetchResult.setFetchedUrl(uri);
                }
            }

            headerOk = true;
        }

        return headerOk;
    }

    // Checks content length of the the body of the response (and if there is one)
    private boolean checkBody(PageFetchResult fetchResult, HttpResponse response) {
        boolean bodyOk = false;
        if (fetchResult.getEntity() != null) {
            long size = fetchResult.getEntity().getContentLength();
            if (size == -1) {
                Header length = response.getLastHeader("Content-Length");
                if (length == null) {
                    length = response.getLastHeader("Content-length");
                }
                if (length != null) {
                    size = Integer.parseInt(length.getValue());
                } else {
                    size = -1;
                }
            }

            if (size > config.getMaxDownloadSize()) {
                fetchResult.setStatusCode(CustomFetchStatus.PageTooBig);
            } else {
                fetchResult.setStatusCode(HttpStatus.SC_OK);
                bodyOk = true;
            }
        }
        return bodyOk;
    }

    // Fetches header of a page given the URL
    public PageFetchResult fetchHeader(WebURL webUrl) {
        PageFetchResult fetchResult = new PageFetchResult();
        String toFetchURL = webUrl.getURL();
        HttpGet get = null;

        try {
            get = new HttpGet(toFetchURL);
            get.addHeader("Accept-Encoding", "gzip");

            waitForFetchStart();

            HttpResponse response = httpClient.execute(get);
            fetchResult.setEntity(response.getEntity());
            fetchResult.setResponseHeaders(response.getAllHeaders());

            if (checkHeader(fetchResult, response, toFetchURL, get)) {
                if (checkBody(fetchResult, response)) {
                    // All good, this page checks out with regards to header & body parameters
                } else {
                    get.abort();
                }
            } else {
                get.abort();
            }

            return fetchResult;
        } catch (IOException e) {
            logger.error("Fatal transport error: " + e.getMessage() + " while fetching " + toFetchURL
                    + " (link found in doc #" + webUrl.getParentDocid() + ")");
            fetchResult.setStatusCode(CustomFetchStatus.FatalTransportError);
            get.abort();
            return fetchResult;
        } catch (IllegalStateException e) {
            // Ignoring exceptions that occur because of not registering https and other schemes
        } catch (Exception e) {
            if (e.getMessage() == null) {
                logger.error("Error while fetching " + webUrl.getURL());
            } else {
                logger.error(e.getMessage() + " while fetching " + webUrl.getURL());
            }
        }

        fetchResult.setStatusCode(CustomFetchStatus.UnknownError);
        return fetchResult;
    }

    // Shuts down the connection manager 
    public synchronized void shutDown() {
        if (connectionMonitorThread != null) {
            connectionManager.shutdown();
            connectionMonitorThread.shutdown();
        }
    }
}