org.aliuge.crawler.fetcher.DefaultFetcher.java Source code

Java tutorial

Introduction

Here is the source code for org.aliuge.crawler.fetcher.DefaultFetcher.java

Source

/**
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.aliuge.crawler.fetcher;

import java.io.IOException;
import java.io.InputStream;
import java.util.Date;
import java.util.zip.GZIPInputStream;

import org.aliuge.crawler.jobconf.FetchConfig;
import org.aliuge.crawler.page.PageFetchResult;
import org.aliuge.crawler.url.WebURL;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.http.Header;
import org.apache.http.HttpEntity;
import org.apache.http.HttpHost;
import org.apache.http.HttpStatus;
import org.apache.http.client.HttpClient;
import org.apache.http.client.config.CookieSpecs;
import org.apache.http.client.config.RequestConfig;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.config.Registry;
import org.apache.http.config.RegistryBuilder;
import org.apache.http.cookie.Cookie;
import org.apache.http.cookie.CookieOrigin;
import org.apache.http.cookie.CookieSpec;
import org.apache.http.cookie.CookieSpecProvider;
import org.apache.http.cookie.MalformedCookieException;
import org.apache.http.entity.HttpEntityWrapper;
import org.apache.http.impl.client.BasicCookieStore;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.impl.conn.PoolingHttpClientConnectionManager;
import org.apache.http.impl.cookie.BestMatchSpecFactory;
import org.apache.http.impl.cookie.BrowserCompatSpec;
import org.apache.http.impl.cookie.BrowserCompatSpecFactory;
import org.apache.http.protocol.HttpContext;

public class DefaultFetcher extends Fetcher {

    protected static final Log log = LogFactory.getLog(DefaultFetcher.class);

    protected static PoolingHttpClientConnectionManager connectionManager;

    protected CloseableHttpClient httpClient;

    protected RequestConfig defaultRequestConfig;

    protected final Object mutex = new Object();

    protected long lastFetchTime = 0;

    protected static IdleConnectionMonitorThread connectionMonitorThread = null;

    public DefaultFetcher() {
        createFetcher(null);
    }

    public DefaultFetcher(FetchConfig config) {

        super(config);
        createFetcher(config);
    }

    public DefaultFetcher createFetcher(FetchConfig config) {
        // 
        connectionManager = new PoolingHttpClientConnectionManager();

        BasicCookieStore cookieStore = new BasicCookieStore();
        CookieSpecProvider easySpecProvider = new CookieSpecProvider() {
            public CookieSpec create(HttpContext context) {

                return new BrowserCompatSpec() {
                    @Override
                    public void validate(Cookie cookie, CookieOrigin origin) throws MalformedCookieException {
                        // Oh, I am easy
                    }
                };
            }

        };
        Registry<CookieSpecProvider> r = RegistryBuilder.<CookieSpecProvider>create()
                .register(CookieSpecs.BEST_MATCH, new BestMatchSpecFactory())
                .register(CookieSpecs.BROWSER_COMPATIBILITY, new BrowserCompatSpecFactory())
                .register("easy", easySpecProvider).build();

        // Create global request configuration
        defaultRequestConfig = RequestConfig.custom().setCookieSpec("easy").setSocketTimeout(10000)
                .setConnectTimeout(10000).build();

        connectionManager.setMaxTotal(config.getMaxTotalConnections());
        connectionManager.setDefaultMaxPerRoute(config.getMaxConnectionsPerHost());

        // Create an HttpClient with the given custom dependencies and
        // configuration.
        httpClient = HttpClients.custom().setConnectionManager(connectionManager).setDefaultCookieStore(cookieStore)
                .setDefaultCookieSpecRegistry(r)
                /* .setProxy(new HttpHost("myproxy", 8080)) */
                .setDefaultRequestConfig(defaultRequestConfig).build();

        if (connectionMonitorThread == null) {
            connectionMonitorThread = new IdleConnectionMonitorThread(connectionManager);
        }
        /*
         * connectionMonitorThread.start(); try {
         * connectionMonitorThread.join(); } catch (InterruptedException e) { //
         * TODO Auto-generated catch block e.printStackTrace(); }
         */
        return this;
    }

    public PageFetchResult fetch(WebURL webUrl) {
        return fetch(webUrl, false);
    }

    public PageFetchResult fetch(WebURL webUrl, boolean proxy) {
        PageFetchResult fetchResult = new PageFetchResult();

        String toFetchURL = webUrl.getUrl();
        HttpGet get = new HttpGet(toFetchURL);
        get.addHeader("Accept-Encoding", "gzip");
        get.addHeader("User-Agent", config.getAgent());

        RequestConfig requestConfig = null;
        CloseableHttpResponse response = null;

        synchronized (mutex) {
            long now = (new Date()).getTime();
            if (now - lastFetchTime < ((FetchConfig) config).getDelayBetweenRequests()) {
                try {
                    Thread.sleep(((FetchConfig) config).getDelayBetweenRequests() - (now - lastFetchTime));
                } catch (InterruptedException e) {
                    // TODO Auto-generated catch block
                    e.printStackTrace();
                }
            }
            lastFetchTime = (new Date()).getTime();
        }
        int statusCode = 0;
        int count = 5;

        while (statusCode != HttpStatus.SC_OK && count-- > 0) {
            HttpHost proxyHost = null;
            if (proxy) {
                proxyHost = getProxyIp();

                if (proxyHost != null)
                    requestConfig = RequestConfig.copy(defaultRequestConfig).setSocketTimeout(10000)
                            .setConnectTimeout(10000).setProxy(proxyHost).build();
            }

            get.setConfig(requestConfig);

            try {
                response = httpClient.execute(get);
                statusCode = response.getStatusLine().getStatusCode();
                fetchResult.setEntity(response.getEntity());
                fetchResult.setResponseHeaders(response.getAllHeaders());
            } catch (IOException e) {
                // e.printStackTrace();
                // log.info("Fatal transport error: " + e.getMessage()+
                // " while fetching " + toFetchURL + " (link found in doc #"+
                // webUrl.getParentDocid() + ")");
                addFailedProxy(proxyHost.toHostString());
                /*
                 * if (null != get) get.abort();
                 * fetchResult.setStatusCode(CustomFetchStatus
                 * .FatalTransportError);
                 */
                // return fetchResult;
            }
        }

        fetchResult.setStatusCode(statusCode);

        fetchResult.setFetchedUrl(toFetchURL);

        if (fetchResult.getStatusCode() == HttpStatus.SC_OK) {

            long size = fetchResult.getEntity().getContentLength();
            if (size == -1) {
                Header length = response.getLastHeader("Content-Length");
                if (length == null) {
                    length = response.getLastHeader("Content-length");
                }
                if (length != null) {
                    size = Integer.parseInt(length.getValue());
                } else {
                    size = -1;
                }
            }
            if (size > ((FetchConfig) config).getMaxDownloadSizePerPage()) {
                fetchResult.setStatusCode(CustomFetchStatus.PageTooBig);
                get.abort();
                return fetchResult;
            }
            // fetchResult.setStatusCode(HttpStatus.SC_OK);
            return fetchResult;
        }
        get.abort();

        fetchResult.setStatusCode(CustomFetchStatus.UnknownError);
        return fetchResult;
    }

    public synchronized void shutDown() {
        if (connectionMonitorThread != null) {
            connectionManager.shutdown();
            connectionMonitorThread.shutdown();
        }
    }

    public HttpClient getHttpClient() {
        return httpClient;
    }

    @SuppressWarnings("unused")
    private static class GzipDecompressingEntity extends HttpEntityWrapper {

        public GzipDecompressingEntity(final HttpEntity entity) {
            super(entity);
        }

        @Override
        public InputStream getContent() throws IOException, IllegalStateException {

            // the wrapped entity's getContent() decides about repeatability
            InputStream wrappedin = wrappedEntity.getContent();

            return new GZIPInputStream(wrappedin);
        }

        @Override
        public long getContentLength() {
            // length of ungzipped content is not known
            return -1;
        }

    }
}