com.soulgalore.crawler.guice.HttpClientProvider.java Source code

Java tutorial

Introduction

Here is the source code for com.soulgalore.crawler.guice.HttpClientProvider.java

Source

/******************************************************
 * Web crawler
 * 
 * 
 * Copyright (C) 2012 by Peter Hedenskog (http://peterhedenskog.com)
 * 
 ****************************************************** 
 * 
 * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
 * in compliance with the License. You may obtain a copy of the License at
 * 
 * 
 * http://www.apache.org/licenses/LICENSE-2.0
 * 
 * Unless required by applicable law or agreed to in writing, software distributed under the License
 * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
 * or implied. See the License for the specific language governing permissions and limitations under
 * the License.
 * 
 ******************************************************* 
 */
package com.soulgalore.crawler.guice;

import java.util.Set;
import java.util.StringTokenizer;

import org.apache.http.Header;
import org.apache.http.HttpHost;
import org.apache.http.auth.AuthScope;
import org.apache.http.auth.UsernamePasswordCredentials;
import org.apache.http.client.HttpClient;
import org.apache.http.client.params.ClientPNames;
import org.apache.http.client.params.CookiePolicy;
import org.apache.http.client.protocol.RequestAcceptEncoding;
import org.apache.http.client.protocol.ResponseContentEncoding;
import org.apache.http.conn.params.ConnRoutePNames;
import org.apache.http.cookie.Cookie;
import org.apache.http.cookie.CookieSpec;
import org.apache.http.cookie.CookieSpecFactory;
import org.apache.http.cookie.MalformedCookieException;
import org.apache.http.impl.client.DefaultHttpClient;
import org.apache.http.cookie.CookieOrigin;
import org.apache.http.impl.conn.tsccm.ThreadSafeClientConnManager;
import org.apache.http.impl.cookie.BestMatchSpec;
import org.apache.http.impl.cookie.BrowserCompatSpec;
import org.apache.http.params.HttpParams;
import org.apache.http.params.HttpProtocolParams;

import com.google.inject.Inject;
import com.google.inject.Provider;
import com.google.inject.name.Named;
import com.soulgalore.crawler.core.CrawlerConfiguration;
import com.soulgalore.crawler.util.Auth;
import com.soulgalore.crawler.util.AuthUtil;
import com.soulgalore.crawler.util.HTTPSFaker;
import com.soulgalore.crawler.util.HeaderUtil;

/**
 * Provide a HTTPClient.
 * 
 * 
 */
public class HttpClientProvider implements Provider<HttpClient> {

    /**
     * The number of threads used in the HTTP Client Manager, meaning we can have this number of HTTP
     * connections open at the same time.
     */
    private final int nrOfThreads;

    /**
     * The number of connections that can be open to the same route. Setting this to the same number
     * as the number of HTTP threads, will ensure that we will use all the thread, even if we only are
     * using one route.
     */
    private final int maxToRoute;

    /**
     * The number in ms before a socket timeout.
     */
    private final int socketTimeout;

    /**
     * The number in ms before a connection timeout.
     */
    private final int connectionTimeout;

    private final Set<Auth> auths;

    private final String proxy;

    /**
     * Create a provider.
     * 
     * @param maxNrOfThreads the max number of threads in the client
     * @param theSocketTimeout the socket timeout time
     * @param theConnectionTimeout the connection timeout time
     * @param authAsString the auth string
     * @param theProxy the proxy
     */
    @Inject
    public HttpClientProvider(@Named(CrawlerConfiguration.MAX_THREADS_PROPERTY_NAME) int maxNrOfThreads,
            @Named(CrawlerConfiguration.SOCKET_TIMEOUT_PROPERTY_NAME) int theSocketTimeout,
            @Named(CrawlerConfiguration.CONNECTION_TIMEOUT_PROPERTY_NAME) int theConnectionTimeout,
            @Named(CrawlerConfiguration.AUTH_PROPERTY_NAME) String authAsString,
            @Named(CrawlerConfiguration.PROXY_PROPERTY_NAME) String theProxy) {
        nrOfThreads = maxNrOfThreads;
        maxToRoute = maxNrOfThreads;
        connectionTimeout = theConnectionTimeout;
        socketTimeout = theSocketTimeout;
        auths = AuthUtil.getInstance().createAuthsFromString(authAsString);
        proxy = theProxy;
    }

    /**
     * Get the client.
     * 
     * @return the client
     */
    public HttpClient get() {
        final ThreadSafeClientConnManager cm = new ThreadSafeClientConnManager();
        cm.setMaxTotal(nrOfThreads);
        cm.setDefaultMaxPerRoute(maxToRoute);
        final DefaultHttpClient client = HTTPSFaker.getClientThatAllowAnyHTTPS(cm);

        client.getParams().setParameter("http.socket.timeout", socketTimeout);
        client.getParams().setParameter("http.connection.timeout", connectionTimeout);

        client.addRequestInterceptor(new RequestAcceptEncoding());
        client.addResponseInterceptor(new ResponseContentEncoding());

        CookieSpecFactory csf = new CookieSpecFactory() {
            public CookieSpec newInstance(HttpParams params) {
                return new BestMatchSpecWithURLErrorLog();
            }
        };

        client.getCookieSpecs().register("bestmatchwithurl", csf);
        client.getParams().setParameter(ClientPNames.COOKIE_POLICY, "bestmatchwithurl");

        if (!"".equals(proxy)) {
            StringTokenizer token = new StringTokenizer(proxy, ":");

            if (token.countTokens() == 3) {
                String proxyProtocol = token.nextToken();
                String proxyHost = token.nextToken();
                int proxyPort = Integer.parseInt(token.nextToken());

                HttpHost proxy = new HttpHost(proxyHost, proxyPort, proxyProtocol);
                client.getParams().setParameter(ConnRoutePNames.DEFAULT_PROXY, proxy);
            } else
                System.err.println("Invalid proxy configuration: " + proxy);
        }

        if (auths.size() > 0) {

            for (Auth authObject : auths) {
                client.getCredentialsProvider().setCredentials(
                        new AuthScope(authObject.getScope(), authObject.getPort()),
                        new UsernamePasswordCredentials(authObject.getUserName(), authObject.getPassword()));
            }
        }

        return client;
    }

    private class BestMatchSpecWithURLErrorLog extends BestMatchSpec {
        @Override
        public void validate(Cookie cookie, CookieOrigin origin) throws MalformedCookieException {
            try {
                super.validate(cookie, origin);
            } catch (MalformedCookieException e) {
                System.err.println("Cookie rejected for url: " + origin.getHost()
                        + (origin.getPort() != 80 ? ":" + origin.getPort() : "") + origin.getPath() + " the error:"
                        + e.getMessage() + " for cookie:" + cookie.toString());
                throw e;
            }
        }
    }
}