Source code

Java tutorial


Here is the source code for


 * Web crawler
 * Copyright (C) 2012 by Peter Hedenskog (
 * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
 * in compliance with the License. You may obtain a copy of the License at
 * Unless required by applicable law or agreed to in writing, software distributed under the License
 * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
 * or implied. See the License for the specific language governing permissions and limitations under
 * the License.
package com.soulgalore.crawler.guice;

import java.util.Set;
import java.util.StringTokenizer;

import org.apache.http.Header;
import org.apache.http.HttpHost;
import org.apache.http.auth.AuthScope;
import org.apache.http.auth.UsernamePasswordCredentials;
import org.apache.http.client.HttpClient;
import org.apache.http.client.params.ClientPNames;
import org.apache.http.client.params.CookiePolicy;
import org.apache.http.client.protocol.RequestAcceptEncoding;
import org.apache.http.client.protocol.ResponseContentEncoding;
import org.apache.http.conn.params.ConnRoutePNames;
import org.apache.http.cookie.Cookie;
import org.apache.http.cookie.CookieSpec;
import org.apache.http.cookie.CookieSpecFactory;
import org.apache.http.cookie.MalformedCookieException;
import org.apache.http.impl.client.DefaultHttpClient;
import org.apache.http.cookie.CookieOrigin;
import org.apache.http.impl.conn.tsccm.ThreadSafeClientConnManager;
import org.apache.http.impl.cookie.BestMatchSpec;
import org.apache.http.impl.cookie.BrowserCompatSpec;
import org.apache.http.params.HttpParams;
import org.apache.http.params.HttpProtocolParams;

import com.soulgalore.crawler.core.CrawlerConfiguration;
import com.soulgalore.crawler.util.Auth;
import com.soulgalore.crawler.util.AuthUtil;
import com.soulgalore.crawler.util.HTTPSFaker;
import com.soulgalore.crawler.util.HeaderUtil;

 * Provide a HTTPClient.
public class HttpClientProvider implements Provider<HttpClient> {

     * The number of threads used in the HTTP Client Manager, meaning we can have this number of HTTP
     * connections open at the same time.
    private final int nrOfThreads;

     * The number of connections that can be open to the same route. Setting this to the same number
     * as the number of HTTP threads, will ensure that we will use all the thread, even if we only are
     * using one route.
    private final int maxToRoute;

     * The number in ms before a socket timeout.
    private final int socketTimeout;

     * The number in ms before a connection timeout.
    private final int connectionTimeout;

    private final Set<Auth> auths;

    private final String proxy;

     * Create a provider.
     * @param maxNrOfThreads the max number of threads in the client
     * @param theSocketTimeout the socket timeout time
     * @param theConnectionTimeout the connection timeout time
     * @param authAsString the auth string
     * @param theProxy the proxy
    public HttpClientProvider(@Named(CrawlerConfiguration.MAX_THREADS_PROPERTY_NAME) int maxNrOfThreads,
            @Named(CrawlerConfiguration.SOCKET_TIMEOUT_PROPERTY_NAME) int theSocketTimeout,
            @Named(CrawlerConfiguration.CONNECTION_TIMEOUT_PROPERTY_NAME) int theConnectionTimeout,
            @Named(CrawlerConfiguration.AUTH_PROPERTY_NAME) String authAsString,
            @Named(CrawlerConfiguration.PROXY_PROPERTY_NAME) String theProxy) {
        nrOfThreads = maxNrOfThreads;
        maxToRoute = maxNrOfThreads;
        connectionTimeout = theConnectionTimeout;
        socketTimeout = theSocketTimeout;
        auths = AuthUtil.getInstance().createAuthsFromString(authAsString);
        proxy = theProxy;

     * Get the client.
     * @return the client
    public HttpClient get() {
        final ThreadSafeClientConnManager cm = new ThreadSafeClientConnManager();
        final DefaultHttpClient client = HTTPSFaker.getClientThatAllowAnyHTTPS(cm);

        client.getParams().setParameter("http.socket.timeout", socketTimeout);
        client.getParams().setParameter("http.connection.timeout", connectionTimeout);

        client.addRequestInterceptor(new RequestAcceptEncoding());
        client.addResponseInterceptor(new ResponseContentEncoding());

        CookieSpecFactory csf = new CookieSpecFactory() {
            public CookieSpec newInstance(HttpParams params) {
                return new BestMatchSpecWithURLErrorLog();

        client.getCookieSpecs().register("bestmatchwithurl", csf);
        client.getParams().setParameter(ClientPNames.COOKIE_POLICY, "bestmatchwithurl");

        if (!"".equals(proxy)) {
            StringTokenizer token = new StringTokenizer(proxy, ":");

            if (token.countTokens() == 3) {
                String proxyProtocol = token.nextToken();
                String proxyHost = token.nextToken();
                int proxyPort = Integer.parseInt(token.nextToken());

                HttpHost proxy = new HttpHost(proxyHost, proxyPort, proxyProtocol);
                client.getParams().setParameter(ConnRoutePNames.DEFAULT_PROXY, proxy);
            } else
                System.err.println("Invalid proxy configuration: " + proxy);

        if (auths.size() > 0) {

            for (Auth authObject : auths) {
                        new AuthScope(authObject.getScope(), authObject.getPort()),
                        new UsernamePasswordCredentials(authObject.getUserName(), authObject.getPassword()));

        return client;

    private class BestMatchSpecWithURLErrorLog extends BestMatchSpec {
        public void validate(Cookie cookie, CookieOrigin origin) throws MalformedCookieException {
            try {
                super.validate(cookie, origin);
            } catch (MalformedCookieException e) {
                System.err.println("Cookie rejected for url: " + origin.getHost()
                        + (origin.getPort() != 80 ? ":" + origin.getPort() : "") + origin.getPath() + " the error:"
                        + e.getMessage() + " for cookie:" + cookie.toString());
                throw e;