org.eclipse.smila.connectivity.framework.crawler.web.http.Http.java Source code

Java tutorial

Introduction

Here is the source code for org.eclipse.smila.connectivity.framework.crawler.web.http.Http.java

Source

/***********************************************************************************************************************
 * Copyright (c) 2008 empolis GmbH and brox IT Solutions GmbH. All rights reserved. This program and the accompanying
 * materials are made available under the terms of the Eclipse Public License v1.0 which accompanies this distribution,
 * and is available at http://www.eclipse.org/legal/epl-v10.html
 * 
 * Contributors: Dmitry Hazin (brox IT Solutions GmbH) - initial creator 
 * Sebastian Voigt (brox IT Solutions GmbH) 
 * This File is based on the Http.java from Nutch 0.8.1 (see below the licene). The original File was modified by the 
 * Smila Team
 **********************************************************************************************************************/
/**
 * Copyright 2005 The Apache Software Foundation
 * 
 * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
 * the License. You may obtain a copy of the License at
 * 
 * http://www.apache.org/licenses/LICENSE-2.0
 * 
 * Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 * specific language governing permissions and limitations under the License.
 */
package org.eclipse.smila.connectivity.framework.crawler.web.http;

import java.io.File;
import java.io.IOException;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.ArrayList;
import java.util.List;

import org.apache.commons.httpclient.Credentials;
import org.apache.commons.httpclient.Header;
import org.apache.commons.httpclient.HostConfiguration;
import org.apache.commons.httpclient.HttpClient;
import org.apache.commons.httpclient.MultiThreadedHttpConnectionManager;
import org.apache.commons.httpclient.UsernamePasswordCredentials;
import org.apache.commons.httpclient.auth.AuthScope;
import org.apache.commons.httpclient.contrib.ssl.AuthSSLProtocolSocketFactory;
import org.apache.commons.httpclient.params.HttpConnectionManagerParams;
import org.apache.commons.httpclient.protocol.Protocol;
import org.apache.commons.httpclient.protocol.ProtocolSocketFactory;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.eclipse.smila.connectivity.framework.crawler.web.auth.Rfc2617Authentication;
import org.eclipse.smila.connectivity.framework.crawler.web.auth.SslCertificateAuthentication;
import org.eclipse.smila.connectivity.framework.crawler.web.configuration.Configuration;
import org.eclipse.smila.connectivity.framework.crawler.web.configuration.HttpProperties;
import org.eclipse.smila.connectivity.framework.crawler.web.filter.FilterProcessor;

/**
 * Holds the HTTP protocol-specific options.
 */
public class Http extends HttpBase {

    /** The connection manager. */
    private static MultiThreadedHttpConnectionManager s_connectionManager = new MultiThreadedHttpConnectionManager();

    /** The Log. */
    private static final Log LOG = LogFactory.getLog(Http.class);

    /** The Constant COLON. */
    private static final String COLON = ":";

    /** The Constant SEMICOLON. */
    private static final String SEMICOLON = ";";

    /** Since the Configuration has not yet been set, then not configured client is returned. */
    private static HttpClient s_client = new HttpClient(s_connectionManager);

    /** The HTTPS protocol. */
    private Protocol _https;

    /**
     * Creates a new instance of HTTP.
     */
    public Http() {
        super();
    }

    /**
     * Gets the client.
     * 
     * @return the client
     */
    static synchronized HttpClient getClient() {
        return s_client;
    }

    /**
     * Loads HttpBase and Client configurations.
     * 
     * @param conf
     *          Configuration
     */
    @Override
    public void setConf(Configuration conf) {
        super.setConf(conf);
        configureClient();
    }

    /**
     * Returns HttpResponse for the given URL.
     * 
     * @param urlString
     *          String
     * 
     * @return HttpResponse
     * 
     * @throws IOException
     *           if there was a error retrieving URL.
     */
    @Override
    protected Response getResponse(String urlString) throws IOException {
        return getResponse(urlString, null);
    }

    /**
     * Returns HttpResponse for the given URL and filter processor.
     * 
     * @param filterProcessor
     *          filterProcessor implementation
     * @param urlString
     *          the url string
     * 
     * @return HttpResponse
     * 
     * @throws IOException
     *           if there was a error retrieving URL.
     */
    @Override
    protected Response getResponse(String urlString, FilterProcessor filterProcessor) throws IOException {
        return new HttpResponse(this, urlString, filterProcessor);
    }

    /**
     * Loads HTTP client configuration for this web site.
     */
    private void configureClient() {
        final HttpConnectionManagerParams params = s_connectionManager.getParams();
        if (_timeout != 0) {
            params.setConnectionTimeout(_timeout);
            params.setSoTimeout(_timeout);
        } else {
            params.setConnectionTimeout(_connectTimeout);
            params.setSoTimeout(_readTimeout);
        }
        params.setSendBufferSize(BUFFER_SIZE);
        params.setReceiveBufferSize(BUFFER_SIZE);
        final HostConfiguration hostConf = s_client.getHostConfiguration();
        final List<Header> headers = new ArrayList<Header>();
        // prefer English
        headers.add(new Header("Accept-Language", "en-us,en-gb,en;q=0.7,*;q=0.3"));
        // prefer UTF-8
        headers.add(new Header("Accept-Charset", "utf-8,ISO-8859-1;q=0.7,*;q=0.7"));
        // prefer understandable formats
        headers.add(new Header("Accept",
                "text/html,application/xml;q=0.9,application/xhtml+xml,text/xml;q=0.9,text/plain;q=0.8"));
        // accept GZIP content
        headers.add(new Header("Accept-Encoding", "x-gzip, gzip"));
        final String[] webSiteHeaders = getConf().get(HttpProperties.HEADERS).split(SEMICOLON);
        for (String header : webSiteHeaders) {
            final String[] headerInformation = header.split(COLON);
            if (headerInformation.length > 2) {
                headers.add(new Header(headerInformation[0].trim(), headerInformation[1].trim()));
            }
        }
        hostConf.getParams().setParameter("http.default-headers", headers);
        if (_useProxy) {
            hostConf.setProxy(_proxyHost, _proxyPort);
            if (_proxyLogin.length() > 0) {
                final Credentials proxyCreds = new UsernamePasswordCredentials(_proxyLogin, _proxyPassword);
                s_client.getState().setProxyCredentials(new AuthScope(AuthScope.ANY), proxyCreds);
            }
        }
        final List<Rfc2617Authentication> httpAuthentications = _authentication.getRfc2617Authentications();

        for (Rfc2617Authentication auth : httpAuthentications) {
            s_client.getState().setCredentials(
                    new AuthScope(auth.getHost(), Integer.valueOf(auth.getPort()), auth.getRealm()),
                    new UsernamePasswordCredentials(auth.getLogin(), auth.getPassword()));
        }

        final SslCertificateAuthentication sslAuth = _authentication.getSslCertificateAuthentication();
        if (sslAuth != null) {
            try {
                final URL truststoreURL = new File(sslAuth.getTruststoreUrl()).toURL();
                final URL keystoreURL = new File(sslAuth.getKeystoreUrl()).toURL();

                final ProtocolSocketFactory sslFactory = new AuthSSLProtocolSocketFactory(keystoreURL,
                        sslAuth.getKeystorePassword(), truststoreURL, sslAuth.getTruststorePassword());
                _https = new Protocol(sslAuth.getProtocolName(), sslFactory, Integer.valueOf(sslAuth.getPort()));
                Protocol.registerProtocol(sslAuth.getProtocolName(), _https);
            } catch (MalformedURLException exception) {
                LOG.error("unable to bind https protocol" + exception.toString());
            }
        }
    }

}