org.archive.modules.fetcher.HeritrixProtocolSocketFactory.java Source code

Java tutorial

Introduction

Here is the source code for org.archive.modules.fetcher.HeritrixProtocolSocketFactory.java

Source

/*
 *  This file is part of the Heritrix web crawler (crawler.archive.org).
 *
 *  Licensed to the Internet Archive (IA) by one or more individual 
 *  contributors. 
 *
 *  The IA licenses this file to You under the Apache License, Version 2.0
 *  (the "License"); you may not use this file except in compliance with
 *  the License.  You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 *  Unless required by applicable law or agreed to in writing, software
 *  distributed under the License is distributed on an "AS IS" BASIS,
 *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 *  See the License for the specific language governing permissions and
 *  limitations under the License.
 */
package org.archive.modules.fetcher;

import java.io.IOException;
import java.net.InetAddress;
import java.net.InetSocketAddress;
import java.net.Socket;
import java.net.SocketTimeoutException;
import java.net.UnknownHostException;

import org.apache.commons.httpclient.ConnectTimeoutException;
import org.apache.commons.httpclient.params.HttpConnectionParams;
import org.apache.commons.httpclient.protocol.ProtocolSocketFactory;

/**
 * Version of protocol socket factory that tries to get IP from heritrix IP
 * cache -- if its been set into the HttpConnectionParameters.
 * 
 * Copied the guts of DefaultProtocolSocketFactory.  This factory gets
 * setup by {@link FetchHTTP}.
 * 
 * @author stack
 * @version $Date$, $Revision$
 */
public class HeritrixProtocolSocketFactory implements ProtocolSocketFactory {
    /**
     * Constructor.
     */
    public HeritrixProtocolSocketFactory() {
        super();
    }

    /**
     * @see #createSocket(java.lang.String,int,java.net.InetAddress,int)
     */
    public Socket createSocket(String host, int port, InetAddress localAddress, int localPort)
            throws IOException, UnknownHostException {
        return new Socket(host, port, localAddress, localPort);
    }

    /**
     * Attempts to get a new socket connection to the given host within the
     * given time limit.
     * <p>
     * This method employs several techniques to circumvent the limitations
     * of older JREs that do not support connect timeout. When running in
     * JRE 1.4 or above reflection is used to call
     * Socket#connect(SocketAddress endpoint, int timeout) method. When
     * executing in older JREs a controller thread is executed. The
     * controller thread attempts to create a new socket within the given
     * limit of time. If socket constructor does not return until the
     * timeout expires, the controller terminates and throws an
     * {@link ConnectTimeoutException}
     * </p>
     *
     * @param host the host name/IP
     * @param port the port on the host
     * @param localAddress the local host name/IP to bind the socket to
     * @param localPort the port on the local machine
     * @param params {@link HttpConnectionParams Http connection parameters}
     *
     * @return Socket a new socket
     *
     * @throws IOException if an I/O error occurs while creating the socket
     * @throws UnknownHostException if the IP address of the host cannot be
     * @throws IOException if an I/O error occurs while creating the socket
     * @throws UnknownHostException if the IP address of the host cannot be
     * determined
     * @throws ConnectTimeoutException if socket cannot be connected within the
     *  given time limit
     *
     * @since 3.0
     */
    public Socket createSocket(final String host, final int port, final InetAddress localAddress,
            final int localPort, final HttpConnectionParams params)
            throws IOException, UnknownHostException, ConnectTimeoutException {
        // Below code is from the DefaultSSLProtocolSocketFactory#createSocket
        // method only it has workarounds to deal with pre-1.4 JVMs.  I've
        // cut these out.
        if (params == null) {
            throw new IllegalArgumentException("Parameters may not be null");
        }
        Socket socket = null;
        int timeout = params.getConnectionTimeout();
        if (timeout == 0) {
            socket = createSocket(host, port, localAddress, localPort);
        } else {
            socket = new Socket();

            InetAddress hostAddress;
            Thread current = Thread.currentThread();
            if (current instanceof HostResolver) {
                HostResolver resolver = (HostResolver) current;
                hostAddress = resolver.resolve(host);
            } else {
                hostAddress = null;
            }
            InetSocketAddress address = (hostAddress != null) ? new InetSocketAddress(hostAddress, port)
                    : new InetSocketAddress(host, port);
            socket.bind(new InetSocketAddress(localAddress, localPort));
            try {
                socket.connect(address, timeout);
            } catch (SocketTimeoutException e) {
                // Add timeout info. to the exception.
                throw new SocketTimeoutException(
                        e.getMessage() + ": timeout set at " + Integer.toString(timeout) + "ms.");
            }
            assert socket.isConnected() : "Socket not connected " + host;
        }
        return socket;
    }

    /**
     * Get host address using first the heritrix cache of addresses, then,
     * failing that, go to the dnsjava cache.
     * 
     * Default access and static so can be used by other classes in this
     * package.
     *
     * @param host Host whose address we're to fetch.
     * @return an IP address for this host or null if one can't be found
     * in caches.
     * @exception IOException If we fail to get host IP from ServerCache.
     */
    /*
    static InetAddress getHostAddress(final ServerCache cache,
        final String host) throws IOException {
    InetAddress result = null;
    if (cache != null) {
       CrawlHost ch = cache.getHostFor(host);
        if (ch != null) {
            result = ch.getIP();
        }
    }
    if (result ==  null) {
        throw new IOException("Failed to get host " + host +
            " address from ServerCache");
    }
    return result;
    }
    */

    /**
     * @see ProtocolSocketFactory#createSocket(java.lang.String,int)
     */
    public Socket createSocket(String host, int port) throws IOException, UnknownHostException {
        return new Socket(host, port);
    }

    /**
     * All instances of DefaultProtocolSocketFactory are the same.
     * @param obj Object to compare.
     * @return True if equal
     */
    public boolean equals(Object obj) {
        return ((obj != null) && obj.getClass().equals(HeritrixProtocolSocketFactory.class));
    }

    /**
     * All instances of DefaultProtocolSocketFactory have the same hash code.
     * @return Hash code for this object.
     */
    public int hashCode() {
        return HeritrixProtocolSocketFactory.class.hashCode();
    }
}