org.archive.crawler.fetcher.OptimizeFetchHTTP.java Source code

Introduction

Here is the source code for org.archive.crawler.fetcher.OptimizeFetchHTTP.java
Source

/* FetchHTTP.java
 *
 * $Id: FetchHTTP.java 6803 2010-04-02 01:03:46Z gojomo $
 *
 * Created on Jun 5, 2003
 *
 * Copyright (C) 2003 Internet Archive.
 *
 * This file is part of the Heritrix web crawler (crawler.archive.org).
 *
 * Heritrix is free software; you can redistribute it and/or modify
 * it under the terms of the GNU Lesser Public License as published by
 * the Free Software Foundation; either version 2.1 of the License, or
 * any later version.
 *
 * Heritrix is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU Lesser Public License for more details.
 *
 * You should have received a copy of the GNU Lesser Public License
 * along with Heritrix; if not, write to the Free Software
 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
 */
package org.archive.crawler.fetcher;

import it.unimi.dsi.mg4j.util.MutableString;

import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.ObjectInputStream;
import java.io.ObjectOutputStream;
import java.io.RandomAccessFile;
import java.net.InetAddress;
import java.net.UnknownHostException;
import java.security.KeyManagementException;
import java.security.KeyStoreException;
import java.security.MessageDigest;
import java.security.NoSuchAlgorithmException;
import java.util.Collection;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.ListIterator;
import java.util.Map;
import java.util.Set;

import javax.management.AttributeNotFoundException;
import javax.management.MBeanException;
import javax.management.ReflectionException;
import javax.net.ssl.SSLContext;
import javax.net.ssl.SSLSocketFactory;
import javax.net.ssl.TrustManager;

import org.apache.commons.httpclient.Cookie;
import org.apache.commons.httpclient.Header;
import org.apache.commons.httpclient.HostConfiguration;
import org.apache.commons.httpclient.HttpClient;
import org.apache.commons.httpclient.HttpConnection;
import org.apache.commons.httpclient.HttpConnectionManager;
import org.apache.commons.httpclient.HttpException;
import org.apache.commons.httpclient.HttpMethod;
import org.apache.commons.httpclient.HttpMethodBase;
import org.apache.commons.httpclient.HttpState;
import org.apache.commons.httpclient.HttpStatus;
import org.apache.commons.httpclient.HttpVersion;
import org.apache.commons.httpclient.auth.AuthChallengeParser;
import org.apache.commons.httpclient.auth.AuthScheme;
import org.apache.commons.httpclient.auth.BasicScheme;
import org.apache.commons.httpclient.auth.DigestScheme;
import org.apache.commons.httpclient.auth.MalformedChallengeException;
import org.apache.commons.httpclient.cookie.CookiePolicy;
import org.apache.commons.httpclient.params.HttpClientParams;
import org.apache.commons.httpclient.params.HttpConnectionManagerParams;
import org.apache.commons.httpclient.params.HttpMethodParams;
import org.apache.commons.httpclient.protocol.Protocol;
import org.apache.commons.httpclient.protocol.ProtocolSocketFactory;
import org.apache.log4j.Logger;
import org.archive.crawler.Heritrix;
import org.archive.crawler.datamodel.CoreAttributeConstants;
import org.archive.crawler.datamodel.CrawlHost;
import org.archive.crawler.datamodel.CrawlOrder;
import org.archive.crawler.datamodel.CrawlServer;
import org.archive.crawler.datamodel.CrawlURI;
import org.archive.crawler.datamodel.CredentialStore;
import org.archive.crawler.datamodel.FetchStatusCodes;
import org.archive.crawler.datamodel.ServerCache;
import org.archive.crawler.datamodel.credential.Credential;
import org.archive.crawler.datamodel.credential.CredentialAvatar;
import org.archive.crawler.datamodel.credential.Rfc2617Credential;
import org.archive.crawler.deciderules.DecideRule;
import org.archive.crawler.deciderules.DecideRuleSequence;
import org.archive.crawler.event.CrawlStatusListener;
import org.archive.crawler.extractor.Link;
import org.archive.crawler.framework.Processor;
import org.archive.crawler.settings.SettingsHandler;
import org.archive.crawler.settings.SimpleType;
import org.archive.crawler.settings.StringList;
import org.archive.crawler.settings.Type;
import org.archive.httpclient.ConfigurableX509TrustManager;
import org.archive.httpclient.HttpRecorderGetMethod;
import org.archive.httpclient.HttpRecorderMethod;
import org.archive.httpclient.HttpRecorderPostMethod;
import org.archive.httpclient.SingleHttpConnectionManager;
import org.archive.io.ObjectPlusFilesInputStream;
import org.archive.io.RecorderLengthExceededException;
import org.archive.io.RecorderTimeoutException;
import org.archive.io.RecorderTooMuchHeaderException;
import org.archive.util.ArchiveUtils;
import org.archive.util.HttpRecorder;
import org.archive.util.bdbje.EnhancedEnvironment;

import st.ata.util.AList;

import com.netease.backend.collector.rss.common.util.IPSwitcher;
import com.sleepycat.bind.serial.SerialBinding;
import com.sleepycat.bind.serial.StoredClassCatalog;
import com.sleepycat.bind.tuple.StringBinding;
import com.sleepycat.collections.StoredSortedMap;
import com.sleepycat.je.Database;
import com.sleepycat.je.DatabaseConfig;
import com.sleepycat.je.DatabaseException;

/**
 * HTTP fetcher that uses <a
 * href="http://jakarta.apache.org/commons/httpclient/">Apache Jakarta Commons
 * HttpClient</a> library.
 *
 * @author Gordon Mohr
 * @author Igor Ranitovic
 * @author others
 * @version $Id: FetchHTTP.java 6803 2010-04-02 01:03:46Z gojomo $
 */
public class OptimizeFetchHTTP extends Processor
        implements CoreAttributeConstants, FetchStatusCodes, CrawlStatusListener {
    // be robust against trivial implementation changes
    private static final long serialVersionUID = ArchiveUtils.classnameBasedUID(OptimizeFetchHTTP.class, 1);

    private static Logger logger = Logger.getLogger(OptimizeFetchHTTP.class.getName());

    public static final String ATTR_HTTP_PROXY_HOST = A_HTTP_PROXY_HOST;
    public static final String ATTR_HTTP_PROXY_PORT = A_HTTP_PROXY_PORT;
    public static final String ATTR_TIMEOUT_SECONDS = "timeout-seconds";
    public static final String ATTR_SOTIMEOUT_MS = "sotimeout-ms";
    public static final String ATTR_MAX_LENGTH_BYTES = "max-length-bytes";
    public static final String ATTR_LOAD_COOKIES = "load-cookies-from-file";
    public static final String ATTR_SAVE_COOKIES = "save-cookies-to-file";
    public static final String ATTR_ACCEPT_HEADERS = "accept-headers";
    public static final String ATTR_DEFAULT_ENCODING = "default-encoding";
    public static final String ATTR_DIGEST_CONTENT = "digest-content";
    public static final String ATTR_DIGEST_ALGORITHM = "digest-algorithm";
    public static final String ATTR_FETCH_BANDWIDTH_MAX = "fetch-bandwidth";

    public static final String DESC_DIGEST_CONTENT = "Whether or not to"
            + " perform an on-the-fly digest hash of retrieved content-bodies.";
    public static final String DESC_DIGEST_ALGORITHM = "Which algorithm (for"
            + " example MD5 or SHA-1) to use to perform an on-the-fly digest hash"
            + " of retrieved content-bodies.";

    /**
     * SSL trust level setting attribute name.
     */
    public static final String ATTR_TRUST = "trust-level";

    private static Integer DEFAULT_TIMEOUT_SECONDS = new Integer(1200);
    private static Integer DEFAULT_SOTIMEOUT_MS = new Integer(20000);
    private static Long DEFAULT_MAX_LENGTH_BYTES = new Long(0);
    private static Integer DEFAULT_FETCH_BANDWIDTH_MAX = 0;

    /**
     * This is the default value pre-1.4. Needs special handling else
     * treated as negative number doing math later in processing.
     */
    private static long OLD_DEFAULT_MAX_LENGTH_BYTES = 9223372036854775807L;

    /**
     * Default character encoding to use for pages that do not specify.
     */
    private static String DEFAULT_CONTENT_CHARSET = Heritrix.DEFAULT_ENCODING;

    /**
     * Default whether to perform on-the-fly digest hashing of content-bodies.
     */
    static Boolean DEFAULT_DIGEST_CONTENT = new Boolean(true);

    /**
     * The different digest algorithms to choose between, 
     * SHA-1 or MD-5 at the moment. 
     */
    public static final String SHA1 = "sha1";
    public static final String MD5 = "md5";
    public static String[] DIGEST_ALGORITHMS = { SHA1, MD5 };

    /**
     * Default algorithm to use for message disgesting.
     */
    public static final String DEFAULT_DIGEST_ALGORITHM = SHA1;

    private transient ThreadLocal<HttpClient> httpInstance = new ThreadLocal<HttpClient>();

    /**
     * How many 'instant retries' of HttpRecoverableExceptions have occurred
     * 
     * Would like it to be 'long', but longs aren't atomic
     */
    private int recoveryRetries = 0;

    /**
     * Count of crawl uris handled.
     * Would like to be 'long', but longs aren't atomic
     */
    private int curisHandled = 0;

    /**
     * Rules to apply mid-fetch, just after receipt of the response
     * headers before we start to download body.
     */
    public static final String ATTR_MIDFETCH_DECIDE_RULES = "midfetch-decide-rules";

    /**
     * What to log if midfetch abort.
     */
    private static final String MIDFETCH_ABORT_LOG = "midFetchAbort";

    public static final String ATTR_SEND_CONNECTION_CLOSE = "send-connection-close";
    private static final Header HEADER_SEND_CONNECTION_CLOSE = new Header("Connection", "close");
    public static final String ATTR_SEND_REFERER = "send-referer";
    public static final String ATTR_SEND_RANGE = "send-range";
    public static final String ATTR_SEND_IF_MODIFIED_SINCE = "send-if-modified-since";
    public static final String ATTR_SEND_IF_NONE_MATCH = "send-if-none-match";
    public static final String REFERER = "Referer";
    public static final String RANGE = "Range";
    public static final String RANGE_PREFIX = "bytes=0-";
    public static final String HTTP_SCHEME = "http";
    public static final String HTTPS_SCHEME = "https";

    public static final String ATTR_IGNORE_COOKIES = "ignore-cookies";
    private static Boolean DEFAULT_IGNORE_COOKIES = new Boolean(false);

    public static final String ATTR_BDB_COOKIES = "use-bdb-for-cookies";
    private static Boolean DEFAULT_BDB_COOKIES = new Boolean(true);

    public static final String ATTR_HTTP_BIND_ADDRESS = A_HTTP_BIND_ADDRESS;

    /**
     * Database backing cookie map, if using BDB
     */
    protected Database cookieDb;
    /**
     * Name of cookie BDB Database
     */
    public static final String COOKIEDB_NAME = "http_cookies";

    static {
        Protocol.registerProtocol("http", new Protocol("http", new HeritrixProtocolSocketFactory(), 80));
        try {
            Protocol.registerProtocol("https",
                    new Protocol("https", ((ProtocolSocketFactory) new HeritrixSSLProtocolSocketFactory()), 443));
        } catch (KeyManagementException e) {
            e.printStackTrace();
        } catch (KeyStoreException e) {
            e.printStackTrace();
        } catch (NoSuchAlgorithmException e) {
            e.printStackTrace();
        }
    }
    static final String SERVER_CACHE_KEY = "heritrix.server.cache";
    static final String SSL_FACTORY_KEY = "heritrix.ssl.factory";

    /***
     * Socket factory that has the configurable trust manager installed.
     */
    private SSLSocketFactory sslfactory = null;

    /**
     * Constructor.
     *
     * @param name Name of this processor.
     */
    public OptimizeFetchHTTP(String name) {
        super(name, "Optimize HTTP Fetcher");

        addElementToDefinition(new DecideRuleSequence(ATTR_MIDFETCH_DECIDE_RULES,
                "DecideRules which, if final decision is REJECT, "
                        + "abort fetch after headers before all content is" + "read."));

        addElementToDefinition(new SimpleType(ATTR_TIMEOUT_SECONDS,
                "If the fetch is not completed in this number of seconds, "
                        + "even if it is making progress, give up. The URI will be "
                        + "annotated as timeTrunc. Set to zero for no timeout. "
                        + "(This is not recommended: threads could wait indefinitely " + "for the fetch to end.)",
                DEFAULT_TIMEOUT_SECONDS));
        Type e = addElementToDefinition(new SimpleType(ATTR_SOTIMEOUT_MS,
                "If a socket is unresponsive for this number of milliseconds, "
                        + "give up on that connects/read. (This does not necessarily give "
                        + "up on the fetch immediately; connects are subject to retries "
                        + "and reads will be retried until " + ATTR_TIMEOUT_SECONDS
                        + " have elapsed. Set to zero for no socket timeout. (This is "
                        + "note recommended: a socket operation could hand indefinitely.",
                DEFAULT_SOTIMEOUT_MS));
        e.setExpertSetting(true);
        e = addElementToDefinition(new SimpleType(
                ATTR_FETCH_BANDWIDTH_MAX, "The maximum KB/sec to use when fetching data from a server. "
                        + "0 means no maximum.  Default: " + DEFAULT_FETCH_BANDWIDTH_MAX + ".",
                DEFAULT_FETCH_BANDWIDTH_MAX));
        e.setExpertSetting(true);
        e.setOverrideable(true);
        addElementToDefinition(new SimpleType(ATTR_MAX_LENGTH_BYTES,
                "Maximum length in bytes to fetch.\n"
                        + "Fetch is truncated at this length. A value of 0 means no limit.",
                DEFAULT_MAX_LENGTH_BYTES));
        e = addElementToDefinition(
                new SimpleType(ATTR_IGNORE_COOKIES, "Disable cookie-handling.", DEFAULT_IGNORE_COOKIES));
        e.setOverrideable(true);
        e.setExpertSetting(true);
        e = addElementToDefinition(
                new SimpleType(ATTR_BDB_COOKIES, "Store cookies in BDB-backed map.", DEFAULT_BDB_COOKIES));
        e.setExpertSetting(true);

        e = addElementToDefinition(new SimpleType(ATTR_LOAD_COOKIES, "File to preload cookies from", ""));
        e.setExpertSetting(true);
        e = addElementToDefinition(
                new SimpleType(ATTR_SAVE_COOKIES, "When crawl finishes save cookies to this file", ""));
        e.setExpertSetting(true);
        e = addElementToDefinition(new SimpleType(ATTR_TRUST,
                "SSL certificate trust level.  Range is from the default 'open'"
                        + " (trust all certs including expired, selfsigned, and those for"
                        + " which we do not have a CA) through 'loose' (trust all valid"
                        + " certificates including selfsigned), 'normal' (all valid"
                        + " certificates not including selfsigned) to 'strict' (Cert is"
                        + " valid and DN must match servername)",
                ConfigurableX509TrustManager.DEFAULT, ConfigurableX509TrustManager.LEVELS_AS_ARRAY));
        e.setOverrideable(false);
        e.setExpertSetting(true);
        e = addElementToDefinition(
                new StringList(ATTR_ACCEPT_HEADERS, "Accept Headers to include in each request. Each must be the"
                        + " complete header, e.g., 'Accept-Language: en'"));
        e.setExpertSetting(true);
        e = addElementToDefinition(new SimpleType(ATTR_HTTP_PROXY_HOST, "Proxy host IP (set only if needed).", ""));
        e.setExpertSetting(true);
        e = addElementToDefinition(new SimpleType(ATTR_HTTP_PROXY_PORT, "Proxy port (set only if needed)", ""));
        e.setExpertSetting(true);
        e = addElementToDefinition(new SimpleType(ATTR_DEFAULT_ENCODING,
                "The character encoding to use for files that do not have one"
                        + " specified in the HTTP response headers.  Default: " + DEFAULT_CONTENT_CHARSET + ".",
                DEFAULT_CONTENT_CHARSET));
        e.setExpertSetting(true);
        e = addElementToDefinition(
                new SimpleType(ATTR_DIGEST_CONTENT, DESC_DIGEST_CONTENT, DEFAULT_DIGEST_CONTENT));
        e.setExpertSetting(true);
        e = addElementToDefinition(new SimpleType(ATTR_DIGEST_ALGORITHM, DESC_DIGEST_ALGORITHM,
                DEFAULT_DIGEST_ALGORITHM, DIGEST_ALGORITHMS));
        e.setExpertSetting(true);
        e = addElementToDefinition(new SimpleType(ATTR_SEND_IF_MODIFIED_SINCE,
                "Send 'If-Modified-Since' header, if previous 'Last-Modified' "
                        + "fetch history information is available in URI history.",
                new Boolean(true)));
        e.setOverrideable(true);
        e.setExpertSetting(true);
        e = addElementToDefinition(
                new SimpleType(ATTR_SEND_IF_NONE_MATCH, "Send 'If-None-Match' header, if previous 'Etag' fetch "
                        + "history information is available in URI history.", new Boolean(true)));
        e.setOverrideable(true);
        e.setExpertSetting(true);
        e = addElementToDefinition(new SimpleType(ATTR_SEND_CONNECTION_CLOSE,
                "Send 'Connection: close' header with every request.", new Boolean(true)));
        e.setOverrideable(true);
        e.setExpertSetting(true);
        e = addElementToDefinition(new SimpleType(ATTR_SEND_REFERER,
                "Send 'Referer' header with every request.\n"
                        + "The 'Referer' header contans the location the crawler came " + " from, "
                        + "the page the current URI was discovered in. The 'Referer' " + "usually is "
                        + "logged on the remote server and can be of assistance to "
                        + "webmasters trying to figure how a crawler got to a " + "particular area on a site.",
                new Boolean(true)));
        e.setOverrideable(true);
        e.setExpertSetting(true);
        e = addElementToDefinition(new SimpleType(ATTR_SEND_RANGE,
                "Send 'Range' header when a limit (" + ATTR_MAX_LENGTH_BYTES + ") on document size.\n"
                        + "Be polite to the HTTP servers and send the 'Range' header,"
                        + "stating that you are only interested in the first n bytes. " + "Only pertinent if "
                        + ATTR_MAX_LENGTH_BYTES + " > 0. " + "Sending the 'Range' header results in a "
                        + "'206 Partial Content' status response, which is better than "
                        + "just cutting the response mid-download. On rare occasion, " + " sending 'Range' will "
                        + "generate '416 Request Range Not Satisfiable' response.",
                new Boolean(false)));
        e.setOverrideable(true);
        e.setExpertSetting(true);
        e = addElementToDefinition(
                new SimpleType(ATTR_HTTP_BIND_ADDRESS,
                        "Local IP address or hostname to use when making connections "
                                + "(binding sockets). When not specified, uses default local" + "address(es).",
                        ""));
        e.setExpertSetting(true);
    }

    /*protected boolean shouldFetchBody(final CrawlURI curi, HttpMethodBase method) {
       boolean ret = true;
       long modifyTime = 0;
           
       Header header = method.getResponseHeader(A_LAST_MODIFIED_HEADER);
      if (header != null) {
     modifyTime = Date.parse(header.getValue());
      }
          
      ret = CrawlerService.getInstance().shouldFetchHttpBody(curi, modifyTime);
           
       return ret;
    }*/

    protected void innerProcess(final CrawlURI curi) throws InterruptedException {
        if (!canFetch(curi)) {
            // Cannot fetch this, due to protocol, retries, or other problems
            return;
        }

        HttpClient http = this.getClient();
        setLocalIP(http);

        this.curisHandled++;

        // Note begin time
        curi.putLong(A_FETCH_BEGAN_TIME, System.currentTimeMillis());

        // Get a reference to the HttpRecorder that is set into this ToeThread.
        HttpRecorder rec = HttpRecorder.getHttpRecorder();

        // Shall we get a digest on the content downloaded?
        boolean digestContent = ((Boolean) getUncheckedAttribute(curi, ATTR_DIGEST_CONTENT)).booleanValue();
        String algorithm = null;
        if (digestContent) {
            algorithm = ((String) getUncheckedAttribute(curi, ATTR_DIGEST_ALGORITHM));
            rec.getRecordedInput().setDigest(algorithm);
        } else {
            // clear
            rec.getRecordedInput().setDigest((MessageDigest) null);
        }

        // Below we do two inner classes that add check of midfetch
        // filters just as we're about to receive the response body.
        String curiString = curi.getUURI().toString();
        HttpMethodBase method = null;
        if (curi.isPost()) {
            method = new HttpRecorderPostMethod(curiString, rec) {
                protected void readResponseBody(HttpState state, HttpConnection conn)
                        throws IOException, HttpException {
                    addResponseContent(this, curi);
                    if (checkMidfetchAbort(curi, this.httpRecorderMethod, conn)) {
                        doAbort(curi, this, MIDFETCH_ABORT_LOG);
                    } else {
                        super.readResponseBody(state, conn);
                    }
                }
            };
        } else {
            method = new HttpRecorderGetMethod(curiString, rec) {
                protected void readResponseBody(HttpState state, HttpConnection conn)
                        throws IOException, HttpException {
                    addResponseContent(this, curi);
                    if (checkMidfetchAbort(curi, this.httpRecorderMethod, conn)) {
                        doAbort(curi, this, MIDFETCH_ABORT_LOG);
                    } else {
                        super.readResponseBody(state, conn);
                    }
                }
            };
        }

        HostConfiguration customConfigOrNull = configureMethod(curi, method);

        // Set httpRecorder into curi. Subsequent code both here and later
        // in extractors expects to find the HttpRecorder in the CrawlURI.
        curi.setHttpRecorder(rec);

        // Populate credentials. Set config so auth. is not automatic.
        boolean addedCredentials = populateCredentials(curi, method);
        method.setDoAuthentication(addedCredentials);

        // set hardMax on bytes (if set by operator)
        long hardMax = getMaxLength(curi);
        // set overall timeout (if set by operator)
        long timeoutMs = 1000 * getTimeout(curi);
        // Get max fetch rate (bytes/ms). It comes in in KB/sec
        long maxRateKBps = getMaxFetchRate(curi);
        rec.getRecordedInput().setLimits(hardMax, timeoutMs, maxRateKBps);

        try {
            http.executeMethod(customConfigOrNull, method);
        } catch (RecorderTooMuchHeaderException ex) {
            // when too much header material, abort like other truncations
            doAbort(curi, method, HEADER_TRUNC);
        } catch (IOException e) {
            failedExecuteCleanup(method, curi, e);
            return;
        } catch (ArrayIndexOutOfBoundsException e) {
            // For weird windows-only ArrayIndex exceptions in native
            // code... see
            // http://forum.java.sun.com/thread.jsp?forum=11&thread=378356
            // treating as if it were an IOException
            failedExecuteCleanup(method, curi, e);
            return;
        }

        // set softMax on bytes to get (if implied by content-length) 
        long softMax = method.getResponseContentLength();

        try {
            if (!curi.isSeed() && curi.getFetchStatus() == HttpStatus.SC_NOT_MODIFIED) {
                logger.debug(curi.getUURI().toString() + " is not modify");
                curi.skipToProcessorChain(getController().getPostprocessorChain());
            } else if (!method.isAborted()) {
                // Force read-to-end, so that any socket hangs occur here,
                // not in later modules.
                rec.getRecordedInput().readFullyOrUntil(softMax);
            }
        } catch (RecorderTimeoutException ex) {
            doAbort(curi, method, TIMER_TRUNC);
        } catch (RecorderLengthExceededException ex) {
            doAbort(curi, method, LENGTH_TRUNC);
        } catch (IOException e) {
            cleanup(curi, e, "readFully", S_CONNECT_LOST);
            return;
        } catch (ArrayIndexOutOfBoundsException e) {
            // For weird windows-only ArrayIndex exceptions from native code
            // see http://forum.java.sun.com/thread.jsp?forum=11&thread=378356
            // treating as if it were an IOException
            cleanup(curi, e, "readFully", S_CONNECT_LOST);
            return;
        } finally {
            // ensure recording has stopped
            rec.closeRecorders();
            logger.debug("cloase backup file.&uri= " + curi.getCrawlURIString());
            if (!method.isAborted()) {
                method.releaseConnection();
            }
            // Note completion time
            curi.putLong(A_FETCH_COMPLETED_TIME, System.currentTimeMillis());
            // Set the response charset into the HttpRecord if available.
            setCharacterEncoding(rec, method);
            setSizes(curi, rec);
        }

        if (digestContent) {
            curi.setContentDigest(algorithm, rec.getRecordedInput().getDigestValue());
        }

        logger.info((curi.isPost() ? "POST" : "GET") + " " + curi.getUURI().toString() + " "
                + method.getStatusCode() + " " + rec.getRecordedInput().getSize() + " " + curi.getContentType());

        if (curi.isSuccess() && addedCredentials) {
            // Promote the credentials from the CrawlURI to the CrawlServer
            // so they are available for all subsequent CrawlURIs on this
            // server.
            promoteCredentials(curi);
            if (logger.isDebugEnabled()) {
                // Print out the cookie.  Might help with the debugging.
                Header setCookie = method.getResponseHeader("set-cookie");
                if (setCookie != null) {
                    logger.debug(setCookie.toString().trim());
                }
            }
        } else if (method.getStatusCode() == HttpStatus.SC_UNAUTHORIZED) {
            // 401 is not 'success'.
            handle401(method, curi);
        }

        if (rec.getRecordedInput().isOpen()) {
            logger.error(curi.toString() + " RIS still open. Should have" + " been closed by method release: "
                    + Thread.currentThread().getName());
            try {
                rec.getRecordedInput().close();
            } catch (IOException e) {
                logger.error("second-chance RIS close failed", e);
            }
        }
    }

    /**
     * Update CrawlURI internal sizes based on current transaction (and
     * in the case of 304s, history) 
     * 
     * @param curi CrawlURI
     * @param rec HttpRecorder
     */
    protected void setSizes(final CrawlURI curi, HttpRecorder rec) {
        // set reporting size
        curi.setContentSize(rec.getRecordedInput().getSize());
        // special handling for 304-not modified
        if (curi.getFetchStatus() == HttpStatus.SC_NOT_MODIFIED && curi.containsKey(A_FETCH_HISTORY)) {
            AList history[] = curi.getAList().getAListArray(A_FETCH_HISTORY);
            if (history[0] != null && history[0].containsKey(CoreAttributeConstants.A_REFERENCE_LENGTH)) {
                long referenceLength = history[0].getLong(A_REFERENCE_LENGTH);
                // carry-forward previous 'reference-length' for future
                curi.putLong(A_REFERENCE_LENGTH, referenceLength);
                // increase content-size to virtual-size for reporting
                curi.setContentSize(rec.getRecordedInput().getSize() + referenceLength);
            }
        }
    }

    protected void doAbort(CrawlURI curi, HttpMethod method, String annotation) {
        curi.addAnnotation(annotation);
        curi.getHttpRecorder().close();
        method.abort();
    }

    protected boolean checkMidfetchAbort(CrawlURI curi, HttpRecorderMethod method, HttpConnection conn) {
        if (curi.isPrerequisite() || rulesAccept(getMidfetchRule(curi), curi)) {
            return false;
        }
        method.markContentBegin(conn);
        return true;
    }

    protected DecideRule getMidfetchRule(Object o) {
        try {
            return (DecideRule) getAttribute(o, ATTR_MIDFETCH_DECIDE_RULES);
        } catch (AttributeNotFoundException e) {
            throw new RuntimeException(e);
        }
    }

    /**
     * This method populates <code>curi</code> with response status and
     * content type.
     * @param curi CrawlURI to populate.
     * @param method Method to get response status and headers from.
     */
    protected void addResponseContent(HttpMethod method, CrawlURI curi) {
        curi.setFetchStatus(method.getStatusCode());
        Header ct = method.getResponseHeader("content-type");
        curi.setContentType((ct == null) ? null : ct.getValue());
        // Save method into curi too.  Midfetch filters may want to leverage
        // info in here.
        curi.putObject(A_HTTP_TRANSACTION, method);
    }

    /**
     * Set the character encoding based on the result headers or default.
     *
     * The HttpClient returns its own default encoding ("ISO-8859-1") if one
     * isn't specified in the Content-Type response header. We give the user
     * the option of overriding this, so we need to detect the case where the
     * default is returned.
     *
     * Now, it may well be the case that the default returned by HttpClient
     * and the default defined by the user are the same.
     * 
     * @param rec Recorder for this request.
     * @param method Method used for the request.
     */
    private void setCharacterEncoding(final HttpRecorder rec, final HttpMethod method) {
        String encoding = null;

        try {
            encoding = ((HttpMethodBase) method).getResponseCharSet();
            if (encoding == null || encoding.equals(DEFAULT_CONTENT_CHARSET)) {
                encoding = (String) getAttribute(ATTR_DEFAULT_ENCODING);
            }
        } catch (Exception e) {
            logger.warn("Failed get default encoding: " + e.getLocalizedMessage());
        }
        rec.setCharacterEncoding(encoding);
    }

    /**
     * Cleanup after a failed method execute.
     * @param curi CrawlURI we failed on.
     * @param method Method we failed on.
     * @param exception Exception we failed with.
     */
    private void failedExecuteCleanup(final HttpMethod method, final CrawlURI curi, final Exception exception) {
        cleanup(curi, exception, "executeMethod", (method.isRequestSent() ? S_CONNECT_LOST : S_CONNECT_FAILED));
        method.releaseConnection();
    }

    /**
     * Cleanup after a failed method execute.
     * @param curi CrawlURI we failed on.
     * @param exception Exception we failed with.
     * @param message Message to log with failure.
     * @param status Status to set on the fetch.
     */
    private void cleanup(final CrawlURI curi, final Exception exception, final String message, final int status) {
        curi.addLocalizedError(this.getName(), exception, message);
        curi.setFetchStatus(status);
        curi.getHttpRecorder().close();
    }

    /**
     * Can this processor fetch the given CrawlURI. May set a fetch
     * status if this processor would usually handle the CrawlURI,
     * but cannot in this instance.
     *
     * @param curi
     * @return True if processor can fetch.
     */
    private boolean canFetch(CrawlURI curi) {
        if (curi.getFetchStatus() < 0) {
            // already marked as errored, this pass through
            // skip to end
            curi.skipToProcessorChain(getController().getPostprocessorChain());
            return false;
        }
        String scheme = curi.getUURI().getScheme();
        if (!(scheme.equals("http") || scheme.equals("https"))) {
            // handles only plain http and https
            return false;
        }
        CrawlHost host = getController().getServerCache().getHostFor(curi);
        // make sure the dns lookup succeeded
        if (host.getIP() == null && host.hasBeenLookedUp()) {
            curi.setFetchStatus(S_DOMAIN_PREREQUISITE_FAILURE);
            return false;
        }
        return true;
    }

    /**
     * Configure the HttpMethod setting options and headers.
     *
     * @param curi CrawlURI from which we pull configuration.
     * @param method The Method to configure.
     * @return HostConfiguration copy customized for this CrawlURI
     */
    protected HostConfiguration configureMethod(CrawlURI curi, HttpMethod method) {
        // Don't auto-follow redirects
        method.setFollowRedirects(false);

        //        // set soTimeout
        //        method.getParams().setSoTimeout(
        //                ((Integer) getUncheckedAttribute(curi, ATTR_SOTIMEOUT_MS))
        //                        .intValue());

        // Set cookie policy.
        method.getParams()
                .setCookiePolicy((((Boolean) getUncheckedAttribute(curi, ATTR_IGNORE_COOKIES)).booleanValue())
                        ? CookiePolicy.IGNORE_COOKIES
                        : CookiePolicy.BROWSER_COMPATIBILITY);

        // Use only HTTP/1.0 (to avoid receiving chunked responses)
        method.getParams().setVersion(HttpVersion.HTTP_1_0);

        CrawlOrder order = getSettingsHandler().getOrder();
        String userAgent = curi.getUserAgent();
        if (userAgent == null) {
            userAgent = order.getUserAgent(curi);
        }
        method.setRequestHeader("User-Agent", userAgent);
        method.setRequestHeader("From", order.getFrom(curi));

        // Set retry handler.
        method.getParams().setParameter(HttpMethodParams.RETRY_HANDLER, new HeritrixHttpMethodRetryHandler());

        final long maxLength = getMaxLength(curi);
        if (maxLength > 0 && ((Boolean) getUncheckedAttribute(curi, ATTR_SEND_RANGE)).booleanValue()) {
            method.addRequestHeader(RANGE, RANGE_PREFIX.concat(Long.toString(maxLength - 1)));
        }

        if (((Boolean) getUncheckedAttribute(curi, ATTR_SEND_CONNECTION_CLOSE)).booleanValue()) {
            method.addRequestHeader(HEADER_SEND_CONNECTION_CLOSE);
        }

        if (((Boolean) getUncheckedAttribute(curi, ATTR_SEND_REFERER)).booleanValue()
                && (curi.getViaContext() == null || !Link.PREREQ_MISC.equals(curi.getViaContext().toString()))) {
            // RFC2616 says no referer header if referer is https and the url
            // is not
            String via = curi.flattenVia();
            if (via != null && via.length() > 0
                    && !(via.startsWith(HTTPS_SCHEME) && curi.getUURI().getScheme().equals(HTTP_SCHEME))) {
                method.setRequestHeader(REFERER, via);
            }
        }

        /*if(!curi.isPrerequisite() && curi.containsKey(URLInfo.MODIFY_TIME) &&
           (Boolean)getUncheckedAttribute(curi, ATTR_SEND_IF_MODIFIED_SINCE)) {
          long modifyTime = curi.getLong(URLInfo.MODIFY_TIME);
          if (modifyTime != 0) {
          Date date = new Date(modifyTime);
          method.setRequestHeader("If-Modified-Since", date.toString());
          logger.debug(curi.getUURI().toString() + " send header modifyTime:" + date.toGMTString());
          }
              
          setConditionalGetHeader(curi, method, ATTR_SEND_IF_MODIFIED_SINCE, 
               CoreAttributeConstants.A_LAST_MODIFIED_HEADER, "If-Modified-Since");
          setConditionalGetHeader(curi, method, ATTR_SEND_IF_NONE_MATCH, 
            CoreAttributeConstants.A_ETAG_HEADER, "If-None-Match");
        }*/

        // TODO: What happens if below method adds a header already
        // added above: e.g. Connection, Range, or Referer?
        setAcceptHeaders(curi, method);

        HttpClient http = getClient();
        HostConfiguration config = new HostConfiguration(http.getHostConfiguration());
        configureProxy(curi, config);
        configureBindAddress(curi, config);
        return config;
    }

    /**
     * Set the given conditional-GET header, if the setting is enabled and
     * a suitable value is available in the URI history. 
     * @param curi source CrawlURI
     * @param method HTTP operation pending
     * @param setting true/false enablement setting name to consult
     * @param sourceHeader header to consult in URI history
     * @param targetHeader header to set if possible
     */
    protected void setConditionalGetHeader(CrawlURI curi, HttpMethod method, String setting, String sourceHeader,
            String targetHeader) {
        if (((Boolean) getUncheckedAttribute(curi, setting))) {
            try {
                int previousStatus = curi.getAList().getAListArray(A_FETCH_HISTORY)[0].getInt(A_STATUS);
                if (previousStatus <= 0) {
                    // do not reuse headers from any broken fetch
                    return;
                }
                String previousValue = curi.getAList().getAListArray(A_FETCH_HISTORY)[0].getString(sourceHeader);
                if (previousValue != null) {
                    method.setRequestHeader(targetHeader, previousValue);
                }
            } catch (RuntimeException e) {
                // for absent key, bad index, etc. just do nothing
            }
        }
    }

    /**
     * Setup proxy, based on attributes in CrawlURI and settings, 
     * in the given HostConfiguration
     */
    private void configureProxy(CrawlURI curi, HostConfiguration config) {
        String proxy = (String) getAttributeEither(curi, ATTR_HTTP_PROXY_HOST);
        int port = -1;
        if (proxy.length() == 0) {
            proxy = null;
        } else {
            String portString = (String) getAttributeEither(curi, ATTR_HTTP_PROXY_PORT);
            port = portString.length() > 0 ? Integer.parseInt(portString) : -1;
        }
        if (proxy != null) {
            config.setProxy(proxy, port);
        }
    }

    /**
     * Setup local bind address, based on attributes in CrawlURI and settings, 
     * in the given HostConfiguration 
     */
    private void configureBindAddress(CrawlURI curi, HostConfiguration config) {
        String addressString = (String) getAttributeEither(curi, ATTR_HTTP_BIND_ADDRESS);
        if (addressString != null && addressString.length() > 0) {
            try {
                InetAddress localAddress = InetAddress.getByName(addressString);
                config.setLocalAddress(localAddress);
            } catch (UnknownHostException e) {
                // Convert all to RuntimeException so get an exception out
                // if initialization fails.
                throw new RuntimeException("Unknown host " + addressString + " in " + ATTR_HTTP_BIND_ADDRESS);
            }
        }
    }

    /**
     * Get a value either from inside the CrawlURI instance, or from 
     * settings (module attributes). 
     * 
     * @param curi CrawlURI to consult
     * @param key key to lookup
     * @return value from either CrawlURI (preferred) or settings
     */
    protected Object getAttributeEither(CrawlURI curi, String key) {
        Object obj = curi != null ? curi.getObject(key) : null;
        if (obj == null) {
            obj = getUncheckedAttribute(curi, key);
        }
        return obj;
    }

    private HttpClient getClient() {
        HttpClient http = httpInstance.get();
        if (http == null) {
            http = configureHttp();
            httpInstance.set(http);
        }
        return http;
    }

    private void setLocalIP(HttpClient http) {
        String localIP = IPSwitcher.getLocalIP();
        if (localIP != null) {
            try {
                http.getHostConfiguration().setLocalAddress(InetAddress.getByName(localIP));
            } catch (UnknownHostException e) {
                e.printStackTrace();
            }
        }
    }

    /**
     * Add credentials if any to passed <code>method</code>.
     *
     * Do credential handling.  Credentials are in two places.  1. Credentials
     * that succeeded are added to the CrawlServer (Or rather, avatars for
     * credentials are whats added because its not safe to keep around
     * references to credentials).  2. Credentials to be tried are in the curi.
     * Returns true if found credentials to be tried.
     *
     * @param curi Current CrawlURI.
     * @param method The method to add to.
     * @return True if prepopulated <code>method</code> with credentials AND the
     * credentials came from the <code>curi</code>, not from the CrawlServer.
     * The former is  special in that if the <code>curi</curi> credentials
     * succeed, then the caller needs to promote them from the CrawlURI to the
     * CrawlServer so they are available for all subsequent CrawlURIs on this
     * server.
     */
    private boolean populateCredentials(CrawlURI curi, HttpMethod method) {
        // First look at the server avatars. Add any that are to be volunteered
        // on every request (e.g. RFC2617 credentials).  Every time creds will
        // return true when we call 'isEveryTime().
        HttpClient http = this.getClient();
        CrawlServer server = getController().getServerCache().getServerFor(curi);
        if (server.hasCredentialAvatars()) {
            Set avatars = server.getCredentialAvatars();
            for (Iterator i = avatars.iterator(); i.hasNext();) {
                CredentialAvatar ca = (CredentialAvatar) i.next();
                Credential c = ca.getCredential(getSettingsHandler(), curi);
                if (c.isEveryTime()) {
                    c.populate(curi, http, method, ca.getPayload());
                }
            }
        }

        boolean result = false;

        // Now look in the curi.  The Curi will have credentials loaded either
        // by the handle401 method if its a rfc2617 or it'll have been set into
        // the curi by the preconditionenforcer as this login uri came through.
        if (curi.hasCredentialAvatars()) {
            Set avatars = curi.getCredentialAvatars();
            for (Iterator i = avatars.iterator(); i.hasNext();) {
                CredentialAvatar ca = (CredentialAvatar) i.next();
                Credential c = ca.getCredential(getSettingsHandler(), curi);
                if (c.populate(curi, http, method, ca.getPayload())) {
                    result = true;
                }
            }
        }

        return result;
    }

    /**
     * Promote successful credential to the server.
     *
     * @param curi CrawlURI whose credentials we are to promote.
     */
    private void promoteCredentials(final CrawlURI curi) {
        if (!curi.hasCredentialAvatars()) {
            logger.error("No credentials to promote when there should be " + curi);
        } else {
            Set avatars = curi.getCredentialAvatars();
            for (Iterator i = avatars.iterator(); i.hasNext();) {
                CredentialAvatar ca = (CredentialAvatar) i.next();
                curi.removeCredentialAvatar(ca);
                // The server to attach too may not be the server that hosts
                // this passed curi.  It might be of another subdomain.
                // The avatar needs to be added to the server that is dependent
                // on this precondition.  Find it by name.  Get the name from
                // the credential this avatar represents.
                Credential c = ca.getCredential(getSettingsHandler(), curi);
                String cd = null;
                try {
                    cd = c.getCredentialDomain(curi);
                } catch (AttributeNotFoundException e) {
                    logger.error("Failed to get cred domain for " + curi + " for " + ca + ": " + e.getMessage());
                }
                if (cd != null) {
                    CrawlServer cs = getController().getServerCache().getServerFor(cd);
                    if (cs != null) {
                        cs.addCredentialAvatar(ca);
                    }
                }
            }
        }
    }

    /**
     * Server is looking for basic/digest auth credentials (RFC2617). If we have
     * any, put them into the CrawlURI and have it come around again. Presence
     * of the credential serves as flag to frontier to requeue promptly. If we
     * already tried this domain and still got a 401, then our credentials are
     * bad. Remove them and let this curi die.
     *
     * @param method Method that got a 401.
     * @param curi CrawlURI that got a 401.
     */
    protected void handle401(final HttpMethod method, final CrawlURI curi) {
        AuthScheme authscheme = getAuthScheme(method, curi);
        if (authscheme == null) {
            return;
        }
        String realm = authscheme.getRealm();

        // Look to see if this curi had rfc2617 avatars loaded.  If so, are
        // any of them for this realm?  If so, then the credential failed
        // if we got a 401 and it should be let die a natural 401 death.
        Set curiRfc2617Credentials = getCredentials(getSettingsHandler(), curi, Rfc2617Credential.class);
        Rfc2617Credential extant = Rfc2617Credential.getByRealm(curiRfc2617Credentials, realm, curi);
        if (extant != null) {
            // Then, already tried this credential.  Remove ANY rfc2617
            // credential since presence of a rfc2617 credential serves
            // as flag to frontier to requeue this curi and let the curi
            // die a natural death.
            extant.detachAll(curi);
            logger.warn("Auth failed (401) though supplied realm " + realm + " to " + curi.toString());
        } else {
            // Look see if we have a credential that corresponds to this
            // realm in credential store.  Filter by type and credential
            // domain.  If not, let this curi die. Else, add it to the
            // curi and let it come around again. Add in the AuthScheme
            // we got too.  Its needed when we go to run the Auth on
            // second time around.
            CredentialStore cs = CredentialStore.getCredentialStore(getSettingsHandler());
            if (cs == null) {
                logger.error("No credential store for " + curi);
            } else {
                CrawlServer server = getController().getServerCache().getServerFor(curi);
                Set storeRfc2617Credentials = cs.subset(curi, Rfc2617Credential.class, server.getName());
                if (storeRfc2617Credentials == null || storeRfc2617Credentials.size() <= 0) {
                    logger.info("No rfc2617 credentials for " + curi);
                } else {
                    Rfc2617Credential found = Rfc2617Credential.getByRealm(storeRfc2617Credentials, realm, curi);
                    if (found == null) {
                        logger.info("No rfc2617 credentials for realm " + realm + " in " + curi);
                    } else {
                        found.attach(curi, authscheme.getRealm());
                        logger.info("Found credential for realm " + realm + " in store for " + curi.toString());
                    }
                }
            }
        }
    }

    /**
     * @param method Method that got a 401.
     * @param curi CrawlURI that got a 401.
     * @return Returns first wholesome authscheme found else null.
     */
    protected AuthScheme getAuthScheme(final HttpMethod method, final CrawlURI curi) {
        Header[] headers = method.getResponseHeaders("WWW-Authenticate");
        if (headers == null || headers.length <= 0) {
            logger.info("We got a 401 but no WWW-Authenticate challenge: " + curi.toString());
            return null;
        }

        Map authschemes = null;
        try {
            authschemes = AuthChallengeParser.parseChallenges(headers);
        } catch (MalformedChallengeException e) {
            logger.info("Failed challenge parse: " + e.getMessage());
        }
        if (authschemes == null || authschemes.size() <= 0) {
            logger.info("We got a 401 and WWW-Authenticate challenge" + " but failed parse of the header "
                    + curi.toString());
            return null;
        }

        AuthScheme result = null;
        // Use the first auth found.
        for (Iterator i = authschemes.keySet().iterator(); result == null && i.hasNext();) {
            String key = (String) i.next();
            String challenge = (String) authschemes.get(key);
            if (key == null || key.length() <= 0 || challenge == null || challenge.length() <= 0) {
                logger.warn("Empty scheme: " + curi.toString() + ": " + headers);
            }
            AuthScheme authscheme = null;
            if (key.equals("basic")) {
                authscheme = new BasicScheme();
            } else if (key.equals("digest")) {
                authscheme = new DigestScheme();
            } else {
                logger.info("Unsupported scheme: " + key);
                continue;
            }

            try {
                authscheme.processChallenge(challenge);
            } catch (MalformedChallengeException e) {
                logger.info(e.getMessage() + " " + curi + " " + headers);
                continue;
            }
            if (authscheme.isConnectionBased()) {
                logger.info("Connection based " + authscheme);
                continue;
            }

            if (authscheme.getRealm() == null || authscheme.getRealm().length() <= 0) {
                logger.info("Empty realm " + authscheme + " for " + curi);
                continue;
            }
            result = authscheme;
        }

        return result;
    }

    /**
     * @param handler Settings Handler.
     * @param curi CrawlURI that got a 401.
     * @param type Class of credential to get from curi.
     * @return Set of credentials attached to this curi.
     */
    private Set<Credential> getCredentials(SettingsHandler handler, CrawlURI curi, Class type) {
        Set<Credential> result = null;

        if (curi.hasCredentialAvatars()) {
            for (Iterator i = curi.getCredentialAvatars().iterator(); i.hasNext();) {
                CredentialAvatar ca = (CredentialAvatar) i.next();
                if (ca.match(type)) {
                    if (result == null) {
                        result = new HashSet<Credential>();
                    }
                    result.add(ca.getCredential(handler, curi));
                }
            }
        }
        return result;
    }

    public void initialTasks() {
        super.initialTasks();
        this.getController().addCrawlStatusListener(this);
        //configureHttp();

        // load cookies from a file if specified in the order file.
        loadCookies();

        // I tried to get the default KeyManagers but doesn't work unless you
        // point at a physical keystore. Passing null seems to do the right
        // thing so we'll go w/ that.
        try {
            SSLContext context = SSLContext.getInstance("SSL");
            context.init(null,
                    new TrustManager[] { new ConfigurableX509TrustManager((String) getAttribute(ATTR_TRUST)) },
                    null);
            this.sslfactory = context.getSocketFactory();
        } catch (Exception e) {
            logger.warn("Failed configure of ssl context " + e.getMessage(), e);
        }
    }

    public void finalTasks() {
        // At the end save cookies to the file specified in the order file.
        saveCookies();
        cleanupHttp();
        super.finalTasks();
    }

    /**
     * Perform any final cleanup related to the HttpClient instance.
     */
    protected void cleanupHttp() {
        if (cookieDb != null) {
            try {
                cookieDb.sync();
                cookieDb.close();
            } catch (DatabaseException e) {
                // TODO Auto-generated catch block
                e.printStackTrace();
            }
        }
    }

    protected HttpClient configureHttp() throws RuntimeException {
        // Get timeout.  Use it for socket and for connection timeout.
        int timeout = (getSoTimeout(null) > 0) ? getSoTimeout(null) : 0;

        // HttpConnectionManager cm = new ThreadLocalHttpConnectionManager();
        HttpConnectionManager cm = new SingleHttpConnectionManager();

        // TODO: The following settings should be made in the corresponding
        // HttpConnectionManager, not here.
        HttpConnectionManagerParams hcmp = cm.getParams();
        hcmp.setConnectionTimeout(timeout);
        hcmp.setStaleCheckingEnabled(true);
        // Minimizes bandwidth usage.  Setting to true disables Nagle's
        // algorithm.  IBM JVMs < 142 give an NPE setting this boolean
        // on ssl sockets.
        hcmp.setTcpNoDelay(false);

        HttpClient http = new HttpClient(cm);
        HttpClientParams hcp = http.getParams();
        // Set default socket timeout.
        hcp.setSoTimeout(timeout);
        // Set client to be version 1.0.
        hcp.setVersion(HttpVersion.HTTP_1_0);

        configureHttpCookies(http);

        // Configure how we want the method to act.
        http.getParams().setParameter(HttpMethodParams.SINGLE_COOKIE_HEADER, new Boolean(true));
        http.getParams().setParameter(HttpMethodParams.UNAMBIGUOUS_STATUS_LINE, new Boolean(false));
        http.getParams().setParameter(HttpMethodParams.STRICT_TRANSFER_ENCODING, new Boolean(false));
        http.getParams().setIntParameter(HttpMethodParams.STATUS_LINE_GARBAGE_LIMIT, 10);

        // modify the default config with any global settings
        HostConfiguration config = http.getHostConfiguration();
        configureProxy(null, config);
        configureBindAddress(null, config);

        // Use our own protocol factory, one that gets IP to use from
        // heritrix cache (They're cached in CrawlHost instances).
        final ServerCache cache = getController().getServerCache();
        hcmp.setParameter(SERVER_CACHE_KEY, cache);
        hcmp.setParameter(SSL_FACTORY_KEY, this.sslfactory);

        return http;
    }

    /**
     * Set the HttpClient HttpState instance to use a BDB-backed
     * StoredSortedMap for cookie storage, if that option is chosen.
     */
    private void configureHttpCookies(HttpClient http) {
        // If Bdb-backed cookies chosen, replace map in HttpState
        if (((Boolean) getUncheckedAttribute(null, ATTR_BDB_COOKIES)).booleanValue()) {
            try {
                EnhancedEnvironment env = getController().getBdbEnvironment();
                StoredClassCatalog classCatalog = env.getClassCatalog();
                DatabaseConfig dbConfig = new DatabaseConfig();
                dbConfig.setTransactional(false);
                dbConfig.setAllowCreate(true);
                dbConfig.setDeferredWrite(true);
                cookieDb = env.openDatabase(null, COOKIEDB_NAME, dbConfig);
                StoredSortedMap cookiesMap = new StoredSortedMap(cookieDb, new StringBinding(),
                        new SerialBinding(classCatalog, Cookie.class), true);
                http.getState().setCookiesMap(cookiesMap);
            } catch (DatabaseException e) {
                // TODO Auto-generated catch block
                logger.error(e.getMessage());
                e.printStackTrace();
            }
        }
    }

    /**
     * @param curi Current CrawlURI.  Used to get context.
     * @return Socket timeout value.
     */
    private int getSoTimeout(CrawlURI curi) {
        Integer res = null;
        try {
            res = (Integer) getAttribute(ATTR_SOTIMEOUT_MS, curi);
        } catch (Exception e) {
            res = DEFAULT_SOTIMEOUT_MS;
        }
        return res.intValue();
    }

    /**
     * @param curi Current CrawlURI.  Used to get context.
     * @return Timeout value for total request.
     */
    private int getTimeout(CrawlURI curi) {
        Integer res;
        try {
            res = (Integer) getAttribute(ATTR_TIMEOUT_SECONDS, curi);
        } catch (Exception e) {
            res = DEFAULT_TIMEOUT_SECONDS;
        }
        return res.intValue();
    }

    private int getMaxFetchRate(CrawlURI curi) {
        Integer res;
        try {
            res = (Integer) getAttribute(ATTR_FETCH_BANDWIDTH_MAX, curi);
        } catch (Exception e) {
            res = DEFAULT_FETCH_BANDWIDTH_MAX;
        }
        return res.intValue();
    }

    private long getMaxLength(CrawlURI curi) {
        Long res;
        try {
            res = (Long) getAttribute(ATTR_MAX_LENGTH_BYTES, curi);
            if (res.longValue() == OLD_DEFAULT_MAX_LENGTH_BYTES) {
                res = DEFAULT_MAX_LENGTH_BYTES;
            }
        } catch (Exception e) {
            res = DEFAULT_MAX_LENGTH_BYTES;
        }
        return res.longValue();
    }

    /**
     * Load cookies from a file before the first fetch.
     * <p>
     * The file is a text file in the Netscape's 'cookies.txt' file format.<br>
     * Example entry of cookies.txt file:<br>
     * <br>
     * www.archive.org FALSE / FALSE 1074567117 details-visit texts-cralond<br>
     * <br>
     * Each line has 7 tab-separated fields:<br>
     * <li>1. DOMAIN: The domain that created and have access to the cookie
     * value.
     * <li>2. FLAG: A TRUE or FALSE value indicating if hosts within the given
     * domain can access the cookie value.
     * <li>3. PATH: The path within the domain that the cookie value is valid
     * for.
     * <li>4. SECURE: A TRUE or FALSE value indicating if to use a secure
     * connection to access the cookie value.
     * <li>5. EXPIRATION: The expiration time of the cookie value (unix style.)
     * <li>6. NAME: The name of the cookie value
     * <li>7. VALUE: The cookie value
     *
     * @param cookiesFile file in the Netscape's 'cookies.txt' format.
     */
    public void loadCookies(String cookiesFile) {
        // Do nothing if cookiesFile is not specified.
        if (cookiesFile == null || cookiesFile.length() <= 0) {
            return;
        }
        RandomAccessFile raf = null;
        try {
            raf = new RandomAccessFile(cookiesFile, "r");
            String[] cookieParts;
            String line;
            Cookie cookie = null;
            while ((line = raf.readLine()) != null) {
                // Line that starts with # is commented line, therefore skip it.
                if (!line.startsWith("#")) {
                    cookieParts = line.split("\\t");
                    if (cookieParts.length == 7) {
                        // Create cookie with not expiration date (-1 value).
                        // TODO: add this as an option.
                        cookie = new Cookie(cookieParts[0], cookieParts[5], cookieParts[6], cookieParts[2], -1,
                                Boolean.valueOf(cookieParts[3]).booleanValue());

                        if (cookieParts[1].toLowerCase().equals("true")) {
                            cookie.setDomainAttributeSpecified(true);
                        } else {
                            cookie.setDomainAttributeSpecified(false);
                        }
                        HttpClient http = this.getClient();
                        http.getState().addCookie(cookie);
                        logger.debug("Adding cookie: " + cookie.toExternalForm());
                    }
                }
            }
        } catch (FileNotFoundException e) {
            // We should probably throw FatalConfigurationException.
            System.out.println("Could not find file: " + cookiesFile + " (Element: " + ATTR_LOAD_COOKIES + ")");

        } catch (IOException e) {
            // We should probably throw FatalConfigurationException.
            e.printStackTrace();
        } finally {
            try {
                if (raf != null) {
                    raf.close();
                }
            } catch (IOException e) {
                e.printStackTrace();
            }
        }
    }

    /* (non-Javadoc)
     * @see org.archive.crawler.framework.Processor#report()
     */
    public String report() {
        StringBuffer ret = new StringBuffer();
        ret.append("Processor: org.archive.crawler.fetcher.OptimizeFetchHTTP\n");
        ret.append("  Function:          Fetch HTTP URIs\n");
        ret.append("  CrawlURIs handled: " + this.curisHandled + "\n");
        ret.append("  Recovery retries:   " + this.recoveryRetries + "\n\n");

        return ret.toString();
    }

    /**
     * Load cookies from the file specified in the order file.
     *
     * <p>
     * The file is a text file in the Netscape's 'cookies.txt' file format.<br>
     * Example entry of cookies.txt file:<br>
     * <br>
     * www.archive.org FALSE / FALSE 1074567117 details-visit texts-cralond<br>
     * <br>
     * Each line has 7 tab-separated fields:<br>
     * <li>1. DOMAIN: The domain that created and have access to the cookie
     * value.
     * <li>2. FLAG: A TRUE or FALSE value indicating if hosts within the given
     * domain can access the cookie value.
     * <li>3. PATH: The path within the domain that the cookie value is valid
     * for.
     * <li>4. SECURE: A TRUE or FALSE value indicating if to use a secure
     * connection to access the cookie value.
     * <li>5. EXPIRATION: The expiration time of the cookie value (unix style.)
     * <li>6. NAME: The name of the cookie value
     * <li>7. VALUE: The cookie value
     */
    public void loadCookies() {
        try {
            loadCookies((String) getAttribute(ATTR_LOAD_COOKIES));
        } catch (MBeanException e) {
            logger.warn(e.getLocalizedMessage());
        } catch (ReflectionException e) {
            logger.warn(e.getLocalizedMessage());
        } catch (AttributeNotFoundException e) {
            logger.warn(e.getLocalizedMessage());
        }
    }

    /**
     * Saves cookies to the file specified in the order file.
     *
     * Output file is in the Netscape 'cookies.txt' format.
     *
     */
    public void saveCookies() {
        try {
            saveCookies((String) getAttribute(ATTR_SAVE_COOKIES));
        } catch (MBeanException e) {
            logger.warn(e.getLocalizedMessage());
        } catch (ReflectionException e) {
            logger.warn(e.getLocalizedMessage());
        } catch (AttributeNotFoundException e) {
            logger.warn(e.getLocalizedMessage());
        }
    }

    /**
     * Saves cookies to a file.
     *
     * Output file is in the Netscape 'cookies.txt' format.
     *
     * @param saveCookiesFile output file.
     */
    public void saveCookies(String saveCookiesFile) {
        // Do nothing if cookiesFile is not specified.
        if (saveCookiesFile == null || saveCookiesFile.length() <= 0) {
            return;
        }

        FileOutputStream out = null;
        try {
            out = new FileOutputStream(new File(saveCookiesFile));
            @SuppressWarnings("unchecked")
            HttpClient http = this.getClient();
            Map<String, Cookie> cookies = http.getState().getCookiesMap();
            String tab = "\t";
            out.write("# Heritrix Cookie File\n".getBytes());
            out.write("# This file is the Netscape cookies.txt format\n\n".getBytes());
            for (Cookie cookie : cookies.values()) {
                MutableString line = new MutableString(1024 * 2 /*Guess an initial size*/);
                line.append(cookie.getDomain());
                line.append(tab);
                line.append(cookie.isDomainAttributeSpecified() == true ? "TRUE" : "FALSE");
                line.append(tab);
                line.append(cookie.getPath());
                line.append(tab);
                line.append(cookie.getSecure() == true ? "TRUE" : "FALSE");
                line.append(tab);
                line.append(cookie.getName());
                line.append(tab);
                line.append((null == cookie.getValue()) ? "" : cookie.getValue());
                line.append("\n");
                out.write(line.toString().getBytes());
            }
        } catch (FileNotFoundException e) {
            // We should probably throw FatalConfigurationException.
            System.out.println("Could not find file: " + saveCookiesFile + " (Element: " + ATTR_SAVE_COOKIES + ")");
        } catch (IOException e) {
            e.printStackTrace();
        } finally {
            try {
                if (out != null) {
                    out.close();
                }
            } catch (IOException e) {
                e.printStackTrace();
            }
        }
    }

    /* (non-Javadoc)
     * @see org.archive.crawler.settings.ModuleType#listUsedFiles(java.util.List)
     */
    protected void listUsedFiles(List<String> list) {
        // List the cookies files
        // Add seed file
        try {
            String tmp = (String) getAttribute(ATTR_LOAD_COOKIES);
            if (tmp != null && tmp.length() > 0) {
                File file = getSettingsHandler().getPathRelativeToWorkingDirectory(tmp);
                list.add(file.getAbsolutePath());
            }
            tmp = (String) getAttribute(ATTR_SAVE_COOKIES);
            if (tmp != null && tmp.length() > 0) {
                File file = getSettingsHandler().getPathRelativeToWorkingDirectory(tmp);
                list.add(file.getAbsolutePath());
            }
        } catch (AttributeNotFoundException e) {
            // TODO Auto-generated catch block
            e.printStackTrace();
        } catch (MBeanException e) {
            // TODO Auto-generated catch block
            e.printStackTrace();
        } catch (ReflectionException e) {
            // TODO Auto-generated catch block
            e.printStackTrace();
        }
    }

    private void setAcceptHeaders(CrawlURI curi, HttpMethod get) {
        try {
            StringList accept_headers = (StringList) getAttribute(ATTR_ACCEPT_HEADERS, curi);
            if (!accept_headers.isEmpty()) {
                for (ListIterator i = accept_headers.listIterator(); i.hasNext();) {
                    String hdr = (String) i.next();
                    String[] nvp = hdr.split(": +");
                    if (nvp.length == 2) {
                        get.setRequestHeader(nvp[0], nvp[1]);
                    } else {
                        logger.warn("Invalid accept header: " + hdr);
                    }
                }
            }
        } catch (AttributeNotFoundException e) {
            logger.error(e.getMessage());
        }
    }

    // custom serialization
    private void writeObject(ObjectOutputStream stream) throws IOException {
        stream.defaultWriteObject();
        // save cookies
        HttpClient http = this.getClient();
        @SuppressWarnings("unchecked")
        Collection<Cookie> c = http.getState().getCookiesMap().values();
        Cookie[] cookies = c.toArray(new Cookie[c.size()]);
        stream.writeObject(cookies);
    }

    private void readObject(ObjectInputStream stream) throws IOException, ClassNotFoundException {
        stream.defaultReadObject();
        Cookie cookies[] = (Cookie[]) stream.readObject();
        ObjectPlusFilesInputStream coistream = (ObjectPlusFilesInputStream) stream;
        coistream.registerFinishTask(new PostRestore(cookies));
    }

    class PostRestore implements Runnable {
        Cookie cookies[];

        public PostRestore(Cookie cookies[]) {
            this.cookies = cookies;
        }

        public void run() {
            configureHttp();
            for (int i = 0; i < cookies.length; i++) {
                getClient().getState().addCookie(cookies[i]);
            }
        }
    }

    /* (non-Javadoc)
     * @see org.archive.crawler.event.CrawlStatusListener#crawlStarted(java.lang.String)
     */
    public void crawlStarted(String message) {
        // TODO Auto-generated method stub
    }

    /* (non-Javadoc)
     * @see org.archive.crawler.event.CrawlStatusListener#crawlStarted(java.lang.String)
     */
    public void crawlCheckpoint(File checkpointDir) {
        if (cookieDb != null) {
            try {
                cookieDb.sync();
            } catch (DatabaseException e) {
                // TODO Auto-generated catch block
                throw new RuntimeException(e);
            }
        }
    }

    /* (non-Javadoc)
     * @see org.archive.crawler.event.CrawlStatusListener#crawlEnding(java.lang.String)
     */
    public void crawlEnding(String sExitMessage) {
        // TODO Auto-generated method stub
    }

    /* (non-Javadoc)
     * @see org.archive.crawler.event.CrawlStatusListener#crawlEnded(java.lang.String)
     */
    public void crawlEnded(String sExitMessage) {
    }

    /* (non-Javadoc)
     * @see org.archive.crawler.event.CrawlStatusListener#crawlPausing(java.lang.String)
     */
    public void crawlPausing(String statusMessage) {
        // TODO Auto-generated method stub
    }

    /* (non-Javadoc)
     * @see org.archive.crawler.event.CrawlStatusListener#crawlPaused(java.lang.String)
     */
    public void crawlPaused(String statusMessage) {
        // TODO Auto-generated method stub
    }

    /* (non-Javadoc)
     * @see org.archive.crawler.event.CrawlStatusListener#crawlResuming(java.lang.String)
     */
    public void crawlResuming(String statusMessage) {
        // TODO Auto-generated method stub
    }
}