com.cyberway.issue.crawler.datamodel.CrawlServer.java Source code

Introduction

Here is the source code for com.cyberway.issue.crawler.datamodel.CrawlServer.java
Source

/* Copyright (C) 2003 Internet Archive.
 *
 * This file is part of the Heritrix web crawler (crawler.archive.org).
 *
 * Heritrix is free software; you can redistribute it and/or modify
 * it under the terms of the GNU Lesser Public License as published by
 * the Free Software Foundation; either version 2.1 of the License, or
 * any later version.
 *
 * Heritrix is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU Lesser Public License for more details.
 *
 * You should have received a copy of the GNU Lesser Public License
 * along with Heritrix; if not, write to the Free Software
 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
 *
 * CrawlServer.java
 * Created on Apr 17, 2003
 *
 * $Header$
 */
package com.cyberway.issue.crawler.datamodel;

import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.ObjectInputStream;
import java.io.Serializable;
import java.io.StringReader;
import java.util.HashSet;
import java.util.Set;
import java.util.zip.Checksum;

import org.apache.commons.httpclient.URIException;
import com.cyberway.issue.crawler.datamodel.credential.CredentialAvatar;
import com.cyberway.issue.crawler.framework.Checkpointer;
import com.cyberway.issue.crawler.framework.ToeThread;
import com.cyberway.issue.crawler.settings.CrawlerSettings;
import com.cyberway.issue.crawler.settings.SettingsHandler;
import com.cyberway.issue.io.ReplayInputStream;
import com.cyberway.issue.net.UURIFactory;

/**
 * Represents a single remote "server".
 *
 * A server is a service on a host. There might be more than one service on a
 * host differentiated by a port number.
 *
 * @author gojomo
 */
public class CrawlServer implements Serializable, CrawlSubstats.HasCrawlSubstats {

    private static final long serialVersionUID = -989714570750970369L;

    public static final long ROBOTS_NOT_FETCHED = -1;
    /** only check if robots-fetch is perhaps superfluous 
     * after this many tries */
    public static final long MIN_ROBOTS_RETRIES = 2;

    private final String server; // actually, host+port in the https case
    private int port;
    private transient SettingsHandler settingsHandler;
    private RobotsExclusionPolicy robots;
    long robotsFetched = ROBOTS_NOT_FETCHED;
    boolean validRobots = false;
    Checksum robotstxtChecksum;
    CrawlSubstats substats = new CrawlSubstats();

    // how many consecutive connection errors have been encountered;
    // used to drive exponentially increasing retry timeout or decision
    // to 'freeze' entire class (queue) of URIs
    protected int consecutiveConnectionErrors = 0;

    /**
     * Set of credential avatars.
     */
    private transient Set<CredentialAvatar> avatars = null;

    /**
     * Creates a new CrawlServer object.
     *
     * @param h the host string for the server.
     */
    public CrawlServer(String h) {
        // TODO: possibly check for illegal host string
        server = h;
        int colonIndex = server.lastIndexOf(":");
        if (colonIndex < 0) {
            port = -1;
        } else {
            try {
                port = Integer.parseInt(server.substring(colonIndex + 1));
            } catch (NumberFormatException e) {
                port = -1;
            }
        }
    }

    /** Get the robots exclusion policy for this server.
     *
     * @return the robots exclusion policy for this server.
     */
    public RobotsExclusionPolicy getRobots() {
        return robots;
    }

    /** Set the robots exclusion policy for this server.
     *
     * @param policy the policy to set.
     */
    public void setRobots(RobotsExclusionPolicy policy) {
        robots = policy;
    }

    public String toString() {
        return "CrawlServer(" + server + ")";
    }

    /** Update the robots exclusion policy.
     *
     * @param curi the crawl URI containing the fetched robots.txt
     * @throws IOException
     */
    public void updateRobots(CrawlURI curi) {
        RobotsHonoringPolicy honoringPolicy = settingsHandler.getOrder().getRobotsHonoringPolicy();

        robotsFetched = System.currentTimeMillis();

        boolean gotSomething = curi.getFetchStatus() > 0 && curi.isHttpTransaction();
        if (!gotSomething && curi.getFetchAttempts() < MIN_ROBOTS_RETRIES) {
            // robots.txt lookup failed, no reason to consider IGNORE yet
            validRobots = false;
            return;
        }

        CrawlerSettings settings = getSettings(curi);
        int type = honoringPolicy.getType(settings);
        if (type == RobotsHonoringPolicy.IGNORE) {
            // IGNORE = ALLOWALL
            robots = RobotsExclusionPolicy.ALLOWALL;
            validRobots = true;
            return;
        }

        if (!gotSomething) {
            // robots.txt lookup failed and policy not IGNORE
            validRobots = false;
            return;
        }

        if (!curi.is2XXSuccess()) {
            // Not found or anything but a status code in the 2xx range is
            // treated as giving access to all of a sites' content.
            // This is the prevailing practice of Google, since 4xx
            // responses on robots.txt are usually indicative of a 
            // misconfiguration or blanket-block, not an intentional
            // indicator of partial blocking. 
            // TODO: consider handling server errors, redirects differently
            robots = RobotsExclusionPolicy.ALLOWALL;
            validRobots = true;
            return;
        }

        ReplayInputStream contentBodyStream = null;
        try {
            try {
                BufferedReader reader;
                if (type == RobotsHonoringPolicy.CUSTOM) {
                    reader = new BufferedReader(new StringReader(honoringPolicy.getCustomRobots(settings)));
                } else {
                    contentBodyStream = curi.getHttpRecorder().getRecordedInput().getContentReplayInputStream();

                    contentBodyStream.setToResponseBodyStart();
                    reader = new BufferedReader(new InputStreamReader(contentBodyStream));
                }
                robots = RobotsExclusionPolicy.policyFor(settings, reader, honoringPolicy);
                validRobots = true;
            } finally {
                if (contentBodyStream != null) {
                    contentBodyStream.close();
                }
            }
        } catch (IOException e) {
            robots = RobotsExclusionPolicy.ALLOWALL;
            validRobots = true;
            curi.addLocalizedError(getName(), e, "robots.txt parsing IOException");
        }
    }

    /**
     * @return Returns the time when robots.txt was fetched.
     */
    public long getRobotsFetchedTime() {
        return robotsFetched;
    }

    /**
     * @return The server string which might include a port number.
     */
    public String getName() {
        return server;
    }

    /** Get the port number for this server.
     *
     * @return the port number or -1 if not known (uses default for protocol)
     */
    public int getPort() {
        return port;
    }

    /** 
     * Called when object is being deserialized.
     * In addition to the default java deserialization, this method
     * re-establishes the references to settings handler and robots honoring
     * policy.
     *
     * @param stream the stream to deserialize from.
     * @throws IOException if I/O errors occur
     * @throws ClassNotFoundException If the class for an object being restored
     *         cannot be found.
     */
    private void readObject(ObjectInputStream stream) throws IOException, ClassNotFoundException {
        stream.defaultReadObject();
        settingsHandler = SettingsHandler.getThreadContextSettingsHandler();
        postDeserialize();
    }

    private void postDeserialize() {
        if (this.robots != null) {
            RobotsHonoringPolicy honoringPolicy = settingsHandler.getOrder().getRobotsHonoringPolicy();
            this.robots.honoringPolicy = honoringPolicy;
        }
    }

    /** Get the settings handler.
     *
     * @return the settings handler.
     */
    public SettingsHandler getSettingsHandler() {
        return this.settingsHandler;
    }

    /** Get the settings object in effect for this server.
     * @param curi
     *
     * @return the settings object in effect for this server.
     * @throws URIException
     */
    private CrawlerSettings getSettings(CandidateURI curi) {
        try {
            return this.settingsHandler.getSettings(curi.getUURI().getReferencedHost(), curi.getUURI());
        } catch (URIException e) {
            return null;
        }
    }

    /** Set the settings handler to be used by this server.
     *
     * @param settingsHandler the settings handler to be used by this server.
     */
    public void setSettingsHandler(SettingsHandler settingsHandler) {
        this.settingsHandler = settingsHandler;
    }

    public void incrementConsecutiveConnectionErrors() {
        this.consecutiveConnectionErrors++;
    }

    public void resetConsecutiveConnectionErrors() {
        this.consecutiveConnectionErrors = 0;
    }

    /**
     * @return Credential avatars for this server.  Returns null if none.
     */
    public Set getCredentialAvatars() {
        return this.avatars;
    }

    /**
     * @return True if there are avatars attached to this instance.
     */
    public boolean hasCredentialAvatars() {
        return this.avatars != null && this.avatars.size() > 0;
    }

    /**
     * Add an avatar.
     *
     * @param ca Credential avatar to add to set of avatars.
     */
    public void addCredentialAvatar(CredentialAvatar ca) {
        if (this.avatars == null) {
            this.avatars = new HashSet<CredentialAvatar>();
        }
        this.avatars.add(ca);
    }

    /**
      * If true then valid robots.txt information has been retrieved. If false
      * either no attempt has been made to fetch robots.txt or the attempt
      * failed.
      *
     * @return Returns the validRobots.
     */
    public boolean isValidRobots() {
        return validRobots;
    }

    /**
     * Get key to use doing lookup on server instances.
     * @param cauri CandidateURI we're to get server key for.
     * @return String to use as server key.
     * @throws URIException
     */
    public static String getServerKey(CandidateURI cauri) throws URIException {
        // TODO: evaluate if this is really necessary -- why not 
        // make the server of a dns CandidateURI the looked-up domain,
        // also simplifying FetchDNS?
        String key = cauri.getUURI().getAuthorityMinusUserinfo();
        if (key == null) {
            // Fallback for cases where getAuthority() fails (eg 'dns:'.
            // DNS UURIs have the 'domain' in the 'path' parameter, not
            // in the authority).
            key = cauri.getUURI().getCurrentHierPath();
            if (key != null && !key.matches("[-_\\w\\.:]+")) {
                // Not just word chars and dots and colons and dashes and
                // underscores; throw away
                key = null;
            }
        }
        if (key != null && cauri.getUURI().getScheme().equals(UURIFactory.HTTPS)) {
            // If https and no port specified, add default https port to
            // distinuish https from http server without a port.
            if (!key.matches(".+:[0-9]+")) {
                key += UURIFactory.HTTPS_PORT;
            }
        }
        return key;
    }

    /* (non-Javadoc)
     * @see com.cyberway.issue.crawler.datamodel.CrawlSubstats.HasCrawlSubstats#getSubstats()
     */
    public CrawlSubstats getSubstats() {
        return substats;
    }
}