org.commoncrawl.service.crawler.CrawlHostImpl.java Source code

Java tutorial

Introduction

Here is the source code for org.commoncrawl.service.crawler.CrawlHostImpl.java

Source

/**
 * Copyright 2008 - CommonCrawl Foundation
 * 
 *    This program is free software: you can redistribute it and/or modify
 *    it under the terms of the GNU General Public License as published by
 *    the Free Software Foundation, either version 3 of the License, or
 *    (at your option) any later version.
 *
 *    This program is distributed in the hope that it will be useful,
 *    but WITHOUT ANY WARRANTY; without even the implied warranty of
 *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *    GNU General Public License for more details.
 *
 *    You should have received a copy of the GNU General Public License
 *    along with this program.  If not, see <http://www.gnu.org/licenses/>.
 *
 **/

package org.commoncrawl.service.crawler;

import java.io.IOException;
import java.io.StringWriter;
import java.io.Writer;
import java.net.UnknownHostException;
import java.text.SimpleDateFormat;
import java.util.Date;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.commoncrawl.common.Environment;
import org.commoncrawl.protocol.CrawlURLMetadata;
import org.commoncrawl.service.crawler.RobotRulesParser.RobotRuleSet;
import org.commoncrawl.service.crawler.filters.IPAddressBlockFilter;
import org.commoncrawl.service.crawler.filters.Filter.FilterResult;
import org.commoncrawl.util.CCStringUtils;
import org.commoncrawl.util.IPAddressUtils;
import org.commoncrawl.util.IntrusiveList;
import org.commoncrawl.util.HttpCookieUtils.CookieStore;
import org.commoncrawl.util.IntrusiveList.IntrusiveListElement;

/**
 * 
 * @author rana
 *
 * The common CrawlHost implementation shared by CrawlList and CrawlQueue
 */
public final class CrawlHostImpl implements CrawlListHost, CrawlQueueHost {

    private static final int DEFAULT_CRAWL_DELAY = 2000;

    private static final int SUPER_HOST_THRESHOLD_1 = 25;
    private static final int SUPER_HOST_THRESHOLD_2 = 100;
    private static final int SUPERHOST_CRAWL_DELAY_1 = 3000;
    private static final int SUPERHOST_CRAWL_DELAY_2 = 2000;

    /** fail the host after 5 consecutive io errors **/
    private static final int HOST_FAIL_ON_CONSECUTIVE_IO_ERRORS_THRESHOLD = 100;
    /** time to keep a host in failed state after an io error condition is detected **/
    private static final int HOST_FAIL_RESET_TIMER = 10 * 60 * 1000; // 10 minutes .. 

    /** logging **/
    private static final Log LOG = LogFactory.getLog(CrawlListHost.class);

    private int _ipAddress; // host's ip address ...

    private IntrusiveList<CrawlList> _crawlLists = new IntrusiveList<CrawlList>();

    private CrawlQueue _queue;
    private boolean _idle = false;
    private boolean _zombie = false;
    private boolean _isBlackListedHost = false;
    /** paused by a master crawl controller **/
    private boolean _isPaused = false;
    /** pause state timestamp **/
    private int _pauseStateTimestamp = -1;

    private long _blackListStatusUpdateTime = -1;
    private long _lastFetchStartTime = -1;
    private long _lastDispositionChangeTime = -1;
    private long _waitTime = -1;
    private int _failedDomainCount = 0;
    private int _uniqueDomainCount = 0;
    private int _successfulGETCount = 0;
    private int _http200Count = 0;
    private int _robotsExcludedCount = 0;
    private int _403Count = 0;
    private int _failedGETCount = 0;
    boolean _inFeedQueue = false;
    private short _consecutiveIOErrors = 0;
    private long _lastIOErrorTime = -1;
    private CookieStore _cookieStore = new CookieStore();

    // private boolean       _skipRobots = false;
    // private String            _resolvedHostName = null;

    private static int MAX_ROBOTS_CACHE_ENTIRES = 20;

    private static class RobotRuleSetCacheItem extends IntrusiveListElement<RobotRuleSetCacheItem> {
        public long _crc;
        public RobotRuleSet _ruleSetObject;
        public long _lastTouched;
    }

    /** robots rule set cache **/
    private IntrusiveList<RobotRuleSetCacheItem> _robotsRuleSetCache = new IntrusiveList<RobotRuleSetCacheItem>();

    public CrawlHostImpl(CrawlQueue queue, int ipAddress) {
        _ipAddress = ipAddress;
        _queue = queue;
    }

    public CrawlerServer getServer() {
        return getQueue().getEngine().getServer();
    }

    public CrawlList getActiveList() {
        return getHead();
    }

    /** get access to the cookie store associated with this host 
     *  may be null 
     * **/
    public CookieStore getCookieStore() {
        return _cookieStore;
    }

    public boolean noActiveLists() {
        // if there are no lists present then, yes there are no active lists ... 
        if (getHead() == null) {
            return true;
        } else {
            // otherwise ... special case, only one list present and it is the high priority list ... 
            if (_crawlLists.size() == 1
                    && getHead().getListId() == CrawlerServer.getServer().getHighPriorityListId()) {
                // check to see if it's disposition is QueueEmpty 
                return getHead().getDisposition() == CrawlList.Disposition.QueueEmpty;
            }
            // otherwise ... no, there are active lists present 
            return false;
        }
    }

    String getActiveListName() {
        if (getHead() != null)
            return getHead().getListName();
        return "null";
    }

    String getActiveListDisposition() {
        if (getHead() != null)
            return getHead().getDisposition().toString();
        return "null";
    }

    @Override
    public String getIPAddressAsString() {
        String ipAddress = "UNKNOWN";
        try {
            ipAddress = IPAddressUtils.IntegerToInetAddress(getIPAddress()).toString();
        } catch (UnknownHostException e) {
        }
        return ipAddress;
    }

    public int getIPAddress() {
        return _ipAddress;
    }

    public CrawlQueue getQueue() {
        return _queue;
    }

    public int getQueuedURLCount() {
        int countOut = 0;
        for (CrawlList list : _crawlLists) {
            synchronized (list) {
                countOut += list.getPendingURLCount() + list.getOfflineURLCount();
            }
        }
        return countOut;
    }

    public CrawlList getCrawlList(int listId) {
        CrawlList domainOut = null;

        for (CrawlList domain : _crawlLists) {
            if (domain.getListId() == listId) {
                domainOut = domain;
                break;
            }
        }

        if (domainOut == null) {
            domainOut = new CrawlList(this, listId);
            _crawlLists.addTail(domainOut);
            // increment unique domain count ... 
            _uniqueDomainCount++;
        }

        return domainOut;
    }

    public boolean isTimerActive() {
        return _waitTime != -1;
    }

    public void setTimer(long expireTime) {
        _idle = false;
        _waitTime = expireTime;
        // check for null for unit test scenario
        if (getQueue() != null) {
            getQueue().setTimer(this, expireTime);
        }
    }

    public void killTimer() {
        if (_waitTime != -1) {
            _waitTime = -1;
            // check for null for unit test scenario
            if (getQueue() != null) {
                getQueue().killTimer(this);
            }
        }
    }

    @Override
    public void updateLastFetchStartTime(long newFetchStartTime) {
        _lastFetchStartTime = newFetchStartTime;
    }

    void incFailedDomainCount() {
        _failedDomainCount++;
    }

    /** increments the consecutive io errors counter for this host 
     *  used to track failed servers 
     * **/
    public void incConsecutiveIOErrorCount() {
        _consecutiveIOErrors++;
        _lastIOErrorTime = System.currentTimeMillis();
        if (_consecutiveIOErrors >= HOST_FAIL_ON_CONSECUTIVE_IO_ERRORS_THRESHOLD) {
            LOG.error(
                    "### HOST: Failed Host:" + getIPAddressAsString() + " due to too many consecutive IO Errors!");
        }
    }

    /** reset IOError Counter ... whenever we succesfully retrieve a document from this host **/
    public void resetIOErrorCount() {
        _consecutiveIOErrors = 0;
    }

    /** is this a blacklisted host **/
    public boolean isBlackListedHost() {
        // check blacklisted host status ... 
        if (_blackListStatusUpdateTime < getServer().getFilterUpdateTime()) {

            // if black list status out of date ... validate blacklist status 
            IPAddressBlockFilter filter = getServer().getIPAddressFilter();

            if (filter != null) {
                CrawlURLMetadata metadata = new CrawlURLMetadata();
                metadata.setServerIP(getIPAddress());
                _isBlackListedHost = filter.filterItem(null, null, null, metadata,
                        null) == FilterResult.Filter_Reject;

                if (_isBlackListedHost) {
                    if (Environment.detailLogEnabled())
                        LOG.info("### FILTER Host:" + getIPAddressAsString() + " has been blacklisted.");
                }
            }

            _blackListStatusUpdateTime = getServer().getFilterUpdateTime();

        }

        return _isBlackListedHost;
    }

    /** check to see if we have marked this server as a failed host (too many consecutive io errors) **/
    public boolean isFailedServer() {

        if (isBlackListedHost()) {
            return true;
        } else {
            if (_consecutiveIOErrors >= HOST_FAIL_ON_CONSECUTIVE_IO_ERRORS_THRESHOLD) {
                if (System.currentTimeMillis() - _lastIOErrorTime < HOST_FAIL_RESET_TIMER) {
                    return true;
                }
            }
            return false;
        }
    }

    @Override
    public int getCrawlDelay() {
        /*
            if (_uniqueDomainCount >= SUPER_HOST_THRESHOLD_1 && _uniqueDomainCount < SUPER_HOST_THRESHOLD_2)
              return SUPERHOST_CRAWL_DELAY_1;
            else if (_uniqueDomainCount >= SUPER_HOST_THRESHOLD_2)
              return SUPERHOST_CRAWL_DELAY_2;
            else
        */
        return DEFAULT_CRAWL_DELAY;
    }

    /*
    boolean skipRobots() { 
      return _skipRobots ;
    }
    */

    long getDomainTransitionWaitTime() {
        if (_lastFetchStartTime != -1) {
            return _lastFetchStartTime + getCrawlDelay();
        }
        return -1;
    }

    public long getLastFetchStartTime() {
        return _lastFetchStartTime;
    }

    private final CrawlList getHead() {
        return _crawlLists.getHead();
    }

    public void feedQueue() {

        if (getQueue() != null) {

            if (!_inFeedQueue) {

                if (getHead() != null && (getHead().getDisposition() == CrawlList.Disposition.WaitingOnCompletion
                        || getHead().getDisposition() == CrawlList.Disposition.WaitingOnTime)) {
                    LOG.warn("FeedQueue called on Host with Active List:" + getHead().toString());
                    return;
                }

                _inFeedQueue = true;

                boolean exitLoop = false;

                while (!exitLoop && getHead() != null) {

                    CrawlList currentList = getHead();

                    setIdle(false);

                    CrawlTarget target = null;

                    if (currentList.getDisposition() != CrawlList.Disposition.QueueEmpty) {
                        // get the next target for the current domain ...
                        target = currentList.getNextTarget();
                    }

                    if (target != null) {
                        // log it for now ...
                        if (Environment.detailLogEnabled())
                            LOG.info("Host: " + getIPAddressAsString() + " FeedQueue returned Target:"
                                    + target.getOriginalURL());
                        // and queue it up with the fetcher 
                        getQueue().fetchItem(target);

                        // break out here ... 
                        exitLoop = true;
                    } else {

                        // figure out what to do next ... 
                        if (currentList.getDisposition() == CrawlList.Disposition.WaitingOnTime) {
                            if (Environment.detailLogEnabled())
                                LOG.info("Feed Queue for List:" + currentList.getListName()
                                        + " Returned WaitingOnTime");
                            setTimer(currentList.calculateNextWaitTime());
                            exitLoop = true;
                        } else if (currentList.getDisposition() == CrawlList.Disposition.QueueEmpty) {

                            if (currentList.getListId() != CrawlerServer.getServer().getHighPriorityListId()) {
                                if (Environment.detailLogEnabled())
                                    LOG.info("Feed Detected List:" + currentList.getListName()
                                            + " is in QueueEmpty state - removing...");
                                // remove the domain from the list ... 
                                _crawlLists.removeElement(currentList);
                            } else {
                                // remove the list to the tail end of the queue ... 
                                _crawlLists.removeElement(currentList);
                                _crawlLists.addTail(currentList);
                                if (_crawlLists.getHead() == currentList) {
                                    if (Environment.detailLogEnabled())
                                        LOG.info("FeedQueue Detected HighPriority List:" + currentList.getListName()
                                                + " is IDLE. Exiting.");
                                    // and exit loop 
                                    exitLoop = true;
                                }
                            }

                            // if there is a next domain ... 
                            if (!exitLoop && getHead() != null) {
                                // setup a transition timer ... 
                                setTimer(getDomainTransitionWaitTime());
                                // and exit loop 
                                exitLoop = true;
                            }

                        } else {

                            StringWriter writer = new StringWriter();

                            writer.write("Invalid Domain State Encountered for List:" + currentList.getListName()
                                    + " Disposition:" + currentList.getDisposition() + "\n");
                            try {
                                dumpDetails(writer);
                            } catch (IOException e) {
                                LOG.error(CCStringUtils.stringifyException(e));
                            }
                            LOG.fatal(writer.toString());
                            throw new RuntimeException(writer.toString());
                        }
                    }
                }

                if (getHead() == null) {
                    setIdle(true);
                }
                _inFeedQueue = false;
            }
        }
    }

    @Override
    public void listDispositionChanged(CrawlList list, CrawlList.Disposition oldDisposition,
            CrawlList.Disposition newDisposition) {

        boolean originallyIdle = isIdled();

        updateLastModifiedTime(System.currentTimeMillis());
        list.updateLastModifiedTime(System.currentTimeMillis());

        // if this is the active domain ... 
        if (getHead() == list) {

            // if we were previously waiting on time ... 
            if (oldDisposition == CrawlList.Disposition.WaitingOnTime) {
                if (Environment.detailLogEnabled())
                    LOG.info("Host:" + getIPAddressAsString() + " List:" + list.getListName() + " timer FIRED");
                if (isTimerActive()) {
                    killTimer();
                }
            }

            if (newDisposition == CrawlList.Disposition.WaitingOnTime) {
                if (Environment.detailLogEnabled())
                    LOG.info("Host:" + getIPAddressAsString() + " List:" + list.getListName() + " timer SET");
                setTimer(list.calculateNextWaitTime());
            } else if (newDisposition == CrawlList.Disposition.ItemAvailable
                    || newDisposition == CrawlList.Disposition.QueueEmpty) {
                if (Environment.detailLogEnabled())
                    LOG.info("Host:" + getIPAddressAsString() + " List:" + list.getListName()
                            + " triggered feedQueue");
                feedQueue();
            }
        }
        // otherwise if state change occured on a non-active domain ... 
        else {
            throw new RuntimeException("List Disposition Change Happened in non Active List:" + list.getListName()
                    + " CurrentList:" + getHead());
        }

        // if we became idled as a result of this state transition ... 
        if (isIdled() && !originallyIdle) {
            if (_queue != null) {
                _queue.idleHost(this);
            }
        }
    }

    public void clearWaitState() {

        if (_waitTime == -1) {
            LOG.error("Host: " + getIPAddressAsString() + " clearWaitState called while _waitTime == -1");
            return;
        }

        _waitTime = -1;

        // check to see if the active domain is in a time wait state  
        if (getHead() != null) {
            // clear the item's wait state ... 
            getHead().clearWaitState();
        } else {
            throw new RuntimeException("clearWaitState called while head was null!");
        }
    }

    @Override
    public void purgeReferences() {
        if (_waitTime != -1) {
            killTimer();
        }

        while (_crawlLists.getHead() != null) {
            CrawlList headElement = _crawlLists.removeHead();
            headElement.clear();
        }
    }

    @Override
    public void setIdle(boolean isIdle) {
        _idle = isIdle;
    }

    public boolean isIdled() {
        return _idle;
    }

    @Override
    public void updateLastModifiedTime(long time) {
        _lastDispositionChangeTime = time;
        _zombie = false;
    }

    @Override
    public long getLastModifiedTime() {
        return _lastDispositionChangeTime;
    }

    public long getWaitTime() {
        return _waitTime;
    }

    void incSuccessfulGETCount() {
        ++_successfulGETCount;
    }

    void incHttp200Count() {
        ++_http200Count;
    }

    void incFailedGETCount() {
        ++_failedGETCount;
    }

    void incRobotsExcludedCount() {
        ++_robotsExcludedCount;
    }

    void inc403Count() {
        ++_403Count;
    }

    private static SimpleDateFormat _formatter = new SimpleDateFormat("yyyy.MM.dd 'at' hh:mm:ss z");

    private static final String dateStringFromTimeValue(long timeValue) {

        if (timeValue != -1) {
            Date theDate = new Date(timeValue);
            return _formatter.format(theDate);
        }
        return "";
    }

    /** cache a robots file by crc **/
    public void cacheRobotsFile(RobotRuleSet ruleSet, long robotsCRC) {
        RobotRuleSetCacheItem oldestItem = null;
        boolean found = false;
        for (RobotRuleSetCacheItem item : _robotsRuleSetCache) {
            if (item._crc == robotsCRC) {
                item._lastTouched = System.currentTimeMillis();
                found = true;
            }
            oldestItem = (oldestItem == null) ? item
                    : (oldestItem._lastTouched > item._lastTouched) ? item : oldestItem;
        }

        if (!found) {
            if (_robotsRuleSetCache.size() == MAX_ROBOTS_CACHE_ENTIRES) {
                _robotsRuleSetCache.removeElement(oldestItem);
            }
            RobotRuleSetCacheItem cacheItem = new RobotRuleSetCacheItem();

            cacheItem._crc = robotsCRC;
            cacheItem._lastTouched = System.currentTimeMillis();
            cacheItem._ruleSetObject = ruleSet;
            _robotsRuleSetCache.addHead(cacheItem);
        }
    }

    /** check for a cached robots entry via the given crc value **/
    public RobotRuleSet getCachedRobotsEntry(long crcValue) {
        for (RobotRuleSetCacheItem cacheItem : _robotsRuleSetCache) {
            if (cacheItem._crc == crcValue)
                return cacheItem._ruleSetObject;
        }
        return null;
    }

    @Override
    public void dumpDetails(Writer out) throws IOException {

        StringBuffer sb = new StringBuffer();

        sb.append("Failed:" + isFailedServer() + "\n");
        sb.append("Idle:" + _idle + "\n");
        sb.append("Paused:" + _isPaused + "\n");
        sb.append("Zombie:" + _zombie + "\n");
        sb.append("LastFetchTime:" + dateStringFromTimeValue(_lastFetchStartTime) + "\n");
        sb.append("LastDispChangeTime:" + dateStringFromTimeValue(_lastDispositionChangeTime) + "\n");
        sb.append("WaitTime:" + dateStringFromTimeValue(_waitTime) + "\n");
        sb.append("CrawlDelay:" + ((_lastFetchStartTime != -1) ? (Math.max(0, _waitTime - _lastFetchStartTime)) : 0)
                + "\n");
        sb.append("UniqueDomainCount:" + _uniqueDomainCount + "\n");
        sb.append("SuccessfulGETs:" + _successfulGETCount + "\n");
        sb.append("HTTP-200-Count:" + _http200Count + "\n");
        sb.append("HTTP-403-Count:" + _403Count + "\n");
        sb.append("RobotsExcludedCount:" + _robotsExcludedCount + "\n");
        sb.append("FailedGETs:" + _failedGETCount + "\n");
        sb.append("ActiveList:" + ((getHead() != null) ? getHead().getListName() : "NULL" + "\n"));
        sb.append("ActiveList-LastFetchTime:" + ((getHead() != null) ? getHead().getLastRequestFetchTime() : 0)
                + "\n");
        sb.append("ActiveList-NextCrawlInterface:" + ((getHead() != null) ? getHead().getNextCrawlInterface() : 0)
                + "\n");
        sb.append("\n\n<b>ActiveDomain Details:</b>\n");
        if (getHead() != null) {
            getHead().dumpDetailsToHTML(sb);
        }
        if (_crawlLists.size() > 1) {
            sb.append("\n\n<B>Other Domains:</B>\n");
            for (CrawlList domain : _crawlLists) {
                if (domain != getHead()) {
                    domain.dumpDetailsToHTML(sb);
                    sb.append("\n");
                }
            }
        }
        out.write(sb.toString());
    }

    @Override
    public String getScheme() {
        return CrawlQueue.protocolToScheme(_queue.getProtocol());
    }

    @Override
    public void incrementCounter(CounterId counter, int amount) {
        switch (counter) {
        case RobotsExcludedCount:
            _robotsExcludedCount += amount;
            break;
        case SuccessfullGetCount:
            _successfulGETCount += amount;
            break;
        case Http200Count:
            _http200Count += amount;
            break;
        case Http403Count:
            _403Count += amount;
            break;
        case FailedDomainCount:
            _failedDomainCount += amount;
            break;
        case FailedGetCount:
            _failedGETCount += amount;
            break;
        case ConsecutiveIOErrorCount:
            _consecutiveIOErrors += amount;
            break;
        }
    }

    @Override
    public void resetCounter(CounterId counter) {
        switch (counter) {
        case RobotsExcludedCount:
            _robotsExcludedCount = 0;
            break;
        case SuccessfullGetCount:
            _successfulGETCount = 0;
            break;
        case Http200Count:
            _http200Count = 0;
            break;
        case Http403Count:
            _403Count = 0;
            break;
        case FailedDomainCount:
            _failedDomainCount = 0;
            break;
        case FailedGetCount:
            _failedGETCount = 0;
            break;
        case ConsecutiveIOErrorCount:
            _consecutiveIOErrors = 0;
            break;
        }
    }

    @Override
    public boolean isPaused() {
        // check to see if our pause state needs to be updated ...
        int currentServerPauseStateTimestamp = CrawlerServer.getServer().getPauseStateSerialTimestamp();
        if (currentServerPauseStateTimestamp != _pauseStateTimestamp) {
            // yes it does ... 
            // ask server for current state
            _isPaused = CrawlerServer.getServer().isHostPaused(this);
            // and update our timestamp ... 
            _pauseStateTimestamp = currentServerPauseStateTimestamp;
        }
        return _isPaused;
    }
}