Java tutorial
/** * Copyright 2008 - CommonCrawl Foundation * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see <http://www.gnu.org/licenses/>. * **/ package org.commoncrawl.service.crawler; import java.io.IOException; import java.io.StringWriter; import java.io.Writer; import java.net.UnknownHostException; import java.text.SimpleDateFormat; import java.util.Date; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.commoncrawl.common.Environment; import org.commoncrawl.protocol.CrawlURLMetadata; import org.commoncrawl.service.crawler.RobotRulesParser.RobotRuleSet; import org.commoncrawl.service.crawler.filters.IPAddressBlockFilter; import org.commoncrawl.service.crawler.filters.Filter.FilterResult; import org.commoncrawl.util.CCStringUtils; import org.commoncrawl.util.IPAddressUtils; import org.commoncrawl.util.IntrusiveList; import org.commoncrawl.util.HttpCookieUtils.CookieStore; import org.commoncrawl.util.IntrusiveList.IntrusiveListElement; /** * * @author rana * * The common CrawlHost implementation shared by CrawlList and CrawlQueue */ public final class CrawlHostImpl implements CrawlListHost, CrawlQueueHost { private static final int DEFAULT_CRAWL_DELAY = 2000; private static final int SUPER_HOST_THRESHOLD_1 = 25; private static final int SUPER_HOST_THRESHOLD_2 = 100; private static final int SUPERHOST_CRAWL_DELAY_1 = 3000; private static final int SUPERHOST_CRAWL_DELAY_2 = 2000; /** fail the host after 5 consecutive io errors **/ private static final int HOST_FAIL_ON_CONSECUTIVE_IO_ERRORS_THRESHOLD = 100; /** time to keep a host in failed state after an io error condition is detected **/ private static final int HOST_FAIL_RESET_TIMER = 10 * 60 * 1000; // 10 minutes .. /** logging **/ private static final Log LOG = LogFactory.getLog(CrawlListHost.class); private int _ipAddress; // host's ip address ... private IntrusiveList<CrawlList> _crawlLists = new IntrusiveList<CrawlList>(); private CrawlQueue _queue; private boolean _idle = false; private boolean _zombie = false; private boolean _isBlackListedHost = false; /** paused by a master crawl controller **/ private boolean _isPaused = false; /** pause state timestamp **/ private int _pauseStateTimestamp = -1; private long _blackListStatusUpdateTime = -1; private long _lastFetchStartTime = -1; private long _lastDispositionChangeTime = -1; private long _waitTime = -1; private int _failedDomainCount = 0; private int _uniqueDomainCount = 0; private int _successfulGETCount = 0; private int _http200Count = 0; private int _robotsExcludedCount = 0; private int _403Count = 0; private int _failedGETCount = 0; boolean _inFeedQueue = false; private short _consecutiveIOErrors = 0; private long _lastIOErrorTime = -1; private CookieStore _cookieStore = new CookieStore(); // private boolean _skipRobots = false; // private String _resolvedHostName = null; private static int MAX_ROBOTS_CACHE_ENTIRES = 20; private static class RobotRuleSetCacheItem extends IntrusiveListElement<RobotRuleSetCacheItem> { public long _crc; public RobotRuleSet _ruleSetObject; public long _lastTouched; } /** robots rule set cache **/ private IntrusiveList<RobotRuleSetCacheItem> _robotsRuleSetCache = new IntrusiveList<RobotRuleSetCacheItem>(); public CrawlHostImpl(CrawlQueue queue, int ipAddress) { _ipAddress = ipAddress; _queue = queue; } public CrawlerServer getServer() { return getQueue().getEngine().getServer(); } public CrawlList getActiveList() { return getHead(); } /** get access to the cookie store associated with this host * may be null * **/ public CookieStore getCookieStore() { return _cookieStore; } public boolean noActiveLists() { // if there are no lists present then, yes there are no active lists ... if (getHead() == null) { return true; } else { // otherwise ... special case, only one list present and it is the high priority list ... if (_crawlLists.size() == 1 && getHead().getListId() == CrawlerServer.getServer().getHighPriorityListId()) { // check to see if it's disposition is QueueEmpty return getHead().getDisposition() == CrawlList.Disposition.QueueEmpty; } // otherwise ... no, there are active lists present return false; } } String getActiveListName() { if (getHead() != null) return getHead().getListName(); return "null"; } String getActiveListDisposition() { if (getHead() != null) return getHead().getDisposition().toString(); return "null"; } @Override public String getIPAddressAsString() { String ipAddress = "UNKNOWN"; try { ipAddress = IPAddressUtils.IntegerToInetAddress(getIPAddress()).toString(); } catch (UnknownHostException e) { } return ipAddress; } public int getIPAddress() { return _ipAddress; } public CrawlQueue getQueue() { return _queue; } public int getQueuedURLCount() { int countOut = 0; for (CrawlList list : _crawlLists) { synchronized (list) { countOut += list.getPendingURLCount() + list.getOfflineURLCount(); } } return countOut; } public CrawlList getCrawlList(int listId) { CrawlList domainOut = null; for (CrawlList domain : _crawlLists) { if (domain.getListId() == listId) { domainOut = domain; break; } } if (domainOut == null) { domainOut = new CrawlList(this, listId); _crawlLists.addTail(domainOut); // increment unique domain count ... _uniqueDomainCount++; } return domainOut; } public boolean isTimerActive() { return _waitTime != -1; } public void setTimer(long expireTime) { _idle = false; _waitTime = expireTime; // check for null for unit test scenario if (getQueue() != null) { getQueue().setTimer(this, expireTime); } } public void killTimer() { if (_waitTime != -1) { _waitTime = -1; // check for null for unit test scenario if (getQueue() != null) { getQueue().killTimer(this); } } } @Override public void updateLastFetchStartTime(long newFetchStartTime) { _lastFetchStartTime = newFetchStartTime; } void incFailedDomainCount() { _failedDomainCount++; } /** increments the consecutive io errors counter for this host * used to track failed servers * **/ public void incConsecutiveIOErrorCount() { _consecutiveIOErrors++; _lastIOErrorTime = System.currentTimeMillis(); if (_consecutiveIOErrors >= HOST_FAIL_ON_CONSECUTIVE_IO_ERRORS_THRESHOLD) { LOG.error( "### HOST: Failed Host:" + getIPAddressAsString() + " due to too many consecutive IO Errors!"); } } /** reset IOError Counter ... whenever we succesfully retrieve a document from this host **/ public void resetIOErrorCount() { _consecutiveIOErrors = 0; } /** is this a blacklisted host **/ public boolean isBlackListedHost() { // check blacklisted host status ... if (_blackListStatusUpdateTime < getServer().getFilterUpdateTime()) { // if black list status out of date ... validate blacklist status IPAddressBlockFilter filter = getServer().getIPAddressFilter(); if (filter != null) { CrawlURLMetadata metadata = new CrawlURLMetadata(); metadata.setServerIP(getIPAddress()); _isBlackListedHost = filter.filterItem(null, null, null, metadata, null) == FilterResult.Filter_Reject; if (_isBlackListedHost) { if (Environment.detailLogEnabled()) LOG.info("### FILTER Host:" + getIPAddressAsString() + " has been blacklisted."); } } _blackListStatusUpdateTime = getServer().getFilterUpdateTime(); } return _isBlackListedHost; } /** check to see if we have marked this server as a failed host (too many consecutive io errors) **/ public boolean isFailedServer() { if (isBlackListedHost()) { return true; } else { if (_consecutiveIOErrors >= HOST_FAIL_ON_CONSECUTIVE_IO_ERRORS_THRESHOLD) { if (System.currentTimeMillis() - _lastIOErrorTime < HOST_FAIL_RESET_TIMER) { return true; } } return false; } } @Override public int getCrawlDelay() { /* if (_uniqueDomainCount >= SUPER_HOST_THRESHOLD_1 && _uniqueDomainCount < SUPER_HOST_THRESHOLD_2) return SUPERHOST_CRAWL_DELAY_1; else if (_uniqueDomainCount >= SUPER_HOST_THRESHOLD_2) return SUPERHOST_CRAWL_DELAY_2; else */ return DEFAULT_CRAWL_DELAY; } /* boolean skipRobots() { return _skipRobots ; } */ long getDomainTransitionWaitTime() { if (_lastFetchStartTime != -1) { return _lastFetchStartTime + getCrawlDelay(); } return -1; } public long getLastFetchStartTime() { return _lastFetchStartTime; } private final CrawlList getHead() { return _crawlLists.getHead(); } public void feedQueue() { if (getQueue() != null) { if (!_inFeedQueue) { if (getHead() != null && (getHead().getDisposition() == CrawlList.Disposition.WaitingOnCompletion || getHead().getDisposition() == CrawlList.Disposition.WaitingOnTime)) { LOG.warn("FeedQueue called on Host with Active List:" + getHead().toString()); return; } _inFeedQueue = true; boolean exitLoop = false; while (!exitLoop && getHead() != null) { CrawlList currentList = getHead(); setIdle(false); CrawlTarget target = null; if (currentList.getDisposition() != CrawlList.Disposition.QueueEmpty) { // get the next target for the current domain ... target = currentList.getNextTarget(); } if (target != null) { // log it for now ... if (Environment.detailLogEnabled()) LOG.info("Host: " + getIPAddressAsString() + " FeedQueue returned Target:" + target.getOriginalURL()); // and queue it up with the fetcher getQueue().fetchItem(target); // break out here ... exitLoop = true; } else { // figure out what to do next ... if (currentList.getDisposition() == CrawlList.Disposition.WaitingOnTime) { if (Environment.detailLogEnabled()) LOG.info("Feed Queue for List:" + currentList.getListName() + " Returned WaitingOnTime"); setTimer(currentList.calculateNextWaitTime()); exitLoop = true; } else if (currentList.getDisposition() == CrawlList.Disposition.QueueEmpty) { if (currentList.getListId() != CrawlerServer.getServer().getHighPriorityListId()) { if (Environment.detailLogEnabled()) LOG.info("Feed Detected List:" + currentList.getListName() + " is in QueueEmpty state - removing..."); // remove the domain from the list ... _crawlLists.removeElement(currentList); } else { // remove the list to the tail end of the queue ... _crawlLists.removeElement(currentList); _crawlLists.addTail(currentList); if (_crawlLists.getHead() == currentList) { if (Environment.detailLogEnabled()) LOG.info("FeedQueue Detected HighPriority List:" + currentList.getListName() + " is IDLE. Exiting."); // and exit loop exitLoop = true; } } // if there is a next domain ... if (!exitLoop && getHead() != null) { // setup a transition timer ... setTimer(getDomainTransitionWaitTime()); // and exit loop exitLoop = true; } } else { StringWriter writer = new StringWriter(); writer.write("Invalid Domain State Encountered for List:" + currentList.getListName() + " Disposition:" + currentList.getDisposition() + "\n"); try { dumpDetails(writer); } catch (IOException e) { LOG.error(CCStringUtils.stringifyException(e)); } LOG.fatal(writer.toString()); throw new RuntimeException(writer.toString()); } } } if (getHead() == null) { setIdle(true); } _inFeedQueue = false; } } } @Override public void listDispositionChanged(CrawlList list, CrawlList.Disposition oldDisposition, CrawlList.Disposition newDisposition) { boolean originallyIdle = isIdled(); updateLastModifiedTime(System.currentTimeMillis()); list.updateLastModifiedTime(System.currentTimeMillis()); // if this is the active domain ... if (getHead() == list) { // if we were previously waiting on time ... if (oldDisposition == CrawlList.Disposition.WaitingOnTime) { if (Environment.detailLogEnabled()) LOG.info("Host:" + getIPAddressAsString() + " List:" + list.getListName() + " timer FIRED"); if (isTimerActive()) { killTimer(); } } if (newDisposition == CrawlList.Disposition.WaitingOnTime) { if (Environment.detailLogEnabled()) LOG.info("Host:" + getIPAddressAsString() + " List:" + list.getListName() + " timer SET"); setTimer(list.calculateNextWaitTime()); } else if (newDisposition == CrawlList.Disposition.ItemAvailable || newDisposition == CrawlList.Disposition.QueueEmpty) { if (Environment.detailLogEnabled()) LOG.info("Host:" + getIPAddressAsString() + " List:" + list.getListName() + " triggered feedQueue"); feedQueue(); } } // otherwise if state change occured on a non-active domain ... else { throw new RuntimeException("List Disposition Change Happened in non Active List:" + list.getListName() + " CurrentList:" + getHead()); } // if we became idled as a result of this state transition ... if (isIdled() && !originallyIdle) { if (_queue != null) { _queue.idleHost(this); } } } public void clearWaitState() { if (_waitTime == -1) { LOG.error("Host: " + getIPAddressAsString() + " clearWaitState called while _waitTime == -1"); return; } _waitTime = -1; // check to see if the active domain is in a time wait state if (getHead() != null) { // clear the item's wait state ... getHead().clearWaitState(); } else { throw new RuntimeException("clearWaitState called while head was null!"); } } @Override public void purgeReferences() { if (_waitTime != -1) { killTimer(); } while (_crawlLists.getHead() != null) { CrawlList headElement = _crawlLists.removeHead(); headElement.clear(); } } @Override public void setIdle(boolean isIdle) { _idle = isIdle; } public boolean isIdled() { return _idle; } @Override public void updateLastModifiedTime(long time) { _lastDispositionChangeTime = time; _zombie = false; } @Override public long getLastModifiedTime() { return _lastDispositionChangeTime; } public long getWaitTime() { return _waitTime; } void incSuccessfulGETCount() { ++_successfulGETCount; } void incHttp200Count() { ++_http200Count; } void incFailedGETCount() { ++_failedGETCount; } void incRobotsExcludedCount() { ++_robotsExcludedCount; } void inc403Count() { ++_403Count; } private static SimpleDateFormat _formatter = new SimpleDateFormat("yyyy.MM.dd 'at' hh:mm:ss z"); private static final String dateStringFromTimeValue(long timeValue) { if (timeValue != -1) { Date theDate = new Date(timeValue); return _formatter.format(theDate); } return ""; } /** cache a robots file by crc **/ public void cacheRobotsFile(RobotRuleSet ruleSet, long robotsCRC) { RobotRuleSetCacheItem oldestItem = null; boolean found = false; for (RobotRuleSetCacheItem item : _robotsRuleSetCache) { if (item._crc == robotsCRC) { item._lastTouched = System.currentTimeMillis(); found = true; } oldestItem = (oldestItem == null) ? item : (oldestItem._lastTouched > item._lastTouched) ? item : oldestItem; } if (!found) { if (_robotsRuleSetCache.size() == MAX_ROBOTS_CACHE_ENTIRES) { _robotsRuleSetCache.removeElement(oldestItem); } RobotRuleSetCacheItem cacheItem = new RobotRuleSetCacheItem(); cacheItem._crc = robotsCRC; cacheItem._lastTouched = System.currentTimeMillis(); cacheItem._ruleSetObject = ruleSet; _robotsRuleSetCache.addHead(cacheItem); } } /** check for a cached robots entry via the given crc value **/ public RobotRuleSet getCachedRobotsEntry(long crcValue) { for (RobotRuleSetCacheItem cacheItem : _robotsRuleSetCache) { if (cacheItem._crc == crcValue) return cacheItem._ruleSetObject; } return null; } @Override public void dumpDetails(Writer out) throws IOException { StringBuffer sb = new StringBuffer(); sb.append("Failed:" + isFailedServer() + "\n"); sb.append("Idle:" + _idle + "\n"); sb.append("Paused:" + _isPaused + "\n"); sb.append("Zombie:" + _zombie + "\n"); sb.append("LastFetchTime:" + dateStringFromTimeValue(_lastFetchStartTime) + "\n"); sb.append("LastDispChangeTime:" + dateStringFromTimeValue(_lastDispositionChangeTime) + "\n"); sb.append("WaitTime:" + dateStringFromTimeValue(_waitTime) + "\n"); sb.append("CrawlDelay:" + ((_lastFetchStartTime != -1) ? (Math.max(0, _waitTime - _lastFetchStartTime)) : 0) + "\n"); sb.append("UniqueDomainCount:" + _uniqueDomainCount + "\n"); sb.append("SuccessfulGETs:" + _successfulGETCount + "\n"); sb.append("HTTP-200-Count:" + _http200Count + "\n"); sb.append("HTTP-403-Count:" + _403Count + "\n"); sb.append("RobotsExcludedCount:" + _robotsExcludedCount + "\n"); sb.append("FailedGETs:" + _failedGETCount + "\n"); sb.append("ActiveList:" + ((getHead() != null) ? getHead().getListName() : "NULL" + "\n")); sb.append("ActiveList-LastFetchTime:" + ((getHead() != null) ? getHead().getLastRequestFetchTime() : 0) + "\n"); sb.append("ActiveList-NextCrawlInterface:" + ((getHead() != null) ? getHead().getNextCrawlInterface() : 0) + "\n"); sb.append("\n\n<b>ActiveDomain Details:</b>\n"); if (getHead() != null) { getHead().dumpDetailsToHTML(sb); } if (_crawlLists.size() > 1) { sb.append("\n\n<B>Other Domains:</B>\n"); for (CrawlList domain : _crawlLists) { if (domain != getHead()) { domain.dumpDetailsToHTML(sb); sb.append("\n"); } } } out.write(sb.toString()); } @Override public String getScheme() { return CrawlQueue.protocolToScheme(_queue.getProtocol()); } @Override public void incrementCounter(CounterId counter, int amount) { switch (counter) { case RobotsExcludedCount: _robotsExcludedCount += amount; break; case SuccessfullGetCount: _successfulGETCount += amount; break; case Http200Count: _http200Count += amount; break; case Http403Count: _403Count += amount; break; case FailedDomainCount: _failedDomainCount += amount; break; case FailedGetCount: _failedGETCount += amount; break; case ConsecutiveIOErrorCount: _consecutiveIOErrors += amount; break; } } @Override public void resetCounter(CounterId counter) { switch (counter) { case RobotsExcludedCount: _robotsExcludedCount = 0; break; case SuccessfullGetCount: _successfulGETCount = 0; break; case Http200Count: _http200Count = 0; break; case Http403Count: _403Count = 0; break; case FailedDomainCount: _failedDomainCount = 0; break; case FailedGetCount: _failedGETCount = 0; break; case ConsecutiveIOErrorCount: _consecutiveIOErrors = 0; break; } } @Override public boolean isPaused() { // check to see if our pause state needs to be updated ... int currentServerPauseStateTimestamp = CrawlerServer.getServer().getPauseStateSerialTimestamp(); if (currentServerPauseStateTimestamp != _pauseStateTimestamp) { // yes it does ... // ask server for current state _isPaused = CrawlerServer.getServer().isHostPaused(this); // and update our timestamp ... _pauseStateTimestamp = currentServerPauseStateTimestamp; } return _isPaused; } }