Java tutorial
/** * Copyright 2008 - CommonCrawl Foundation * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see <http://www.gnu.org/licenses/>. * **/ package org.commoncrawl.service.crawlhistory; import java.io.ByteArrayInputStream; import java.io.DataInputStream; import java.io.DataOutputStream; import java.io.File; import java.io.IOException; import java.util.ArrayList; import java.util.Date; import java.util.List; import java.util.TreeSet; import java.util.concurrent.Semaphore; import java.util.concurrent.atomic.AtomicInteger; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.hadoop.fs.FSDataInputStream; import org.apache.hadoop.fs.FSDataOutputStream; import org.apache.hadoop.fs.FileStatus; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.WritableUtils; import org.apache.hadoop.record.Buffer; import org.commoncrawl.async.Timer; import org.commoncrawl.crawl.common.internal.CrawlEnvironment; import org.commoncrawl.db.RecordStore; import org.commoncrawl.db.RecordStore.RecordStoreException; import org.commoncrawl.protocol.BulkItemHistoryQuery; import org.commoncrawl.protocol.BulkItemHistoryQueryResponse; import org.commoncrawl.protocol.BulkUpdateData; import org.commoncrawl.protocol.CrawlHistoryStatus; import org.commoncrawl.protocol.CrawlerHistoryService; import org.commoncrawl.protocol.SingleItemHistoryQueryResponse; import org.commoncrawl.protocol.URLFPV2; import org.commoncrawl.protocol.CrawlHistoryStatus.CheckpointState; import org.commoncrawl.rpc.base.internal.AsyncClientChannel; import org.commoncrawl.rpc.base.internal.AsyncContext; import org.commoncrawl.rpc.base.internal.AsyncServerChannel; import org.commoncrawl.rpc.base.internal.NullMessage; import org.commoncrawl.rpc.base.internal.AsyncRequest.Status; import org.commoncrawl.rpc.base.shared.RPCException; import org.commoncrawl.server.CommonCrawlServer; import org.commoncrawl.service.crawler.CrawlSegmentLog; import org.commoncrawl.service.crawlhistory.HistoryServerState; import org.commoncrawl.util.CCStringUtils; import org.commoncrawl.util.ImmutableBuffer; import org.commoncrawl.util.URLFPBloomFilter; import org.commoncrawl.util.BitUtils.BitStream; import org.commoncrawl.util.time.Hour; /** * * * @author rana * */ public class CrawlHistoryServer extends CommonCrawlServer implements CrawlerHistoryService, AsyncServerChannel.ConnectionCallback { private int _numElements = -1; private int _numHashFunctions = -1; private int _bitsPerElement = -1; private int _crawlNumber = -1; private URLFPBloomFilter _bloomFilter = null; /** primary crawler database **/ RecordStore _recordStore = new RecordStore(); /** server state record key **/ String HistoryServerStateKey = "HistoryServerState"; /** server state object **/ HistoryServerState _state; private static final Log LOG = LogFactory.getLog(CrawlHistoryServer.class); @Override protected String getDefaultDataDir() { return CrawlEnvironment.DEFAULT_DATA_DIR; } @Override protected String getDefaultHttpInterface() { return CrawlEnvironment.DEFAULT_HTTP_INTERFACE; } @Override protected int getDefaultHttpPort() { return CrawlEnvironment.DEFAULT_CRAWLER_HISTORY_HTTP_PORT; } @Override protected String getDefaultLogFileName() { return "historyserver.log"; } @Override protected String getDefaultRPCInterface() { return CrawlEnvironment.DEFAULT_RPC_INTERFACE; } @Override protected int getDefaultRPCPort() { return CrawlEnvironment.DEFAULT_CRAWLER_HISTORY_RPC_PORT; } @Override protected String getWebAppName() { return CrawlEnvironment.CRAWLER_HISTORY_WEBAPP_NAME; } @Override protected boolean initServer() { File dataPath = getDataDirectory(); File dbPath = new File(dataPath, CrawlEnvironment.CRAWLER_HISTORY_DB); // now initialize the recorstore ... try { // initialize database _recordStore.initialize(dbPath, null); _state = new HistoryServerState(); _state.setCurrentCheckpointState(CheckpointState.ACTIVE); _state.setCurrentCrawlNumber(_crawlNumber); updateState(); // load bloom filter from disk if possible loadBloomFilter(); // create server channel ... AsyncServerChannel channel = new AsyncServerChannel(this, this.getEventLoop(), this.getServerAddress(), this); // register RPC services it supports ... registerService(channel, CrawlerHistoryService.spec); // start the checkpoint thread ... startCheckpointThread(CrawlEnvironment.getDefaultFileSystem()); } catch (IOException e) { LOG.error(CCStringUtils.stringifyException(e)); } return true; } /** do a clean shutdown (if possible) **/ @Override public void stop() { // ok, wait to grab the checkpoint thread semaphore LOG.info("Server Shutdown Detected. Waiting on checkpoint thread"); _shutdownFlag = true; _checkpointThreadSemaphore.acquireUninterruptibly(); LOG.info("Checkpoint thread semaphore acquired. Joining checkpoint thread ... "); if (_checkpointThread != null) { try { _checkpointThread.join(); } catch (Exception e) { LOG.error("Exception while waiting for Checkpoint Thread shutdown:" + CCStringUtils.stringifyException(e)); } } // ok safe to call super now ... super.stop(); } /** update and persist state data structure **/ private void updateState() throws RecordStoreException { _recordStore.beginTransaction(); _recordStore.updateRecordByKey(HistoryServerStateKey, _state); _recordStore.commitTransaction(); } @Override protected boolean parseArguements(String[] argv) { for (int i = 0; i < argv.length; ++i) { if (argv[i].equalsIgnoreCase("--numElements")) { if (i + 1 < argv.length) { _numElements = Integer.parseInt(argv[++i]); } } else if (argv[i].equalsIgnoreCase("--numHashFunctions")) { if (i + 1 < argv.length) { _numHashFunctions = Integer.parseInt(argv[++i]); } } else if (argv[i].equalsIgnoreCase("--numBitsPerElement")) { if (i + 1 < argv.length) { _bitsPerElement = Integer.parseInt(argv[++i]); } } else if (argv[i].equalsIgnoreCase("--crawlNumber")) { _crawlNumber = Integer.parseInt(argv[++i]); } } return (_numElements != -1 && _numHashFunctions != -1 && _bitsPerElement != -1 && _crawlNumber != -1); } @Override protected void printUsage() { } @Override protected boolean startDaemons() { return true; } @Override protected void stopDaemons() { } @Override public void checkpoint(AsyncContext<CrawlHistoryStatus, CrawlHistoryStatus> rpcContext) throws RPCException { LOG.info("Received Checkpoint Command with CrawlerNumber:" + rpcContext.getInput().getActiveCrawlNumber()); if (_state.getCurrentCrawlNumber() < rpcContext.getInput().getActiveCrawlNumber() || _bloomFilter == null) { rpcContext.setStatus(Status.Error_RequestFailed); rpcContext.setErrorDesc("Incoming Version:" + rpcContext.getInput().getActiveCrawlNumber() + " Expected:" + _state.getCurrentCrawlNumber()); rpcContext.completeRequest(); } else { if (_state.getCurrentCheckpointState() == CrawlHistoryStatus.CheckpointState.ACTIVE && rpcContext .getInput().getCheckpointState() == CrawlHistoryStatus.CheckpointState.TRANSITIONING) { LOG.info("Moving to Transitioning State"); moveToTransitioningState(rpcContext); } else { LOG.info( "Current Crawl Number equals Checkpoint Command Crawl Number. Ignoring Checkpoint Command"); //NOOP ... just ignore the request ...and echo current crawl number ... rpcContext.getOutput().setActiveCrawlNumber(_state.getCurrentCrawlNumber()); rpcContext.getOutput().setCheckpointState(_state.getCurrentCheckpointState()); rpcContext.completeRequest(); } } } @Override public void bulkItemQuery(AsyncContext<BulkItemHistoryQuery, BulkItemHistoryQueryResponse> rpcContext) throws RPCException { LOG.info("Received BulkItemQueryRequest"); ImmutableBuffer inputBuffer = rpcContext.getInput().getFingerprintList(); if (inputBuffer.getCount() != 0) { try { if (_bloomFilter == null) { throw new IOException("BloomFilter Not Initilized. Invalid Server State!"); } DataInputStream inputStream = new DataInputStream( new ByteArrayInputStream(inputBuffer.getReadOnlyBytes(), 0, inputBuffer.getCount())); BitStream bitStreamOut = new BitStream(); URLFPV2 fingerprint = new URLFPV2(); int itemsPresent = 0; while (inputStream.available() != 0) { fingerprint.setDomainHash(WritableUtils.readVLong(inputStream)); fingerprint.setUrlHash(WritableUtils.readVLong(inputStream)); if (_bloomFilter.isPresent(fingerprint)) { bitStreamOut.addbit(1); ++itemsPresent; } else { bitStreamOut.addbit(0); } } LOG.info("Received BulkItemQueryRequest Completed with " + itemsPresent + " items found"); rpcContext.getOutput() .setResponseList(new Buffer(bitStreamOut.bits, 0, (bitStreamOut.nbits + 7) / 8)); } catch (IOException e) { LOG.error(CCStringUtils.stringifyException(e)); rpcContext.setStatus(Status.Error_RequestFailed); rpcContext.setErrorDesc(CCStringUtils.stringifyException(e)); } rpcContext.completeRequest(); } } @Override public void singleItemQuery(AsyncContext<URLFPV2, SingleItemHistoryQueryResponse> rpcContext) throws RPCException { try { if (_bloomFilter == null) { throw new IOException("BloomFilter Not Initilized. Invalid Server State!"); } rpcContext.getOutput().setWasCrawled(_bloomFilter.isPresent(rpcContext.getInput())); } catch (IOException e) { LOG.error(CCStringUtils.stringifyException(e)); rpcContext.setStatus(Status.Error_RequestFailed); rpcContext.setErrorDesc(CCStringUtils.stringifyException(e)); } rpcContext.completeRequest(); } @Override public void updateHistory(AsyncContext<URLFPV2, NullMessage> rpcContext) throws RPCException { try { if (_bloomFilter == null) { throw new IOException("BloomFilter Not Initilized. Invalid Server State!"); } _bloomFilter.add(rpcContext.getInput()); } catch (IOException e) { LOG.error(CCStringUtils.stringifyException(e)); rpcContext.setStatus(Status.Error_RequestFailed); rpcContext.setErrorDesc(CCStringUtils.stringifyException(e)); } rpcContext.completeRequest(); } private final Path getDataFileBasePath() { return new Path(CrawlEnvironment.HDFS_HistoryServerBase, getHostName()); } private final Path getDataFileFinalPath() { return new Path(CrawlEnvironment.HDFS_HistoryServerBase, getHostName() + ".data"); } private final Path getDataFileCheckpointPath() { return new Path(CrawlEnvironment.HDFS_HistoryServerBase, getHostName() + ".checkpoint"); } private final Path getCheckpointMutexPath() { Hour hour = new Hour(new Date()); Path checkpointPath = new Path(CrawlEnvironment.HDFS_HistoryServerBase + CrawlEnvironment.HDFS_HistoryServerCheckpointMutex + "." + hour.getFirstMillisecond()); return checkpointPath; } private List<Path> reloadActiveHistory() throws IOException { ArrayList<Path> paths = new ArrayList<Path>(); FileSystem fs = CrawlEnvironment.getDefaultFileSystem(); // create scan pattern Path hdfsScanPath = new Path( CrawlEnvironment.getCrawlSegmentDataDirectory() + "/" + _state.getCurrentCrawlNumber() + "/*/" + CrawlEnvironment.buildCrawlSegmentLogCheckpointWildcardString(getHostName())); // scan hdfs for log files FileStatus candidates[]; LOG.info("Scanning For Cadnidates in:" + hdfsScanPath); candidates = fs.globStatus(hdfsScanPath); // iterate candidates for (FileStatus candidate : candidates) { // ok found a candidate we can work on LOG.info("Found Candidate:" + candidate.getPath()); final URLFPV2 placeHolderFP = new URLFPV2(); CrawlSegmentLog.walkFingerprintsInLogFile(fs, candidate.getPath(), new CrawlSegmentLog.LogFileItemCallback() { @Override public void processItem(long domainHash, long urlFingerprint) { placeHolderFP.setDomainHash(domainHash); placeHolderFP.setUrlHash(urlFingerprint); // add item for bloom filter _bloomFilter.add(placeHolderFP); } }); LOG.info("Finished Processing Candidate:" + candidate.getPath()); paths.add(candidate.getPath()); } return paths; } private void reloadLaggingHistory(int previousCrawlNumber) throws IOException { FileSystem fs = CrawlEnvironment.getDefaultFileSystem(); // create scan pattern Path hdfsScanPath = new Path(CrawlEnvironment.getCrawlSegmentDataDirectory() + "/" + previousCrawlNumber + "/*/" + CrawlEnvironment.buildCrawlSegmentLogCheckpointWildcardString(getHostName())); // scan hdfs for log files FileStatus candidates[]; LOG.info("Scanning For Cadnidates in:" + hdfsScanPath); candidates = fs.globStatus(hdfsScanPath); // iterate candidates for (FileStatus candidate : candidates) { // ok found a candidate we can work on LOG.info("Found Candidate:" + candidate.getPath()); final URLFPV2 placeHolderFP = new URLFPV2(); CrawlSegmentLog.walkFingerprintsInLogFile(fs, candidate.getPath(), new CrawlSegmentLog.LogFileItemCallback() { @Override public void processItem(long domainHash, long urlFingerprint) { placeHolderFP.setDomainHash(domainHash); placeHolderFP.setUrlHash(urlFingerprint); // add item for bloom filter _bloomFilter.add(placeHolderFP); } }); LOG.info("Finished Processing Candidate:" + candidate.getPath()); } } private void moveToTransitioningState(final AsyncContext<CrawlHistoryStatus, CrawlHistoryStatus> rpcContext) { // start a thread an wait for checkpoint thread to purge all log files ... new Thread(new Runnable() { @Override public void run() { // create scan pattern Path hdfsScanPath = new Path(CrawlEnvironment.getCrawlSegmentDataDirectory() + "/" + _state.getCurrentCrawlNumber() + "/*/" + CrawlEnvironment.buildCrawlSegmentLogCheckpointWildcardString(getHostName())); LOG.info("Scanning Log Directory at Path:" + hdfsScanPath + " for Log Files"); // scan hdfs for log files while (true) { try { FileSystem fs = CrawlEnvironment.getDefaultFileSystem(); if (fs.globStatus(hdfsScanPath).length != 0) { LOG.info( "Waiting for CheckpointThread to Purge All Existing Log Files for Crawl Number:" + _state.getCurrentCrawlNumber()); try { Thread.sleep(5000); } catch (InterruptedException e) { } } else { break; } } catch (IOException e) { LOG.error(CCStringUtils.stringifyException(e)); } } LOG.info("Acquiring Checkpoint Thread Semaphore"); _checkpointThreadSemaphore.acquireUninterruptibly(); LOG.info("Acquired Checkpoint Thread Semaphore - Scheduling Async Callback"); // ok now we can safely reset state, shift back to async thread ... getEventLoop().setTimer(new Timer(0, false, new Timer.Callback() { @Override public void timerFired(Timer timer) { try { LOG.info("Updating State to Transitioning"); // set server to appropriate state _state.setCurrentCheckpointState(CrawlHistoryStatus.CheckpointState.TRANSITIONING); LOG.info("Serializing Database State"); updateState(); rpcContext.getOutput().setActiveCrawlNumber(_state.getCurrentCrawlNumber()); rpcContext.getOutput().setCheckpointState(_state.getCurrentCheckpointState()); } catch (IOException e) { LOG.error(CCStringUtils.stringifyException(e)); rpcContext.setStatus(Status.Error_RequestFailed); rpcContext.setErrorDesc(CCStringUtils.stringifyException(e)); } // complete the request ... try { rpcContext.completeRequest(); } catch (RPCException e) { LOG.error(CCStringUtils.stringifyException(e)); } finally { _checkpointThreadSemaphore.release(); } } })); } }).start(); } Thread _resetThread = null; private void resetBloomFilter(final AsyncContext<CrawlHistoryStatus, CrawlHistoryStatus> rpcContext) { LOG.info("Got Reset BloomFilter RPC"); if (_resetThread != null) { rpcContext.setErrorDesc("Reset Already In Progress!"); rpcContext.setStatus(Status.Error_RequestFailed); try { rpcContext.completeRequest(); } catch (RPCException e) { } } else { // ok, first we need to stop the checkpoint thread ... // so, start a new thread and have it block on the checkpoint semaphore ... _resetThread = new Thread(new Runnable() { @Override public void run() { LOG.info("Waiting for Checkpoint Thread to IDLE"); // ok, now in the thread's context, safely block to acquire the checkpoint semaphore _checkpointThreadSemaphore.acquireUninterruptibly(); LOG.info("Checkpoint Thread to IDLE - Proceeding with BloomFilter Reset"); // ok now we can safely reset state, shift back to async thread ... getEventLoop().setTimer(new Timer(0, false, new Timer.Callback() { @Override public void timerFired(Timer timer) { try { // ok now we are back in the async thread context ... try { LOG.info("Deleting Existing Checkpoint Files"); // safely delete checkpoint files ... FileSystem fs = CrawlEnvironment.getDefaultFileSystem(); fs.delete(getDataFileCheckpointPath(), false); fs.delete(getDataFileFinalPath(), false); LOG.info("Reseting BloomFilter"); // safely reset bloom filter _bloomFilter = null; _bloomFilter = new URLFPBloomFilter(_numElements, _numHashFunctions, _bitsPerElement); // ok reload any lagging history ... LOG.info("Reloading Lagging History"); reloadLaggingHistory(rpcContext.getInput().getActiveCrawlNumber() - 1); // and write out bloom filter Path finalPath = getDataFileFinalPath(); LOG.info("Writing BloomFilter Data"); // serialize the filter ... serializeBloomFilter(finalPath); LOG.info("Update Disk State"); _state.setCurrentCrawlNumber(rpcContext.getInput().getActiveCrawlNumber()); _state.setCurrentCheckpointState(CrawlHistoryStatus.CheckpointState.ACTIVE); // write state to disk ... updateState(); LOG.info("Transition to new CrawlNumber:" + rpcContext.getInput().getActiveCrawlNumber() + " complete"); // update status rpcContext.getOutput().setActiveCrawlNumber(_state.getCurrentCrawlNumber()); rpcContext.getOutput().setCheckpointState(_state.getCurrentCheckpointState()); } catch (IOException e) { LOG.error(CCStringUtils.stringifyException(e)); rpcContext.setStatus(Status.Error_RequestFailed); rpcContext.setErrorDesc(CCStringUtils.stringifyException(e)); } // complete the request ... try { rpcContext.completeRequest(); } catch (RPCException e) { LOG.error(CCStringUtils.stringifyException(e)); } } finally { // reset scan variables _lastCheckpointScanTime = -1; _lastCheckpointFlushTime = 1; _checkpointThreadSemaphore.release(); _resetThread = null; } } })); } }); _resetThread.start(); } } private void serializeBloomFilter(Path checkpointPath) throws IOException { FileSystem fs = CrawlEnvironment.getDefaultFileSystem(); // delete existing ... fs.delete(checkpointPath, false); FSDataOutputStream outputStream = fs.create(checkpointPath); try { DataOutputStream dataOut = new DataOutputStream(outputStream); dataOut.writeInt(0); // version dataOut.writeInt(_state.getCurrentCrawlNumber()); // crawl number ... // serialize bloom filter contents ... _bloomFilter.serialize(outputStream); } finally { if (outputStream != null) { outputStream.flush(); outputStream.close(); } } } private void deSerializeBloomFilter(Path checkpointPath) throws IOException { FileSystem fs = CrawlEnvironment.getDefaultFileSystem(); FSDataInputStream stream = fs.open(checkpointPath); try { stream.readInt(); // version stream.readInt(); // crawl number ... // serialize bloom filter contents ... _bloomFilter = URLFPBloomFilter.load(stream); } finally { stream.close(); } } private boolean validateOnDiskVersion() throws IOException { FileSystem fs = CrawlEnvironment.getDefaultFileSystem(); Path dataFilePath = getDataFileFinalPath(); LOG.info("Loading BloomFilter From Disk at Path:" + dataFilePath); if (fs.exists(dataFilePath)) { FSDataInputStream stream = null; try { stream = fs.open(dataFilePath); DataInputStream dataInput = new DataInputStream(stream); // skip version dataInput.readInt(); // read crawl version ... int serializedCrawlVersion = dataInput.readInt(); LOG.info("BloomFilter From On Disk has CrawlVersion:" + serializedCrawlVersion); if (serializedCrawlVersion < _state.getCurrentCrawlNumber()) { LOG.error("skipping load because serial crawl number is less than current crawl"); stream.close(); stream = null; fs.rename(dataFilePath, new Path(dataFilePath.getParent(), dataFilePath.getName() + "-V-" + serializedCrawlVersion)); return false; } return true; } finally { if (stream != null) stream.close(); } } return false; } private void loadBloomFilter() throws IOException { FileSystem fs = CrawlEnvironment.getDefaultFileSystem(); Path dataFilePath = getDataFileFinalPath(); LOG.info("Potentially Loading BloomFilter From Disk at Path:" + dataFilePath); if (!validateOnDiskVersion()) { LOG.info("On Disk Verison Not Valid. Allocating New BloomFilter at Path:" + dataFilePath); LOG.info("Allocating NEW BloomFilter"); _bloomFilter = new URLFPBloomFilter(_numElements, _numHashFunctions, _bitsPerElement); } else { LOG.info("Loading BloomFilter From Disk"); deSerializeBloomFilter(dataFilePath); } List<Path> paths = reloadActiveHistory(); if (paths.size() != 0) { LOG.info("Loaded Some History Via Log Files - Writing Back to Disk"); serializeBloomFilter(dataFilePath); for (Path historyFile : paths) { fs.delete(historyFile, false); } } } private Thread _checkpointThread = null; private Semaphore _checkpointThreadSemaphore = new Semaphore(1); private boolean _shutdownFlag = false; /** checkpoint paths **/ private TreeSet<Path> _processedPaths = new TreeSet<Path>(); /** last checkpoint time **/ private long _lastCheckpointScanTime = -1; private long _lastCheckpointFlushTime = -1; private static final int CHECKPOINT_MUTEX_ACQUISITON_DELAY = 60000 * 2; /** urls process since last checkpoint **/ private AtomicInteger _urlsProcessedSinceCheckpoint = new AtomicInteger(); /** checkpoint scan interval **/ private static final int CHECKPOINT_SCAN_INTERVAL = 60000; // every minute /** checkpoint flush interval **/ private static final int CHECKPOINT_FLUSH_INTERVAL = 15 * 60 * 1000; // 15 minutes private void startCheckpointThread(final FileSystem fs) { _checkpointThread = new Thread(new Runnable() { @Override public void run() { // ok check point thread run in perpetuty while (!_shutdownFlag) { if (_lastCheckpointScanTime == -1 || _lastCheckpointFlushTime == -1 || (System.currentTimeMillis() - _lastCheckpointScanTime) >= CHECKPOINT_SCAN_INTERVAL || (System.currentTimeMillis() - _lastCheckpointFlushTime) >= CHECKPOINT_FLUSH_INTERVAL) { //LOG.info("Checkpoint Thread Grabbing Semaphore"); // grab checkpoint thread semaphore _checkpointThreadSemaphore.acquireUninterruptibly(); //LOG.info("Checkpoint Thread Grabbed Semaphore"); try { // create scan pattern Path hdfsScanPath = new Path(CrawlEnvironment.getCrawlSegmentDataDirectory() + "/" + _state.getCurrentCrawlNumber() + "/*/" + CrawlEnvironment.buildCrawlSegmentLogCheckpointWildcardString(getHostName())); // scan hdfs for log files FileStatus candidates[]; try { LOG.info("Checkpoint Thread Scanning For Cadnidates in:" + hdfsScanPath); candidates = fs.globStatus(hdfsScanPath); // iterate candidates for (FileStatus candidate : candidates) { // check candidate against processed path list ... if (!_processedPaths.contains(candidate.getPath())) { int urlCountBeforeProcessing = _urlsProcessedSinceCheckpoint.get(); // ok found a candidate we can work on LOG.info("Checkpoint Thread Found Candidate:" + candidate.getPath()); final URLFPV2 placeHolderFP = new URLFPV2(); CrawlSegmentLog.walkFingerprintsInLogFile(fs, candidate.getPath(), new CrawlSegmentLog.LogFileItemCallback() { @Override public void processItem(long domainHash, long urlFingerprint) { placeHolderFP.setDomainHash(domainHash); placeHolderFP.setUrlHash(urlFingerprint); // add item for bloom filter _bloomFilter.add(placeHolderFP); // inrement urls processed count ... _urlsProcessedSinceCheckpoint.addAndGet(1); } }); _processedPaths.add(candidate.getPath()); LOG.info("Finished Processing Candidate:" + candidate.getPath()); } } // update scan time ... _lastCheckpointScanTime = System.currentTimeMillis(); // see if can do a full checkpoint ... if (_lastCheckpointFlushTime == -1 || System.currentTimeMillis() - _lastCheckpointFlushTime >= CHECKPOINT_FLUSH_INTERVAL) { int approximateItemsToFlush = _urlsProcessedSinceCheckpoint.get(); // ok at this point we are read to initialize a checkpoint if (approximateItemsToFlush != 0) { Path checkpointMutexPath = getCheckpointMutexPath(); if (fs.createNewFile(checkpointMutexPath)) { try { LOG.info("Checkpoint Thread Starting Checkpoint"); // get the checkpoint path ... Path checkpointPath = getDataFileCheckpointPath(); Path finalPath = getDataFileFinalPath(); LOG.info("Checkpoint Thread Writing BloomFilter Data"); // serialize the filter ... serializeBloomFilter(checkpointPath); LOG.info("Checkpoint Thread Deleting Old Checkpoint Data"); // ok now everything seems to have gone fine ... delete existing data file fs.delete(finalPath, false); LOG.info("Checkpoint Thread ReWriting New Checkpoint Data"); // rename checkpoint to final ... fs.rename(checkpointPath, finalPath); if (_state .getCurrentCheckpointState() != CrawlHistoryStatus.CheckpointState.TRANSITIONING) { LOG.info("Checkpoint Thread Deleting Processed Files"); // ok safely delete all processed files for (Path processedFilePath : _processedPaths) { fs.delete(processedFilePath, false); } _processedPaths.clear(); } else { LOG.info( "Skipping Processed Files Purge because we are in Transitioning State"); } _urlsProcessedSinceCheckpoint.addAndGet(-approximateItemsToFlush); } finally { LOG.info( "Checkpoint Thread Releasing Mutex:" + checkpointMutexPath); fs.delete(checkpointMutexPath, false); } } else { int delay = (int) (Math.random() * CHECKPOINT_MUTEX_ACQUISITON_DELAY); LOG.info("Checkpoint thread failed to acquire Mutex:" + checkpointMutexPath + " Waiting " + delay + "(MS) before retry"); try { Thread.sleep(delay); } catch (InterruptedException e) { } } } // update last checkpoint no time no matter what ... _lastCheckpointFlushTime = System.currentTimeMillis(); } } catch (IOException e) { LOG.error("Checkpoint Thread Bloom Filter Checkpoint Failed with Exception:" + CCStringUtils.stringifyException(e)); try { Thread.sleep(60000); } catch (InterruptedException e1) { } } } finally { LOG.info("Checkpoint Thread Releasing Checkpoint Semaphore"); _checkpointThreadSemaphore.release(); } } else { try { //LOG.info("Checkpoint Thread IDLE"); Thread.sleep(100); } catch (InterruptedException e) { } } } } }); _checkpointThread.start(); } @Override public void IncomingClientConnected(AsyncClientChannel channel) { // TODO Auto-generated method stub } @Override public void IncomingClientDisconnected(AsyncClientChannel channel) { // TODO Auto-generated method stub } @Override public void queryStatus(AsyncContext<NullMessage, CrawlHistoryStatus> rpcContext) throws RPCException { rpcContext.getOutput().setActiveCrawlNumber(_state.getCurrentCrawlNumber()); rpcContext.getOutput().setCheckpointState(_state.getCurrentCheckpointState()); rpcContext.completeRequest(); } @Override public void sync(final AsyncContext<CrawlHistoryStatus, NullMessage> rpcContext) throws RPCException { LOG.info("Received Sync From Crawler"); // validate crawl number if (_state.getCurrentCrawlNumber() == rpcContext.getInput().getActiveCrawlNumber()) { // snapshot current time final long startTime = System.currentTimeMillis(); // ok reset resync variable on checkpoint thread _lastCheckpointScanTime = -1; // now set a timer to poll periodically for resync to complete getEventLoop().setTimer(new Timer(100, true, new Timer.Callback() { @Override public void timerFired(Timer timer) { // ok check to see if resync happened ... if (_lastCheckpointScanTime >= startTime) { getEventLoop().cancelTimer(timer); try { rpcContext.completeRequest(); } catch (RPCException e) { LOG.error(CCStringUtils.stringifyException(e)); } } } })); } else { LOG.error("Crawler CrawlNumber and HistoryServer CrawlNumber don't match! - Aborting Sync"); rpcContext.setStatus(Status.Error_RequestFailed); rpcContext .setErrorDesc("Crawler CrawlNumber and HistoryServer CrawlNumber don't match! - Aborting Sync"); rpcContext.completeRequest(); } } @Override public void bulkUpdateHistory(AsyncContext<BulkUpdateData, NullMessage> rpcContext) throws RPCException { LOG.info("Received BulkUpdate Request"); ImmutableBuffer inputBuffer = rpcContext.getInput().getFingerprintList(); if (inputBuffer.getCount() != 0) { try { if (_bloomFilter == null) { throw new IOException("BloomFilter Not Initilized. Invalid Server State!"); } DataInputStream inputStream = new DataInputStream( new ByteArrayInputStream(inputBuffer.getReadOnlyBytes(), 0, inputBuffer.getCount())); URLFPV2 fingerprint = new URLFPV2(); int itemsAdded = 0; while (inputStream.available() != 0) { fingerprint.setDomainHash(WritableUtils.readVLong(inputStream)); fingerprint.setUrlHash(WritableUtils.readVLong(inputStream)); _bloomFilter.add(fingerprint); ++itemsAdded; } _urlsProcessedSinceCheckpoint.addAndGet(itemsAdded); LOG.info("Finished Processed BulkUpdate Request. " + itemsAdded + " items processed."); } catch (IOException e) { LOG.error(CCStringUtils.stringifyException(e)); rpcContext.setStatus(Status.Error_RequestFailed); rpcContext.setErrorDesc(CCStringUtils.stringifyException(e)); } rpcContext.completeRequest(); } } }