Java tutorial
/** * Copyright 2008 - CommonCrawl Foundation * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see <http://www.gnu.org/licenses/>. * **/ package org.commoncrawl.service.listcrawler; import java.io.BufferedReader; import java.io.BufferedWriter; import java.io.DataInput; import java.io.DataOutput; import java.io.DataOutputStream; import java.io.File; import java.io.FileInputStream; import java.io.FileOutputStream; import java.io.FileWriter; import java.io.IOException; import java.io.InputStreamReader; import java.io.PrintWriter; import java.io.RandomAccessFile; import java.nio.channels.FileLock; import java.nio.channels.OverlappingFileLockException; import java.nio.charset.Charset; import java.util.ArrayList; import java.util.Arrays; import java.util.Comparator; import java.util.Map; import java.util.Set; import java.util.TreeMap; import java.util.TreeSet; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.DataInputBuffer; import org.apache.hadoop.io.DataOutputBuffer; import org.apache.hadoop.io.SequenceFile; import org.apache.hadoop.io.WritableUtils; import org.apache.log4j.BasicConfigurator; import org.commoncrawl.crawl.common.internal.CrawlEnvironment; import org.commoncrawl.hadoop.mergeutils.MergeSortSpillWriter; import org.commoncrawl.hadoop.mergeutils.RawKeyValueComparator; import org.commoncrawl.hadoop.mergeutils.SequenceFileSpillWriter; import org.commoncrawl.mapred.ProxyCrawlHistoryItem; import org.commoncrawl.protocol.CrawlURL; import org.commoncrawl.protocol.URLFP; import org.commoncrawl.rpc.base.shared.BinaryProtocol; import org.commoncrawl.service.crawler.util.URLFPBloomFilter; import org.commoncrawl.service.listcrawler.CrawlListDomainItem; import org.commoncrawl.service.listcrawler.CrawlListMetadata; import org.commoncrawl.service.listcrawler.CrawlHistoryManager.ItemUpdater; import org.commoncrawl.util.GoogleURL; import org.commoncrawl.util.URLFingerprint; import org.commoncrawl.util.URLUtils; import org.commoncrawl.util.CCStringUtils; import org.commoncrawl.util.CRC16; import org.commoncrawl.util.FileUtils; import org.junit.Assert; import com.google.gson.stream.JsonWriter; /** * A list of urls that need to be crawled * @author rana * */ public final class CrawlList implements ItemUpdater { // default refresh interval is 60 days ... public static final int DEFAULT_REFRESH_INTERVAL_IN_SECS = 86400 * 60; /** * events generated by the CrawlList * * @author rana * */ public static interface CrawlListEvents { public void itemUpdated(URLFP itemFingerprint); } public static final Log LOG = LogFactory.getLog(CrawlList.class); public static final int ValueFlag_HasRedirect = 1 << 0; File _listURLDataFile = null; File _fixedDataFile = null; File _variableDataFile = null; File _bloomFilterData = null; File _listMetadataFile = null; File _subDomainMetadataFile = null; URLFPBloomFilter _bloomFilter = null; long _listId; CrawlHistoryStorage _manager; CrawlListMetadata _metadata = new CrawlListMetadata(); CrawlListEvents _eventListener; byte[] _tempFixedDataBuffer = null; int _tempFixedDataBufferSize = 0; DataOutputBuffer _tempOutputBuffer = new DataOutputBuffer(OnDiskCrawlHistoryItem.ON_DISK_SIZE); TreeMap<Long, CrawlListMetadata> _transientSubDomainStats = new TreeMap<Long, CrawlListMetadata>(); DataOutputBuffer _offsetLookupTable = null; Exception _exception; public enum LoadState { UNINITIALIZED, QUEUED_FOR_LOADING, REALLY_LOADING, LOADED, ERROR } LoadState _listState = LoadState.UNINITIALIZED; public enum QueueState { WAITING, QUEUEING, QUEUED, ERROR } QueueState _queueState = QueueState.WAITING; /** * internal factory constructor */ private CrawlList(CrawlHistoryStorage manager, long listId, LoadState state) { _manager = manager; //establish file names initializeListFileNames(); _listId = listId; _listState = state; } /** * internal factory constructor */ private CrawlList(CrawlHistoryStorage manager, long listId, Exception e) { _manager = manager; //establish file names initializeListFileNames(); _listId = listId; _listState = LoadState.ERROR; _exception = e; } /** is list loaded * */ public boolean isListLoaded() { return _listState == LoadState.LOADED; } /** mark list as loading * * */ public void markListAsReallyLoading() { _listState = LoadState.REALLY_LOADING; } // get the list's load state public LoadState getLoadState() { return _listState; } // get the last caught exception (if list is in error state) public Exception getLastException() { return _exception; } /** get the list id * */ public long getListId() { return _listId; } /** set the event listener hook ** * * @param eventListener */ public synchronized void setEventListener(CrawlListEvents eventListener) { _eventListener = eventListener; } public synchronized CrawlListEvents getEventListener() { return _eventListener; } /** get metadata * */ public CrawlListMetadata getMetadata() { CrawlListMetadata metadataOut = null; synchronized (_metadata) { try { metadataOut = (CrawlListMetadata) _metadata.clone(); } catch (CloneNotSupportedException e) { } } return metadataOut; } /** * * @return the path to the url data file (source for the urls in this list) */ public File getListURLDataFile() { return _listURLDataFile; } /** * Initialize a CrawlList in an error state .. */ public static CrawlList createListWithLoadErrorState(CrawlHistoryStorage manager, long listId, Exception e) { return new CrawlList(manager, listId, e); } /** * Initialize a CrawlList in an laoding state .. */ public static CrawlList createListLoadingInLoadingState(CrawlHistoryStorage manager, long listId, File dataFile, int refreshInterval) { CrawlList listOut = new CrawlList(manager, listId, LoadState.QUEUED_FOR_LOADING); listOut.getMetadata().setRefreshInterval(refreshInterval); listOut._listURLDataFile = dataFile; return listOut; } /** * Load a CrawlList from previously stored disk state * * @param manager - reference to the crawl list history manager * @param listId - the list id (the timestamp) for the given list to load from disk state */ public CrawlList(CrawlHistoryStorage storage, long listId) throws IOException { _listId = listId; _manager = storage; //establish file names initializeListFileNames(); LOG.info("Initilaizing pre-existing List with Id:" + listId); LOG.info("Loading BloomFilterData for List:" + listId); FileInputStream bloomFilterData = new FileInputStream(_bloomFilterData); try { // load bloom filter _bloomFilter = URLFPBloomFilter.load(bloomFilterData); } finally { bloomFilterData.close(); } // load list metadata from disk loadMetadataFromDisk(); // reset queued counts ... _metadata.setQueuedItemCount(0); // write it back writeMetadataToDisk(); // load sub domain metadata from disk ... loadSubDomainMetadataFromDisk(); // reset queued count ... resetSubDomainCounts(); _listState = LoadState.LOADED; } /** * Initialize a new CrawlList object from a given input stream of urls * * @param manager - reference to the crawl history log manager * @param urlInputStream - the input stream containing the list of urls that we should add to this list ... * @throws IOException */ public CrawlList(CrawlHistoryStorage manager, long listId, File sourceURLFile, int refreshInterval) throws IOException { _manager = manager; _listState = LoadState.REALLY_LOADING; // initialize a new list id _listId = listId; LOG.info("*** LIST:" + getListId() + " LOADING FROM SOURCE FILE:" + sourceURLFile.getAbsolutePath()); //establish file names initializeListFileNames(); sourceURLFile.renameTo(_listURLDataFile); FileInputStream urlInputStream = new FileInputStream(_listURLDataFile); try { // set we will use to hold all fingerprints generated TreeSet<URLFP> urlSet = new TreeSet<URLFP>(); // create temp files ... File spillOutputFile = File.createTempFile("spillOut", Long.toString(_listId)); // create mergesortspillwriter SequenceFileSpillWriter<URLFP, ProxyCrawlHistoryItem> spillwriter = new SequenceFileSpillWriter<URLFP, ProxyCrawlHistoryItem>( FileSystem.getLocal(CrawlEnvironment.getHadoopConfig()), CrawlEnvironment.getHadoopConfig(), new Path(spillOutputFile.getAbsolutePath()), URLFP.class, ProxyCrawlHistoryItem.class, null, false); try { MergeSortSpillWriter<URLFP, ProxyCrawlHistoryItem> merger = new MergeSortSpillWriter<URLFP, ProxyCrawlHistoryItem>( CrawlEnvironment.getHadoopConfig(), spillwriter, FileSystem.getLocal(CrawlEnvironment.getHadoopConfig()), new Path(manager.getLocalDataDir().getAbsolutePath()), null, new RawKeyValueComparator<URLFP, ProxyCrawlHistoryItem>() { DataInputBuffer _key1Buffer = new DataInputBuffer(); DataInputBuffer _key2Buffer = new DataInputBuffer(); @Override public int compareRaw(byte[] key1Data, int key1Offset, int key1Length, byte[] key2Data, int key2Offset, int key2Length, byte[] value1Data, int value1Offset, int value1Length, byte[] value2Data, int value2Offset, int value2Length) throws IOException { _key1Buffer.reset(key1Data, key1Offset, key1Length); _key2Buffer.reset(key2Data, key2Offset, key2Length); _key1Buffer.skip(2); // skip verison, and 1 byte id _key2Buffer.skip(2); // skip verison, and 1 byte id int domainHash1 = WritableUtils.readVInt(_key1Buffer); int domainHash2 = WritableUtils.readVInt(_key2Buffer); _key1Buffer.skip(1); // skip 1 byte id _key2Buffer.skip(1); // skip 1 byte id long fingerprint1 = WritableUtils.readVLong(_key1Buffer); long fingerprint2 = WritableUtils.readVLong(_key2Buffer); int result = ((Integer) domainHash1).compareTo(domainHash2); if (result == 0) { result = ((Long) fingerprint1).compareTo(fingerprint2); } return result; } @Override public int compare(URLFP key1, ProxyCrawlHistoryItem value1, URLFP key2, ProxyCrawlHistoryItem value2) { return key1.compareTo(key2); } }, URLFP.class, ProxyCrawlHistoryItem.class, false, null); try { LOG.info("*** LIST:" + getListId() + " Starting Scan of URLS In List"); BufferedReader reader = new BufferedReader( new InputStreamReader(urlInputStream, Charset.forName("UTF-8"))); String line = null; int lineNumber = 0; ProxyCrawlHistoryItem item = new ProxyCrawlHistoryItem(); while ((line = reader.readLine()) != null) { ++lineNumber; if (line.length() != 0 && !line.startsWith("#")) { URLFP fingerprint = URLUtils.getURLFPFromURL(line, true); if (fingerprint != null) { if (!urlSet.contains(fingerprint)) { // and add fingerprint to set urlSet.add(fingerprint); // initialize item item.clear(); item.setOriginalURL(line); // and spill to merger / sorter .. merger.spillRecord(fingerprint, item); } } else { LOG.error("*** LIST:" + getListId() + " Invalid URL Encounered at Line:" + lineNumber + " URL" + line); } } } LOG.info("*** LIST:" + getListId() + " Completed Scan of:" + urlSet.size() + " URLS"); } finally { merger.close(); } } finally { if (spillwriter != null) spillwriter.close(); } LOG.info("*** LIST:" + getListId() + " Generating BloomFilter for:" + urlSet.size() + " keys"); // generate bloom filter ... _bloomFilter = new URLFPBloomFilter(urlSet.size(), 7, 10); for (URLFP fingerprint : urlSet) { _bloomFilter.add(fingerprint); } LOG.info("*** LIST:" + getListId() + " Serializing BloomFilter"); // serialize it FileOutputStream bloomFilterStream = new FileOutputStream(_bloomFilterData); try { _bloomFilter.serialize(bloomFilterStream); } finally { bloomFilterStream.flush(); bloomFilterStream.close(); } LOG.info("*** LIST:" + getListId() + " Starting Read of Merged Sequence File:" + spillOutputFile); // now initialize value map and string maps based on output sequence file ... SequenceFile.Reader reader = new SequenceFile.Reader( FileSystem.getLocal(CrawlEnvironment.getHadoopConfig()), new Path(spillOutputFile.getAbsolutePath()), CrawlEnvironment.getHadoopConfig()); LOG.info("*** LIST:" + getListId() + " PRE-ALLOCATING FIXED DATA BUFFER OF SIZE:" + (urlSet.size() * OnDiskCrawlHistoryItem.ON_DISK_SIZE)); // OK, Allocate room for fixed data file upfront DataOutputBuffer valueStream = new DataOutputBuffer( urlSet.size() * OnDiskCrawlHistoryItem.ON_DISK_SIZE); LOG.info("*** LIST:" + getListId() + " ALLOCATION SUCCEEDED"); try { //DataOutputStream valueStream = new DataOutputStream(new FileOutputStream(_fixedDataFile)); RandomAccessFile stringsStream = new RandomAccessFile(_variableDataFile, "rw"); try { URLFP urlFP = new URLFP(); ProxyCrawlHistoryItem item = new ProxyCrawlHistoryItem(); // read fingerprints ... while (reader.next(urlFP, item)) { // write out fixed data structure and strings writeInitialOnDiskItem(urlFP, item, valueStream, stringsStream); } } finally { //valueStream.flush(); //valueStream.close(); stringsStream.close(); } } finally { reader.close(); } LOG.info("*** LIST:" + getListId() + " Finished Writing Initial Values to Disk"); LOG.info("*** LIST:" + getListId() + " FIXED DATA BUFFER OF SIZE:" + valueStream.getLength() + " EXCEPECTED SIZE:" + (urlSet.size() * OnDiskCrawlHistoryItem.ON_DISK_SIZE)); if (valueStream.getLength() != (urlSet.size() * OnDiskCrawlHistoryItem.ON_DISK_SIZE)) { throw new IOException("Final FixedItemData Buffer Size:" + valueStream.getLength() + " != URLSetSize:" + (urlSet.size() * OnDiskCrawlHistoryItem.ON_DISK_SIZE)); } // initialize temp data buffer variables _tempFixedDataBuffer = valueStream.getData(); _tempFixedDataBufferSize = valueStream.getLength(); // update metadata _metadata.setRefreshInterval(refreshInterval); _metadata.setUrlCount(urlSet.size()); // setup version _metadata.setVersion(1); // and write to disk writeMetadataToDisk(); // mark state as loaded ... _listState = LoadState.LOADED; LOG.info("*** LIST:" + getListId() + " SYNCING"); // reconcile with history log _manager.syncList(this.getListId(), urlSet, this); LOG.info("*** LIST:" + getListId() + " SYNC COMPLETE"); // write metdata to disk again writeMetadataToDisk(); LOG.info("*** LIST:" + getListId() + " FLUSHING FIXED DATA"); // and finally flush fixed data to disk FileOutputStream finalDataStream = new FileOutputStream(_fixedDataFile); try { synchronized (this) { int blockSize = 1 << 20; long bytesCopied = 0; for (int offset = 0; offset < _tempFixedDataBufferSize; offset += blockSize) { int bytesToCopy = Math.min(blockSize, _tempFixedDataBufferSize - offset); finalDataStream.write(_tempFixedDataBuffer, offset, bytesToCopy); bytesCopied += bytesToCopy; } // validate bytes copied if (bytesCopied != _tempFixedDataBufferSize) { throw new IOException("Buffer Size:" + _tempFixedDataBufferSize + " Does not Match BytesCopied:" + bytesCopied); } // ok release the buffer _tempFixedDataBuffer = null; _tempFixedDataBufferSize = 0; LOG.info("*** LIST:" + getListId() + " FIXED DATA FLUSH COMPLETE"); } } finally { finalDataStream.flush(); finalDataStream.close(); } // load sub domain metadata from disk ... loadSubDomainMetadataFromDisk(); } catch (IOException e) { LOG.error("*** LIST:" + getListId() + " Crawl List Initialization Failed With Exception:" + CCStringUtils.stringifyException(e)); _fixedDataFile.delete(); _variableDataFile.delete(); _bloomFilterData.delete(); _listState = LoadState.ERROR; throw e; } finally { urlInputStream.close(); } } /** * update list state of a recently crawled item * * @param fingerprint - the fingerprint of the updated item * @param newData - the updated crawl history data for the given item * @throws IOException */ @Override public void updateItemState(URLFP fingerprint, ProxyCrawlHistoryItem newData) throws IOException { if (_listState == LoadState.LOADED) { // check for membership ... if (_bloomFilter.isPresent(fingerprint)) { //LOG.info("UpdateItemState Called for URL:" + newData.getOriginalURL() + " List:" + getListId()); //LOG.info("UpdateItemState Loading OnDisk Item for URL:" + newData.getOriginalURL() + " List:" + getListId()); // extract existing item from disk OnDiskCrawlHistoryItem originalItem = loadOnDiskItemForURLFP(fingerprint); //if present (null if false cache hit) if (originalItem != null) { // build an on disk item data structure for any potential changes ... OnDiskCrawlHistoryItem newItem = onDiskItemFromHistoryItem(fingerprint, newData); // set inital offset information newItem._fileOffset = originalItem._fileOffset; newItem._stringsOffset = originalItem._stringsOffset; // LOG.info("UpdateItemState Comparing OnDisk Item to New Item for URL:" + newData.getOriginalURL() + " List:" + getListId()); // compare the two items ... if (!newItem.equals(originalItem)) { //LOG.info("UpdateItemState Items Don't Match for URL:" + newData.getOriginalURL() + " List:" + getListId()); // ok items do not match ... figure out if strings are different ... if (newItem._stringsCRC != originalItem._stringsCRC) { RandomAccessFile stringsFile = new RandomAccessFile(_variableDataFile, "rw"); try { // seek to end stringsFile.seek(stringsFile.length()); // update offset info newItem._stringsOffset = stringsFile.length(); // write out string data length WritableUtils.writeVInt(stringsFile, _stringBuffer1.getLength()); // write strings to log file stringsFile.write(_stringBuffer1.getData(), 0, _stringBuffer1.getLength()); } finally { stringsFile.close(); } } // otherwise take the offset from old item else { newItem._stringsOffset = originalItem._stringsOffset; } //LOG.info("Opening Data File for OnDiskItem load for Fingerprint:" + newItem._urlFingerprint); // ok, different paths depending on wether this is an in memory update or not ... boolean wroteToMemory = false; synchronized (this) { if (_tempFixedDataBuffer != null) { wroteToMemory = true; // reset output buffer _tempOutputBuffer.reset(); // serizlie to output buffer newItem.serialize(_tempOutputBuffer); // copy to appropriate location System.arraycopy(_tempOutputBuffer.getData(), 0, _tempFixedDataBuffer, (int) originalItem._fileOffset, OnDiskCrawlHistoryItem.ON_DISK_SIZE); } } if (!wroteToMemory) { // write to disk RandomAccessFile file = new RandomAccessFile(_fixedDataFile, "rw"); try { while (true) { try { //LOG.info("*** TRYING UPDATE LOCK FOR OFFSET:" + originalItem._fileOffset); FileLock lock = file.getChannel().tryLock(originalItem._fileOffset, OnDiskCrawlHistoryItem.ON_DISK_SIZE, false); try { //LOG.info("*** GOT UPDATE LOCK FOR OFFSET:" + originalItem._fileOffset); file.seek(originalItem._fileOffset); newItem.serialize(file); //LOG.info("Updated Data File for OnDiskItem for Fingerprint:" + originalItem._urlFingerprint); break; } finally { //LOG.info("*** RELEASED UPDATE LOCK FOR OFFSET:" + originalItem._fileOffset); lock.release(); } } catch (OverlappingFileLockException e) { LOG.error("###LockConflict(RETRY):" + CCStringUtils.stringifyException(e)); } } } finally { file.close(); } } // ok now update metadata ... synchronized (_metadata) { int updateFlags = calculateUpdateFlags(originalItem, newItem); if (updateFlags != 0) { int metadataDirtyFlags = updateMetadata(newItem, _metadata, 0); // only write metadata to disk if temp data buffer is null if (metadataDirtyFlags != 0 && !wroteToMemory) { if ((metadataDirtyFlags & MetadataUpdateFlag_ModifiedCrawlStatus) != 0) { _metadata.setQueuedItemCount(_metadata.getQueuedItemCount() - 1); } writeMetadataToDisk(); } // if not writing to memory then update subdomain metadata if (!wroteToMemory) { synchronized (_subDomainMetadataFile) { CrawlListMetadata subDomainMetadata = getSubDomainMetadataByURL( newData.getOriginalURL()); int subDomainMetadataDirtyFlags = updateMetadata(newItem, subDomainMetadata, processFileOffsets); if (subDomainMetadataDirtyFlags != 0 && !wroteToMemory) { if ((subDomainMetadataDirtyFlags & MetadataUpdateFlag_ModifiedCrawlStatus) != 0) { subDomainMetadata.setQueuedItemCount( subDomainMetadata.getQueuedItemCount() - 1); } writeSubDomainMetadataToDisk(subDomainMetadata); } } } } } synchronized (this) { if (_eventListener != null) { _eventListener.itemUpdated(fingerprint); } } } } } } } private static final int processOrignalStatus = 1 << 0; private static final int processOriginalResult = 1 << 1; private static final int processRedirectStatus = 1 << 2; private static final int processRedirectResult = 1 << 3; private static final int processFileOffsets = 1 << 4; private static final int processAllItems = Integer.MAX_VALUE; private static int calculateUpdateFlags(OnDiskCrawlHistoryItem originalItem, OnDiskCrawlHistoryItem newItem) { int updateFlags = 0; if (newItem.isFlagSet(OnDiskCrawlHistoryItem.FLAG_HAS_CRAWL_STATUS) && !originalItem.isFlagSet(OnDiskCrawlHistoryItem.FLAG_HAS_CRAWL_STATUS)) { updateFlags |= processOrignalStatus; } if (newItem.isFlagSet(OnDiskCrawlHistoryItem.FLAG_HAS_ORIGINAL_RESULT_CODE) && !originalItem.isFlagSet(OnDiskCrawlHistoryItem.FLAG_HAS_ORIGINAL_RESULT_CODE)) { updateFlags |= processOriginalResult; } if (newItem.isFlagSet(OnDiskCrawlHistoryItem.FLAG_HAS_REDIRECT_STATUS) && !originalItem.isFlagSet(OnDiskCrawlHistoryItem.FLAG_HAS_REDIRECT_STATUS)) { updateFlags |= processRedirectStatus; } if (newItem.isFlagSet(OnDiskCrawlHistoryItem.FLAG_HAS_REDIRECT_RESULT_CODE) && !originalItem.isFlagSet(OnDiskCrawlHistoryItem.FLAG_HAS_REDIRECT_RESULT_CODE)) { updateFlags |= processRedirectResult; } return updateFlags; } private static final int MetadataUpdateFlag_ModifiedCrawlStatus = 1 << 0; private static final int MetadataUpdateFlag_ModifiedRedirectStatus = 1 << 1; private static final int MetadataUpdateFlag_ModifiedOffsets = 1 << 1; private static int updateMetadata(OnDiskCrawlHistoryItem newItem, CrawlListMetadata metadata, int updateFlags) { int metadataDirtyFlags = 0; if (!newItem.isFlagSet(OnDiskCrawlHistoryItem.FLAG_HAS_REDIRECT_STATUS)) { //if ((updateFlags & processOrignalStatus) != 0) { // LOG.info("### Updating OriginalCrawlStatus for Item:" + newData.getOriginalURL()); // status changed ... if (newItem._crawlStatus != 0) { switch (newItem._crawlStatus) { case CrawlURL.FailureReason.RobotsExcluded: metadata.setRobotsExcludedCount(metadata.getRobotsExcludedCount() + 1); break; case CrawlURL.FailureReason.Timeout: metadata.setTimeoutErrorCount(metadata.getTimeoutErrorCount() + 1); break; case CrawlURL.FailureReason.IOException: metadata.setIOExceptionCount(metadata.getIOExceptionCount() + 1); break; case CrawlURL.FailureReason.DNSFailure: metadata.setDNSErrorCount(metadata.getDNSErrorCount() + 1); break; default: metadata.setOtherErrorCount(metadata.getOtherErrorCount() + 1); } metadataDirtyFlags |= MetadataUpdateFlag_ModifiedCrawlStatus; } //} //if ((updateFlags & processOriginalResult) != 0) { // LOG.info("### Updating OriginalResultCode for Item:" + newData.getOriginalURL()); if (newItem._crawlStatus == 0) { if (newItem._httpResultCode == 200) metadata.setHttp200Count(metadata.getHttp200Count() + 1); else if (newItem._httpResultCode == 301) metadata.setHttp301Count(metadata.getHttp301Count() + 1); else if (newItem._httpResultCode == 403) metadata.setHttp403Count(metadata.getHttp403Count() + 1); else if (newItem._httpResultCode == 404) metadata.setHttp404Count(metadata.getHttp404Count() + 1); else if (newItem._httpResultCode >= 500 && newItem._httpResultCode < 600) metadata.setHttp500Count(metadata.getHttp500Count() + 1); else if (newItem._httpResultCode >= 600) metadata.setHttpOtherCount(metadata.getHttpOtherCount() + 1); metadataDirtyFlags |= MetadataUpdateFlag_ModifiedCrawlStatus; } //} } else { //if ((updateFlags & processRedirectStatus) != 0) { // status changed ... if (newItem._redirectStatus != 0) { switch (newItem._redirectStatus) { case CrawlURL.FailureReason.RobotsExcluded: metadata.setRobotsExcludedCount(metadata.getRobotsExcludedCount() + 1); break; case CrawlURL.FailureReason.Timeout: metadata.setTimeoutErrorCount(metadata.getTimeoutErrorCount() + 1); break; case CrawlURL.FailureReason.IOException: metadata.setIOExceptionCount(metadata.getIOExceptionCount() + 1); break; case CrawlURL.FailureReason.DNSFailure: metadata.setDNSErrorCount(metadata.getDNSErrorCount() + 1); break; default: metadata.setOtherErrorCount(metadata.getOtherErrorCount() + 1); } metadataDirtyFlags |= MetadataUpdateFlag_ModifiedCrawlStatus; } //} //if ((updateFlags & processRedirectResult) != 0) { if (newItem._redirectStatus == 0) { if (newItem._redirectHttpResult == 200) metadata.setHttp200Count(metadata.getHttp200Count() + 1); else if (newItem._redirectHttpResult == 301) metadata.setHttp301Count(metadata.getHttp301Count() + 1); else if (newItem._redirectHttpResult == 403) metadata.setHttp403Count(metadata.getHttp403Count() + 1); else if (newItem._redirectHttpResult == 404) metadata.setHttp404Count(metadata.getHttp404Count() + 1); else if (newItem._redirectHttpResult >= 500 && newItem._redirectHttpResult < 600) metadata.setHttp500Count(metadata.getRedirectHttp500Count() + 1); else if (newItem._redirectHttpResult >= 600) metadata.setRedirectHttpOtherCount(metadata.getHttpOtherCount() + 1); metadataDirtyFlags |= MetadataUpdateFlag_ModifiedCrawlStatus; } //} } if ((updateFlags & processFileOffsets) != 0) { if (!metadata.isFieldDirty(CrawlListMetadata.Field_FIRSTRECORDOFFSET) || metadata.getFirstRecordOffset() > newItem._fileOffset) { metadata.setFirstRecordOffset(newItem._fileOffset); metadataDirtyFlags = MetadataUpdateFlag_ModifiedOffsets; } if (!metadata.isFieldDirty(CrawlListMetadata.Field_LASTRECORDOFFSET) || metadata.getLastRecordOffset() < newItem._fileOffset) { metadata.setLastRecordOffset(newItem._fileOffset); metadataDirtyFlags = MetadataUpdateFlag_ModifiedOffsets; } } return metadataDirtyFlags; } /** * * @return the queued (all urls queued for crawling or not) state of this list */ public QueueState getQueuedState() { return _queueState; } private int lastDomainHash = -1; private String lastRootDomainName = null; private CrawlListMetadata lastRootDomainMetadata = null; private int domainQueuedCount = 0; private void updateSubDomainMetadataForItemDuringLoad(OnDiskCrawlHistoryItem item, String itemURL, URLFP itemFP, boolean isQueued) throws IOException { // ok unfortunately, we need to update stats for the subdomain here if (item._domainHash != lastDomainHash) { // update last domain hash ... lastDomainHash = item._domainHash; // extract root domain name GoogleURL urlObject = new GoogleURL(itemURL); String rootDomainName = URLUtils.extractRootDomainName(urlObject.getHost()); // if root domain name different than last root domain name ... if (rootDomainName != lastRootDomainName) { // flush last entry flushCachedSubDomainMetadata(); // load new entry if (rootDomainName != null) { lastRootDomainName = rootDomainName; lastRootDomainMetadata = new CrawlListMetadata(); } } if (lastRootDomainMetadata != null) { if (isQueued) { lastRootDomainMetadata.setQueuedItemCount(lastRootDomainMetadata.getQueuedItemCount() + 1); } else { updateMetadata(item, lastRootDomainMetadata, 0); } } if (lastRootDomainName != null) { updateSubDomainQueueStatus(lastRootDomainName, domainQueuedCount); } } } private void flushCachedSubDomainMetadata() throws IOException { if (lastRootDomainMetadata != null) { // ok get the latest version of the metadata from disk synchronized (_subDomainMetadataFile) { // get from disk CrawlListMetadata metadataOnDisk = getSubDomainMetadataByRootDomain(lastRootDomainName); // update on disk version ... metadataOnDisk.setHttp200Count( metadataOnDisk.getHttp200Count() + lastRootDomainMetadata.getHttp200Count()); metadataOnDisk.setHttp301Count( metadataOnDisk.getHttp301Count() + lastRootDomainMetadata.getHttp301Count()); metadataOnDisk.setHttp403Count( metadataOnDisk.getHttp403Count() + lastRootDomainMetadata.getHttp403Count()); metadataOnDisk.setHttp404Count( metadataOnDisk.getHttp404Count() + lastRootDomainMetadata.getHttp404Count()); metadataOnDisk.setHttp500Count( metadataOnDisk.getHttp500Count() + lastRootDomainMetadata.getHttp500Count()); metadataOnDisk.setHttpOtherCount( metadataOnDisk.getHttpOtherCount() + lastRootDomainMetadata.getHttpOtherCount()); metadataOnDisk.setRobotsExcludedCount( metadataOnDisk.getRobotsExcludedCount() + lastRootDomainMetadata.getRobotsExcludedCount()); metadataOnDisk.setTimeoutErrorCount( metadataOnDisk.getTimeoutErrorCount() + lastRootDomainMetadata.getTimeoutErrorCount()); metadataOnDisk.setIOExceptionCount( metadataOnDisk.getIOExceptionCount() + lastRootDomainMetadata.getIOExceptionCount()); metadataOnDisk.setDNSErrorCount( metadataOnDisk.getDNSErrorCount() + lastRootDomainMetadata.getDNSErrorCount()); metadataOnDisk.setOtherErrorCount( metadataOnDisk.getOtherErrorCount() + lastRootDomainMetadata.getOtherErrorCount()); metadataOnDisk.setQueuedItemCount( metadataOnDisk.getQueuedItemCount() + lastRootDomainMetadata.getQueuedItemCount()); // ok write it back to disk writeSubDomainMetadataToDisk(metadataOnDisk); } lastRootDomainMetadata = null; lastRootDomainName = null; lastDomainHash = -1; } } /** queue uncrawled urls via the CrawlQueueLoader * * @param loader */ public void queueUnCrawledItems(CrawlQueueLoader loader) throws IOException { _queueState = QueueState.QUEUEING; int metadataVersion = getMetadata().getVersion(); synchronized (_metadata) { // reset metadata PERIOD int urlCount = _metadata.getUrlCount(); _metadata.clear(); _metadata.setUrlCount(urlCount); } RandomAccessFile fixedDataReader = new RandomAccessFile(_fixedDataFile, "rw"); RandomAccessFile stringDataReader = new RandomAccessFile(_variableDataFile, "rw"); try { OnDiskCrawlHistoryItem item = new OnDiskCrawlHistoryItem(); URLFP fingerprint = new URLFP(); while (fixedDataReader.getFilePointer() != fixedDataReader.length()) { long position = fixedDataReader.getFilePointer(); //LOG.info("*** TRYING READ LOCK FOR OFFSET:" + position); while (true) { // get read lock on position ... try { FileLock lock = fixedDataReader.getChannel().tryLock(position, OnDiskCrawlHistoryItem.ON_DISK_SIZE, false); try { //LOG.info("*** GOT READ LOCK FOR OFFSET:" + position); item.deserialize(fixedDataReader); break; } finally { lock.release(); //LOG.info("*** RELEASED READ LOCK FOR OFFSET:" + position); } } catch (OverlappingFileLockException e) { LOG.error("*** LOCK CONTENTION AT:" + position + " Exception:" + CCStringUtils.stringifyException(e)); } } // seek to string data stringDataReader.seek(item._stringsOffset); // and skip buffer length WritableUtils.readVInt(stringDataReader); // and read primary string String url = stringDataReader.readUTF(); // setup fingerprint fingerprint.setDomainHash(item._domainHash); fingerprint.setUrlHash(item._urlFingerprint); // first, if it has not been crawled ever, crawl it not matter what ... boolean crawlItem = !item.isFlagSet(OnDiskCrawlHistoryItem.FLAG_HAS_CRAWL_STATUS); // if it has been crawled ... check list metadata version ... if (!crawlItem && metadataVersion >= 1) { // ok this is newer version of the list ... // check refresh time if specified ... int refreshIntervalInSeconds = DEFAULT_REFRESH_INTERVAL_IN_SECS; if (getMetadata().getRefreshInterval() != 0) { refreshIntervalInSeconds = getMetadata().getRefreshInterval(); } if (item._updateTimestamp > 0) { long timeSinceLastCrawl = item._updateTimestamp; if (System.currentTimeMillis() - timeSinceLastCrawl >= (refreshIntervalInSeconds * 1000)) { crawlItem = true; } } } if (crawlItem) { loader.queueURL(fingerprint, url); synchronized (_metadata) { // update queued item count _metadata.setQueuedItemCount(_metadata.getQueuedItemCount() + 1); } } else { updateMetadata(item, _metadata, 0); } // ok update subdomain stats updateSubDomainMetadataForItemDuringLoad(item, url, fingerprint, crawlItem); } flushCachedSubDomainMetadata(); loader.flush(); _queueState = QueueState.QUEUED; } catch (IOException e) { LOG.error("Encountered Exception Queueing Items for List:" + _listId + " Exception:" + CCStringUtils.stringifyException(e)); _queueState = QueueState.ERROR; } finally { fixedDataReader.close(); stringDataReader.close(); } } /** resubmit failed items * * @param loader */ public void requeueFailedItems(CrawlQueueLoader loader) throws IOException { synchronized (this) { _queueState = QueueState.QUEUEING; } RandomAccessFile fixedDataReader = new RandomAccessFile(_fixedDataFile, "rw"); RandomAccessFile stringDataReader = new RandomAccessFile(_variableDataFile, "rw"); try { OnDiskCrawlHistoryItem item = new OnDiskCrawlHistoryItem(); URLFP fingerprint = new URLFP(); while (fixedDataReader.getFilePointer() != fixedDataReader.length()) { item.deserialize(fixedDataReader); boolean queueItem = false; if (item.isFlagSet(OnDiskCrawlHistoryItem.FLAG_HAS_CRAWL_STATUS)) { if (item.isFlagSet(OnDiskCrawlHistoryItem.FLAG_HAS_REDIRECT_STATUS)) { queueItem = (item._redirectStatus != 0); if (!queueItem) { if (item._redirectHttpResult != 200 && item._redirectHttpResult != 404) { queueItem = true; } } } else { queueItem = (item._crawlStatus != 0); if (!queueItem) { if (item._httpResultCode != 200 && item._httpResultCode != 404) { queueItem = true; } } } if (queueItem) { // seek to string data stringDataReader.seek(item._stringsOffset); // and skip buffer length WritableUtils.readVInt(stringDataReader); // and read primary string String url = stringDataReader.readUTF(); // and spill fingerprint.setDomainHash(item._domainHash); fingerprint.setUrlHash(item._urlFingerprint); loader.queueURL(fingerprint, url); } } } } catch (IOException e) { LOG.error("Encountered Exception Queueing Items for List:" + _listId + " Exception:" + CCStringUtils.stringifyException(e)); _queueState = QueueState.QUEUED; } finally { fixedDataReader.close(); stringDataReader.close(); } } /** * * @param localLogFileDir * @param listId * @return */ public static boolean allFilesPresent(File localLogFileDir, long listId) { //establish file names File urlDataFile = new File(localLogFileDir, LIST_URL_DATA_PREFIX + Long.toString(listId)); File fixedDataFile = new File(localLogFileDir, LIST_VALUE_MAP_PREFIX + Long.toString(listId)); File variableDataFile = new File(localLogFileDir, LIST_STRING_MAP_PREFIX + Long.toString(listId)); File bloomFilterFile = new File(localLogFileDir, LIST_BLOOM_DATA_PREFIX + Long.toString(listId)); if (urlDataFile.exists() && fixedDataFile.exists() && variableDataFile.exists() && bloomFilterFile.exists()) { return true; } return false; } public static final String LIST_URL_DATA_PREFIX = "listURLS-"; public static final String LIST_VALUE_MAP_PREFIX = "listValueMap-"; public static final String LIST_STRING_MAP_PREFIX = "listStringMap-"; public static final String LIST_BLOOM_DATA_PREFIX = "listBloomFilter-"; public static final String LIST_METADATA_PREFIX = "listMetadata-"; public static final String LIST_SUBDOMAIN_METADATA_PREFIX = "listSubDomainMetadata-"; private void initializeListFileNames() { //establish file names _listURLDataFile = new File(_manager.getLocalDataDir(), LIST_URL_DATA_PREFIX + Long.toString(_listId)); _fixedDataFile = new File(_manager.getLocalDataDir(), LIST_VALUE_MAP_PREFIX + Long.toString(_listId)); _variableDataFile = new File(_manager.getLocalDataDir(), LIST_STRING_MAP_PREFIX + Long.toString(_listId)); _bloomFilterData = new File(_manager.getLocalDataDir(), LIST_BLOOM_DATA_PREFIX + Long.toString(_listId)); _listMetadataFile = new File(_manager.getLocalDataDir(), LIST_METADATA_PREFIX + Long.toString(_listId)); _subDomainMetadataFile = new File(_manager.getLocalDataDir(), LIST_SUBDOMAIN_METADATA_PREFIX + Long.toString(_listId)); } private static class OnDiskCrawlHistoryItem { public long _fileOffset = -1; int _domainHash = -1; // 4 long _urlFingerprint = -1; // 8 int _stringsCRC = -1; // 4 long _stringsOffset = -1; // 8 byte _flags = 0; // 1 byte _crawlStatus = -1; // 1 short _httpResultCode = -1; // 2 byte _redirectStatus = -1; // 1 short _redirectHttpResult = -1; // 2 long _updateTimestamp = -1; // 8 //__ // 39 bytes public static final int ON_DISK_SIZE = 39; public static final int FLAG_HAS_CRAWL_STATUS = 1; public static final int FLAG_HAS_ORIGINAL_RESULT_CODE = 2; public static final int FLAG_HAS_REDIRECT_URL = 4; public static final int FLAG_HAS_REDIRECT_STATUS = 8; public static final int FLAG_HAS_REDIRECT_RESULT_CODE = 16; public static final int FLAG_HAS_LASTMODIFIED_TIME = 32; public int compareFingerprints(URLFP fp) { int result = ((Integer) _domainHash).compareTo(fp.getDomainHash()); if (result == 0) { result = ((Long) _urlFingerprint).compareTo(fp.getUrlHash()); } return result; } @Override public boolean equals(Object obj) { if (obj instanceof OnDiskCrawlHistoryItem) { OnDiskCrawlHistoryItem other = (OnDiskCrawlHistoryItem) obj; if (_domainHash == other._domainHash && _urlFingerprint == other._urlFingerprint && _stringsCRC == other._stringsCRC && _flags == other._flags && _crawlStatus == other._crawlStatus && _httpResultCode == other._httpResultCode && _redirectStatus == other._redirectStatus && _redirectHttpResult == other._redirectHttpResult) { return true; } } return false; } public void setFlag(int flag) { _flags |= flag; } public boolean isFlagSet(int flag) { return ((_flags & flag) != 0); } public void serialize(DataOutput out) throws IOException { out.writeInt(_domainHash); out.writeLong(_urlFingerprint); out.writeInt(_stringsCRC); out.writeLong(_stringsOffset); out.write(_flags); out.writeByte(_crawlStatus); out.writeShort(_httpResultCode); out.writeByte(_redirectStatus); out.writeShort(_redirectHttpResult); out.writeLong(_updateTimestamp); } public void deserialize(DataInput in) throws IOException { _domainHash = in.readInt(); _urlFingerprint = in.readLong(); _stringsCRC = in.readInt(); _stringsOffset = in.readLong(); _flags = in.readByte(); _crawlStatus = in.readByte(); _httpResultCode = in.readShort(); _redirectStatus = in.readByte(); _redirectHttpResult = in.readShort(); _updateTimestamp = in.readLong(); } } DataOutputBuffer _stringBuffer1 = new DataOutputBuffer(); DataOutputBuffer _stringBuffer2 = new DataOutputBuffer(); CRC16 _stringCRC = new CRC16(); private OnDiskCrawlHistoryItem onDiskItemFromHistoryItem(URLFP fingerprint, ProxyCrawlHistoryItem item) throws IOException { OnDiskCrawlHistoryItem itemOut = new OnDiskCrawlHistoryItem(); itemOut._domainHash = fingerprint.getDomainHash(); itemOut._urlFingerprint = fingerprint.getUrlHash(); itemOut._stringsCRC = calculateStringCRC(item, _stringBuffer1); if (item.isFieldDirty(ProxyCrawlHistoryItem.Field_CRAWLSTATUS)) { itemOut.setFlag(OnDiskCrawlHistoryItem.FLAG_HAS_CRAWL_STATUS); itemOut._crawlStatus = (byte) item.getCrawlStatus(); } if (item.isFieldDirty(ProxyCrawlHistoryItem.Field_HTTPRESULTCODE)) { itemOut.setFlag(OnDiskCrawlHistoryItem.FLAG_HAS_ORIGINAL_RESULT_CODE); itemOut._httpResultCode = (short) item.getHttpResultCode(); } if (item.isFieldDirty(ProxyCrawlHistoryItem.Field_REDIRECTURL)) { itemOut.setFlag(OnDiskCrawlHistoryItem.FLAG_HAS_REDIRECT_URL); } if (item.isFieldDirty(ProxyCrawlHistoryItem.Field_REDIRECTSTATUS)) { itemOut.setFlag(OnDiskCrawlHistoryItem.FLAG_HAS_REDIRECT_STATUS); itemOut._redirectStatus = (byte) item.getRedirectStatus(); } if (item.isFieldDirty(ProxyCrawlHistoryItem.Field_REDIRECTHTTPRESULT)) { itemOut.setFlag(OnDiskCrawlHistoryItem.FLAG_HAS_REDIRECT_RESULT_CODE); itemOut._redirectHttpResult = (short) item.getRedirectHttpResult(); } // update last modified time if present .... if (item.isFieldDirty(ProxyCrawlHistoryItem.Field_LASTMODIFIEDTIME) && item.getLastModifiedTime() > 0) { itemOut._updateTimestamp = Math.max(itemOut._updateTimestamp, item.getLastModifiedTime()); itemOut.setFlag(OnDiskCrawlHistoryItem.FLAG_HAS_LASTMODIFIED_TIME); } return itemOut; } private int calculateStringCRC(ProxyCrawlHistoryItem item, DataOutputBuffer stringBuffer) throws IOException { stringBuffer.reset(); stringBuffer.writeUTF(item.getOriginalURL()); if (item.isFieldDirty(ProxyCrawlHistoryItem.Field_REDIRECTURL)) { stringBuffer.writeUTF(item.getRedirectURL()); } _stringCRC.reset(); _stringCRC.update(stringBuffer.getData(), 0, stringBuffer.getLength()); return (int) _stringCRC.getValue(); } private void writeInitialOnDiskItem(URLFP fp, ProxyCrawlHistoryItem historyItem, DataOutputStream valueStreamOut, RandomAccessFile stringStream) throws IOException { OnDiskCrawlHistoryItem itemOut = onDiskItemFromHistoryItem(fp, historyItem); // update string offset ... itemOut._stringsOffset = stringStream.length(); // write out string data length WritableUtils.writeVInt(stringStream, _stringBuffer1.getLength()); // write strings to log file stringStream.write(_stringBuffer1.getData(), 0, _stringBuffer1.getLength()); // update timestamp ... itemOut._updateTimestamp = -1; // and write to disk itemOut.serialize(valueStreamOut); } private void dumpFixedDataFile() { try { RandomAccessFile fixedDataReader = new RandomAccessFile(_fixedDataFile, "rw"); try { OnDiskCrawlHistoryItem item = new OnDiskCrawlHistoryItem(); int index = 0; while (fixedDataReader.getFilePointer() != fixedDataReader.length()) { item.deserialize(fixedDataReader); LOG.info("Item at Index:" + index++ + " Domain:" + item._domainHash + " URLFP:" + item._urlFingerprint); } } finally { fixedDataReader.close(); } } catch (IOException e) { LOG.error(CCStringUtils.stringifyException(e)); } } private OnDiskCrawlHistoryItem loadOnDiskItemForURLFP(URLFP fingerprint) throws IOException { // see if state is cached in memory ... boolean loadedFromMemory = false; synchronized (this) { if (_tempFixedDataBuffer != null) { loadedFromMemory = true; int low = 0; int high = (int) (_tempFixedDataBufferSize / OnDiskCrawlHistoryItem.ON_DISK_SIZE) - 1; OnDiskCrawlHistoryItem itemOut = new OnDiskCrawlHistoryItem(); DataInputBuffer inputBuffer = new DataInputBuffer(); int iterationNumber = 0; while (low <= high) { ++iterationNumber; int mid = low + ((high - low) / 2); inputBuffer.reset(_tempFixedDataBuffer, 0, _tempFixedDataBufferSize); inputBuffer.skip(mid * OnDiskCrawlHistoryItem.ON_DISK_SIZE); // deserialize itemOut.deserialize(inputBuffer); // now compare it against desired hash value ... int comparisonResult = itemOut.compareFingerprints(fingerprint); if (comparisonResult > 0) high = mid - 1; else if (comparisonResult < 0) low = mid + 1; else { // cache offset itemOut._fileOffset = mid * OnDiskCrawlHistoryItem.ON_DISK_SIZE; // LOG.info("Found Match. Took:"+ iterationNumber + " iterations"); // and return item return itemOut; } } //LOG.error("Did Not Find Match For Domain:" + fingerprint.getDomainHash() + " URLFP:" + fingerprint.getUrlHash() + " Took:" + iterationNumber + " iterations"); } } if (!loadedFromMemory) { //load from disk //LOG.info("Opening Data File for OnDiskItem load for Fingerprint:" + fingerprint.getUrlHash()); RandomAccessFile file = new RandomAccessFile(_fixedDataFile, "rw"); // allocate buffer upfront byte[] onDiskItemBuffer = new byte[OnDiskCrawlHistoryItem.ON_DISK_SIZE]; DataInputBuffer inputStream = new DataInputBuffer(); //LOG.info("Opened Data File. Searching for match"); try { int low = 0; int high = (int) (file.length() / OnDiskCrawlHistoryItem.ON_DISK_SIZE) - 1; OnDiskCrawlHistoryItem itemOut = new OnDiskCrawlHistoryItem(); int iterationNumber = 0; while (low <= high) { ++iterationNumber; int mid = low + ((high - low) / 2); // seek to proper location file.seek(mid * OnDiskCrawlHistoryItem.ON_DISK_SIZE); // read the data structure file.readFully(onDiskItemBuffer, 0, onDiskItemBuffer.length); // map location in file //MappedByteBuffer memoryBuffer = file.getChannel().map(MapMode.READ_ONLY,mid * OnDiskCrawlHistoryItem.ON_DISK_SIZE,OnDiskCrawlHistoryItem.ON_DISK_SIZE); //DataInputStream inputStream = new DataInputStream(new ByteBufferInputStream(memoryBuffer)); inputStream.reset(onDiskItemBuffer, 0, OnDiskCrawlHistoryItem.ON_DISK_SIZE); // deserialize itemOut.deserialize(inputStream); // memoryBuffer = null; //inputStream = null; // now compare it against desired hash value ... int comparisonResult = itemOut.compareFingerprints(fingerprint); if (comparisonResult > 0) high = mid - 1; else if (comparisonResult < 0) low = mid + 1; else { // cache offset itemOut._fileOffset = mid * OnDiskCrawlHistoryItem.ON_DISK_SIZE; // LOG.info("Found Match. Took:"+ iterationNumber + " iterations"); // and return item return itemOut; } } //LOG.error("******Did Not Find Match For Domain:" + fingerprint.getDomainHash() + " URLFP:" + fingerprint.getUrlHash() + " Took:" + iterationNumber + " iterations"); //DEBUG ONLY ! // dumpFixedDataFile(); } finally { file.close(); } } return null; } private ProxyCrawlHistoryItem getHistoryItemFromURLFP(URLFP fingerprint) throws IOException { OnDiskCrawlHistoryItem item = loadOnDiskItemForURLFP(fingerprint); if (item != null) { return getHistoryItemFromOnDiskItem(item); } return null; } private ProxyCrawlHistoryItem getHistoryItemFromOnDiskItem(OnDiskCrawlHistoryItem item) throws IOException { ProxyCrawlHistoryItem itemOut = new ProxyCrawlHistoryItem(); if ((item._flags & OnDiskCrawlHistoryItem.FLAG_HAS_CRAWL_STATUS) != 0) itemOut.setCrawlStatus(item._crawlStatus); if ((item._flags & OnDiskCrawlHistoryItem.FLAG_HAS_ORIGINAL_RESULT_CODE) != 0) itemOut.setHttpResultCode(item._httpResultCode); if ((item._flags & OnDiskCrawlHistoryItem.FLAG_HAS_REDIRECT_STATUS) != 0) itemOut.setRedirectStatus(item._redirectStatus); if ((item._flags & OnDiskCrawlHistoryItem.FLAG_HAS_REDIRECT_RESULT_CODE) != 0) itemOut.setRedirectHttpResult(item._redirectHttpResult); if ((item._flags & OnDiskCrawlHistoryItem.FLAG_HAS_LASTMODIFIED_TIME) != 0) itemOut.setLastModifiedTime(item._updateTimestamp); // now attept to get the string offset RandomAccessFile stringDataReader = new RandomAccessFile(_variableDataFile, "rw"); try { // seek to string data stringDataReader.seek(item._stringsOffset); // and skip buffer length WritableUtils.readVInt(stringDataReader); // now populate original url ... itemOut.setOriginalURL(stringDataReader.readUTF()); // now if redirect url is present if ((item._flags & OnDiskCrawlHistoryItem.FLAG_HAS_REDIRECT_URL) != 0) { itemOut.setRedirectURL(stringDataReader.readUTF()); } } finally { stringDataReader.close(); } return itemOut; } /** * deserialize metadata from disk * * @throws IOException */ void loadMetadataFromDisk() throws IOException { // skip metadata load if sub-domain metadata file is missing... // in this case, metadata will be rebuilt during subdomain metadata rescan ... if (_subDomainMetadataFile.exists()) { RandomAccessFile file = new RandomAccessFile(_listMetadataFile, "rw"); try { _metadata.deserialize(file, new BinaryProtocol()); int urlCount = _metadata.getUrlCount(); _metadata.clear(); _metadata.setUrlCount(urlCount); } finally { file.close(); } } } /** * serialize metadata to disk * @throws IOException */ void writeMetadataToDisk() throws IOException { synchronized (_metadata) { RandomAccessFile file = new RandomAccessFile(_listMetadataFile, "rw"); try { file.seek(0); _metadata.serialize(file, new BinaryProtocol()); } finally { file.close(); } } } public static void generateTestURLFile(File outputFile, String... urlList) throws IOException { PrintWriter writer = new PrintWriter(outputFile, "UTF-8"); for (String url : urlList) { writer.println(url); } writer.flush(); writer.close(); } private static void validateListCode(final File dataDirectory, long listId) throws IOException { final String urlList[] = new String[] { "http://www.yahoo.com/1", "http://www.google.com/1", "http://www.cnn.com/1", "http://www.yahoo.com/2", "http://www.google.com/2", "http://www.cnn.com/2" }; File tempFile = File.createTempFile("CrawlList", "validateListInit"); File localTempFile = new File(dataDirectory, tempFile.getName()); generateTestURLFile(localTempFile, urlList); final TreeMap<String, URLFP> urlToFPMap = new TreeMap<String, URLFP>(); final TreeMap<URLFP, String> urlFPToString = new TreeMap<URLFP, String>(); for (String url : urlList) { URLFP fp = URLUtils.getURLFPFromURL(url, true); urlToFPMap.put(url, fp); urlFPToString.put(fp, url); } final TreeMap<URLFP, ProxyCrawlHistoryItem> itemsToMarkComplete = new TreeMap<URLFP, ProxyCrawlHistoryItem>(); ProxyCrawlHistoryItem item1 = new ProxyCrawlHistoryItem(); item1.setCrawlStatus(CrawlURL.FailureReason.RobotsExcluded); item1.setOriginalURL(urlList[1]); ProxyCrawlHistoryItem item2 = new ProxyCrawlHistoryItem(); item2.setCrawlStatus(0); item2.setOriginalURL(urlList[3]); item2.setHttpResultCode(301); item2.setRedirectURL("http://www.yahoo.com/3"); item2.setRedirectStatus(0); item2.setRedirectHttpResult(200); ProxyCrawlHistoryItem item3 = new ProxyCrawlHistoryItem(); item3.setCrawlStatus(0); item3.setOriginalURL(urlList[4]); item3.setHttpResultCode(301); item3.setRedirectURL("http://www.google.com/3"); item3.setRedirectStatus(CrawlURL.FailureReason.IOException); itemsToMarkComplete.put(urlToFPMap.get(item1.getOriginalURL()), item1); itemsToMarkComplete.put(urlToFPMap.get(item2.getOriginalURL()), item2); itemsToMarkComplete.put(urlToFPMap.get(item3.getOriginalURL()), item3); final Set<URLFP> itemsToMarkCompleteFPSet = itemsToMarkComplete.keySet(); final Set<URLFP> itemsNotMarked = new TreeSet<URLFP>(urlToFPMap.values()); itemsNotMarked.removeAll(itemsToMarkCompleteFPSet); CrawlHistoryStorage storage = new CrawlHistoryStorage() { @Override public void syncList(long listId, TreeSet<URLFP> matchCriteria, ItemUpdater targetList) throws IOException { for (URLFP matchItem : matchCriteria) { if (itemsToMarkCompleteFPSet.contains(matchItem)) { targetList.updateItemState(matchItem, itemsToMarkComplete.get(matchItem)); } } } @Override public File getLocalDataDir() { return dataDirectory; } }; CrawlList list1 = new CrawlList(storage, listId, localTempFile, 0); for (int pass = 0; pass < 2; ++pass) { CrawlList list = null; if (pass == 0) { System.out.println("Pass 0 - Initialize from URLList"); list = list1; } else { System.out.println("Pass 1 - Initialize from OnDisk Data"); list = new CrawlList(storage, listId); } // iterate fingerprints for (URLFP fingerprint : urlToFPMap.values()) { ProxyCrawlHistoryItem itemRetrieved = list.getHistoryItemFromURLFP(fingerprint); if (itemsToMarkCompleteFPSet.contains(fingerprint)) { ProxyCrawlHistoryItem itemExpected = itemsToMarkComplete.get(fingerprint); Assert.assertTrue(itemExpected.equals(itemRetrieved)); } else { Assert.assertTrue(itemRetrieved.getOriginalURL().equals(urlFPToString.get(fingerprint)) && !itemRetrieved.isFieldDirty(ProxyCrawlHistoryItem.Field_CRAWLSTATUS) && !itemRetrieved.isFieldDirty(ProxyCrawlHistoryItem.Field_HTTPRESULTCODE) && !itemRetrieved.isFieldDirty(ProxyCrawlHistoryItem.Field_REDIRECTHTTPRESULT) && !itemRetrieved.isFieldDirty(ProxyCrawlHistoryItem.Field_REDIRECTSTATUS) && !itemRetrieved.isFieldDirty(ProxyCrawlHistoryItem.Field_REDIRECTURL)); } } } // validate string code does not update when strings have not changed item3.setRedirectStatus(0); item3.setRedirectHttpResult(200); long variableDataLength = list1._variableDataFile.length(); long fixedDataLength = list1._fixedDataFile.length(); list1.updateItemState(urlToFPMap.get(item3.getOriginalURL()), item3); Assert.assertTrue(fixedDataLength == list1._fixedDataFile.length()); Assert.assertTrue(variableDataLength == list1._variableDataFile.length()); list1.queueUnCrawledItems(new CrawlQueueLoader() { @Override public void queueURL(URLFP urlfp, String url) { Assert.assertTrue(itemsNotMarked.contains(urlfp)); Assert.assertTrue(urlFPToString.get(urlfp).equals(url)); } @Override public void flush() { // TODO Auto-generated method stub } }); } public static void testmain(String[] args) { // initialize ... Configuration conf = new Configuration(); conf.addResource("nutch-default.xml"); conf.addResource("nutch-site.xml"); conf.addResource("core-site.xml"); conf.addResource("hdfs-site.xml"); conf.addResource("mapred-site.xml"); BasicConfigurator.configure(); conf.set("mapred.output.compression.codec", "org.apache.hadoop.io.compress.GzipCodec"); conf.set("mapred.map.output.compression.codec", "org.apache.hadoop.io.compress.GzipCodec"); CrawlEnvironment.setHadoopConfig(conf); CrawlEnvironment.setDefaultHadoopFSURI("file:///"); File testDirectory = new File("/tmp/CrawlListTests"); FileUtils.recursivelyDeleteFile(testDirectory); testDirectory.mkdir(); try { validateListCode(testDirectory, System.currentTimeMillis()); } catch (IOException e) { e.printStackTrace(); } } private static final int OFFSET_TABLE_ENTRY_SIZE = 12; private final int getOffsetForSubDomainData(long domainHash) throws IOException { DataInputBuffer inputBuffer = new DataInputBuffer(); int low = 0; int high = (int) (_offsetLookupTable.getLength() / OFFSET_TABLE_ENTRY_SIZE) - 1; while (low <= high) { int mid = low + ((high - low) / 2); inputBuffer.reset(_offsetLookupTable.getData(), _offsetLookupTable.getLength()); inputBuffer.skip(mid * OFFSET_TABLE_ENTRY_SIZE); // deserialize long hash = inputBuffer.readLong(); // now compare it against desired hash value ... int comparisonResult = ((Long) hash).compareTo(domainHash); if (comparisonResult > 0) high = mid - 1; else if (comparisonResult < 0) low = mid + 1; else { return inputBuffer.readInt(); } } throw new IOException("NOT-FOUND!"); } void updateSubDomainQueueStatus(String rootDomainName, int deltaQueuedCount) throws IOException { long domainHash = URLFingerprint.generate64BitURLFPrint(rootDomainName); synchronized (_subDomainMetadataFile) { CrawlListMetadata metadata = new CrawlListMetadata(); RandomAccessFile file = new RandomAccessFile(_subDomainMetadataFile, "rw"); try { int dataOffset = getOffsetForSubDomainData(domainHash); if (dataOffset == 0) { throw new IOException("Data Offset Zero for host:" + rootDomainName); } file.seek(dataOffset); metadata.readFields(file); // set the data offset on the way out so that updates write to the proper location metadata.setQueuedItemCount(metadata.getQueuedItemCount() + deltaQueuedCount); // ok reseek to data offset file.seek(dataOffset); // rewrite the data structure metadata.write(file); } finally { file.close(); } } } public CrawlListMetadata getSubDomainMetadataByURL(String originalURL) throws IOException { GoogleURL urlObject = new GoogleURL(originalURL); return getSubDomainMetadataByDomain(urlObject.getHost()); } public CrawlListMetadata getSubDomainMetadataByDomain(String hostName) throws IOException { String rootDomainName = URLUtils.extractRootDomainName(hostName); if (rootDomainName != null) { return getSubDomainMetadataByRootDomain(rootDomainName); } throw new IOException("Unable to Extract RootDomainName for host:" + hostName); } public CrawlListMetadata getSubDomainMetadataByRootDomain(String rootDomainName) throws IOException { long domainHash = URLFingerprint.generate64BitURLFPrint(rootDomainName); CrawlListMetadata metadata = new CrawlListMetadata(); synchronized (_subDomainMetadataFile) { RandomAccessFile file = new RandomAccessFile(_subDomainMetadataFile, "rw"); try { int dataOffset = getOffsetForSubDomainData(domainHash); if (dataOffset == 0) { throw new IOException("Data Offset Zero for host:" + rootDomainName); } file.seek(dataOffset); metadata.readFields(file); // set the data offset on the way out so that updates write to the proper location metadata.setSubDomainDataOffset(dataOffset); } finally { file.close(); } } return metadata; } // get subdomain metadata CrawlListMetadata getTransientSubDomainMetadata(String originalURL) throws IOException { GoogleURL urlObject = new GoogleURL(originalURL); String rootDomainName = URLUtils.extractRootDomainName(urlObject.getHost()); if (rootDomainName != null) { long domainHash = URLFingerprint.generate64BitURLFPrint(rootDomainName); CrawlListMetadata metadata = _transientSubDomainStats.get(domainHash); if (metadata == null) { metadata = new CrawlListMetadata(); _transientSubDomainStats.put(domainHash, metadata); metadata.setDomainName(rootDomainName); metadata.setDomainHash(domainHash); } return metadata; } throw new IOException("Unable to Extract RootDomainName for url:" + originalURL); } /** * serialize metadata to disk * @throws IOException */ void writeSubDomainMetadataToDisk(CrawlListMetadata subDomainData) throws IOException { DataOutputBuffer outputBuffer = new DataOutputBuffer(CrawlListMetadata.Constants.FixedDataSize); subDomainData.serialize(outputBuffer, new BinaryProtocol()); if (outputBuffer.getLength() > CrawlListMetadata.Constants.FixedDataSize) { LOG.error("ListMetadata Serialize for List:" + subDomainData.getDomainName() + " > FixedDataSize!!!"); outputBuffer.reset(); subDomainData.setDomainName("<<CORRUPT>>"); subDomainData.serialize(outputBuffer, new BinaryProtocol()); } synchronized (_subDomainMetadataFile) { RandomAccessFile file = new RandomAccessFile(_subDomainMetadataFile, "rw"); try { if (subDomainData.getSubDomainDataOffset() == 0) { throw new IOException("Data Offset Zero during write!"); } file.seek(subDomainData.getSubDomainDataOffset()); file.write(outputBuffer.getData(), 0, outputBuffer.getLength()); } finally { file.close(); } } } void writeInitialSubDomainMetadataToDisk() throws IOException { RandomAccessFile file = new RandomAccessFile(_subDomainMetadataFile, "rw"); try { file.writeByte(0); // version file.writeInt(_transientSubDomainStats.size()); ArrayList<CrawlListMetadata> sortedMetadata = new ArrayList<CrawlListMetadata>(); sortedMetadata.addAll(_transientSubDomainStats.values()); _transientSubDomainStats = null; CrawlListMetadata metadataArray[] = sortedMetadata.toArray(new CrawlListMetadata[0]); Arrays.sort(metadataArray, new Comparator<CrawlListMetadata>() { @Override public int compare(CrawlListMetadata o1, CrawlListMetadata o2) { int result = ((Integer) o2.getUrlCount()).compareTo(o1.getUrlCount()); if (result == 0) { result = o1.getDomainName().compareTo(o2.getDomainName()); } return result; } }); DataOutputBuffer outputBuffer = new DataOutputBuffer(CrawlListMetadata.Constants.FixedDataSize); TreeMap<Long, Integer> idToOffsetMap = new TreeMap<Long, Integer>(); for (CrawlListMetadata entry : metadataArray) { // reset output buffer outputBuffer.reset(); // write item to disk entry.serialize(outputBuffer, new BinaryProtocol()); if (outputBuffer.getLength() > CrawlListMetadata.Constants.FixedDataSize) { LOG.fatal("Metadata Serialization for List:" + getListId() + " SubDomain:" + entry.getDomainName()); System.out.println("Metadata Serialization for List:" + getListId() + " SubDomain:" + entry.getDomainName()); } // save offset idToOffsetMap.put(entry.getDomainHash(), (int) file.getFilePointer()); // write out fixed data size file.write(outputBuffer.getData(), 0, CrawlListMetadata.Constants.FixedDataSize); } // write lookup table _offsetLookupTable = new DataOutputBuffer(idToOffsetMap.size() * OFFSET_TABLE_ENTRY_SIZE); for (Map.Entry<Long, Integer> entry : idToOffsetMap.entrySet()) { _offsetLookupTable.writeLong(entry.getKey()); _offsetLookupTable.writeInt(entry.getValue()); } } finally { file.close(); } _transientSubDomainStats = null; } void resetSubDomainCounts() throws IOException { LOG.info("*** LIST:" + getListId() + " Reset SubDomain Queued Counts."); if (_subDomainMetadataFile.exists()) { LOG.info("*** LIST:" + getListId() + " FILE EXISTS ."); RandomAccessFile file = new RandomAccessFile(_subDomainMetadataFile, "rw"); DataInputBuffer inputBuffer = new DataInputBuffer(); DataOutputBuffer outputBuffer = new DataOutputBuffer(CrawlListMetadata.Constants.FixedDataSize); try { // skip version file.read(); // read item count int itemCount = file.readInt(); LOG.info("*** LIST:" + getListId() + " SUBDOMAIN ITEM COUNT:" + itemCount); CrawlListMetadata newMetadata = new CrawlListMetadata(); for (int i = 0; i < itemCount; ++i) { long orignalPos = file.getFilePointer(); file.readFully(outputBuffer.getData(), 0, CrawlListMetadata.Constants.FixedDataSize); inputBuffer.reset(outputBuffer.getData(), CrawlListMetadata.Constants.FixedDataSize); try { newMetadata.deserialize(inputBuffer, new BinaryProtocol()); } catch (Exception e) { LOG.error("-----Failed to Deserialize Metadata at Index:" + i + " Exception:" + CCStringUtils.stringifyException(e)); } // ok reset everything except hashes and first/last url pointers int urlCount = newMetadata.getUrlCount(); long firstRecordOffset = newMetadata.getFirstRecordOffset(); long lastRecordOffset = newMetadata.getLastRecordOffset(); String domainName = newMetadata.getDomainName(); long domainHash = newMetadata.getDomainHash(); // reset newMetadata.clear(); // restore newMetadata.setUrlCount(urlCount); newMetadata.setFirstRecordOffset(firstRecordOffset); newMetadata.setLastRecordOffset(lastRecordOffset); newMetadata.setDomainName(domainName); newMetadata.setDomainHash(domainHash); // serialize it ... outputBuffer.reset(); newMetadata.serialize(outputBuffer, new BinaryProtocol()); // write it back to disk file.seek(orignalPos); // and rewrite it ... file.write(outputBuffer.getData(), 0, CrawlListMetadata.Constants.FixedDataSize); } } finally { file.close(); } LOG.info("*** LIST:" + getListId() + " DONE RESETTIGN SUBDOMAIN METADATA QUEUE COUNTS"); } } void loadSubDomainMetadataFromDisk() throws IOException { LOG.info("*** LIST:" + getListId() + " LOAD SUBDOMAIN METADATA FROM DISK ... "); if (_subDomainMetadataFile.exists()) { LOG.info("*** LIST:" + getListId() + " FILE EXISTS LOADING SUBDOMAIN DATA FROM DISK."); RandomAccessFile file = new RandomAccessFile(_subDomainMetadataFile, "rw"); DataInputBuffer inputBuffer = new DataInputBuffer(); byte fixedDataBlock[] = new byte[CrawlListMetadata.Constants.FixedDataSize]; try { // skip version file.read(); // read item count int itemCount = file.readInt(); LOG.info("*** LIST:" + getListId() + " SUBDOMAIN ITEM COUNT:" + itemCount); CrawlListMetadata newMetadata = new CrawlListMetadata(); TreeMap<Long, Integer> idToOffsetMap = new TreeMap<Long, Integer>(); for (int i = 0; i < itemCount; ++i) { long orignalPos = file.getFilePointer(); file.readFully(fixedDataBlock, 0, fixedDataBlock.length); inputBuffer.reset(fixedDataBlock, fixedDataBlock.length); try { newMetadata.deserialize(inputBuffer, new BinaryProtocol()); } catch (Exception e) { LOG.error("-----Failed to Deserialize Metadata at Index:" + i + " Exception:" + CCStringUtils.stringifyException(e)); } idToOffsetMap.put(newMetadata.getDomainHash(), (int) orignalPos); } // write lookup table _offsetLookupTable = new DataOutputBuffer(idToOffsetMap.size() * OFFSET_TABLE_ENTRY_SIZE); for (Map.Entry<Long, Integer> entry : idToOffsetMap.entrySet()) { _offsetLookupTable.writeLong(entry.getKey()); _offsetLookupTable.writeInt(entry.getValue()); } } finally { file.close(); } LOG.info("*** LIST:" + getListId() + " DONE LOADING SUBDOMAIN DATA FROM DISK"); } else { LOG.info("*** LIST:" + getListId() + " SUBDOMAIN METADATA DOES NOT EXIST! LOADING FROM SCRATCH"); RandomAccessFile fixedDataReader = new RandomAccessFile(_fixedDataFile, "rw"); RandomAccessFile stringDataReader = new RandomAccessFile(_variableDataFile, "rw"); try { //ok rebuild top level metadata as well _metadata.clear(); OnDiskCrawlHistoryItem item = new OnDiskCrawlHistoryItem(); int processedCount = 0; while (fixedDataReader.getFilePointer() != fixedDataReader.length()) { long position = fixedDataReader.getFilePointer(); // store offset in item item._fileOffset = position; // load from disk item.deserialize(fixedDataReader); try { // seek to string data stringDataReader.seek(item._stringsOffset); // and skip buffer length WritableUtils.readVInt(stringDataReader); // and read primary string String url = stringDataReader.readUTF(); // get metadata object for subdomain CrawlListMetadata subDomainMetadata = getTransientSubDomainMetadata(url); // increment url count subDomainMetadata.setUrlCount(subDomainMetadata.getUrlCount() + 1); // increment top level metadata count _metadata.setUrlCount(_metadata.getUrlCount() + 1); // update top level metadata .. updateMetadata(item, _metadata, 0); // update sub-domain metadata object from item data updateMetadata(item, subDomainMetadata, 0); ++processedCount; } catch (IOException e) { LOG.error("Exception Reading String Data For Item:" + (processedCount + 1)); LOG.error("Exception:" + CCStringUtils.stringifyException(e)); LOG.error("File Position:" + fixedDataReader.getFilePointer() + " StringsPointer:" + stringDataReader.getFilePointer()); } if (processedCount % 10000 == 0) { LOG.info("*** LIST:" + getListId() + " Processed:" + processedCount + " Items"); } } // ok commit top level metadata to disk as well writeMetadataToDisk(); } catch (IOException e) { LOG.error("Encountered Exception Queueing Items for List:" + _listId + " Exception:" + CCStringUtils.stringifyException(e)); LOG.error("File Position:" + fixedDataReader.getFilePointer() + " StringsPointer:" + stringDataReader.getFilePointer()); _queueState = QueueState.QUEUED; } finally { fixedDataReader.close(); stringDataReader.close(); } LOG.info("*** LIST:" + getListId() + " SUBDOMAIN METADATA REBUILT FROM LIST DATA . WRITING TO DISK"); // write metadat to disk writeInitialSubDomainMetadataToDisk(); LOG.info("*** LIST:" + getListId() + " SUBDOMAIN METADATA REBUILT FROM LIST DATA . WRITE COMPLETE"); } } public int getSubDomainItemCount() { synchronized (_metadata) { return _offsetLookupTable.getLength() / OFFSET_TABLE_ENTRY_SIZE; } } public ArrayList<CrawlListDomainItem> getSubDomainList(int offset, int count) { synchronized (_metadata) { ArrayList<CrawlListDomainItem> itemsOut = new ArrayList<CrawlListDomainItem>(); try { synchronized (_subDomainMetadataFile) { RandomAccessFile file = new RandomAccessFile(_subDomainMetadataFile, "rw"); DataInputBuffer inputBuffer = new DataInputBuffer(); byte fixedDataBlock[] = new byte[CrawlListMetadata.Constants.FixedDataSize]; try { // skip version file.read(); // read item count int itemCount = file.readInt(); int i = offset; int end = Math.min(i + count, itemCount); LOG.info("*** LIST:" + getListId() + " SUBDOMAIN ITEM COUNT:" + itemCount); if (i < itemCount) { file.seek(5 + (CrawlListMetadata.Constants.FixedDataSize * offset)); CrawlListMetadata newMetadata = new CrawlListMetadata(); for (; i < end; ++i) { long orignalPos = file.getFilePointer(); file.readFully(fixedDataBlock, 0, fixedDataBlock.length); inputBuffer.reset(fixedDataBlock, fixedDataBlock.length); newMetadata.deserialize(inputBuffer, new BinaryProtocol()); itemsOut.add(buildSubDomainSummary(newMetadata.getDomainName(), newMetadata)); } } } finally { file.close(); } } } catch (IOException e) { LOG.error(CCStringUtils.stringifyException(e)); } LOG.info("*** LIST:" + getListId() + " DONE LOADING SUBDOMAIN DATA FROM DISK"); return itemsOut; } } private static CrawlListDomainItem buildSubDomainSummary(String domainName, CrawlListMetadata metadata) { CrawlListDomainItem domainItem = new CrawlListDomainItem(); domainItem.setDomainName(domainName); int robotsExcludedItemsCount = 0; int errorItemsCount = 0; int otherHTTPResultsCount = 0; metadata.getHttp200Count(); metadata.getRobotsExcludedCount(); errorItemsCount += metadata.getTimeoutErrorCount(); errorItemsCount += metadata.getIOExceptionCount(); errorItemsCount += metadata.getDNSErrorCount(); errorItemsCount += metadata.getOtherErrorCount(); otherHTTPResultsCount += metadata.getHttp403Count(); otherHTTPResultsCount += metadata.getHttp404Count(); otherHTTPResultsCount += metadata.getHttp500Count(); otherHTTPResultsCount += metadata.getHttpOtherCount(); domainItem.setUrlCount(metadata.getUrlCount()); domainItem.setUrlsCrawled(metadata.getHttp200Count() + otherHTTPResultsCount); domainItem.setHttp200Count(metadata.getHttp200Count()); domainItem.setInCacheItemsCount(0); domainItem.setRobotsExcludedCount(robotsExcludedItemsCount); domainItem.setErrorCount(errorItemsCount); domainItem.setFirstItemOffset(metadata.getFirstRecordOffset()); domainItem.setLastItemOffset(metadata.getLastRecordOffset()); domainItem.setHashCode((int) metadata.getDomainHash()); domainItem.setQueuedCount(metadata.getQueuedItemCount()); return domainItem; } /* public CrawlListMetadata getSubDomainMetadata() { synchronized (_metadata) { ImmutableSortedSet.Builder<String> builder = ImmutableSortedSet.naturalOrder(); builder.addAll(_subDomainNameToStatsMap.keySet()); return builder.build(); } } */ /**********************************************************************/ public static void main(String[] args) throws IOException { if (args[0].equalsIgnoreCase("dump")) { File dataDir = new File(args[1]); long listId = Long.parseLong(args[2]); File outputPath = new File(args[3]); dumpUnCrawledItems(dataDir, listId, outputPath, true); } } public static void dumpUnCrawledItems(File dataDir, long listId, File outputFilePath, boolean includeRobotsExcludedItems) throws IOException { File fixedDataFile = new File(dataDir, LIST_VALUE_MAP_PREFIX + Long.toString(listId)); File variableDataFile = new File(dataDir, LIST_STRING_MAP_PREFIX + Long.toString(listId)); LOG.info("FixedDataFile is:" + fixedDataFile); LOG.info("VariableDataFile is:" + variableDataFile); RandomAccessFile fixedDataReader = new RandomAccessFile(fixedDataFile, "r"); RandomAccessFile stringDataReader = new RandomAccessFile(variableDataFile, "r"); JsonWriter writer = new JsonWriter(new BufferedWriter(new FileWriter(outputFilePath), 1024 * 1024 * 10)); writer.setIndent(" "); try { writer.beginObject(); writer.name("urls"); writer.beginArray(); try { OnDiskCrawlHistoryItem item = new OnDiskCrawlHistoryItem(); URLFP fingerprint = new URLFP(); while (fixedDataReader.getFilePointer() != fixedDataReader.length()) { long position = fixedDataReader.getFilePointer(); item.deserialize(fixedDataReader); // seek to string data stringDataReader.seek(item._stringsOffset); // and skip buffer length WritableUtils.readVInt(stringDataReader); // and read primary string String url = stringDataReader.readUTF(); // setup fingerprint fingerprint.setDomainHash(item._domainHash); fingerprint.setUrlHash(item._urlFingerprint); // any item that has not been crawled needs to be queued boolean queueItem = !item.isFlagSet(OnDiskCrawlHistoryItem.FLAG_HAS_CRAWL_STATUS); // if item is not queued, check to see if we need to retry the item if (!queueItem && item.isFlagSet(OnDiskCrawlHistoryItem.FLAG_HAS_CRAWL_STATUS)) { if (item.isFlagSet(OnDiskCrawlHistoryItem.FLAG_HAS_REDIRECT_STATUS)) { queueItem = (item._redirectStatus != 0); if (!queueItem) { if (item._redirectHttpResult != 200 && item._redirectHttpResult != 404) { queueItem = true; } } } else { queueItem = (item._crawlStatus != 0); if (!queueItem) { if (item._httpResultCode != 200 && item._httpResultCode != 404) { queueItem = true; } } } } if (queueItem) { // ok if queue item is set ... writer.beginObject(); writer.name("url"); writer.value(url); writer.name("redirected"); writer.value((boolean) item.isFlagSet(OnDiskCrawlHistoryItem.FLAG_HAS_REDIRECT_STATUS)); writer.name("lastStatus"); if (item.isFlagSet(OnDiskCrawlHistoryItem.FLAG_HAS_REDIRECT_STATUS)) { if (item._redirectStatus == 0) { writer.value("HTTP-" + item._redirectHttpResult); } else { writer.value(CrawlURL.FailureReason.toString(item._redirectHttpResult)); } } else { if (item.isFlagSet(OnDiskCrawlHistoryItem.FLAG_HAS_CRAWL_STATUS)) { if (item._crawlStatus == 0) { writer.value("HTTP-" + item._httpResultCode); } else { writer.value(CrawlURL.FailureReason.toString(item._crawlStatus)); } } else { writer.value("UNCRAWLED"); } } writer.name("updateTime"); writer.value(item._updateTimestamp); writer.endObject(); } } } catch (IOException e) { LOG.error("Encountered Exception Queueing Items for List:" + listId + " Exception:" + CCStringUtils.stringifyException(e)); } finally { fixedDataReader.close(); stringDataReader.close(); } writer.endArray(); writer.endObject(); } catch (Exception e) { LOG.error(CCStringUtils.stringifyException(e)); throw new IOException(e); } finally { writer.flush(); writer.close(); } } }