Java tutorial
/** * Copyright 2012 - CommonCrawl Foundation * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see <http://www.gnu.org/licenses/>. * **/ package org.commoncrawl.mapred.ec2.postprocess.crawldb; import java.io.IOException; import java.nio.charset.Charset; import java.text.NumberFormat; import java.util.ArrayList; import java.util.HashSet; import java.util.Iterator; import java.util.List; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileStatus; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.DataOutputBuffer; import org.apache.hadoop.mapred.JobConf; import org.apache.hadoop.mapred.OutputCollector; import org.apache.hadoop.mapred.Reducer; import org.apache.hadoop.mapred.Reporter; import org.commoncrawl.mapred.ec2.postprocess.crawldb.CrawlDBKey.ComponentId; import org.commoncrawl.protocol.URLFPV2; import org.commoncrawl.util.ByteArrayUtils; import org.commoncrawl.util.GoogleURL; import org.commoncrawl.util.HttpHeaderInfoExtractor; import org.commoncrawl.util.TextBytes; import org.commoncrawl.util.URLFPBloomFilter; import org.commoncrawl.util.URLUtils; import static org.commoncrawl.util.JSONUtils.*; import com.google.gson.JsonArray; import com.google.gson.JsonElement; import com.google.gson.JsonObject; import com.google.gson.JsonParser; /** * map reduce job that produces a crawldb given link graph/crawl status data emitted * from both the LinkGraphDataEmitter job and previous runs of the CrawlDBWriter itself. * * @author rana * */ public class CrawlDBMergingReducer implements Reducer<TextBytes, TextBytes, TextBytes, TextBytes>, CrawlDBCommon { static final Log LOG = LogFactory.getLog(CrawlDBMergingReducer.class); // The crawldb job emits data in the form a JSON data structure // The top level JSON object contains optionally, a link_status object, a summary object // and a source_url string. // The Summary object has the properties defined by the SUMMARYRECORD_ constant prefix. // The LinkStatus object has properties defined by the LINKSTATUS_ prefix // The Summary object can contain zero to N CrawlDetail objects, one for each // crawl attempt. The properties defined by CrawlDetail object are prefixed with // the CRAWLDETAIL_ prefix. /////////////////////////////////////////////////////////////////////////// // EC2 PATHS /////////////////////////////////////////////////////////////////////////// static final String S3N_BUCKET_PREFIX = "s3n://aws-publicdatasets"; static final String MERGE_INTERMEDIATE_OUTPUT_PATH = "/common-crawl/crawl-db/intermediate/"; static final String MERGE_DB_PATH = "/common-crawl/crawl-db/mergedDB/"; /////////////////////////////////////////////////////////////////////////// // CONSTANTS /////////////////////////////////////////////////////////////////////////// static final int MAX_TYPE_SAMPLES = 5; static final int DEFAULT_OUTGOING_URLS_BUFFER_SIZE = 1 << 18; // 262K static final int DEFAULT_OUTGOING_URLS_BUFFER_PAD_AMOUNT = 16384; static final int DEFAULT_EXT_SOURCE_SAMPLE_BUFFER_SIZE = 1 << 27; // 134 MB static final int DEFAULT_EXT_SOURCE_SAMPLE_BUFFER_PAD_AMOUNT = 16384; static final int MAX_EXTERNALLY_REFERENCED_URLS = 100; //private int OUTGOING_URLS_BUFFER_SIZE = DEFAULT_OUTGOING_URLS_BUFFER_SIZE; //private int OUTGOING_URLS_BUFFER_PAD_AMOUNT =DEFAULT_OUTGOING_URLS_BUFFER_PAD_AMOUNT; private int EXT_SOURCE_SAMPLE_BUFFER_SIZE = DEFAULT_EXT_SOURCE_SAMPLE_BUFFER_SIZE; private int EXT_SOURCE_SAMPLE_BUFFER_PAD_AMOUNT = DEFAULT_EXT_SOURCE_SAMPLE_BUFFER_PAD_AMOUNT; /////////////////////////////////////////////////////////////////////////// // Counters /////////////////////////////////////////////////////////////////////////// enum Counters { FAILED_TO_GET_LINKS_FROM_HTML, NO_HREF_FOR_HTML_LINK, EXCEPTION_IN_MAP, GOT_HTML_METADATA, GOT_FEED_METADATA, EMITTED_ATOM_LINK, EMITTED_HTML_LINK, EMITTED_RSS_LINK, GOT_PARSED_AS_ATTRIBUTE, GOT_LINK_OBJECT, NULL_CONTENT_OBJECT, NULL_LINKS_ARRAY, FP_NULL_IN_EMBEDDED_LINK, SKIPPED_ALREADY_EMITTED_LINK, FOUND_HTTP_DATE_HEADER, FOUND_HTTP_AGE_HEADER, FOUND_HTTP_LAST_MODIFIED_HEADER, FOUND_HTTP_EXPIRES_HEADER, FOUND_HTTP_CACHE_CONTROL_HEADER, FOUND_HTTP_PRAGMA_HEADER, REDUCER_GOT_LINK, REDUCER_GOT_STATUS, ONE_REDUNDANT_LINK_IN_REDUCER, TWO_REDUNDANT_LINKS_IN_REDUCER, THREE_REDUNDANT_LINKS_IN_REDUCER, GT_THREE_REDUNDANT_LINKS_IN_REDUCER, ONE_REDUNDANT_STATUS_IN_REDUCER, TWO_REDUNDANT_STATUS_IN_REDUCER, THREE_REDUNDANT_STATUS_IN_REDUCER, GT_THREE_REDUNDANT_STATUS_IN_REDUCER, GOT_RSS_FEED, GOT_ATOM_FEED, GOT_ALTERNATE_LINK_FOR_ATOM_ITEM, GOT_CONTENT_FOR_ATOM_ITEM, GOT_ITEM_LINK_FROM_RSS_ITEM, GOT_TOP_LEVEL_LINK_FROM_RSS_ITEM, GOT_TOP_LEVEL_LINK_FROM_ATOM_ITEM, EMITTED_REDIRECT_RECORD, DISCOVERED_NEW_LINK, GOT_LINK_FOR_ITEM_WITH_STATUS, FAILED_TO_GET_SOURCE_HREF, GOT_CRAWL_STATUS_RECORD, GOT_EXTERNAL_DOMAIN_SOURCE, NO_SOURCE_URL_FOR_CRAWL_STATUS, OUTPUT_KEY_FROM_INTERNAL_LINK, OUTPUT_KEY_FROM_EXTERNAL_LINK, GOT_HTTP_200_CRAWL_STATUS, GOT_REDIRECT_CRAWL_STATUS, BAD_REDIRECT_URL, GOT_MERGED_RECORD, MERGED_OBJECT_FIRST_OBJECT, ADOPTED_SOURCE_SUMMARY_RECORD, MERGED_SOURCE_SUMMARY_RECORD_INTO_DEST, ADOPTED_SOURCE_LINKSUMMARY_RECORD, MERGED_SOURCE_LINKSUMMARY_RECORD_INTO_DEST, ALLOCATED_TOP_LEVEL_OBJECT_IN_FLUSH, ENCOUNTERED_EXISTING_TOP_LEVEL_OBJECT_IN_FLUSH, ENCOUNTERED_SUMMARY_RECORD_IN_FLUSH, ENCOUNTERED_LINKSUMMARY_RECORD_IN_FLUSH, EMITTED_SOURCEINPUTS_RECORD, GOT_NULL_REDIRECT_URL, INTERDOMAIN_LINKS_LTEQ_100, INTERDOMAIN_LINKS_LTEQ_1000, INTERDOMAIN_LINKS_GT_1000, EMITTED_SOURCEINPUTS_DATA_BYTES_EMITTED, INPUT_RECORD_COUNT, ADOPTED_NEW_BLEKKO_METADATA_RECORD, BLEKKO_METADATA_WITH_NO_SOURCE_CC_RECORD, MERGE_RECORD_HAS_BLEKKO_METADATA, EMITTED_RECORD_WITH_BLEKKO_METADATA, BLEKKO_RECORD_ALREADY_IN_DATABASE , BLEKKO_CRAWLED_CC_CRAWLED, BLEKKO_NOT_CRAWLED_CC_CRAWLED } /////////////////////////////////////////////////////////////////////////// // Data Members /////////////////////////////////////////////////////////////////////////// public static final int NUM_HASH_FUNCTIONS = 10; public static final int NUM_BITS = 11; public static final int NUM_ELEMENTS = 1 << 26; public static final int FLUSH_INTERVAL = 1 << 17; private static final NumberFormat NUMBER_FORMAT = NumberFormat.getInstance(); static { NUMBER_FORMAT.setMinimumIntegerDigits(5); NUMBER_FORMAT.setGroupingUsed(false); } // parser JsonParser _parser = new JsonParser(); // the top level object JsonObject _topLevelJSONObject; // the current summary record ... JsonObject _summaryRecord = null; // the current link summary record JsonObject _linkSummaryRecord = null; // collection of types detected for current url HashSet<String> _types = new HashSet<String>(); // collection of external references urls in current document HashSet<String> _extHrefs = new HashSet<String>(); // the url string to use as the output key ... String _outputKeyString = null; // freeze url key ... boolean _urlKeyForzen = false; // url object representing the current key GoogleURL _outputKeyURLObj = null; // source inputs tracking bloomfilter URLFPBloomFilter _sourceInputsTrackingFilter; // a count of the number of urls processed long _urlsProcessed = 0; // key used to test bloomfilter URLFPV2 _bloomFilterKey = new URLFPV2(); // captured job conf JobConf _conf; // file system FileSystem _fs; // partition id int _partitionId; //SequenceFile.Writer _redirectWriter = null; // input buffer used to collect referencing urls DataOutputBuffer _sourceInputsBuffer; // count of referencing domains int _sourceSampleSize = 0; // current input key URLFPV2 _currentKey = null; // temporary key used to transition input keys URLFPV2 _tempKey = new URLFPV2(); // cached collector pointer ... OutputCollector<TextBytes, TextBytes> _outputCollector; Reporter _reporter; @Override public void reduce(TextBytes keyBytes, Iterator<TextBytes> values, OutputCollector<TextBytes, TextBytes> output, Reporter reporter) throws IOException { if (_outputCollector == null) { _outputCollector = output; _reporter = reporter; } // potentially transition to new url readFPCheckForTransition(keyBytes, output, reporter); // extract link type .. long linkType = CrawlDBKey.getLongComponentFromKey(keyBytes, CrawlDBKey.ComponentId.TYPE_COMPONENT_ID); while (values.hasNext()) { reporter.incrCounter(Counters.INPUT_RECORD_COUNT, 1); TextBytes valueBytes = values.next(); //LOG.debug("ValueBytes:"+ valueBytes.toString()); if (linkType == CrawlDBKey.Type.KEY_TYPE_MERGED_RECORD.ordinal()) { reporter.incrCounter(Counters.GOT_MERGED_RECORD, 1); JsonObject mergedObject = _parser.parse(valueBytes.toString()).getAsJsonObject(); if (mergedObject != null) { setSourceURLFromJSONObject(mergedObject, linkType); processMergedRecord(mergedObject, _currentKey, reporter); } } else if (linkType == CrawlDBKey.Type.KEY_TYPE_CRAWL_STATUS.ordinal()) { reporter.incrCounter(Counters.GOT_CRAWL_STATUS_RECORD, 1); try { JsonObject object = _parser.parse(valueBytes.toString()).getAsJsonObject(); if (object != null) { // update url key if necessary ... setSourceURLFromJSONObject(object, linkType); // emit a redirect record if necessary ... JsonElement redirectObject = object.get("redirect_from"); if (redirectObject != null) { emitRedirectRecord(object, redirectObject.getAsJsonObject(), output, reporter); } // get latest crawl time long latestCrawlTime = (_summaryRecord != null) ? safeGetLong(_summaryRecord, SUMMARYRECORD_LATEST_CRAWLTIME_PROPERTY) : -1; long attemptTime = safeGetLong(object, "attempt_time"); // if this is the latest crawl event, then we want to track the links associated with this crawl status ... HashSet<String> extHrefs = (attemptTime > latestCrawlTime) ? _extHrefs : null; // create a crawl detail record from incoming JSON JsonObject crawlDetail = crawlDetailRecordFromCrawlStatusRecord(object, _currentKey, extHrefs, reporter); // add to our list of crawl detail records ... safeAddCrawlDetailToSummaryRecord(crawlDetail); // ok, now update summary stats based on incoming crawl detail record ... updateSummaryRecordFromCrawlDetailRecord(crawlDetail, _currentKey, reporter); } } catch (Exception e) { LOG.error("Error Parsing JSON:" + valueBytes.toString()); throw new IOException(e); } break; } else if (linkType >= CrawlDBKey.Type.KEY_TYPE_HTML_LINK.ordinal() && linkType <= CrawlDBKey.Type.KEY_TYPE_RSS_LINK.ordinal()) { JsonObject object = _parser.parse(valueBytes.toString()).getAsJsonObject(); if (object != null) { setSourceURLFromJSONObject(object, linkType); // LOG.debug("Got LinkData:" + JSONUtils.prettyPrintJSON(object)); // ok this is a link ... updateLinkStatsFromLinkJSONObject(object, _currentKey, reporter); } } else if (linkType == CrawlDBKey.Type.KEY_TYPE_INCOMING_URLS_SAMPLE.ordinal()) { importLinkSourceData(_currentKey, valueBytes); } reporter.progress(); } } @Override public void configure(JobConf job) { _sourceInputsBuffer = new DataOutputBuffer(EXT_SOURCE_SAMPLE_BUFFER_SIZE); _sourceInputsTrackingFilter = new URLFPBloomFilter(NUM_ELEMENTS, NUM_HASH_FUNCTIONS, NUM_BITS); _conf = job; try { _fs = FileSystem.get(_conf); _partitionId = _conf.getInt("mapred.task.partition", 0); } catch (IOException e) { e.printStackTrace(); } } @Override public void close() throws IOException { flushCurrentRecord(_outputCollector, _reporter); } /** * internal helper - emit a redirect record give a source crawl status record * * @param jsonObject * @param redirectObj * @param output * @param reporter * @throws IOException */ void emitRedirectRecord(JsonObject jsonObject, JsonObject redirectObj, OutputCollector<TextBytes, TextBytes> output, Reporter reporter) throws IOException { // ok first things first, generate a fingerprint for redirect SOURCE URLFPV2 redirectFP = URLUtils.getURLFPV2FromURL(redirectObj.get("source_url").getAsString()); if (redirectFP == null) { reporter.incrCounter(Counters.BAD_REDIRECT_URL, 1); } else { int httpResult = redirectObj.get("http_result").getAsInt(); JsonObject redirectJSON = new JsonObject(); redirectJSON.addProperty("disposition", "SUCCESS"); redirectJSON.addProperty("http_result", httpResult); redirectJSON.addProperty("server_ip", redirectObj.get("server_ip").getAsString()); redirectJSON.addProperty("attempt_time", jsonObject.get("attempt_time").getAsLong()); redirectJSON.addProperty("target_url", jsonObject.get("source_url").getAsString()); redirectJSON.addProperty("source_url", redirectObj.get("source_url").getAsString()); // ok emit the redirect record ... TextBytes key = CrawlDBKey.generateKey(redirectFP, CrawlDBKey.Type.KEY_TYPE_CRAWL_STATUS, jsonObject.get("attempt_time").getAsLong()); LOG.debug("!!!!!!Emitting Redirect Record:" + redirectJSON.toString()); output.collect(key, new TextBytes(redirectJSON.toString())); reporter.incrCounter(Counters.EMITTED_REDIRECT_RECORD, 1); //_redirectWriter.append(new TextBytes(redirectObj.get("source_url").getAsString()), new TextBytes(redirectJSON.toString())); } } /** * grab date headers and incorporate them into the crawl detail object * * @param jsonObject * @param crawlStatsJSON */ static void populateDateHeadersFromJSONObject(JsonObject jsonObject, JsonObject crawlStatsJSON) { JsonObject headers = jsonObject.getAsJsonObject("http_headers"); if (headers != null) { JsonElement httpDate = headers.get("date"); JsonElement age = headers.get("age"); JsonElement lastModified = headers.get("last-modified"); JsonElement expires = headers.get("expires"); JsonElement cacheControl = headers.get("cache-control"); JsonElement pragma = headers.get("pragma"); JsonElement etag = headers.get("etag"); if (httpDate != null) { crawlStatsJSON.addProperty(CRAWLDETAIL_HTTP_DATE_PROPERTY, HttpHeaderInfoExtractor.getTime(httpDate.getAsString())); } if (age != null) { crawlStatsJSON.add(CRAWLDETAIL_HTTP_AGE_PROPERTY, age); } if (lastModified != null) { crawlStatsJSON.addProperty(CRAWLDETAIL_HTTP_LAST_MODIFIED_PROPERTY, HttpHeaderInfoExtractor.getTime(lastModified.getAsString())); } if (expires != null) { crawlStatsJSON.addProperty(CRAWLDETAIL_HTTP_EXPIRES_PROPERTY, HttpHeaderInfoExtractor.getTime(expires.getAsString())); } if (cacheControl != null) { crawlStatsJSON.add(CRAWLDETAIL_HTTP_CACHE_CONTROL_PROPERTY, cacheControl); } if (pragma != null) { crawlStatsJSON.add(CRAWLDETAIL_HTTP_PRAGMA_PROPERTY, pragma); } if (etag != null) { crawlStatsJSON.add(CRAWLDETAIL_HTTP_ETAG_PROPERTY, etag); } } } /** * * @param contentObj * @param crawlStatsJSON */ static void addMinMaxFeedItemTimes(JsonObject contentObj, JsonObject crawlStatsJSON) { JsonArray items = contentObj.getAsJsonArray("items"); if (items != null) { long minPubDate = -1L; long maxPubDate = -1L; int itemCount = 0; for (JsonElement item : items) { long pubDateValue = -1; JsonElement pubDate = item.getAsJsonObject().get("published"); if (pubDate != null) { pubDateValue = pubDate.getAsLong(); } JsonElement updateDate = item.getAsJsonObject().get("updated"); if (updateDate != null) { if (updateDate.getAsLong() > pubDateValue) { pubDateValue = updateDate.getAsLong(); } } if (minPubDate == -1L || pubDateValue < minPubDate) { minPubDate = pubDateValue; } if (maxPubDate == -1L || pubDateValue > maxPubDate) { maxPubDate = pubDateValue; } itemCount++; } crawlStatsJSON.addProperty(RSS_MIN_PUBDATE_PROPERTY, minPubDate); crawlStatsJSON.addProperty(RSS_MAX_PUBDATE_PROPERTY, maxPubDate); crawlStatsJSON.addProperty(RSS_ITEM_COUNT_PROPERTY, itemCount); } } /** * we need to extract source url from the JSON because it is not available via * the key * * @param jsonObject * @param keyType */ void setSourceURLFromJSONObject(JsonObject jsonObject, long keyType) { if (!_urlKeyForzen) { JsonElement sourceElement = jsonObject.get("source_url"); if (keyType == CrawlDBKey.Type.KEY_TYPE_CRAWL_STATUS.ordinal()) { _outputKeyString = sourceElement.getAsString(); _outputKeyURLObj = new GoogleURL(_outputKeyString); JsonElement httpResultElem = jsonObject.get("http_result"); if (httpResultElem != null) { int httpResult = httpResultElem.getAsInt(); if (httpResult >= 200 && httpResult <= 299) { if (sourceElement != null && _outputKeyString == null) { _outputKeyString = sourceElement.getAsString(); _outputKeyURLObj = new GoogleURL(_outputKeyString); if (_outputKeyURLObj.isValid()) _urlKeyForzen = true; } } } } else if (keyType == CrawlDBKey.Type.KEY_TYPE_MERGED_RECORD.ordinal()) { _outputKeyString = sourceElement.getAsString(); _outputKeyURLObj = new GoogleURL(_outputKeyString); _urlKeyForzen = true; } else if (keyType >= CrawlDBKey.Type.KEY_TYPE_HTML_LINK.ordinal() && keyType <= CrawlDBKey.Type.KEY_TYPE_RSS_LINK.ordinal()) { if (_outputKeyString == null) { JsonElement hrefElement = jsonObject.get("href"); if (sourceElement != null && hrefElement != null) { GoogleURL hrefSource = new GoogleURL(sourceElement.getAsString()); if (hrefSource.isValid()) { _outputKeyString = hrefElement.getAsString(); _outputKeyURLObj = new GoogleURL(_outputKeyString); } } } } } } void mergeBlekkoMetadata(JsonObject newBlekkoMetadata, JsonObject existingTopLevelObj, Reporter reporter) { if (newBlekkoMetadata != null) { if (!existingTopLevelObj.has(TOPLEVEL_BLEKKO_METADATA_PROPERTY)) { existingTopLevelObj.add(TOPLEVEL_BLEKKO_METADATA_PROPERTY, newBlekkoMetadata); } else { JsonObject existingBlkkoMetadata = existingTopLevelObj .getAsJsonObject(TOPLEVEL_BLEKKO_METADATA_PROPERTY); long existingTimestamp = existingBlkkoMetadata.get(BLEKKO_METADATA_TIMESTAMP_PROPERTY).getAsLong(); long newTimestamp = newBlekkoMetadata.get(BLEKKO_METADATA_TIMESTAMP_PROPERTY).getAsLong(); if (newTimestamp > existingTimestamp) { existingTopLevelObj.add(TOPLEVEL_BLEKKO_METADATA_PROPERTY, newBlekkoMetadata); reporter.incrCounter(Counters.ADOPTED_NEW_BLEKKO_METADATA_RECORD, 1); } } } } void mergeLinkRecords(JsonObject sourceRecord, JsonObject topLevelJSONObject, Reporter reporter) { JsonElement destRecord = topLevelJSONObject.get(TOPLEVEL_LINKSTATUS_PROPERTY); if (destRecord == null) { if (sourceRecord != null) { reporter.incrCounter(Counters.ADOPTED_SOURCE_LINKSUMMARY_RECORD, 1); topLevelJSONObject.add(TOPLEVEL_LINKSTATUS_PROPERTY, sourceRecord); JsonArray typeAndRels = sourceRecord.getAsJsonArray(LINKSTATUS_TYPEANDRELS_PROPERTY); if (typeAndRels != null) { for (JsonElement typeAndRel : typeAndRels) { _types.add(typeAndRel.getAsString()); } } } } else { if (sourceRecord != null) { reporter.incrCounter(Counters.MERGED_SOURCE_LINKSUMMARY_RECORD_INTO_DEST, 1); safeIncrementJSONCounter(destRecord.getAsJsonObject(), LINKSTATUS_INTRADOMAIN_SOURCES_COUNT_PROPERTY, sourceRecord.get(LINKSTATUS_INTRADOMAIN_SOURCES_COUNT_PROPERTY)); safeIncrementJSONCounter(destRecord.getAsJsonObject(), LINKSTATUS_EXTRADOMAIN_SOURCES_COUNT_PROPERTY, sourceRecord.get(LINKSTATUS_EXTRADOMAIN_SOURCES_COUNT_PROPERTY)); safeSetMinLongValue(destRecord.getAsJsonObject(), LINKSTATUS_EARLIEST_DATE_PROPERTY, sourceRecord.get(LINKSTATUS_EARLIEST_DATE_PROPERTY)); safeSetMaxLongValue(destRecord.getAsJsonObject(), LINKSTATUS_LATEST_DATE_PROPERTY, sourceRecord.get(LINKSTATUS_LATEST_DATE_PROPERTY)); JsonArray typeAndRels = sourceRecord.getAsJsonArray(LINKSTATUS_TYPEANDRELS_PROPERTY); if (typeAndRels != null) { for (JsonElement typeAndRel : typeAndRels) { _types.add(typeAndRel.getAsString()); } } } } } /** * merge two crawl summary records * @param incomingRecord * @param topLevelJSONObject * @param reporter * @throws IOException */ void mergeSummaryRecords(JsonObject incomingRecord, JsonObject topLevelJSONObject, Reporter reporter) throws IOException { JsonObject destinationSummaryRecord = topLevelJSONObject.getAsJsonObject(TOPLEVEL_SUMMARYRECORD_PROPRETY); if (destinationSummaryRecord == null) { if (incomingRecord != null) { reporter.incrCounter(Counters.ADOPTED_SOURCE_SUMMARY_RECORD, 1); // adopt source ... topLevelJSONObject.add(TOPLEVEL_SUMMARYRECORD_PROPRETY, incomingRecord); _summaryRecord = incomingRecord; } } else { if (incomingRecord != null) { reporter.incrCounter(Counters.MERGED_SOURCE_SUMMARY_RECORD_INTO_DEST, 1); // walk crawl detail records in incoming record and merge them into destination record ... JsonElement crawlStatsArray = incomingRecord.get(SUMMARYRECORD_CRAWLDETAILS_ARRAY_PROPERTY); if (crawlStatsArray != null) { for (JsonElement crawlDetail : crawlStatsArray.getAsJsonArray()) { // add to our list of crawl detail records ... safeAddCrawlDetailToSummaryRecord(crawlDetail.getAsJsonObject()); // ok, now update summary stats based on incoming crawl detail record ... updateSummaryRecordFromCrawlDetailRecord(crawlDetail.getAsJsonObject(), _currentKey, reporter); } } } } } /** * for the current url, merge the currently accumulated information with a previously generated crawl summary record * @param jsonObject * @param destFP * @param reporter * @throws IOException */ void processMergedRecord(JsonObject jsonObject, URLFPV2 destFP, Reporter reporter) throws IOException { if (jsonObject.has(TOPLEVEL_BLEKKO_METADATA_PROPERTY)) { reporter.incrCounter(Counters.MERGE_RECORD_HAS_BLEKKO_METADATA, 1); } if (_topLevelJSONObject == null) { reporter.incrCounter(Counters.MERGED_OBJECT_FIRST_OBJECT, 1); _topLevelJSONObject = jsonObject; _summaryRecord = jsonObject.getAsJsonObject(TOPLEVEL_SUMMARYRECORD_PROPRETY); _linkSummaryRecord = jsonObject.getAsJsonObject(TOPLEVEL_LINKSTATUS_PROPERTY); if (_linkSummaryRecord != null) { // read in type and rels collection ... safeJsonArrayToStringCollection(_linkSummaryRecord, LINKSTATUS_TYPEANDRELS_PROPERTY, _types); } // and ext hrefs .. if (_summaryRecord != null) { safeJsonArrayToStringCollection(_summaryRecord, SUMMARYRECORD_EXTERNALLY_REFERENCED_URLS, _extHrefs); } // special blekko import stats if (_topLevelJSONObject.has(TOPLEVEL_BLEKKO_METADATA_PROPERTY)) { if (_summaryRecord == null && _linkSummaryRecord == null) { reporter.incrCounter(Counters.BLEKKO_METADATA_WITH_NO_SOURCE_CC_RECORD, 1); } } } else { mergeSummaryRecords(jsonObject.getAsJsonObject(TOPLEVEL_SUMMARYRECORD_PROPRETY), _topLevelJSONObject, reporter); mergeLinkRecords(jsonObject.getAsJsonObject(TOPLEVEL_LINKSTATUS_PROPERTY), _topLevelJSONObject, reporter); mergeBlekkoMetadata(jsonObject.getAsJsonObject(TOPLEVEL_BLEKKO_METADATA_PROPERTY), _topLevelJSONObject, reporter); } } /** * given a incoming link record, track the link source and also update stats and * also capture document type information (if available via the href). * * @param jsonObject * @param destFP * @param reporter * @throws IOException */ void updateLinkStatsFromLinkJSONObject(JsonObject jsonObject, URLFPV2 destFP, Reporter reporter) throws IOException { JsonElement sourceElement = jsonObject.get("source_url"); JsonElement hrefElement = jsonObject.get("href"); if (sourceElement != null && hrefElement != null) { //LOG.info("source:" + sourceElement.getAsString() + " href:" + hrefElement.getAsString()); GoogleURL sourceURLObj = new GoogleURL(sourceElement.getAsString()); if (sourceURLObj.isValid()) { if (_linkSummaryRecord == null) { _linkSummaryRecord = new JsonObject(); } // ok, first compare known host name with incoming link host name ... // if not a match then ... if (!_outputKeyURLObj.getHost().equals(sourceURLObj.getHost())) { // ok now deeper check ... URLFPV2 sourceFP = URLUtils.getURLFPV2FromURLObject(sourceURLObj); if (sourceFP != null) { reporter.incrCounter(Counters.GOT_EXTERNAL_DOMAIN_SOURCE, 1); // increment external source count safeIncrementJSONCounter(_linkSummaryRecord, LINKSTATUS_EXTRADOMAIN_SOURCES_COUNT_PROPERTY); //LOG.info("sourceFP:" + sourceFP.getKey() + " hrefFP:" + destFP.getKey()); // ok track sources if from a different root domain (for now) if (sourceFP.getRootDomainHash() != destFP.getRootDomainHash()) { trackPotentialLinkSource(sourceFP, sourceElement.getAsString(), destFP); } } } // otherwise, count it as an internal link else { // internal for sure ... safeIncrementJSONCounter(_linkSummaryRecord, LINKSTATUS_INTRADOMAIN_SOURCES_COUNT_PROPERTY); } JsonObject sourceHeaders = jsonObject.getAsJsonObject("source_headers"); if (sourceHeaders != null) { long httpDate = safeGetHttpDate(sourceHeaders, "date"); long lastModified = safeGetHttpDate(sourceHeaders, "last-modified"); if (lastModified != -1 && lastModified < httpDate) httpDate = lastModified; if (httpDate != -1L) { safeSetMinLongValue(_linkSummaryRecord, LINKSTATUS_EARLIEST_DATE_PROPERTY, httpDate); safeSetMaxLongValue(_linkSummaryRecord, LINKSTATUS_LATEST_DATE_PROPERTY, httpDate); } } JsonElement typeElement = jsonObject.get("type"); JsonElement relElement = jsonObject.get("rel"); String sourceTypeAndRel = jsonObject.get("source_type").getAsString() + ":"; if (typeElement != null) { sourceTypeAndRel += typeElement.getAsString(); } if (relElement != null) { sourceTypeAndRel += ":" + relElement.getAsString(); } if (_types.size() < MAX_TYPE_SAMPLES) _types.add(sourceTypeAndRel); } } } /** * take linking href data and add it to our list of incoming hrefs * (used during the intermediate merge process) * * @param destFP * @param inputData * @throws IOException */ void importLinkSourceData(URLFPV2 destFP, TextBytes inputData) throws IOException { TextBytes urlText = new TextBytes(); int curpos = inputData.getOffset(); int endpos = inputData.getOffset() + inputData.getLength(); byte lfPattern[] = { 0xA }; byte tabPattern[] = { 0x9 }; while (curpos != endpos) { int tabIndex = ByteArrayUtils.indexOf(inputData.getBytes(), curpos, endpos - curpos, tabPattern); if (tabIndex == -1) { break; } else { int lfIndex = ByteArrayUtils.indexOf(inputData.getBytes(), tabIndex + 1, endpos - (tabIndex + 1), lfPattern); if (lfIndex == -1) { break; } else { long sourceDomainHash = ByteArrayUtils.parseLong(inputData.getBytes(), curpos, tabIndex - curpos, 10); urlText.set(inputData.getBytes(), tabIndex + 1, lfIndex - (tabIndex + 1)); URLFPV2 bloomKey = sourceKeyFromSourceAndDest(sourceDomainHash, destFP.getUrlHash()); if (!_sourceInputsTrackingFilter.isPresent(bloomKey)) { // if not, check to see that we are not about to overflow sample buffer ... if (_sourceInputsBuffer.getLength() < EXT_SOURCE_SAMPLE_BUFFER_SIZE - EXT_SOURCE_SAMPLE_BUFFER_PAD_AMOUNT) { _sourceInputsBuffer.write(inputData.getBytes(), curpos, (lfIndex + 1) - curpos); _sourceSampleSize++; } } curpos = lfIndex + 1; } } } } /** * given an incoming link for a given url, store it in a accumulation buffer IFF we have not * seen a url from the given domain before * * @param sourceFP * @param sourceURL * @param destFP * @throws IOException */ void trackPotentialLinkSource(URLFPV2 sourceFP, String sourceURL, URLFPV2 destFP) throws IOException { URLFPV2 bloomKey = sourceKeyFromSourceAndDest(sourceFP.getDomainHash(), destFP.getUrlHash()); // check to see if we have collected a sample for this source domain / destination url combo or not ... if (!_sourceInputsTrackingFilter.isPresent(bloomKey)) { LOG.debug("sourceFP:" + sourceFP.getKey() + " passed BloomFilter Test"); // if not, check to see that we are not about to overflow sample buffer ... if (_sourceInputsBuffer.getLength() < EXT_SOURCE_SAMPLE_BUFFER_SIZE - EXT_SOURCE_SAMPLE_BUFFER_PAD_AMOUNT) { // ok store the external reference sample ... // write source domain hash _sourceInputsBuffer.write(Long.toString(sourceFP.getDomainHash()).getBytes()); // delimiter _sourceInputsBuffer.write(0x09);// TAB // and source url ... _sourceInputsBuffer.write(sourceURL.getBytes(Charset.forName("UTF-8"))); _sourceInputsBuffer.write(0x0A);// LF _sourceSampleSize++; // add to bloom filter ... _sourceInputsTrackingFilter.add(bloomKey); } } else { LOG.debug("sourceFP:" + sourceFP.getKey() + " failed BloomFilter Test"); } } /** * construct a (hacked) fingerprint key consisting of the source domain and destination * url fingerprint to be used for the purposes of setting bits in a bloomfilter * * @param sourceDomain * @param destURLHash * @return */ private URLFPV2 sourceKeyFromSourceAndDest(long sourceDomain, long destURLHash) { _bloomFilterKey.setDomainHash(sourceDomain); _bloomFilterKey.setUrlHash(destURLHash); return _bloomFilterKey; } /** * construct crawl detail record from incoming crawl status JSON * * @param jsonObject * @param fpSource * @param extHRefs * @param reporter * @return * @throws IOException */ static JsonObject crawlDetailRecordFromCrawlStatusRecord(JsonObject jsonObject, URLFPV2 fpSource, HashSet<String> extHRefs, Reporter reporter) throws IOException { String disposition = jsonObject.get("disposition").getAsString(); long attemptTime = jsonObject.get("attempt_time").getAsLong(); // inject all the details into a JSONObject JsonObject crawlStatsJSON = new JsonObject(); crawlStatsJSON.addProperty(CRAWLDETAIL_ATTEMPT_TIME_PROPERTY, attemptTime); if (disposition.equals("SUCCESS")) { // basic stats ... starting with crawl time ... int httpResult = jsonObject.get("http_result").getAsInt(); crawlStatsJSON.addProperty(CRAWLDETAIL_HTTPRESULT_PROPERTY, httpResult); crawlStatsJSON.addProperty(CRAWLDETAIL_SERVERIP_PROPERTY, jsonObject.get("server_ip").getAsString()); //populate date headers ... populateDateHeadersFromJSONObject(jsonObject, crawlStatsJSON); // if http 200 ... if (httpResult >= 200 && httpResult <= 299) { reporter.incrCounter(Counters.GOT_HTTP_200_CRAWL_STATUS, 1); crawlStatsJSON.addProperty(CRAWLDETAIL_CONTENTLEN_PROPERTY, jsonObject.get("content_len").getAsInt()); if (jsonObject.get("mime_type") != null) { crawlStatsJSON.addProperty(CRAWLDETAIL_MIMETYPE_PROPERTY, jsonObject.get("mime_type").getAsString()); } if (jsonObject.get("md5") != null) { crawlStatsJSON.addProperty(CRAWLDETAIL_MD5_PROPERTY, jsonObject.get("md5").getAsString()); } if (jsonObject.get("text_simhash") != null) { crawlStatsJSON.addProperty(CRAWLDETAIL_TEXTSIMHASH_PROPERTY, jsonObject.get("text_simhash").getAsLong()); } JsonElement parsedAs = jsonObject.get("parsed_as"); if (parsedAs != null) { // populate some info based on type ... crawlStatsJSON.addProperty(CRAWLDETAIL_PARSEDAS_PROPERTY, parsedAs.getAsString()); String parsedAsString = parsedAs.getAsString(); // if html ... if (parsedAsString.equals("html")) { JsonObject content = jsonObject.get("content").getAsJsonObject(); if (content != null) { JsonElement titleElement = content.get("title"); JsonElement metaElement = content.get("meta_tags"); if (titleElement != null) { crawlStatsJSON.add(CRAWLDETAIL_TITLE_PROPERTY, titleElement); } if (metaElement != null) { crawlStatsJSON.add(CRAWLDETAIL_METATAGS_PROPERTY, metaElement); } // collect link stats for json ... updateLinkStatsFromHTMLContent(crawlStatsJSON, jsonObject, extHRefs, fpSource, reporter); } } // if feed ... else if (parsedAsString.equals("feed")) { // get content ... JsonObject content = jsonObject.get("content").getAsJsonObject(); JsonElement titleElement = content.get("title"); if (titleElement != null) { crawlStatsJSON.add(CRAWLDETAIL_TITLE_PROPERTY, titleElement); } // set update time ... long updateTime = safeGetLong(content, "updated"); if (updateTime != -1) { crawlStatsJSON.addProperty(CRAWLDETAIL_UPDATED_PROPERTY, updateTime); } addMinMaxFeedItemTimes(content, crawlStatsJSON); } } } // redirect ... else if (httpResult >= 300 && httpResult <= 399) { reporter.incrCounter(Counters.GOT_REDIRECT_CRAWL_STATUS, 1); // get the target url ... JsonElement targetURL = jsonObject.get("target_url"); if (targetURL != null) { // redirect details ... crawlStatsJSON.addProperty(CRAWLDETAIL_REDIRECT_URL, targetURL.getAsString()); } else { reporter.incrCounter(Counters.GOT_NULL_REDIRECT_URL, 1); } } } else { // inject all the details into a JSONObject // basic stats ... starting with crawl time ... crawlStatsJSON.addProperty(CRAWLDETAIL_FAILURE, true); crawlStatsJSON.addProperty(CRAWLDETAIL_FAILURE_REASON, safeGetStringFromElement(jsonObject, "failure_reason")); crawlStatsJSON.addProperty(CRAWLDETAIL_FAILURE_DETAIL, safeGetStringFromElement(jsonObject, "failure_detail")); } return crawlStatsJSON; } /** * given a crawl detail json record, update summary record stats * * @param crawlDetailRecord * @param fpSource * @param reporter * @throws IOException */ void updateSummaryRecordFromCrawlDetailRecord(JsonObject crawlDetailRecord, URLFPV2 fpSource, Reporter reporter) throws IOException { if (_summaryRecord == null) { _summaryRecord = new JsonObject(); } boolean failure = safeGetBoolean(crawlDetailRecord, CRAWLDETAIL_FAILURE); long attemptTime = crawlDetailRecord.get(CRAWLDETAIL_ATTEMPT_TIME_PROPERTY).getAsLong(); // set latest attempt time ... long latestAttemptTime = safeSetMaxLongValue(_summaryRecord, SUMMARYRECORD_LATEST_ATTEMPT_PROPERTY, attemptTime); // increment attempt count safeIncrementJSONCounter(_summaryRecord, SUMMARYRECORD_ATTEMPT_COUNT_PROPERTY); // if this is the latest attempt ... if (latestAttemptTime == attemptTime) { // add latest http result to summary if (!failure && crawlDetailRecord.has(CRAWLDETAIL_HTTPRESULT_PROPERTY)) { int httpResult = crawlDetailRecord.get(CRAWLDETAIL_HTTPRESULT_PROPERTY).getAsInt(); // set last http result _summaryRecord.addProperty(SUMMARYRECORD_HTTP_RESULT_PROPERTY, httpResult); if (httpResult >= 200 && httpResult <= 299) { // update the crawl timestamp _summaryRecord.addProperty(SUMMARYRECORD_LATEST_CRAWLTIME_PROPERTY, attemptTime); // and the crawl count .... safeIncrementJSONCounter(_summaryRecord, SUMMARYRECORD_CRAWLCOUNT_PROPERTY); // update parsed as if (crawlDetailRecord.has(CRAWLDETAIL_PARSEDAS_PROPERTY)) { _summaryRecord.addProperty(SUMMARYRECORD_PARSEDAS_PROPERTY, safeGetStringFromElement(crawlDetailRecord, CRAWLDETAIL_PARSEDAS_PROPERTY)); } } else if (httpResult >= 300 && httpResult <= 399) { if (crawlDetailRecord.has(CRAWLDETAIL_REDIRECT_URL)) { _summaryRecord.addProperty(SUMMARYRECORD_REDIRECT_URL_PROPERTY, safeGetStringFromElement(crawlDetailRecord, CRAWLDETAIL_REDIRECT_URL)); } } } } } /** * given html content (json object), extract out of domain hrefs and cache them * and ... update stats * @param crawlStats * @param incomingJSONObject * @param extHRefs * @param fpSource * @param reporter */ static void updateLinkStatsFromHTMLContent(JsonObject crawlStats, JsonObject incomingJSONObject, HashSet<String> extHRefs, URLFPV2 fpSource, Reporter reporter) { JsonArray links = incomingJSONObject.getAsJsonArray("links"); if (links == null) { reporter.incrCounter(Counters.NULL_LINKS_ARRAY, 1); } else { // clear our snapshot of externally referenced urls // we only want to capture this information from // the links extracted via the latest content if (extHRefs != null) extHRefs.clear(); int intraDomainLinkCount = 0; int intraRootLinkCount = 0; int interDomainLinkCount = 0; for (JsonElement link : links) { JsonObject linkObj = link.getAsJsonObject(); if (linkObj != null && linkObj.has("href")) { String href = linkObj.get("href").getAsString(); GoogleURL urlObject = new GoogleURL(href); if (urlObject.isValid()) { URLFPV2 linkFP = URLUtils.getURLFPV2FromURLObject(urlObject); if (linkFP != null) { if (linkFP.getRootDomainHash() == fpSource.getRootDomainHash()) { if (linkFP.getDomainHash() == fpSource.getDomainHash()) { intraDomainLinkCount++; } else { intraRootLinkCount++; } } else { interDomainLinkCount++; // track domains we link to if (extHRefs != null) { if (extHRefs.size() <= MAX_EXTERNALLY_REFERENCED_URLS) { extHRefs.add(urlObject.getCanonicalURL()); } } } } } } } // update counts in crawl stats data structure ... crawlStats.addProperty(CRAWLDETAIL_INTRADOMAIN_LINKS, intraDomainLinkCount); crawlStats.addProperty(CRAWLDETAIL_INTRAROOT_LINKS, intraRootLinkCount); crawlStats.addProperty(CRAWLDETAIL_INTERDOMAIN_LINKS, interDomainLinkCount); if (interDomainLinkCount <= 100) { reporter.incrCounter(Counters.INTERDOMAIN_LINKS_LTEQ_100, 1); } else if (interDomainLinkCount <= 1000) { reporter.incrCounter(Counters.INTERDOMAIN_LINKS_LTEQ_1000, 1); } else { reporter.incrCounter(Counters.INTERDOMAIN_LINKS_GT_1000, 1); } } } /** * flush currently accumulated JSON record * * @param output * @param reporter * @throws IOException */ private void flushCurrentRecord(OutputCollector<TextBytes, TextBytes> output, Reporter reporter) throws IOException { _urlsProcessed++; if (_outputKeyString == null || !_outputKeyURLObj.isValid()) { if (reporter != null) { reporter.incrCounter(Counters.FAILED_TO_GET_SOURCE_HREF, 1); } } else { if (_topLevelJSONObject != null || _summaryRecord != null || _linkSummaryRecord != null) { if (_topLevelJSONObject == null) { reporter.incrCounter(Counters.ALLOCATED_TOP_LEVEL_OBJECT_IN_FLUSH, 1); _topLevelJSONObject = new JsonObject(); _topLevelJSONObject.addProperty(TOPLEVEL_SOURCE_URL_PROPRETY, _outputKeyString); } else { reporter.incrCounter(Counters.ENCOUNTERED_EXISTING_TOP_LEVEL_OBJECT_IN_FLUSH, 1); } if (_summaryRecord != null) { _summaryRecord.remove(SUMMARYRECORD_EXTERNALLY_REFERENCED_URLS); _summaryRecord.remove(SUMMARYRECORD_EXTERNALLY_REFERENCED_URLS_TRUNCATED); if (_extHrefs.size() != 0) { // output links in the top level object ... stringCollectionToJsonArrayWithMax(_summaryRecord, SUMMARYRECORD_EXTERNALLY_REFERENCED_URLS, _extHrefs, MAX_EXTERNALLY_REFERENCED_URLS); if (_extHrefs.size() > MAX_EXTERNALLY_REFERENCED_URLS) { _summaryRecord.addProperty(SUMMARYRECORD_EXTERNALLY_REFERENCED_URLS_TRUNCATED, true); } } reporter.incrCounter(Counters.ENCOUNTERED_SUMMARY_RECORD_IN_FLUSH, 1); _topLevelJSONObject.add(TOPLEVEL_SUMMARYRECORD_PROPRETY, _summaryRecord); } if (_linkSummaryRecord != null) { reporter.incrCounter(Counters.ENCOUNTERED_LINKSUMMARY_RECORD_IN_FLUSH, 1); if (_types != null && _types.size() != 0) { stringCollectionToJsonArray(_linkSummaryRecord, LINKSTATUS_TYPEANDRELS_PROPERTY, _types); } _topLevelJSONObject.add(TOPLEVEL_LINKSTATUS_PROPERTY, _linkSummaryRecord); } //System.out.println("Emitting Key:" + CrawlDBKey.generateKey(_currentKey, CrawlDBKey.Type.KEY_TYPE_MERGED_RECORD, 0)); if (_topLevelJSONObject.has(TOPLEVEL_BLEKKO_METADATA_PROPERTY)) { JsonObject blekkoMetadata = _topLevelJSONObject .getAsJsonObject(TOPLEVEL_BLEKKO_METADATA_PROPERTY); reporter.incrCounter(Counters.EMITTED_RECORD_WITH_BLEKKO_METADATA, 1); if (_linkSummaryRecord != null || _summaryRecord != null) { reporter.incrCounter(Counters.BLEKKO_RECORD_ALREADY_IN_DATABASE, 1); if (_summaryRecord != null) { if (_summaryRecord.has(SUMMARYRECORD_ATTEMPT_COUNT_PROPERTY) && _summaryRecord.get(SUMMARYRECORD_ATTEMPT_COUNT_PROPERTY).getAsInt() != 0) { String status = blekkoMetadata.get(BLEKKO_METADATA_STATUS).getAsString(); if (status.equalsIgnoreCase("crawled")) { reporter.incrCounter(Counters.BLEKKO_CRAWLED_CC_CRAWLED, 1); } else { reporter.incrCounter(Counters.BLEKKO_NOT_CRAWLED_CC_CRAWLED, 1); } } } } } // output top level record ... output.collect(CrawlDBKey.generateKey(_currentKey, CrawlDBKey.Type.KEY_TYPE_MERGED_RECORD, 0), new TextBytes(_topLevelJSONObject.toString())); // if there is link status available ... if (_sourceSampleSize != 0) { reporter.incrCounter(Counters.EMITTED_SOURCEINPUTS_RECORD, 1); TextBytes sourceInputsText = new TextBytes(); sourceInputsText.set(_sourceInputsBuffer.getData(), 0, _sourceInputsBuffer.getLength()); //System.out.println("Emitting Key:" + CrawlDBKey.generateKey(_currentKey, CrawlDBKey.Type.KEY_TYPE_INCOMING_URLS_SAMPLE, 0)); output.collect( CrawlDBKey.generateKey(_currentKey, CrawlDBKey.Type.KEY_TYPE_INCOMING_URLS_SAMPLE, 0), sourceInputsText); reporter.incrCounter(Counters.EMITTED_SOURCEINPUTS_DATA_BYTES_EMITTED, sourceInputsText.getLength()); } } if (_urlsProcessed % FLUSH_INTERVAL == 0) { _sourceInputsTrackingFilter.clear(); } } _sourceInputsBuffer.reset(); _sourceSampleSize = 0; _topLevelJSONObject = null; _summaryRecord = null; _linkSummaryRecord = null; _types.clear(); _extHrefs.clear(); _outputKeyString = null; _urlKeyForzen = false; _outputKeyURLObj = null; } /** * Extract the fingerprint from the incoming key and potentially trigger a flush if it is indicative of a * primary key transition * @param key * @param output * @param reporter * @throws IOException */ private void readFPCheckForTransition(TextBytes key, OutputCollector<TextBytes, TextBytes> output, Reporter reporter) throws IOException { if (_tempKey == null) { _tempKey = new URLFPV2(); } _tempKey.setRootDomainHash( CrawlDBKey.getLongComponentFromKey(key, ComponentId.ROOT_DOMAIN_HASH_COMPONENT_ID)); _tempKey.setDomainHash(CrawlDBKey.getLongComponentFromKey(key, ComponentId.DOMAIN_HASH_COMPONENT_ID)); _tempKey.setUrlHash(CrawlDBKey.getLongComponentFromKey(key, ComponentId.URL_HASH_COMPONENT_ID)); if (_currentKey == null) { _currentKey = _tempKey; _tempKey = null; } else { // check for key transition ... if (_currentKey.compareTo(_tempKey) != 0) { // transition flushCurrentRecord(output, reporter); // swap keys ... URLFPV2 oldKey = _currentKey; _currentKey = _tempKey; _tempKey = oldKey; } } } /** * add crawl detail to summary record. construct a summary detail if none exists ... * * @param crawlStatsJSON */ void safeAddCrawlDetailToSummaryRecord(JsonObject crawlStatsJSON) { if (_summaryRecord == null) { _summaryRecord = new JsonObject(); } // construct crawl stats array if necessary JsonArray crawlStatsArray = _summaryRecord.getAsJsonArray(SUMMARYRECORD_CRAWLDETAILS_ARRAY_PROPERTY); if (crawlStatsArray == null) { crawlStatsArray = new JsonArray(); _summaryRecord.add(SUMMARYRECORD_CRAWLDETAILS_ARRAY_PROPERTY, crawlStatsArray); } // add crawl stats to it crawlStatsArray.add(crawlStatsJSON); } /** * scan the merge db path and find the latest crawl database timestamp * * @param fs * @param conf * @return * @throws IOException */ static long findLatestMergeDBTimestamp(FileSystem fs, Configuration conf) throws IOException { long timestampOut = -1L; FileStatus files[] = fs.globStatus(new Path(S3N_BUCKET_PREFIX + MERGE_DB_PATH, "[0-9]*")); for (FileStatus candidate : files) { Path successPath = new Path(candidate.getPath(), "_SUCCESS"); if (fs.exists(successPath)) { long timestamp = Long.parseLong(candidate.getPath().getName()); timestampOut = Math.max(timestamp, timestampOut); } } return timestampOut; } /** * iterate the intermediate link graph data and extract unmerged set ... * * @param fs * @param conf * @param latestMergeDBTimestamp * @return * @throws IOException */ static List<Path> filterMergeCandidtes(FileSystem fs, Configuration conf, long latestMergeDBTimestamp) throws IOException { ArrayList<Path> list = new ArrayList<Path>(); FileStatus candidates[] = fs .globStatus(new Path(S3N_BUCKET_PREFIX + MERGE_INTERMEDIATE_OUTPUT_PATH, "[0-9]*")); for (FileStatus candidate : candidates) { LOG.info("Found Merge Candidate:" + candidate.getPath()); long candidateTimestamp = Long.parseLong(candidate.getPath().getName()); if (candidateTimestamp > latestMergeDBTimestamp) { Path successPath = new Path(candidate.getPath(), "_SUCCESS"); if (fs.exists(successPath)) { list.add(candidate.getPath()); } else { LOG.info("Rejected Merge Candidate:" + candidate.getPath()); } } } return list; } /////////////////////////////////////////////////////////////////////////// // TEST CODE /////////////////////////////////////////////////////////////////////////// /* // PARK THIS CODE FOR NOW SINCE WE ARE TRANSFERRING DATA PROCESSING TO EC2 if (_skipPartition) return; // collect all incoming paths first Vector<Path> incomingPaths = new Vector<Path>(); while(values.hasNext()){ String path = values.next().toString(); LOG.info("Found Incoming Path:" + path); incomingPaths.add(new Path(path)); } FlexBuffer scanArray[] = LinkKey.allocateScanArray(); // set up merge attributes Configuration localMergeConfig = new Configuration(_conf); localMergeConfig.setClass( MultiFileInputReader.MULTIFILE_COMPARATOR_CLASS, LinkKeyGroupingComparator.class, RawComparator.class); localMergeConfig.setClass(MultiFileInputReader.MULTIFILE_KEY_CLASS, TextBytes.class, WritableComparable.class); // ok now spawn merger MultiFileInputReader<TextBytes> multiFileInputReader = new MultiFileInputReader<TextBytes>( _fs, incomingPaths, localMergeConfig); TextBytes keyBytes = new TextBytes(); TextBytes valueBytes = new TextBytes(); DataInputBuffer inputBuffer = new DataInputBuffer(); int processedKeysCount = 0; Pair<KeyAndValueData<TextBytes>,Iterable<RawRecordValue>> nextItem = null; while ((nextItem = multiFileInputReader.getNextItemIterator()) != null) { urlsProcessed++; _sourceInputsBuffer.reset(); _sourceSampleSize = 0; summaryRecord = null; linkSummaryRecord = null; types.clear(); outputKeyString = null; outputKeyFromInternalLink = false; outputKeyURLObj = null; extLinkedDomains.clear(); int statusCount = 0; int linkCount = 0; // scan key components LinkKey.scanForComponents(nextItem.e0._keyObject, ':',scanArray); // pick up source fp from key ... URLFPV2 fpSource = new URLFPV2(); fpSource.setRootDomainHash(LinkKey.getLongComponentFromComponentArray(scanArray,LinkKey.ComponentId.ROOT_DOMAIN_HASH_COMPONENT_ID)); fpSource.setDomainHash(LinkKey.getLongComponentFromComponentArray(scanArray,LinkKey.ComponentId.DOMAIN_HASH_COMPONENT_ID)); fpSource.setUrlHash(LinkKey.getLongComponentFromComponentArray(scanArray,LinkKey.ComponentId.URL_HASH_COMPONENT_ID)); for (RawRecordValue rawValue: nextItem.e1) { inputBuffer.reset(rawValue.key.getData(),0,rawValue.key.getLength()); int length = WritableUtils.readVInt(inputBuffer); keyBytes.set(rawValue.key.getData(),inputBuffer.getPosition(),length); inputBuffer.reset(rawValue.data.getData(),0,rawValue.data.getLength()); length = WritableUtils.readVInt(inputBuffer); valueBytes.set(rawValue.data.getData(),inputBuffer.getPosition(),length); */ }