org.commoncrawl.mapred.ec2.postprocess.linkCollector.LinkMergerJob.java Source code

Introduction

Here is the source code for org.commoncrawl.mapred.ec2.postprocess.linkCollector.LinkMergerJob.java
Source

/**
 * Copyright 2012 - CommonCrawl Foundation
 * 
 *    This program is free software: you can redistribute it and/or modify
 *    it under the terms of the GNU General Public License as published by
 *    the Free Software Foundation, either version 3 of the License, or
 *    (at your option) any later version.
 *
 *    This program is distributed in the hope that it will be useful,
 *    but WITHOUT ANY WARRANTY; without even the implied warranty of
 *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *    GNU General Public License for more details.
 *
 *    You should have received a copy of the GNU General Public License
 *    along with this program.  If not, see <http://www.gnu.org/licenses/>.
 *
 **/

package org.commoncrawl.mapred.ec2.postprocess.linkCollector;

import java.io.IOException;
import java.text.NumberFormat;
import java.util.ArrayList;
import java.util.Collection;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Vector;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.DataInputBuffer;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.RawComparator;
import org.apache.hadoop.io.SequenceFile;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.io.WritableUtils;
import org.apache.hadoop.io.SequenceFile.CompressionType;
import org.apache.hadoop.io.compress.SnappyCodec;
import org.apache.hadoop.mapred.FileOutputFormat;
import org.apache.hadoop.mapred.JobClient;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.Reducer;
import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.mapred.SequenceFileOutputFormat;
import org.commoncrawl.crawl.common.internal.CrawlEnvironment;
import org.commoncrawl.mapred.ec2.postprocess.linkCollector.LinkDataResharder.Counters;
import org.commoncrawl.mapred.ec2.postprocess.linkCollector.LinkKey.LinkKeyGroupingComparator;
import org.commoncrawl.protocol.URLFPV2;
import org.commoncrawl.util.FlexBuffer;
import org.commoncrawl.util.GoogleURL;
import org.commoncrawl.util.HttpHeaderInfoExtractor;
import org.commoncrawl.util.JobBuilder;
import org.commoncrawl.util.TextBytes;
import org.commoncrawl.util.URLUtils;
import org.commoncrawl.util.MultiFileMergeUtils.MultiFileInputReader;
import org.commoncrawl.util.MultiFileMergeUtils.MultiFileMergeInputFormat;
import org.commoncrawl.util.MultiFileMergeUtils.MultiFileMergePartitioner;
import org.commoncrawl.util.MultiFileMergeUtils.MultiFileInputReader.KeyAndValueData;
import org.commoncrawl.util.MultiFileMergeUtils.MultiFileInputReader.RawRecordValue;
import org.commoncrawl.util.Tuples.Pair;

import com.google.common.collect.ImmutableSet;
import com.google.gson.JsonArray;
import com.google.gson.JsonElement;
import com.google.gson.JsonObject;
import com.google.gson.JsonParser;
import com.google.gson.JsonPrimitive;

public class LinkMergerJob implements Reducer<IntWritable, Text, TextBytes, TextBytes> {

    static final Log LOG = LogFactory.getLog(LinkMergerJob.class);
    static final Path internalMergedSegmentPath = new Path("crawl/ec2Import/mergedSegment");
    static final Path internalMergedDBPath = new Path("crawl/ec2Import/mergedDB");

    static final int MAX_TYPE_SAMPLES = 5;
    static final int MAX_EXT_SOURCE_SAMPLES = 100;

    static long findLatestMergeDBTimestamp(FileSystem fs, Configuration conf) throws IOException {
        long timestampOut = -1L;

        FileStatus files[] = fs.globStatus(new Path(internalMergedDBPath, "[0-9]*"));

        for (FileStatus candidate : files) {
            long timestamp = Long.parseLong(candidate.getPath().getName());
            timestampOut = Math.max(timestamp, timestampOut);
        }
        return timestampOut;
    }

    static List<Path> filterMergeCandidtes(FileSystem fs, Configuration conf, long latestMergeDBTimestamp)
            throws IOException {
        ArrayList<Path> list = new ArrayList<Path>();
        FileStatus candidates[] = fs.globStatus(new Path(internalMergedSegmentPath, "[0-9]*"));

        for (FileStatus candidate : candidates) {
            long candidateTimestamp = Long.parseLong(candidate.getPath().getName());
            if (candidateTimestamp > latestMergeDBTimestamp) {
                list.add(candidate.getPath());
            }
        }
        return list;
    }

    public static void main(String[] args) throws IOException {
        Configuration conf = new Configuration();
        FileSystem fs = FileSystem.get(conf);

        // establish merge timestamp 
        long mergeTimesmap = System.currentTimeMillis();
        // get a temp directory ... 
        Path outputPath = JobBuilder.tempDir(conf, Long.toString(mergeTimesmap));

        // find latest merge timestamp ... 
        long latestMergeDBTimestamp = findLatestMergeDBTimestamp(fs, conf);
        LOG.info("Latest MergeDB Timestmap is:" + latestMergeDBTimestamp);
        // find list of merge candidates ... 
        List<Path> candidateList = filterMergeCandidtes(fs, conf, latestMergeDBTimestamp);
        LOG.info("Merge Candidate List is:" + candidateList);
        if (candidateList.size() != 0) {
            ArrayList<Path> inputPaths = new ArrayList<Path>();

            // add all input paths to list 
            inputPaths.addAll(candidateList);
            // establish an affinity path ... 
            Path affinityPath = candidateList.get(0);
            // add merge db path if it exists 
            if (latestMergeDBTimestamp != -1L) {
                affinityPath = new Path(internalMergedDBPath, Long.toString(latestMergeDBTimestamp));
                inputPaths.add(affinityPath);
            }

            JobConf jobConf = new JobBuilder("Final Merge Job", conf).inputs(inputPaths)
                    .inputFormat(MultiFileMergeInputFormat.class).mapperKeyValue(IntWritable.class, Text.class)
                    .outputKeyValue(TextBytes.class, TextBytes.class).outputFormat(SequenceFileOutputFormat.class)
                    .reducer(LinkMergerJob.class, false).partition(MultiFileMergePartitioner.class)
                    .numReducers(CrawlEnvironment.NUM_DB_SHARDS).speculativeExecution(false).output(outputPath)
                    .setAffinityNoBalancing(affinityPath,
                            ImmutableSet.of("ccd001.commoncrawl.org", "ccd006.commoncrawl.org"))

                    .compressMapOutput(false).compressor(CompressionType.BLOCK, SnappyCodec.class)

                    .build();

            JsonArray hack = new JsonArray();

            hack.add(new JsonPrimitive(11));
            hack.add(new JsonPrimitive(21));
            hack.add(new JsonPrimitive(82));
            hack.add(new JsonPrimitive(83));
            hack.add(new JsonPrimitive(90));

            jobConf.set("hack", hack.toString());

            LOG.info("Starting JOB");
            JobClient.runJob(jobConf);

            Path finalOutputPath = new Path(internalMergedDBPath, Long.toString(mergeTimesmap));
            LOG.info("Renaming tempoutput:" + outputPath + " to:" + finalOutputPath);
            fs.rename(outputPath, finalOutputPath);
        }

    }

    void emitRedirectRecord(JsonObject jsonObject, JsonObject redirectObj, Reporter reporter) throws IOException {

        int httpResult = redirectObj.get("http_result").getAsInt();
        if (httpResult == 301) {
            JsonObject redirectJSON = new JsonObject();

            redirectJSON.addProperty("disposition", "SUCCESS");
            redirectJSON.addProperty("http_result", 301);
            redirectJSON.addProperty("attempt_time", jsonObject.get("attempt_time").getAsLong());
            redirectJSON.addProperty("target_url", jsonObject.get("source_url").getAsString());

            reporter.incrCounter(Counters.EMITTED_REDIRECT_RECORD, 1);
            _redirectWriter.append(new TextBytes(redirectObj.get("source_url").getAsString()),
                    new TextBytes(redirectJSON.toString()));
        }
    }

    void populateDateHeadersFromJSONObject(JsonObject jsonObject, JsonObject crawlStatsJSON) {
        JsonObject headers = jsonObject.getAsJsonObject("http_headers");
        JsonElement httpDate = headers.get("date");
        JsonElement age = headers.get("age");
        JsonElement lastModified = headers.get("last-modified");
        JsonElement expires = headers.get("expires");
        JsonElement cacheControl = headers.get("cache-control");
        JsonElement pragma = headers.get("pragma");
        JsonElement etag = headers.get("etag");

        if (httpDate != null) {
            crawlStatsJSON.addProperty("date", HttpHeaderInfoExtractor.getTime(httpDate.getAsString()));
        }
        if (age != null) {
            crawlStatsJSON.add("age", age);
        }
        if (lastModified != null) {
            crawlStatsJSON.addProperty("last-modified",
                    HttpHeaderInfoExtractor.getTime(lastModified.getAsString()));
        }
        if (expires != null) {
            crawlStatsJSON.addProperty("expires", HttpHeaderInfoExtractor.getTime(expires.getAsString()));
        }
        if (cacheControl != null) {
            crawlStatsJSON.add("cache-control", cacheControl);
        }
        if (pragma != null) {
            crawlStatsJSON.add("pragma", pragma);
        }
        if (etag != null) {
            crawlStatsJSON.add("etag", etag);
        }
    }

    static void safeIncrementJSONCounter(JsonObject jsonObj, String property) {
        if (jsonObj.has(property)) {
            jsonObj.addProperty(property, jsonObj.get(property).getAsInt() + 1);
        } else {
            jsonObj.addProperty(property, 1);
        }
    }

    static long safeGetLong(JsonObject jsonObj, String property) {
        JsonElement element = jsonObj.get(property);
        if (element != null) {
            return element.getAsLong();
        }
        return -1;
    }

    static long safeGetHttpDate(JsonObject jsonObj, String property) {
        JsonElement element = jsonObj.get(property);
        if (element != null) {
            return HttpHeaderInfoExtractor.getTime(element.getAsString());
        }
        return -1;
    }

    static long safeSetMaxLongValue(JsonObject jsonObj, String property, long newValue) {
        JsonElement element = jsonObj.get(property);
        if (element != null) {
            if (element.getAsLong() > newValue) {
                return element.getAsLong();
            }
        }
        jsonObj.addProperty(property, newValue);
        return newValue;
    }

    static long safeSetMinLongValue(JsonObject jsonObj, String property, long newValue) {
        JsonElement element = jsonObj.get(property);
        if (element != null) {
            if (newValue > element.getAsLong()) {
                return element.getAsLong();
            }
        }
        jsonObj.addProperty(property, newValue);
        return newValue;
    }

    static void stringCollectionToJsonArray(JsonObject jsonObject, String propertyName,
            Collection<String> stringSet) {
        JsonArray array = new JsonArray();
        for (String value : stringSet) {
            array.add(new JsonPrimitive(value));
        }
        jsonObject.add(propertyName, array);
    }

    void addMinMaxFeedItemTimes(JsonObject contentObj, JsonObject crawlStatsJSON) {
        JsonArray items = contentObj.getAsJsonArray("items");

        if (items != null) {
            long minPubDate = -1L;
            long maxPubDate = -1L;
            int itemCount = 0;

            for (JsonElement item : items) {
                long pubDateValue = -1;
                JsonElement pubDate = item.getAsJsonObject().get("published");

                if (pubDate != null) {
                    pubDateValue = pubDate.getAsLong();
                }
                JsonElement updateDate = item.getAsJsonObject().get("updated");
                if (updateDate != null) {
                    if (updateDate.getAsLong() > pubDateValue) {
                        pubDateValue = updateDate.getAsLong();
                    }
                }

                if (minPubDate == -1L || pubDateValue < minPubDate) {
                    minPubDate = pubDateValue;
                }
                if (maxPubDate == -1L || pubDateValue > maxPubDate) {
                    maxPubDate = pubDateValue;
                }
                itemCount++;
            }
            crawlStatsJSON.addProperty("minPubDate", minPubDate);
            crawlStatsJSON.addProperty("maxPubDate", maxPubDate);
            crawlStatsJSON.addProperty("itemCount", itemCount);
        }
    }

    void updateLinkStatsFromLinkJSONObject(JsonObject jsonObject, URLFPV2 keyFP, Reporter reporter) {
        JsonElement sourceElement = jsonObject.get("source_url");
        JsonElement hrefElement = jsonObject.get("href");

        if (sourceElement != null && hrefElement != null) {
            GoogleURL hrefSource = new GoogleURL(sourceElement.getAsString());

            if (hrefSource.isValid()) {

                if (outputKeyString == null || !outputKeyFromInternalLink) {
                    if (outputKeyString == null || outputKeyString.compareTo(hrefElement.getAsString()) != 0) {
                        outputKeyString = hrefElement.getAsString();
                        outputKeyURLObj = new GoogleURL(outputKeyString);
                        if (outputKeyURLObj.getHost().equals(hrefSource.getHost())) {
                            outputKeyFromInternalLink = true;
                        }
                    }
                }

                if (linkSummaryRecord == null) {
                    linkSummaryRecord = new JsonObject();
                }

                // quick check first ... 
                if (!outputKeyURLObj.getHost().equals(hrefSource.getHost())) {
                    // ok now deeper check ...
                    URLFPV2 sourceFP = URLUtils.getURLFPV2FromURLObject(hrefSource);
                    if (sourceFP != null) {
                        if (sourceFP.getRootDomainHash() != keyFP.getRootDomainHash()) {
                            reporter.incrCounter(Counters.GOT_EXTERNAL_DOMAIN_SOURCE, 1);
                            safeIncrementJSONCounter(linkSummaryRecord, "ext_sources");
                            if (linkSources == null || linkSources.size() < MAX_EXT_SOURCE_SAMPLES) {
                                if (linkSources == null)
                                    linkSources = new HashMap<Long, String>();
                                if (!linkSources.containsKey(sourceFP.getRootDomainHash())) {
                                    linkSources.put(sourceFP.getRootDomainHash(), sourceElement.getAsString());
                                }
                            }
                        } else {
                            safeIncrementJSONCounter(linkSummaryRecord, "int_sources");
                        }
                    }
                } else {
                    // internal for sure ... 
                    safeIncrementJSONCounter(linkSummaryRecord, "int_sources");
                }

                JsonObject sourceHeaders = jsonObject.getAsJsonObject("source_headers");
                if (sourceHeaders != null) {
                    long httpDate = safeGetHttpDate(sourceHeaders, "date");
                    long lastModified = safeGetHttpDate(sourceHeaders, "last-modified");
                    if (lastModified != -1 && lastModified < httpDate)
                        httpDate = lastModified;
                    if (httpDate != -1L) {
                        safeSetMinLongValue(linkSummaryRecord, "earlier_date", httpDate);
                        safeSetMaxLongValue(linkSummaryRecord, "latest_date", httpDate);
                    }
                }
                JsonElement typeElement = jsonObject.get("type");
                JsonElement relElement = jsonObject.get("rel");

                String sourceTypeAndRel = jsonObject.get("source_type").getAsString() + ":";

                if (typeElement != null) {
                    sourceTypeAndRel += typeElement.getAsString();
                }
                if (relElement != null) {
                    sourceTypeAndRel += ":" + relElement.getAsString();
                }

                if (types.size() < MAX_TYPE_SAMPLES)
                    types.add(sourceTypeAndRel);
            }

        }
    }

    void updateCrawlStatsFromJSONObject(JsonObject jsonObject, URLFPV2 fpSource, Reporter reporter)
            throws IOException {

        JsonElement sourceHREFElement = jsonObject.get("source_url");

        if (sourceHREFElement != null) {
            if (outputKeyString == null || !outputKeyFromInternalLink) {
                outputKeyString = sourceHREFElement.getAsString();
                outputKeyURLObj = new GoogleURL(sourceHREFElement.getAsString());
            }
            if (summaryRecord == null) {
                summaryRecord = new JsonObject();
            }
            String disposition = jsonObject.get("disposition").getAsString();
            long attemptTime = jsonObject.get("attempt_time").getAsLong();

            safeIncrementJSONCounter(summaryRecord, "attempt_count");

            long latestAttemptTime = safeSetMaxLongValue(summaryRecord, "latest_attempt", attemptTime);

            JsonElement redirectObject = jsonObject.get("redirect_from");
            if (redirectObject != null) {
                emitRedirectRecord(jsonObject, redirectObject.getAsJsonObject(), reporter);
            }

            if (latestAttemptTime == attemptTime) {
                summaryRecord.addProperty("failed", (disposition.equals("SUCCESS")) ? false : true);
            }

            if (disposition.equals("SUCCESS")) {

                int httpResult = jsonObject.get("http_result").getAsInt();

                if (httpResult == 200) {
                    outputKeyFromInternalLink = true;
                }
                if (latestAttemptTime == attemptTime) {
                    summaryRecord.addProperty("http_result", httpResult);
                }

                if (httpResult == 200) {
                    // inject all the details into a JSONObject 
                    JsonObject crawlStatsJSON = new JsonObject();

                    // basic stats ... starting with crawl time ...
                    crawlStatsJSON.addProperty("server_ip", jsonObject.get("server_ip").getAsString());
                    crawlStatsJSON.addProperty("attempt_time", attemptTime);
                    crawlStatsJSON.addProperty("content_len", jsonObject.get("content_len").getAsInt());
                    if (jsonObject.get("mime_type") != null) {
                        crawlStatsJSON.addProperty("mime_type", jsonObject.get("mime_type").getAsString());
                    }
                    if (jsonObject.get("md5") != null) {
                        crawlStatsJSON.addProperty("md5", jsonObject.get("md5").getAsString());
                    }
                    if (jsonObject.get("text_simhash") != null) {
                        crawlStatsJSON.addProperty("text_simhash", jsonObject.get("text_simhash").getAsLong());
                    }
                    //populate date headers ... 
                    populateDateHeadersFromJSONObject(jsonObject, crawlStatsJSON);

                    JsonElement parsedAs = jsonObject.get("parsed_as");

                    if (parsedAs != null) {
                        // populate some info based on type ... 
                        crawlStatsJSON.addProperty("parsed_as", parsedAs.getAsString());

                        String parsedAsString = parsedAs.getAsString();

                        // if html ... 
                        if (parsedAsString.equals("html")) {
                            JsonObject content = jsonObject.get("content").getAsJsonObject();
                            if (content != null) {
                                JsonElement titleElement = content.get("title");
                                JsonElement metaElement = content.get("meta_tags");
                                if (titleElement != null) {
                                    crawlStatsJSON.add("title", titleElement);
                                }
                                if (metaElement != null) {
                                    crawlStatsJSON.add("meta_tags", metaElement);
                                }
                                // collect link stats for json ... 
                                updateLinkStatsFromHTMLContent(crawlStatsJSON, content, fpSource, reporter);
                            }

                        }
                        // if feed ... 
                        else if (parsedAsString.equals("feed")) {
                            // get content ... 
                            JsonObject content = jsonObject.get("content").getAsJsonObject();
                            JsonElement titleElement = content.get("title");
                            if (titleElement != null) {
                                crawlStatsJSON.add("title", titleElement);
                            }
                            // set update time ... 
                            long updateTime = safeGetLong(content, "updated");
                            if (updateTime != -1) {
                                crawlStatsJSON.addProperty("updated", updateTime);
                            }

                            addMinMaxFeedItemTimes(content, crawlStatsJSON);
                        }
                    }

                    long latestCrawlTime = safeSetMaxLongValue(summaryRecord, "latest_crawl", attemptTime);

                    if (latestCrawlTime == attemptTime) {
                        // update latest http result 
                        // update parsed as info 
                        if (parsedAs != null) {
                            summaryRecord.addProperty("parsed_as", parsedAs.getAsString());
                        }
                        // update the timestamp 
                        summaryRecord.addProperty("latest_crawl", attemptTime);
                    }
                    // always increment successfull crawl count .... 
                    safeIncrementJSONCounter(summaryRecord, "crawl_count");

                    // construct crawl stats array if necessary 
                    JsonArray crawlStatsArray = summaryRecord.getAsJsonArray("crawl_stats");
                    if (crawlStatsArray == null) {
                        crawlStatsArray = new JsonArray();
                        summaryRecord.add("crawl_stats", crawlStatsArray);
                    }
                    // add crawl stats to it 
                    crawlStatsArray.add(crawlStatsJSON);
                }
            }
        } else {
            reporter.incrCounter(Counters.NO_SOURCE_URL_FOR_CRAWL_STATUS, 1);
        }
    }

    void updateLinkStatsFromHTMLContent(JsonObject crawlStats, JsonObject content, URLFPV2 fpSource,
            Reporter reporter) {
        JsonArray links = content.getAsJsonArray("links");

        if (links == null) {
            reporter.incrCounter(Counters.NULL_LINKS_ARRAY, 1);
        } else {
            int internalLinkCount = 0;
            int externalLinkCount = 0;

            for (JsonElement link : links) {
                JsonObject linkObj = link.getAsJsonObject();
                if (linkObj != null && linkObj.has("href")) {
                    String href = linkObj.get("href").getAsString();
                    URLFPV2 linkFP = URLUtils.getURLFPV2FromURL(href);
                    if (linkFP != null) {
                        if (linkFP.getRootDomainHash() == fpSource.getRootDomainHash()) {
                            internalLinkCount++;
                        } else {
                            externalLinkCount++;
                        }
                    }
                }
            }
            crawlStats.addProperty("internal_links", internalLinkCount);
            crawlStats.addProperty("external_links", externalLinkCount);
        }
    }

    JsonParser _parser = new JsonParser();
    JsonObject summaryRecord = null;
    JsonObject linkSummaryRecord = null;
    HashSet<String> types = new HashSet<String>();
    HashMap<Long, String> linkSources = null;
    String outputKeyString = null;
    boolean outputKeyFromInternalLink = false;
    GoogleURL outputKeyURLObj = null;

    @Override
    public void reduce(IntWritable key, Iterator<Text> values, OutputCollector<TextBytes, TextBytes> output,
            Reporter reporter) throws IOException {

        if (_skipPartition)
            return;
        // collect all incoming paths first
        Vector<Path> incomingPaths = new Vector<Path>();

        while (values.hasNext()) {
            String path = values.next().toString();
            LOG.info("Found Incoming Path:" + path);
            incomingPaths.add(new Path(path));
        }

        FlexBuffer scanArray[] = LinkKey.allocateScanArray();

        // set up merge attributes
        Configuration localMergeConfig = new Configuration(_conf);

        localMergeConfig.setClass(MultiFileInputReader.MULTIFILE_COMPARATOR_CLASS, LinkKeyGroupingComparator.class,
                RawComparator.class);
        localMergeConfig.setClass(MultiFileInputReader.MULTIFILE_KEY_CLASS, TextBytes.class,
                WritableComparable.class);

        // ok now spawn merger
        MultiFileInputReader<TextBytes> multiFileInputReader = new MultiFileInputReader<TextBytes>(_fs,
                incomingPaths, localMergeConfig);

        TextBytes keyBytes = new TextBytes();
        TextBytes valueBytes = new TextBytes();
        DataInputBuffer inputBuffer = new DataInputBuffer();

        int processedKeysCount = 0;

        Pair<KeyAndValueData<TextBytes>, Iterable<RawRecordValue>> nextItem = null;
        while ((nextItem = multiFileInputReader.getNextItemIterator()) != null) {

            summaryRecord = null;
            linkSummaryRecord = null;
            types.clear();
            linkSources = null;
            outputKeyString = null;
            outputKeyFromInternalLink = false;
            outputKeyURLObj = null;

            int statusCount = 0;
            int linkCount = 0;

            // scan key components 
            LinkKey.scanForComponents(nextItem.e0._keyObject, ':', scanArray);

            // pick up source fp from key ... 
            URLFPV2 fpSource = new URLFPV2();

            fpSource.setRootDomainHash(LinkKey.getLongComponentFromComponentArray(scanArray,
                    LinkKey.ComponentId.ROOT_DOMAIN_HASH_COMPONENT_ID));
            fpSource.setDomainHash(LinkKey.getLongComponentFromComponentArray(scanArray,
                    LinkKey.ComponentId.DOMAIN_HASH_COMPONENT_ID));
            fpSource.setUrlHash(LinkKey.getLongComponentFromComponentArray(scanArray,
                    LinkKey.ComponentId.URL_HASH_COMPONENT_ID));

            for (RawRecordValue rawValue : nextItem.e1) {

                inputBuffer.reset(rawValue.key.getData(), 0, rawValue.key.getLength());
                int length = WritableUtils.readVInt(inputBuffer);
                keyBytes.set(rawValue.key.getData(), inputBuffer.getPosition(), length);
                inputBuffer.reset(rawValue.data.getData(), 0, rawValue.data.getLength());
                length = WritableUtils.readVInt(inputBuffer);
                valueBytes.set(rawValue.data.getData(), inputBuffer.getPosition(), length);

                long linkType = LinkKey.getLongComponentFromKey(keyBytes, LinkKey.ComponentId.TYPE_COMPONENT_ID);

                if (linkType == LinkKey.Type.KEY_TYPE_CRAWL_STATUS.ordinal()) {
                    statusCount++;

                    try {
                        JsonObject object = _parser.parse(valueBytes.toString()).getAsJsonObject();
                        if (object != null) {
                            updateCrawlStatsFromJSONObject(object, fpSource, reporter);
                        }
                    } catch (Exception e) {
                        LOG.error("Error Parsing JSON:" + valueBytes.toString());
                        throw new IOException(e);
                    }
                } else {
                    linkCount++;
                    JsonObject object = _parser.parse(valueBytes.toString()).getAsJsonObject();
                    // ok this is a link ... 
                    updateLinkStatsFromLinkJSONObject(object, fpSource, reporter);
                }

                reporter.progress();
            }

            if (statusCount > 1) {
                reporter.incrCounter(Counters.TWO_REDUNDANT_STATUS_IN_REDUCER, 1);
            }

            if (statusCount == 0 && linkCount != 0) {
                reporter.incrCounter(Counters.DISCOVERED_NEW_LINK, 1);
            } else {
                if (statusCount >= 1 && linkCount >= 1) {
                    reporter.incrCounter(Counters.GOT_CRAWL_STATUS_WITH_LINK, 1);
                } else if (statusCount >= 1 && linkCount == 0) {
                    reporter.incrCounter(Counters.GOT_CRAWL_STATUS_NO_LINK, 1);
                }
            }

            if (summaryRecord != null || linkSummaryRecord != null) {
                JsonObject compositeObject = new JsonObject();
                if (summaryRecord != null) {
                    compositeObject.add("crawl_status", summaryRecord);
                }
                if (linkSummaryRecord != null) {
                    if (types != null && types.size() != 0) {
                        stringCollectionToJsonArray(linkSummaryRecord, "typeAndRels", types);
                        if (linkSources != null) {
                            stringCollectionToJsonArray(linkSummaryRecord, "sources", linkSources.values());
                        }
                    }
                    compositeObject.add("link_status", linkSummaryRecord);
                }

                if (outputKeyString != null && outputKeyURLObj != null && outputKeyURLObj.isValid()) {
                    if (outputKeyFromInternalLink) {
                        reporter.incrCounter(Counters.OUTPUT_KEY_FROM_INTERNAL_LINK, 1);
                    } else {
                        reporter.incrCounter(Counters.OUTPUT_KEY_FROM_EXTERNAL_LINK, 1);
                    }
                    output.collect(new TextBytes(outputKeyString), new TextBytes(compositeObject.toString()));
                } else {
                    reporter.incrCounter(Counters.FAILED_TO_GET_SOURCE_HREF, 1);
                }
            }
        }
    }

    JobConf _conf;
    FileSystem _fs;
    SequenceFile.Writer _redirectWriter = null;

    private static final NumberFormat NUMBER_FORMAT = NumberFormat.getInstance();
    static {
        NUMBER_FORMAT.setMinimumIntegerDigits(5);
        NUMBER_FORMAT.setGroupingUsed(false);
    }

    boolean _skipPartition = false;

    @Override
    public void configure(JobConf job) {

        HashSet<Integer> onlyDoPartitions = null;
        String hack = job.get("hack");
        if (hack != null) {
            onlyDoPartitions = new HashSet<Integer>();
            JsonParser parser = new JsonParser();
            JsonArray hackArray = parser.parse(hack).getAsJsonArray();
            for (JsonElement element : hackArray) {
                onlyDoPartitions.add(element.getAsInt());
            }
        }
        _conf = job;
        try {
            _fs = FileSystem.get(_conf);
            int partitionId = _conf.getInt("mapred.task.partition", 0);
            if (onlyDoPartitions == null || onlyDoPartitions.contains(partitionId)) {
                Path redirectPath = new Path(FileOutputFormat.getWorkOutputPath(_conf),
                        "redirect-" + NUMBER_FORMAT.format(partitionId));
                _redirectWriter = SequenceFile.createWriter(_fs, _conf, redirectPath, TextBytes.class,
                        TextBytes.class, CompressionType.BLOCK);
            } else {
                _skipPartition = true;
            }
        } catch (IOException e) {
            e.printStackTrace();
        }
    }

    @Override
    public void close() throws IOException {
        if (_redirectWriter != null) {
            _redirectWriter.close();
        }

    }
}