org.commoncrawl.mapred.ec2.parser.ParserMapper.java Source code

Java tutorial

Introduction

Here is the source code for org.commoncrawl.mapred.ec2.parser.ParserMapper.java

Source

/**
 * Copyright 2012 - CommonCrawl Foundation
 * 
 *    This program is free software: you can redistribute it and/or modify
 *    it under the terms of the GNU General Public License as published by
 *    the Free Software Foundation, either version 3 of the License, or
 *    (at your option) any later version.
 *
 *    This program is distributed in the hope that it will be useful,
 *    but WITHOUT ANY WARRANTY; without even the implied warranty of
 *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *    GNU General Public License for more details.
 *
 *    You should have received a copy of the GNU General Public License
 *    along with this program.  If not, see <http://www.gnu.org/licenses/>.
 *
 **/

package org.commoncrawl.mapred.ec2.parser;

import java.io.IOException;
import java.io.StringReader;
import java.net.MalformedURLException;
import java.net.URL;
import java.nio.charset.Charset;
import java.util.Date;
import java.util.List;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.MD5Hash;
import org.apache.hadoop.io.SequenceFile;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.InputSplit;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.Mapper;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.mapred.Counters.Counter;
import org.commoncrawl.crawl.common.internal.CrawlEnvironment;
import org.commoncrawl.service.parser.Meta;
import org.commoncrawl.service.parser.ParseResult;
import org.commoncrawl.io.NIOHttpHeaders;
import org.commoncrawl.protocol.CrawlURL;
import org.commoncrawl.protocol.CrawlURLMetadata;
import org.commoncrawl.protocol.ParseOutput;
import org.commoncrawl.protocol.shared.CrawlMetadata;
import org.commoncrawl.protocol.shared.FeedAuthor;
import org.commoncrawl.protocol.shared.FeedContent;
import org.commoncrawl.protocol.shared.FeedItem;
import org.commoncrawl.protocol.shared.FeedLink;
import org.commoncrawl.protocol.shared.HTMLContent;
import org.commoncrawl.protocol.shared.HTMLLink;
import org.commoncrawl.service.parser.server.ParseWorker;
import org.commoncrawl.util.CCStringUtils;
import org.commoncrawl.util.CharsetUtils;
import org.commoncrawl.util.FlexBuffer;
import org.commoncrawl.util.GZIPUtils;
import org.commoncrawl.util.HttpHeaderInfoExtractor;
import org.commoncrawl.util.IPAddressUtils;
import org.commoncrawl.util.MimeTypeFilter;
import org.commoncrawl.util.SimHash;
import org.commoncrawl.util.TextBytes;
import org.commoncrawl.util.GZIPUtils.UnzipResult;
import org.commoncrawl.util.MimeTypeFilter.MimeTypeDisposition;
import org.commoncrawl.util.Tuples.Pair;
import org.xml.sax.InputSource;

import com.google.common.collect.ImmutableMap;
import com.google.common.collect.ImmutableSet;
import com.google.gson.JsonArray;
import com.google.gson.JsonElement;
import com.google.gson.JsonObject;
import com.google.gson.JsonParser;
import com.google.gson.JsonPrimitive;
import com.google.gson.stream.JsonReader;
import com.sun.syndication.feed.WireFeed;
import com.sun.syndication.feed.atom.Content;
import com.sun.syndication.feed.atom.Entry;
import com.sun.syndication.feed.atom.Feed;
import com.sun.syndication.feed.rss.Category;
import com.sun.syndication.feed.rss.Channel;
import com.sun.syndication.feed.rss.Description;
import com.sun.syndication.feed.rss.Item;
import com.sun.syndication.io.WireFeedInput;

/**
 * Initial version of a Mapper that takes a URL and CrawlURL 
 * (data structure produced by crawlers) and emits metadata and raw content 
 * via a custom OutputFormat to S3.
 * 
 * This version only handles HTML,RSS,and ATOM content mainly because we were 
 * rushed for time to get this job running on EC2 and also because of the 
 * desire to have a very resilient, tightly controlled codebase to ensure smooth
 * and reliable EC2 performance. Needs to be refactored at some point. 
 * 
 * 
 * 
 * @author rana
 *
 */
public class ParserMapper implements Mapper<Text, CrawlURL, Text, ParseOutput> {

    public static final Log LOG = LogFactory.getLog(ParserMapper.class);

    public static final String JSON_DISPOSITION_PROPERTY = "disposition";
    public static final String ORIGINAL_RESPONSE_CODE_HTTP_HEADER = "response";

    enum Counters {
        BAD_REDIRECT_URL, FAILED_TO_PARSE_HTML, PARSED_HTML_DOC, FAILED_TO_PARSE_FEED_URL, PARSED_FEED_URL, GUNZIP_FAILED, GUNZIP_DATA_TRUNCATED, WROTE_METADATA_RECORD, WROTE_TEXT_CONTENT, WROTE_RAW_CONTENT, GOT_UNHANDLED_IO_EXCEPTION, GOT_UNHANDLED_RUNTIME_EXCEPTION, MALFORMED_FINAL_URL, GOT_RSS_FEED, GOT_ATOM_FEED, TRYING_RSS_FEED_PARSER, EXCEPTION_DURING_FEED_PARSE, FAILED_TO_ID_FEED, FAILED_TO_PARSE_XML_AS_FEED, EXCEPTION_PARSING_LINK_JSON
    }

    private static ImmutableSet<String> dontKeepHeaders = ImmutableSet.of("proxy-connection", "connection",
            "keep-alive", "transfer-encoding", "te", "trailer", "proxy-authorization", "proxy-authenticate",
            "upgrade", "set-cookie", "content-encoding");

    public static JsonObject httpHeadersToJsonObject(NIOHttpHeaders headers) throws IOException {
        JsonObject httpHeaderObject = new JsonObject();

        // iterate entires in header object
        for (int i = 0; i < headers.getKeyCount(); ++i) {
            String key = headers.getKey(i);
            String value = headers.getValue(i);

            if (key == null && i == 0) {
                httpHeaderObject.addProperty(ORIGINAL_RESPONSE_CODE_HTTP_HEADER, value);
            } else if (key != null && value != null) {
                if (!dontKeepHeaders.contains(key.toLowerCase())) {
                    // and send other ones through 
                    httpHeaderObject.addProperty(key.toLowerCase(), value);
                }
            }
        }
        return httpHeaderObject;
    }

    private Pair<URL, JsonObject> buildRedirectObject(URL originalURL, CrawlURL value, CrawlMetadata metadata,
            Reporter reporter) throws IOException {

        JsonObject redirectObject = new JsonObject();

        redirectObject.addProperty("source_url", originalURL.toString());
        metadata.getRedirectData().setSourceURL(originalURL.toString());

        URL finalURLObj = null;

        try {
            finalURLObj = new URL(originalURL, value.getRedirectURL());
        } catch (Exception e) {
            reporter.incrCounter(Counters.BAD_REDIRECT_URL, 1);
            throw new IOException(
                    "Bad Redirect Source URL:" + originalURL + " RedirectURL:" + value.getRedirectURL());
        }

        redirectObject.addProperty("http_result", (int) value.getOriginalResultCode());
        metadata.getRedirectData().setHttpResult(value.getOriginalResultCode());
        redirectObject.addProperty("server_ip",
                IPAddressUtils.IntegerToIPAddressString(value.getOriginalServerIP()));
        metadata.getRedirectData().setServerIP(value.getOriginalServerIP());
        redirectObject.add("http_headers",
                httpHeadersToJsonObject(NIOHttpHeaders.parseHttpHeaders(value.getOriginalHeaders())));
        metadata.getRedirectData().setHttpHeaders(value.getOriginalHeaders());

        return new Pair<URL, JsonObject>(finalURLObj, redirectObject);
    }

    private JsonObject parseResultToJsonObject(URL baseURL, ParseResult result, HTMLContent htmlMeta,
            Reporter reporter) throws IOException {

        JsonParser parser = new JsonParser();

        JsonObject objectOut = new JsonObject();

        objectOut.addProperty("type", "html-doc");

        safeSetString(objectOut, "title", result.getTitle());
        if (result.isFieldDirty(ParseResult.Field_TITLE))
            htmlMeta.setTitle(result.getTitle());

        if (result.getMetaTags().size() != 0) {
            JsonArray metaArray = new JsonArray();
            for (Meta meta : result.getMetaTags()) {
                JsonObject metaTag = new JsonObject();
                metaTag.addProperty("name", meta.getName());
                metaTag.addProperty("value", meta.getValue());

                metaArray.add(metaTag);
                htmlMeta.getMetaTags().add(new TextBytes(meta.getName().trim() + "\t" + meta.getValue()));
            }
            objectOut.add("meta_tags", metaArray);
        }
        if (result.getExtractedLinks().size() != 0) {
            JsonArray linkArray = new JsonArray();
            for (org.commoncrawl.service.parser.Link link : result.getExtractedLinks()) {
                try {
                    JsonObject linkObj = parser.parse(new JsonReader(new StringReader(link.getAttributes())))
                            .getAsJsonObject();
                    linkObj.addProperty("href", link.getUrl());
                    linkArray.add(linkObj);

                    HTMLLink linkMeta = new HTMLLink();
                    linkMeta.setAttributes(link.getAttributes());
                    linkMeta.setHref(link.getUrl());

                    htmlMeta.getLinks().add(linkMeta);
                } catch (Exception e) {
                    LOG.error("Error Parsing JSON Link Attributes for Link: " + link.getUrl() + " in Doc:" + baseURL
                            + " Exception:\n" + CCStringUtils.stringifyException(e));
                    reporter.incrCounter(Counters.EXCEPTION_PARSING_LINK_JSON, 1);
                }
            }
            objectOut.add("links", linkArray);
        }
        return objectOut;
    }

    private static String cleanupDescription(Object d) {

        String value = null;

        if (d instanceof Description)
            value = (d != null) ? ((Description) d).getValue() : null;
        else if (d instanceof String)
            value = (String) d;
        else if (d instanceof Content)
            value = (d != null) ? ((Content) d).getValue() : null;

        if (value == null)
            return "";
        String[] parts = value.split("<[^>]*>");
        StringBuffer buf = new StringBuffer();

        for (String part : parts)
            buf.append(part);

        return buf.toString().trim();
    }

    private static void safeSetDate(JsonObject jsonObj, String propertyName, Date date) {
        if (date != null) {
            jsonObj.addProperty(propertyName, date.getTime());
        }
    }

    private static void setRSSCategories(JsonObject jsonObj, List<TextBytes> metaCategories,
            StringBuffer contentOut, List categories) {
        if (categories.size() != 0) {
            JsonArray jsonArray = new JsonArray();
            for (Object category : categories) {
                if (((Category) category).getValue() != null && ((Category) category).getValue().length() != 0) {
                    safeAppendContentFromString(contentOut, ((Category) category).getValue());
                    jsonArray.add(new JsonPrimitive(((Category) category).getValue()));
                    if (((Category) category).getValue() != null) {
                        metaCategories.add(new TextBytes(((Category) category).getValue()));
                    }
                }
            }
            jsonObj.add("categories", jsonArray);
        }
    }

    private static void setAtomCategories(JsonObject jsonObj, List<TextBytes> metaCategoryList,
            StringBuffer contentOut, List categories) {
        if (categories.size() != 0) {
            JsonArray jsonArray = new JsonArray();
            for (Object category : categories) {
                com.sun.syndication.feed.atom.Category categoryObj = (com.sun.syndication.feed.atom.Category) category;

                if (categoryObj.getLabel() != null && categoryObj.getLabel().length() != 0) {
                    safeAppendContentFromString(contentOut, categoryObj.getLabel());
                    jsonArray.add(new JsonPrimitive(categoryObj.getLabel()));
                    if (categoryObj.getLabel() != null)
                        metaCategoryList.add(new TextBytes(categoryObj.getLabel()));
                }
            }
            jsonObj.add("categories", jsonArray);
        }
    }

    private static void safeSetString(JsonObject jsonObj, String propertyName, String propertyValue) {
        if (propertyValue != null && propertyValue.length() != 0) {
            jsonObj.addProperty(propertyName, propertyValue);
        }
    }

    private static void safeSetInteger(JsonObject jsonObj, String propertyName, int propertyValue) {
        if (propertyValue != -1) {
            jsonObj.addProperty(propertyName, propertyValue);
        }
    }

    private Pair<JsonObject, String> parseHTMLDocument(URL baseURL, String rawHeaders, FlexBuffer data,
            HTMLContent contentMetaOut, Reporter reporter) throws IOException {
        ParseResult resultOut = new ParseResult();
        ParseWorker parseWorker = new ParseWorker();
        parseWorker.parseDocument(resultOut, 0, 0, baseURL, rawHeaders, data);
        if (resultOut.getParseSuccessful()) {
            return new Pair<JsonObject, String>(
                    parseResultToJsonObject(baseURL, resultOut, contentMetaOut, reporter), resultOut.getText());
        }
        return null;
    }

    private Pair<JsonObject, String> parseHTMLSnippet(URL baseURL, String htmlSnippet, HTMLContent contentMetaOut,
            Reporter reporter) throws IOException {
        ParseResult resultOut = new ParseResult();
        ParseWorker parseWorker = new ParseWorker();
        parseWorker.parsePartialHTMLDocument(resultOut, baseURL, htmlSnippet);
        if (resultOut.getParseSuccessful()) {
            return new Pair<JsonObject, String>(
                    parseResultToJsonObject(baseURL, resultOut, contentMetaOut, reporter), resultOut.getText());
        }
        return null;
    }

    private static String safeAppendContentFromString(StringBuffer buffer, String content) {
        if (content != null) {
            String contentTrimmed = content.trim();

            if (contentTrimmed.length() != 0) {
                if (buffer.length() != 0)
                    buffer.append(" ");
                buffer.append(contentTrimmed);
            }
            return contentTrimmed;
        }
        return null;
    }

    private static String safeAppendContentFromContentObj(StringBuffer buffer, Content content) {
        if (content != null && content.getValue() != null) {
            String contentTrimmed = content.getValue().trim();

            if (contentTrimmed.length() != 0) {
                if (buffer.length() != 0)
                    buffer.append(" ");
                buffer.append(contentTrimmed);
            }
            return contentTrimmed;
        }
        return null;
    }

    private static void safeAppendLinksFromFeed(JsonObject feedOrItemObj,
            ImmutableMap<String, String> validLinkTypes, List<FeedLink> feedMetaLinks, List links)
            throws IOException {
        for (Object link : links) {
            com.sun.syndication.feed.atom.Link linkObj = (com.sun.syndication.feed.atom.Link) link;
            if (linkObj.getHref() != null && linkObj.getRel() != null) {

                if (validLinkTypes.keySet().contains(linkObj.getRel())) {
                    JsonObject jsonLink = new JsonObject();
                    FeedLink metaLink = new FeedLink();

                    safeSetString(jsonLink, "type", linkObj.getType());
                    if (linkObj.getType() != null)
                        metaLink.setType(linkObj.getType());
                    safeSetString(jsonLink, "href", linkObj.getHref());
                    if (linkObj.getHref() != null)
                        metaLink.setHref(linkObj.getHref());
                    safeSetString(jsonLink, "rel", linkObj.getRel());
                    if (linkObj.getRel() != null)
                        metaLink.setRel(linkObj.getRel());

                    safeSetString(jsonLink, "title", linkObj.getTitle());
                    if (linkObj.getTitle() != null)
                        metaLink.setTitle(linkObj.getTitle());

                    feedMetaLinks.add(metaLink);

                    String linkName = validLinkTypes.get(linkObj.getRel());

                    JsonElement existing = feedOrItemObj.get(linkName);
                    if (existing != null) {
                        JsonArray array = null;
                        if (!existing.isJsonArray()) {
                            array = new JsonArray();
                            array.add(existing);
                            feedOrItemObj.remove(linkName);
                            feedOrItemObj.add(linkName, array);
                        } else {
                            array = existing.getAsJsonArray();
                        }
                        array.add(jsonLink);
                    } else {
                        feedOrItemObj.add(linkName, jsonLink);
                    }
                }
            }
        }
    }

    private static void safeAppendAuthorsFromFeed(JsonObject feedOrItemObj, List<FeedAuthor> metaAuthorList,
            List authors) throws IOException {
        if (authors.size() != 0) {
            JsonArray authorArray = new JsonArray();
            for (Object author : authors) {
                com.sun.syndication.feed.atom.Person authorObj = (com.sun.syndication.feed.atom.Person) author;
                if (authorObj.getName() != null) {

                    JsonObject jsonAuthor = new JsonObject();
                    FeedAuthor metaAuthor = new FeedAuthor();

                    safeSetString(jsonAuthor, "name", authorObj.getName());
                    safeSetString(jsonAuthor, "url", authorObj.getUrl());

                    if (authorObj.getName() != null)
                        metaAuthor.setName(authorObj.getName());
                    if (authorObj.getUrl() != null)
                        metaAuthor.setUrl(authorObj.getUrl());

                    authorArray.add(jsonAuthor);
                    metaAuthorList.add(metaAuthor);
                }
            }
            feedOrItemObj.add("authors", authorArray);
        }
    }

    private static void safeAppendLinkFromString(JsonObject jsonObj, List<FeedLink> metaLinks, String propertyName,
            String linkValue) {
        if (linkValue != null && linkValue.length() != 0) {

            JsonObject jsonLink = new JsonObject();
            FeedLink metaLink = new FeedLink();

            jsonLink.addProperty("href", linkValue);
            metaLink.setHref(linkValue);

            jsonObj.add(propertyName, jsonLink);
            metaLinks.add(metaLink);
        }
    }

    private Pair<JsonObject, String> rssFeedToJson(URL url, Channel channelObject, FeedContent feedMeta,
            Reporter reporter) throws IOException {

        JsonObject rssObject = new JsonObject();

        StringBuffer contentOut = new StringBuffer();

        rssObject.addProperty("type", "rss-feed");
        feedMeta.setType(FeedContent.Type.RSS);

        String feedTitle = cleanupDescription(channelObject.getTitle());
        rssObject.addProperty("title", safeAppendContentFromString(contentOut, feedTitle));
        if (feedTitle != null)
            feedMeta.setTitle(feedTitle);

        safeAppendLinkFromString(rssObject, feedMeta.getLinks(), "link", channelObject.getLink());

        String feedDesc = cleanupDescription(channelObject.getDescription());
        rssObject.addProperty("description", safeAppendContentFromString(contentOut, feedDesc));
        if (feedDesc != null)
            feedMeta.setDescription(feedDesc);

        if (channelObject.getLastBuildDate() != null) {
            safeSetDate(rssObject, "updated", channelObject.getLastBuildDate());
            feedMeta.setUpdated(channelObject.getLastBuildDate().getTime());
        } else if (channelObject.getPubDate() != null) {
            safeSetDate(rssObject, "updated", channelObject.getPubDate());
            feedMeta.setUpdated(channelObject.getPubDate().getTime());
        }

        setRSSCategories(rssObject, feedMeta.getCategories(), contentOut, channelObject.getCategories());

        safeSetString(rssObject, "generator", channelObject.getGenerator());
        if (channelObject.getGenerator() != null)
            feedMeta.setGenerator(channelObject.getGenerator());

        safeSetInteger(rssObject, "ttl", channelObject.getTtl());
        if (channelObject.getTtl() != -1)
            feedMeta.setTtl(channelObject.getTtl());

        JsonArray itemArray = new JsonArray();
        for (Object itemObj : channelObject.getItems()) {
            Item item = (Item) itemObj;
            JsonObject itemObject = new JsonObject();
            FeedItem metaItem = new FeedItem();

            String itemTitle = cleanupDescription(item.getTitle());
            itemObject.addProperty("title", safeAppendContentFromString(contentOut, itemTitle));
            if (itemTitle != null)
                metaItem.setTitle(itemTitle);

            String itemDesc = cleanupDescription(item.getDescription());
            itemObject.addProperty("description", safeAppendContentFromString(contentOut, itemDesc));
            if (itemDesc != null)
                metaItem.setDescription(itemDesc);

            safeAppendLinkFromString(itemObject, metaItem.getLinks(), "link", item.getLink());

            safeSetString(itemObject, "author", item.getAuthor());
            if (item.getAuthor() != null) {
                FeedAuthor metaAuthor = new FeedAuthor();
                metaAuthor.setName(item.getAuthor());
                metaItem.getAuthors().add(metaAuthor);
            }

            setRSSCategories(itemObject, metaItem.getCategories(), contentOut, item.getCategories());

            safeSetString(itemObject, "comments", item.getComments());

            safeSetDate(itemObject, "published", item.getPubDate());
            if (item.getPubDate() != null)
                metaItem.setPublished(item.getPubDate().getTime());

            if (item.getGuid() != null) {
                safeSetString(itemObject, "guid", item.getGuid().getValue());
                if (item.getGuid().getValue() != null)
                    metaItem.setGuid(item.getGuid().getValue());
            }
            if (item.getContent() != null && item.getContent().getValue() != null) {
                if (item.getContent().getType() == null || item.getContent().getType().contains("html")) {
                    HTMLContent metaContent = new HTMLContent();
                    Pair<JsonObject, String> contentTuple = parseHTMLSnippet(url, item.getContent().getValue(),
                            metaContent, reporter);
                    metaItem.getEmbeddedLinks().addAll(metaContent.getLinks());
                    if (contentTuple.e0 != null) {
                        itemObject.add("content", contentTuple.e0);
                    }
                    if (contentTuple.e1 != null && contentTuple.e1.length() != 0) {
                        safeAppendContentFromString(contentOut, contentTuple.e1);
                    }
                }
            }
            itemArray.add(itemObject);
        }
        rssObject.add("items", itemArray);

        return new Pair<JsonObject, String>(rssObject, contentOut.toString());
    }

    static ImmutableMap<String, String> validFeedLinks = new ImmutableMap.Builder<String, String>()
            .put("alternate", "link").build();

    static ImmutableMap<String, String> feedEntryLinks = new ImmutableMap.Builder<String, String>()
            .put("alternate", "link").put("self", "self").put("replies", "replies").build();

    private Pair<JsonObject, String> atomFeedToJson(URL url, Feed feedObject, FeedContent feedMeta,
            Reporter reporter) throws IOException {
        JsonObject jsonFeed = new JsonObject();
        StringBuffer contentOut = new StringBuffer();

        jsonFeed.addProperty("type", "atom-feed");
        feedMeta.setType(FeedContent.Type.ATOM);
        String title = cleanupDescription(feedObject.getTitle());
        jsonFeed.addProperty("title", safeAppendContentFromString(contentOut, title));
        if (title != null)
            feedMeta.setTitle(title);

        safeAppendLinksFromFeed(jsonFeed, validFeedLinks, feedMeta.getLinks(), feedObject.getAlternateLinks());
        safeAppendAuthorsFromFeed(jsonFeed, feedMeta.getAuthors(), feedObject.getAuthors());
        if (feedObject.getGenerator() != null) {
            safeSetString(jsonFeed, "generator", feedObject.getGenerator().getValue());
            if (feedObject.getGenerator().getValue() != null) {
                feedMeta.setGenerator(feedObject.getGenerator().getValue());
            }
        }

        safeSetDate(jsonFeed, "updated", feedObject.getUpdated());
        if (feedObject.getUpdated() != null) {
            feedMeta.setUpdated(feedObject.getUpdated().getTime());
        }

        setAtomCategories(jsonFeed, feedMeta.getCategories(), contentOut, feedObject.getCategories());
        JsonArray itemArray = new JsonArray();
        for (Object entry : feedObject.getEntries()) {
            Entry entryObj = (Entry) entry;
            JsonObject jsonEntry = new JsonObject();
            FeedItem metaItem = new FeedItem();

            String itemTitle = cleanupDescription(entryObj.getTitle());
            jsonEntry.addProperty("title", safeAppendContentFromString(contentOut, itemTitle));
            if (itemTitle != null)
                metaItem.setTitle(itemTitle);

            String itemDesc = cleanupDescription(entryObj.getSummary());
            jsonEntry.addProperty("description", safeAppendContentFromString(contentOut, itemDesc));
            if (itemDesc != null)
                metaItem.setDescription(itemDesc);

            safeSetDate(jsonFeed, "published", entryObj.getPublished());
            if (entryObj.getPublished() != null)
                metaItem.setPublished(entryObj.getPublished().getTime());

            safeSetDate(jsonFeed, "updated", entryObj.getUpdated());
            if (entryObj.getUpdated() != null)
                metaItem.setUpdated(entryObj.getUpdated().getTime());

            safeAppendLinksFromFeed(jsonEntry, feedEntryLinks, metaItem.getLinks(), entryObj.getAlternateLinks());
            safeAppendLinksFromFeed(jsonEntry, feedEntryLinks, metaItem.getLinks(), entryObj.getOtherLinks());
            safeAppendAuthorsFromFeed(jsonEntry, metaItem.getAuthors(), entryObj.getAuthors());
            setAtomCategories(jsonEntry, metaItem.getCategories(), contentOut, entryObj.getCategories());

            for (Object content : entryObj.getContents()) {
                com.sun.syndication.feed.atom.Content contentObj = (com.sun.syndication.feed.atom.Content) content;
                if (contentObj.getValue() != null && contentObj.getValue().length() != 0) {
                    if (contentObj.getType() == null || contentObj.getType().contains("html")) {
                        HTMLContent metaContent = new HTMLContent();
                        Pair<JsonObject, String> contentTuple = parseHTMLSnippet(url, contentObj.getValue(),
                                metaContent, reporter);
                        metaItem.getEmbeddedLinks().addAll(metaContent.getLinks());
                        if (contentTuple.e0 != null) {
                            if (jsonEntry.has("content")) {
                                JsonArray array = null;
                                JsonElement existing = jsonEntry.get("content");
                                if (!existing.isJsonArray()) {
                                    array = new JsonArray();
                                    array.add(existing);
                                    jsonEntry.remove("content");
                                    jsonEntry.add("content", array);
                                } else {
                                    array = existing.getAsJsonArray();
                                }
                                array.add(contentTuple.e0);
                            } else {
                                jsonEntry.add("content", contentTuple.e0);
                            }
                            if (contentTuple.e1 != null && contentTuple.e1.length() != 0) {
                                safeAppendContentFromString(contentOut, contentTuple.e1);
                            }
                        }
                    }
                }
            }
            itemArray.add(jsonEntry);
        }

        jsonFeed.add("items", itemArray);

        return new Pair<JsonObject, String>(jsonFeed, contentOut.toString());
    }

    private static final String feedEntryEnd = "</entry>";
    private static final String feedItemEnd = "</item>";

    private Pair<JsonObject, String> parseFeedDocument(URL baseURL, String rawHeaders, String feedContent,
            FeedContent feedMeta, boolean truncatedDocument, Reporter reporter) throws IOException {

        if (truncatedDocument) {
            LOG.warn("Fixing Up Trancated Doc:" + baseURL);
            int indexOfEntryEnd = feedContent.lastIndexOf(feedEntryEnd);
            if (indexOfEntryEnd != -1) {
                feedContent = feedContent.substring(0, indexOfEntryEnd + feedEntryEnd.length());
                feedContent += "</feed>";
            } else {
                int indexOfItemEnd = feedContent.lastIndexOf(feedItemEnd);
                if (indexOfItemEnd != -1) {
                    feedContent = feedContent.substring(0, indexOfEntryEnd + feedItemEnd.length());
                    feedContent += "</channel></rss>";

                }
            }
        }

        InputSource source = new InputSource(new StringReader(feedContent));
        WireFeedInput input = new WireFeedInput();

        Pair<JsonObject, String> resultTuple = null;

        try {
            WireFeed feed = input.build(source);
            if (feed != null) {

                if (feed instanceof Channel) {
                    reporter.incrCounter(Counters.TRYING_RSS_FEED_PARSER, 1);
                    resultTuple = rssFeedToJson(baseURL, (Channel) feed, feedMeta, reporter);
                    reporter.incrCounter(Counters.GOT_RSS_FEED, 1);
                } else if (feed instanceof Feed) {
                    resultTuple = atomFeedToJson(baseURL, (Feed) feed, feedMeta, reporter);
                    reporter.incrCounter(Counters.GOT_ATOM_FEED, 1);
                } else {
                    reporter.incrCounter(Counters.FAILED_TO_ID_FEED, 1);
                    LOG.error("Failed to ID Feed:" + baseURL);
                }

            }
        } catch (Exception e) {
            reporter.incrCounter(Counters.EXCEPTION_DURING_FEED_PARSE, 1);
            LOG.error("Failed to parse Feed:" + baseURL + "\n ContentLen:" + feedContent.length()
                    + "\n with Exception:" + CCStringUtils.stringifyException(e));
        }
        return resultTuple;
    }

    private Pair<String, Pair<TextBytes, FlexBuffer>> populateContentMetadata(URL finalURL, CrawlURL value,
            Reporter reporter, JsonObject metadata, CrawlMetadata crawlMeta) throws IOException {

        FlexBuffer contentOut = null;
        String textOut = null;

        NIOHttpHeaders finalHeaders = NIOHttpHeaders.parseHttpHeaders(value.getHeaders());

        CrawlURLMetadata urlMetadata = new CrawlURLMetadata();

        // extract information from http headers ... 
        HttpHeaderInfoExtractor.parseHeaders(finalHeaders, urlMetadata);
        // get the mime type ... 
        String normalizedMimeType = urlMetadata.isFieldDirty(CrawlURLMetadata.Field_CONTENTTYPE)
                ? urlMetadata.getContentType()
                : "text/html";

        metadata.addProperty("mime_type", normalizedMimeType);
        crawlMeta.setMimeType(normalizedMimeType);

        // get download size ... 
        int downloadSize = value.getContentRaw().getCount();

        // set original content len ... 
        metadata.addProperty("download_size", downloadSize);
        crawlMeta.setDownloadSize(downloadSize);

        // set truncation flag 
        if ((value.getFlags() & CrawlURL.Flags.TruncatedDuringDownload) != 0) {
            metadata.addProperty("download_truncated", true);
            crawlMeta.setFlags(crawlMeta.getFlags() | CrawlMetadata.Flags.Download_Truncated);
        }

        if (downloadSize > 0) {
            // get content type, charset and encoding 
            String encoding = finalHeaders.findValue("Content-Encoding");
            boolean isGZIP = false;
            if (encoding != null && encoding.equalsIgnoreCase("gzip")) {
                isGZIP = true;
            }

            byte[] contentBytes = value.getContentRaw().getReadOnlyBytes();
            int contentLen = value.getContentRaw().getCount();

            // assume we are going to output original data ... 
            contentOut = new FlexBuffer(contentBytes, 0, contentLen);

            if (isGZIP) {
                metadata.addProperty("content_is_gzip", isGZIP);
                crawlMeta.setFlags(crawlMeta.getFlags() | CrawlMetadata.Flags.ContentWas_GZIP);

                UnzipResult unzipResult = null;
                try {
                    // LOG.info("BEFORE GUNZIP");
                    unzipResult = GZIPUtils.unzipBestEffort(contentBytes, 0, contentLen,
                            CrawlEnvironment.GUNZIP_SIZE_LIMIT);
                } catch (Exception e) {
                    LOG.error(CCStringUtils.stringifyException(e));
                }

                if (unzipResult != null && unzipResult.data != null) {

                    if (unzipResult.wasTruncated) {
                        LOG.warn("Truncated Document During GZIP:" + finalURL);
                        reporter.incrCounter(Counters.GUNZIP_DATA_TRUNCATED, 1);
                    }

                    contentBytes = unzipResult.data.get();
                    contentLen = unzipResult.data.getCount();

                    metadata.addProperty("gunzip_content_len", unzipResult.data.getCount());
                    crawlMeta.setGunzipSize(unzipResult.data.getCount());

                    // update content out ... 
                    contentOut = new FlexBuffer(contentBytes, 0, contentLen);
                } else {

                    metadata.addProperty("gunzip_failed", true);
                    crawlMeta.setFlags(crawlMeta.getFlags() | CrawlMetadata.Flags.GUNZIP_Failed);

                    reporter.incrCounter(Counters.GUNZIP_FAILED, 1);

                    contentBytes = null;
                    contentLen = 0;

                    contentOut = null;
                }
                // LOG.info("AFTER GUNZIP");
            }

            if (contentBytes != null) {

                // ok compute an md5 hash 
                MD5Hash md5Hash = MD5Hash.digest(contentBytes, 0, contentLen);

                metadata.addProperty("md5", md5Hash.toString());
                crawlMeta.setMd5(new FlexBuffer(md5Hash.getDigest(), 0, md5Hash.getDigest().length));
                // get normalized mime type 
                if (MimeTypeFilter.isTextType(normalizedMimeType)) {
                    // ok time to decode the data into ucs2 ... 
                    Pair<Pair<Integer, Charset>, String> decodeResult = CharsetUtils
                            .bestEffortDecodeBytes(value.getHeaders(), contentBytes, 0, contentLen);
                    // ok write out decode metadata 
                    metadata.addProperty("charset_detected", decodeResult.e0.e1.toString());
                    crawlMeta.setCharsetDetected(decodeResult.e0.e1.toString());
                    metadata.addProperty("charset_detector", decodeResult.e0.e0);
                    crawlMeta.setCharsetDetector(decodeResult.e0.e0);
                    // get the content 
                    String textContent = decodeResult.e1;
                    // compute simhash 
                    long simhash = SimHash.computeOptimizedSimHashForString(textContent);
                    metadata.addProperty("text_simhash", simhash);
                    crawlMeta.setTextSimHash(simhash);

                    // figure out simplified mime type ... 
                    MimeTypeDisposition mimeTypeDisposition = MimeTypeFilter
                            .checkMimeTypeDisposition(normalizedMimeType);

                    boolean parseComplete = false;

                    Pair<JsonObject, String> tupleOut = null;

                    // write it out 
                    if (mimeTypeDisposition == MimeTypeDisposition.ACCEPT_HTML) {
                        // ok parse as html 
                        tupleOut = parseHTMLDocument(finalURL, value.getHeaders(),
                                new FlexBuffer(contentBytes, 0, contentLen), crawlMeta.getHtmlContent(), reporter);

                        if (tupleOut == null) {
                            reporter.incrCounter(Counters.FAILED_TO_PARSE_HTML, 1);
                            LOG.error("Unable to Parse as HTML:" + finalURL.toString());
                            mimeTypeDisposition = MimeTypeDisposition.ACCEPT_TEXT;
                        } else {
                            reporter.incrCounter(Counters.PARSED_HTML_DOC, 1);
                            metadata.addProperty("parsed_as", "html");
                            crawlMeta.setParsedAs(CrawlMetadata.ParsedAs.HTML);
                            parseComplete = true;
                        }
                    }

                    if (!parseComplete && (mimeTypeDisposition == MimeTypeDisposition.ACCEPT_FEED
                            || mimeTypeDisposition == MimeTypeDisposition.ACCEPT_XML)) {

                        // ok try parse this document as a feed ...
                        tupleOut = parseFeedDocument(finalURL, value.getHeaders(), textContent,
                                crawlMeta.getFeedContent(),
                                ((value.getFlags() & CrawlURL.Flags.TruncatedDuringDownload) != 0), reporter);

                        if (tupleOut == null) {
                            if (mimeTypeDisposition == MimeTypeDisposition.ACCEPT_FEED) {
                                reporter.incrCounter(Counters.FAILED_TO_PARSE_FEED_URL, 1);
                                //TODO:HACK 
                                //LOG.info("Failed to Parse:" + finalURL + " RawContentLen:" + value.getContentRaw().getCount() + " ContentLen:" + contentLen + " Metadata:" + metadata.toString());
                                mimeTypeDisposition = MimeTypeDisposition.ACCEPT_TEXT;
                            }
                        } else {
                            reporter.incrCounter(Counters.PARSED_FEED_URL, 1);
                            metadata.addProperty("parsed_as", "feed");
                            crawlMeta.setParsedAs(CrawlMetadata.ParsedAs.HTML);
                            parseComplete = true;
                        }
                    }

                    if (!parseComplete && mimeTypeDisposition == MimeTypeDisposition.ACCEPT_XML) {
                        reporter.incrCounter(Counters.FAILED_TO_PARSE_XML_AS_FEED, 1);
                        mimeTypeDisposition = MimeTypeDisposition.ACCEPT_TEXT;
                    }
                    if (!parseComplete && mimeTypeDisposition == MimeTypeDisposition.ACCEPT_TEXT) {
                        // LOG.info("Identified URL" + finalURL + " w/ mimetype:" + normalizedMimeType + " as text");
                        // TODO: FIX THIS BUT PUNT FOR NOW :-(
                        //tupleOut = new Pair<JsonObject,String>(null,textContent);
                    }

                    if (tupleOut != null) {
                        if (tupleOut.e0 != null) {
                            metadata.add("content", tupleOut.e0);
                        }
                        textOut = tupleOut.e1;
                    }
                }
            }
        }
        return new Pair<String, Pair<TextBytes, FlexBuffer>>(textOut,
                new Pair<TextBytes, FlexBuffer>(value.getHeadersAsTextBytes(), contentOut));
    }

    static void safeSetJsonPropertyFromJsonProperty(JsonObject destinationObj, String destinationProperty,
            JsonElement sourceObj, String sourceProperty) throws IOException {
        if (sourceObj != null && sourceObj.isJsonObject()) {
            JsonElement sourceElement = sourceObj.getAsJsonObject().get(sourceProperty);
            if (sourceElement != null) {
                destinationObj.add(destinationProperty, sourceElement);
            }
        }
    }

    @Override
    public void map(Text url, CrawlURL value, OutputCollector<Text, ParseOutput> output, Reporter reporter)
            throws IOException {

        if (url.getLength() == 0) {
            LOG.error("Hit NULL URL. Original URL:" + value.getRedirectURL());
            return;
        }

        try {
            // allocate parse output 
            ParseOutput parseOutput = new ParseOutput();
            // json object out ... 
            JsonObject jsonObj = new JsonObject();
            // and create a crawl metadata 
            CrawlMetadata metadata = parseOutput.getCrawlMetadata();

            // and content (if available) ... 
            Pair<String, Pair<TextBytes, FlexBuffer>> contentOut = null;

            URL originalURL = null;

            try {
                originalURL = new URL(url.toString());
            } catch (MalformedURLException e) {
                LOG.error("Malformed URL:" + CCStringUtils.stringifyException(e));
                reporter.incrCounter(Counters.MALFORMED_FINAL_URL, 1);
                return;
            }

            URL finalURL = originalURL;

            jsonObj.addProperty("attempt_time", value.getLastAttemptTime());
            metadata.setAttemptTime(value.getLastAttemptTime());

            // first step write status 
            jsonObj.addProperty("disposition",
                    (value.getLastAttemptResult() == CrawlURL.CrawlResult.SUCCESS) ? "SUCCESS" : "FAILURE");
            metadata.setCrawlDisposition(
                    (byte) ((value.getLastAttemptResult() == CrawlURL.CrawlResult.SUCCESS) ? 0 : 1));

            // deal with redirects ... 
            if ((value.getFlags() & CrawlURL.Flags.IsRedirected) != 0) {
                Pair<URL, JsonObject> redirect = buildRedirectObject(originalURL, value, metadata, reporter);
                jsonObj.add("redirect_from", redirect.e1);
                finalURL = redirect.e0;
            }

            if (value.getLastAttemptResult() == CrawlURL.CrawlResult.FAILURE) {
                jsonObj.addProperty("failure_reason",
                        CrawlURL.FailureReason.toString(value.getLastAttemptFailureReason()));
                metadata.setFailureReason(value.getLastAttemptFailureReason());
                jsonObj.addProperty("failure_detail", value.getLastAttemptFailureDetail());
                metadata.setFailureDetail(value.getLastAttemptFailureDetail());
            } else {
                jsonObj.addProperty("server_ip", IPAddressUtils.IntegerToIPAddressString(value.getServerIP()));
                metadata.setServerIP(value.getServerIP());
                jsonObj.addProperty("http_result", value.getResultCode());
                metadata.setHttpResult(value.getResultCode());
                jsonObj.add("http_headers",
                        httpHeadersToJsonObject(NIOHttpHeaders.parseHttpHeaders(value.getHeaders())));
                metadata.setHttpHeaders(value.getHeaders());
                jsonObj.addProperty("content_len", value.getContentRaw().getCount());
                metadata.setContentLength(value.getContentRaw().getCount());
                if (value.getResultCode() >= 200 && value.getResultCode() <= 299
                        && value.getContentRaw().getCount() > 0) {
                    contentOut = populateContentMetadata(finalURL, value, reporter, jsonObj, metadata);
                }
            }

            // ok ... write stuff out ...
            reporter.incrCounter(Counters.WROTE_METADATA_RECORD, 1);
            //////////////////////////////////////////////////////////////
            // echo some stuff to parseOutput ... 
            parseOutput.setMetadata(jsonObj.toString());
            JsonElement mimeType = jsonObj.get("mime_type");
            if (mimeType != null) {
                parseOutput.setNormalizedMimeType(mimeType.getAsString());
            }
            JsonElement md5 = jsonObj.get("md5");
            if (md5 != null) {
                MD5Hash hash = new MD5Hash(md5.getAsString());
                byte[] bytes = hash.getDigest();
                parseOutput.setMd5Hash(new FlexBuffer(bytes, 0, bytes.length));
            }
            JsonElement simHash = jsonObj.get("text_simhash");
            if (simHash != null) {
                parseOutput.setSimHash(simHash.getAsLong());
            }
            parseOutput.setHostIPAddress(IPAddressUtils.IntegerToIPAddressString(value.getServerIP()));
            parseOutput.setFetchTime(value.getLastAttemptTime());
            ////////////////////////////////////////////////////////////

            if (contentOut != null) {
                if (contentOut.e0 != null) {
                    parseOutput.setTextContent(contentOut.e0);
                    reporter.incrCounter(Counters.WROTE_TEXT_CONTENT, 1);
                }
                if (contentOut.e1 != null) {

                    // directly set the text bytes ... 
                    parseOutput.getHeadersAsTextBytes().set(contentOut.e1.e0);
                    // mark it dirty !!!
                    parseOutput.setFieldDirty(ParseOutput.Field_HEADERS);
                    // if content available ... 
                    if (contentOut.e1.e1 != null) {
                        parseOutput.setRawContent(contentOut.e1.e1);
                    }
                    reporter.incrCounter(Counters.WROTE_RAW_CONTENT, 1);
                }
            }

            //buildCompactMetadata(parseOutput,jsonObj,urlMap);

            output.collect(new Text(finalURL.toString()), parseOutput);
        } catch (IOException e) {
            LOG.error("Exception Processing URL:" + url.toString() + "\n" + CCStringUtils.stringifyException(e));
            reporter.incrCounter(Counters.GOT_UNHANDLED_IO_EXCEPTION, 1);
            //TODO:HACK
            //throw e;
        } catch (Exception e) {
            LOG.error("Exception Processing URL:" + url.toString() + "\n" + CCStringUtils.stringifyException(e));
            reporter.incrCounter(Counters.GOT_UNHANDLED_RUNTIME_EXCEPTION, 1);
            //TODO: HACK 
            //throw new IOException(e);
        }
    }

    @Override
    public void configure(JobConf job) {
        LOG.info("LIBRARY PATH:" + System.getenv().get("LD_LIBRARY_PATH"));
    }

    @Override
    public void close() throws IOException {

    }

    private static class MockReporter implements Reporter {

        @Override
        public Counter getCounter(Enum<?> name) {
            return null;
        }

        @Override
        public Counter getCounter(String group, String name) {
            return null;
        }

        @Override
        public InputSplit getInputSplit() throws UnsupportedOperationException {
            return null;
        }

        @Override
        public void incrCounter(Enum<?> key, long amount) {
        }

        @Override
        public void incrCounter(String group, String counter, long amount) {
        }

        @Override
        public void setStatus(String status) {
        }

        @Override
        public void progress() {
        }
    }

    /** 
     * some test code ... 
     * 
     * @param args
     * @throws IOException
     */
    public static void main(String[] args) throws IOException {
        Configuration conf = new Configuration();
        FileSystem fs = FileSystem.get(conf);
        Path pathToCrawlLog = new Path(args[0]);
        SequenceFile.Reader reader = new SequenceFile.Reader(fs, pathToCrawlLog, conf);

        Text url = new Text();
        CrawlURL urlData = new CrawlURL();

        ParserMapper mapper = new ParserMapper();
        MockReporter reporter = new MockReporter();
        final JsonParser parser = new JsonParser();

        while (reader.next(url, urlData)) {
            mapper.map(url, urlData, new OutputCollector<Text, ParseOutput>() {

                @Override
                public void collect(Text key, ParseOutput value) throws IOException {

                    long timeStart = System.currentTimeMillis();
                    JsonObject metadata = parser.parse(new JsonReader(new StringReader(value.getMetadata())))
                            .getAsJsonObject();
                    long timeEnd = System.currentTimeMillis();

                    if (metadata.has("parsed_as")) {
                        if (metadata.get("parsed_as").getAsString().equalsIgnoreCase("feed")) {
                            LOG.info("Got FEED for URL:" + key.toString() + " Parse Took:" + (timeEnd - timeStart)
                                    + " Redirect:" + metadata.get("redirect"));
                            //LOG.info("FEED METADATA:" + metadata.toString());
                        }
                    }

                    //LOG.info("Key:" + key.toString() + " Metadata Size:" + value.getMetadataAsTextBytes().getLength());
                    //LOG.info("Key:" + key.toString() + " Text-Size" + value.getTextContentAsTextBytes().getLength());
                    //LOG.info("Key:" + key.toString() + " RAW-Size" + value.getRawContent().getCount());
                }
            }, reporter);
        }

        reader.close();
    }
}