org.commoncrawl.util.HttpHeaderInfoExtractor.java Source code

Introduction

Here is the source code for org.commoncrawl.util.HttpHeaderInfoExtractor.java
Source

/**
 * Copyright 2008 - CommonCrawl Foundation
 * 
 *    This program is free software: you can redistribute it and/or modify
 *    it under the terms of the GNU General Public License as published by
 *    the Free Software Foundation, either version 3 of the License, or
 *    (at your option) any later version.
 *
 *    This program is distributed in the hope that it will be useful,
 *    but WITHOUT ANY WARRANTY; without even the implied warranty of
 *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *    GNU General Public License for more details.
 *
 *    You should have received a copy of the GNU General Public License
 *    along with this program.  If not, see <http://www.gnu.org/licenses/>.
 *
 **/

package org.commoncrawl.util;

import java.io.IOException;
import java.util.Date;
import java.util.Iterator;
import java.util.Locale;
import java.util.StringTokenizer;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.text.SimpleDateFormat;

import junit.framework.Assert;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.commoncrawl.io.NIOHttpHeaders;
import org.commoncrawl.protocol.CrawlURLMetadata;
import org.commoncrawl.util.DateUtils.DateParser;
import org.commoncrawl.util.Tuples.Pair;
import org.junit.Test;

import com.google.common.collect.ImmutableSet;

/** 
 * uses http header information to populate http cache related information into the CrawlURLMetadata data structure 
 * 
 * @author rana
 */

public class HttpHeaderInfoExtractor {

    private static final Log LOG = LogFactory.getLog(HttpHeaderInfoExtractor.class);

    public static void parseHeaders(NIOHttpHeaders headers, CrawlURLMetadata metadataInOut) throws IOException {
        parseStatusLine(headers, metadataInOut);
        parseContentType(headers, metadataInOut);
        parseContentLength(headers, metadataInOut);
        populateETag(headers, metadataInOut);
        populateAgeValue(headers, metadataInOut);
        populateDateValue(headers, metadataInOut);
        populateLastModifiedValue(headers, metadataInOut);
        populateExpiresValue(headers, metadataInOut);
        populateCacheControlFlags(headers, metadataInOut);
    }

    public static void parseStatusLine(NIOHttpHeaders headers, CrawlURLMetadata metadata) {

        String responseLine = headers.getValue(0);

        parseStatusLine(responseLine, metadata);

    }

    public static void parseStatusLine(String responseLine, CrawlURLMetadata metadata) {
        Pair<Integer, Integer> result = parseStatusLine(responseLine);
        if (result.e1 != 0)
            metadata.setHttpResponseFlags((byte) result.e1.byteValue());
        metadata.setHttpResultCode(result.e0.intValue());
    }

    public static Pair<Integer, Integer> parseStatusLine(String responseLine) {

        Pair<Integer, Integer> resultOut = new Pair<Integer, Integer>(200, 0);
        if (responseLine == null || responseLine.length() < 4) {
            resultOut.e1 = CrawlURLMetadata.HTTPResponseFlags.HEADER_MISSING;
        } else {
            responseLine = responseLine.toLowerCase();
            if (!responseLine.startsWith("http")) {
                resultOut.e1 = CrawlURLMetadata.HTTPResponseFlags.HEADER_MISSING;
            } else {
                boolean versionValid = false;

                if (responseLine.length() > 4 || responseLine.charAt(4) == '/') {
                    int indexOfDot = responseLine.indexOf(".", 5);
                    if (indexOfDot != -1 && indexOfDot != 5 || indexOfDot + 1 < responseLine.length()) {
                        char majorVersionChar = responseLine.charAt(5);
                        char minorVersionChar = responseLine.charAt(indexOfDot + 1);
                        if (majorVersionChar >= '0' && majorVersionChar <= '9' && minorVersionChar >= '0'
                                && minorVersionChar <= '9') {
                            int majorVersion = majorVersionChar - '0';
                            int minorVersion = minorVersionChar - '0';
                            if (majorVersion == 1 && minorVersion == 0) {
                                resultOut.e1 = CrawlURLMetadata.HTTPResponseFlags.VERSION_1_0;
                            } else if (majorVersion == 1 && minorVersion == 1) {
                                resultOut.e1 = CrawlURLMetadata.HTTPResponseFlags.VERSION_1_1;
                            } else {
                                resultOut.e1 = CrawlURLMetadata.HTTPResponseFlags.VERSION_0_9;
                            }
                            versionValid = true;

                            // now skip past
                            int spaceIndex = responseLine.indexOf(' ', indexOfDot + 1);
                            if (spaceIndex + 1 < responseLine.length()) {
                                int digitStart = spaceIndex + 1;
                                int digitEnd = digitStart;
                                while (digitEnd < responseLine.length()) {
                                    char c = responseLine.charAt(digitEnd);
                                    if (c >= '0' && c <= '9')
                                        ++digitEnd;
                                    else
                                        break;
                                }
                                if (digitEnd - digitStart != 0) {
                                    try {
                                        resultOut.e0 = Integer
                                                .parseInt(responseLine.substring(digitStart, digitEnd));
                                    } catch (NumberFormatException e) {

                                    }
                                }
                            }
                        }
                    }
                }
                if (!versionValid) {
                    resultOut.e1 = CrawlURLMetadata.HTTPResponseFlags.VERSION_MISSING;
                }
            }
        }
        return resultOut;
    }

    static void populateETag(NIOHttpHeaders headers, CrawlURLMetadata metadata) {
        String etagValue = headers.findValue("Etag");
        if (etagValue != null) {
            metadata.setETag(etagValue);
        }
    }

    static void populateAgeValue(NIOHttpHeaders headers, CrawlURLMetadata metadata) {
        String ageValue = headers.findValue("Age");
        if (ageValue != null) {
            try {
                long ageInSeconds = Long.parseLong(ageValue);
                metadata.setAge(ageInSeconds);
            } catch (NumberFormatException e) {

            }
        }
    }

    static void populateDateValue(NIOHttpHeaders headers, CrawlURLMetadata metadata) {
        long timeValue = getTimeHeaderValue("Date", headers);
        if (timeValue != -1) {
            metadata.setHttpDate(timeValue);
        }
    }

    static void populateLastModifiedValue(NIOHttpHeaders headers, CrawlURLMetadata metadata) {
        long timeValue = getTimeHeaderValue("Last-Modified", headers);
        if (timeValue != -1) {
            metadata.setLastModifiedTime(timeValue);
        }
    }

    static void populateExpiresValue(NIOHttpHeaders headers, CrawlURLMetadata metadata) {
        long timeValue = getTimeHeaderValue("Expires", headers);
        if (timeValue != -1) {
            metadata.setExpires(timeValue);
        }
    }

    static final String kMaxAgePrefix = "max-age=";

    static void populateCacheControlFlags(NIOHttpHeaders headers, CrawlURLMetadata metadataInOut)
            throws IOException {
        Iterator<String> i = headers.multiValueIterator("cache-control");
        while (i.hasNext()) {
            String ccValue = i.next();

            StringTokenizer tokenizer = new StringTokenizer(ccValue, ",");

            while (tokenizer.hasMoreElements()) {

                String value = tokenizer.nextToken();

                if (value.equals("no-cache")) {
                    metadataInOut.setCacheControlFlags((byte) (metadataInOut.getCacheControlFlags()
                            | CrawlURLMetadata.CacheControlFlags.NO_CACHE));
                } else if (value.equals("no-store")) {
                    metadataInOut.setCacheControlFlags((byte) (metadataInOut.getCacheControlFlags()
                            | CrawlURLMetadata.CacheControlFlags.NO_STORE));
                } else if (value.equals("must-revalidate")) {
                    metadataInOut.setCacheControlFlags((byte) (metadataInOut.getCacheControlFlags()
                            | CrawlURLMetadata.CacheControlFlags.NO_STORE));
                } else if (value.equals("private")) {
                    metadataInOut.setCacheControlFlags((byte) (metadataInOut.getCacheControlFlags()
                            | CrawlURLMetadata.CacheControlFlags.PRIVATE));
                }

                else {
                    if (value.length() > kMaxAgePrefix.length()) {
                        String valueLowerCase = value.toLowerCase();
                        if (valueLowerCase.startsWith(kMaxAgePrefix)) {
                            try {
                                long maxAgeInSeconds = Long.parseLong(value.substring(kMaxAgePrefix.length()));
                                metadataInOut.setMaxAge(maxAgeInSeconds);
                            } catch (NumberFormatException e) {

                            }
                        }
                    }
                }
            }
        }
        Iterator<String> j = headers.multiValueIterator("pragma");
        while (j.hasNext()) {
            String value = j.next();
            if (value.equals("no-cache")) {
                metadataInOut.setCacheControlFlags((byte) (metadataInOut.getCacheControlFlags()
                        | CrawlURLMetadata.CacheControlFlags.NO_CACHE));
            }
        }

        String varyValue = headers.findValue("vary");
        if (varyValue != null && varyValue.equals("*")) {
            metadataInOut.setCacheControlFlags(
                    (byte) (metadataInOut.getCacheControlFlags() | CrawlURLMetadata.CacheControlFlags.VARY));
        }
    }

    static void parseContentLength(NIOHttpHeaders headers, CrawlURLMetadata metadata) {
        String contentLenValue = headers.findValue("Content-Length");
        if (contentLenValue != null) {
            try {
                metadata.setHttpContentLength(Integer.parseInt(contentLenValue));
            } catch (Exception e) {
            }
        }
    }

    static void parseContentType(NIOHttpHeaders headers, CrawlURLMetadata metadata) {

        Iterator<String> j = headers.multiValueIterator("content-type");

        while (j.hasNext()) {

            String contentType = j.next();

            if (contentType != null) {
                parseContentType(metadata, contentType);
            }
        }
    }

    public static final void parseContentType(CrawlURLMetadata metadataOut, String contentType) {
        //  Trim leading and trailing whitespace from type.  We include '(' in
        //  the trailing trim set to catch media-type comments, which are not at all
        //  standard, but may occur in rare cases.
        int type_val = HttpHeaderUtils.skipPastLWS(contentType, 0);
        type_val = Math.min(type_val, contentType.length());
        int type_end = HttpHeaderUtils.skipToLWSAndExtra(contentType, type_val);
        if (type_end == -1)
            type_end = contentType.length();

        int charset_val = 0;
        int charset_end = 0;

        //  Iterate over parameters
        boolean type_has_charset = false;
        int param_start = contentType.indexOf(';', type_end);
        if (param_start != -1) {
            //    We have parameters.  Iterate over them.
            int cur_param_start = param_start + 1;
            do {
                int cur_param_end = contentType.indexOf(';', cur_param_start);
                if (cur_param_end == -1)
                    cur_param_end = contentType.length();
                int param_name_start = HttpHeaderUtils.skipPastLWS(contentType, cur_param_start);
                param_name_start = Math.min(param_name_start, cur_param_end);
                int charset_end_offset = Math.min(param_name_start + HttpHeaderUtils.kCharset.length(),
                        cur_param_end);
                if (contentType.substring(param_name_start, charset_end_offset)
                        .equalsIgnoreCase(HttpHeaderUtils.kCharset)) {
                    charset_val = param_name_start + HttpHeaderUtils.kCharset.length();
                    charset_end = cur_param_end;
                    type_has_charset = true;
                }
                cur_param_start = cur_param_end + 1;
            } while (cur_param_start < contentType.length());
        }

        if (type_has_charset) {
            try {
                //    Trim leading and trailing whitespace from charset_val.  We include
                //    '(' in the trailing trim set to catch media-type comments, which are
                //    not at all standard, but may occur in rare cases.
                charset_val = HttpHeaderUtils.skipPastLWS(contentType, charset_val);
                charset_val = Math.min(charset_val, charset_end);
                if (charset_val == contentType.length()) {
                    type_has_charset = false;
                } else {
                    char first_char = contentType.charAt(charset_val);
                    if (first_char == '"' || first_char == '\'') {
                        ++charset_val;
                        charset_end = contentType.indexOf(first_char, charset_val);
                        if (charset_end == -1)
                            charset_end = HttpHeaderUtils.skipToLWSAndExtra(contentType, charset_val);
                    } else {
                        charset_end = Math.min(HttpHeaderUtils.skipToLWSAndExtra(contentType, charset_val),
                                charset_end);
                    }
                }
            } catch (IndexOutOfBoundsException e) {
                type_has_charset = false;
            }
        }

        //  if the server sent "*/*", it is meaningless, so do not store it.
        //  also, if type_val is the same as mime_type, then just update the
        //  charset.  however, if charset is empty and mime_type hasn't
        //  changed, then don't wipe-out an existing charset.  We
        //  also want to reject a mime-type if it does not include a slash.
        //  some servers give junk after the charset parameter, which may
        //  include a comma, so this check makes us a bit more tolerant.
        if (contentType.length() != 0 && !contentType.equals("*/*") && contentType.indexOf('/') != -1) {
            String originalContentType = metadataOut.getContentType();
            metadataOut.setContentType(contentType.substring(type_val, type_end).toLowerCase());

            if (type_has_charset) {
                metadataOut.setCharset(contentType.substring(charset_val, charset_end).toLowerCase());
            } else {
                if (metadataOut.getCharset().length() != 0
                        && !originalContentType.equals(metadataOut.getContentType())) {
                    metadataOut.setCharset("");
                    metadataOut.setFieldClean(CrawlURLMetadata.Field_CHARSET);
                }
            }
        }

    }

    private static long getTimeHeaderValue(String keyName, NIOHttpHeaders headers) {
        String value = headers.findValue(keyName);
        if (value != null) {
            return getTime(value);
        }
        return -1;
    }

    static String _datePatterns[] = new String[] { "EEE, dd-MMM-yyyy HH:mm:ss zzz", "EEE MMM dd HH:mm:ss yyyy",
            "EEE MMM dd HH:mm:ss yyyy zzz", "EEE, MMM dd HH:mm:ss yyyy zzz", "EEE, dd MMM yyyy HH:mm:ss zzz",
            "EEE,dd MMM yyyy HH:mm:ss zzz", "EEE, dd MMM yyyy HH:mm:sszzz", "EEE, dd MMM yyyy HH:mm:ss",
            "EEE, dd-MMM-yy HH:mm:ss zzz", "EEE, dd-MMM-yy zzz", "EEE, dd MMM yyyy zzz", "EEE MMM dd yyyy zzz",
            "EEE, dd MMM yyyy HH:mm zzz", "yyyy/MM/dd HH:mm:ss.SSS zzz", "yyyy/MM/dd HH:mm:ss.SSS",
            "yyyy/MM/dd HH:mm:ss zzz", "yyyy/MM/dd", "yyyy.MM.dd HH:mm:ss", "yyyy.MM.dd", "yyyy-MM-dd HH:mm",
            "yyyy-MM-dd HH:mm:ss", "MMM dd yyyy HH:mm:ss. zzz", "MMM dd yyyy HH:mm:ss zzz",
            "dd.MM.yyyy HH:mm:ss zzz", "dd MM yyyy HH:mm:ss zzz", "dd.MM.yyyy; HH:mm:ss", "dd.MM.yyyy HH:mm:ss",
            "dd.MM.yyyy zzz", "dd.MM.yyyy", "dd/MM/yyyy hh:mm:ss aa zzz", "dd/MM/yyyy hh:mm:ss aa",
            "dd/MM/yyyy HH:mm:ss zzz", "dd/MM/yyyy HH:mm:ss", "dd.MM.yyyy zzz" };

    static ThreadLocal<DateParser> _dateParser = new ThreadLocal<DateParser>() {
        protected DateParser initialValue() {
            return new DateParser(_datePatterns);
        };
    };

    static ThreadLocal<SimpleDateFormat> _httpDateParser = new ThreadLocal<SimpleDateFormat>() {

        @Override
        protected SimpleDateFormat initialValue() {
            return new SimpleDateFormat("EEE, dd MMM yyyy HH:mm:ss zzz", Locale.US);
        }
    };

    static ImmutableSet<String> badDatePatterns = new ImmutableSet.Builder<String>()

            .add("-1").add("0").add("GMT").add("now").add("Now()").build();

    static Pattern onlyDigits = Pattern.compile("[0-9]*");

    static Pattern specialTSMatcher = Pattern
            .compile("\\{\\s*ts\\s*'([0-9]{4})-([0-9]{2})-([0-9]{2}) ([0-9]{2}):([0-9]{2}):([0-9]{2})'\\s*\\}");

    @SuppressWarnings("deprecation")
    public static long getTime(String date) {
        long time = -1;

        if (date != null) {
            date = date.trim();
            if (date.length() != 0 && !badDatePatterns.contains(date)) {

                try {

                    try {
                        if (onlyDigits.matcher(date).matches()) {
                            time = Long.parseLong(date);
                        }
                        Matcher specialTS = specialTSMatcher.matcher(date);
                        if (specialTS.matches()) {
                            time = new Date(Integer.parseInt(specialTS.group(1)), // year
                                    Integer.parseInt(specialTS.group(2)), // month
                                    Integer.parseInt(specialTS.group(3)), // day    
                                    Integer.parseInt(specialTS.group(4)), // hr
                                    Integer.parseInt(specialTS.group(5)), // min
                                    Integer.parseInt(specialTS.group(6))).getTime(); // ss
                        }
                    } catch (Exception e) {

                    }
                    if (time == -1) {
                        long timeStartForSimpleDateFormatParse = System.currentTimeMillis();
                        time = _httpDateParser.get().parse(date).getTime();
                        long timeEndForSimpleDateFormatParse = System.currentTimeMillis();
                        // LOG.info("#### Date Parse (MostCommon) Took:" + (timeEndForSimpleDateFormatParse - timeStartForSimpleDateFormatParse));
                    }
                } catch (Exception e) {

                    time = DateUtils.parseHttpDate(date);
                    if (time == -1) {
                        // try to parse it as date in alternative format
                        try {

                            long timeStartForNewParser = System.currentTimeMillis();
                            Date parsedDate = _dateParser.get().parseDate(date);
                            long timeEndForNewPaser = System.currentTimeMillis();
                            // LOG.info("#### Date Parse (New) Took:" + (timeEndForNewPaser - timeStartForNewParser));

                            time = parsedDate.getTime();
                            // if (LOG.isWarnEnabled()) {
                            //   LOG.warn(url + ": parsed date: " + date +" to:"+time);
                            // }
                        } catch (Exception e2) {
                            LOG.error("can't parse erroneous date: " + date);
                        }
                    }
                }
            }
        }
        return time;
    }

    @Test
    public void validateParser() throws Exception {
        validateCacheControlParser();
        validateContentTypeParser();
    }

    private void validateContentTypeParser() throws Exception {
        String sampleHeaders[] = { "HTTP/1.1 200 OK\n" + "Content-type: text/html\n", "text/html", "",
                // Multiple content-type headers should give us the last one.
                "HTTP/1.1 200 OK\n" + "Content-type: text/html\n" + "Content-type: text/html\n", "text/html", "",
                "HTTP/1.1 200 OK\n" + "Content-type: text/plain\n" + "Content-type: text/html\n"
                        + "Content-type: text/plain\n" + "Content-type: text/html\n",
                "text/html", "",
                // Test charset parsing.
                "HTTP/1.1 200 OK\n" + "Content-type: text/html\n" + "Content-type: text/html; charset=ISO-8859-1\n",
                "text/html", "iso-8859-1",
                // Test charset in double quotes.
                "HTTP/1.1 200 OK\n" + "Content-type: text/html\n"
                        + "Content-type: text/html; charset=\"ISO-8859-1\"\n",
                "text/html", "iso-8859-1",
                // If there are multiple matching content-type headers, we carry
                // over the charset value.
                "HTTP/1.1 200 OK\n" + "Content-type: text/html;charset=utf-8\n" + "Content-type: text/html\n",
                "text/html", "utf-8",
                // Test single quotes.
                "HTTP/1.1 200 OK\n" + "Content-type: text/html;charset='utf-8'\n" + "Content-type: text/html\n",
                "text/html", "utf-8",
                // Last charset wins if matching content-type.
                "HTTP/1.1 200 OK\n" + "Content-type: text/html;charset=utf-8\n"
                        + "Content-type: text/html;charset=iso-8859-1\n",
                "text/html", "iso-8859-1",
                // Charset is ignored if the content types change.
                "HTTP/1.1 200 OK\n" + "Content-type: text/plain;charset=utf-8\n" + "Content-type: text/html\n",
                "text/html", "",
                // Empty content-type
                "HTTP/1.1 200 OK\n" + "Content-type: \n", "", "",
                // Emtpy charset
                "HTTP/1.1 200 OK\n" + "Content-type: text/html;charset=\n", "text/html", "",
                // Multiple charsets, last one wins.
                "HTTP/1.1 200 OK\n" + "Content-type: text/html;charset=utf-8; charset=iso-8859-1\n", "text/html",
                "iso-8859-1",
                // Multiple params.
                "HTTP/1.1 200 OK\n" + "Content-type: text/html; foo=utf-8; charset=iso-8859-1\n", "text/html",
                "iso-8859-1", "HTTP/1.1 200 OK\n" + "Content-type: text/html ; charset=utf-8 ; bar=iso-8859-1\n",
                "text/html", "utf-8",
                // Comma embeded in quotes.
                "HTTP/1.1 200 OK\n" + "Content-type: text/html ; charset='utf-8,text/plain' ;\n", "text/html",
                "utf-8,text/plain",
                // Charset with leading spaces.
                "HTTP/1.1 200 OK\n" + "Content-type: text/html ; charset= 'utf-8' ;\n", "text/html", "utf-8",
                // Media type comments in mime-type.
                "HTTP/1.1 200 OK\n" + "Content-type: text/html (html)\n", "text/html", "",
                // Incomplete charset= param
                "HTTP/1.1 200 OK\n" + "Content-type: text/html; char=\n", "text/html", "",
                // Invalid media type: no slash
                "HTTP/1.1 200 OK\n" + "Content-type: texthtml\n", "", "",
                // Invalid media type: */*
                "HTTP/1.1 200 OK\n" + "Content-type: */*\n", "", ""

        };

        int testCount = sampleHeaders.length / 3;

        for (int i = 0; i < testCount; ++i) {
            String header = sampleHeaders[i * 3];
            String expectedContentType = sampleHeaders[(i * 3) + 1];
            String expectedCharsetType = sampleHeaders[(i * 3) + 2];

            NIOHttpHeaders headers = NIOHttpHeaders.parseHttpHeaders(header);
            CrawlURLMetadata metadata = new CrawlURLMetadata();
            System.out.println("****Original Header:" + header);
            System.out.println("Exepcted ContentType:" + expectedContentType);
            System.out.println("Exepcted Charset:" + expectedCharsetType);
            System.out.println("****Parsed Results:");
            parseContentType(headers, metadata);
            if (metadata.isFieldDirty(CrawlURLMetadata.Field_CONTENTTYPE)) {
                System.out.println("ContentType:" + metadata.getContentType());
                Assert.assertTrue(expectedContentType.length() == metadata.getContentType().length());
                if (expectedContentType.length() != 0) {
                    Assert.assertTrue(expectedContentType.equals(metadata.getContentType()));
                }

            } else {
                Assert.assertTrue(expectedContentType.length() == 0);
            }

            if (metadata.isFieldDirty(CrawlURLMetadata.Field_CHARSET)) {
                System.out.println("Charset:" + metadata.getCharset());
                Assert.assertTrue(expectedCharsetType.length() == metadata.getCharset().length());
                if (expectedCharsetType.length() != 0) {
                    Assert.assertTrue(expectedCharsetType.equals(metadata.getCharset()));
                }
            } else {
                Assert.assertTrue(expectedCharsetType.length() == 0);
            }
        }
    }

    private void validateCacheControlParser() throws Exception {
        String sampleHeaders[] = {

                "HTTP/1.1 200 OK\n" + "Etag: \"34534-d3 134q\"\n" + "\n",
                // valid for a little while
                "HTTP/1.1 200 OK\n" + "cache-control: max-age=10000\n" + "\n",
                // expires in the future
                "HTTP/1.1 200 OK\n" + "date: Wed, 28 Nov 2007 00:40:11 GMT\n"
                        + "expires: Wed, 28 Nov 2007 01:00:00 GMT\n" + "\n",
                // expired already
                "HTTP/1.1 200 OK\n" + "date: Wed, 28 Nov 2007 00:40:11 GMT\n"
                        + "expires: Wed, 28 Nov 2007 00:00:00 GMT\n" + "\n",
                // max-age trumps expires
                "HTTP/1.1 200 OK\n" + "HTTP/1.1 200 OK\n" + "\n",
                // valid for a little while
                "HTTP/1.1 200 OK\n" + "cache-control: max-age=10000\n" + "\n",
                // expires in the future
                "HTTP/1.1 200 OK\n" + "date: Wed, 28 Nov 2007 00:40:11 GMT\n"
                        + "expires: Wed, 28 Nov 2007 01:00:00 GMT\n" + "\n",
                // expired already
                "HTTP/1.1 200 OK\n" + "date: Wed, 28 Nov 2007 00:40:11 GMT\n"
                        + "expires: Wed, 28 Nov 2007 00:00:00 GMT\n" + "\n",
                // max-age trumps expires
                "HTTP/1.1 200 OK\n" + "date: Wed, 28 Nov 2007 00:40:11 GMT\n"
                        + "expires: Wed, 28 Nov 2007 00:00:00 GMT\n" + "cache-control: max-age=10000\n" + "\n",
                // last-modified heuristic: modified a while ago
                "HTTP/1.1 200 OK\n" + "date: Wed, 28 Nov 2007 00:40:11 GMT\n"
                        + "last-modified: Wed, 27 Nov 2007 08:00:00 GMT\n" + "\n",
                // last-modified heuristic: modified recently
                "HTTP/1.1 200 OK\n" + "date: Wed, 28 Nov 2007 00:40:11 GMT\n"
                        + "last-modified: Wed, 28 Nov 2007 00:40:10 GMT\n" + "\n",
                // cached permanent redirect
                "HTTP/1.1 301 Moved Permanently\n" + "\n",
                // cached redirect: not reusable even though by default it would be
                "HTTP/1.1 300 Multiple Choices\n" + "Cache-Control: no-cache\n" + "\n",
                // cached forever by default
                "HTTP/1.1 410 Gone\n" + "\n",
                // cached temporary redirect: not reusable
                "HTTP/1.1 302 Found\n" + "\n",
                // cached temporary redirect: reusable
                "HTTP/1.1 302 Found\n" + "cache-control: max-age=10000\n" + "\n",
                // cache-control: max-age=N overrides expires: date in the past
                "HTTP/1.1 200 OK\n" + "date: Wed, 28 Nov 2007 00:40:11 GMT\n"
                        + "expires: Wed, 28 Nov 2007 00:20:11 GMT\n" + "cache-control: max-age=10000\n" + "\n",
                // cache-control: no-store overrides expires: in the future
                "HTTP/1.1 200 OK\n" + "date: Wed, 28 Nov 2007 00:40:11 GMT\n"
                        + "expires: Wed, 29 Nov 2007 00:40:11 GMT\n"
                        + "cache-control: no-store,private,no-cache=\"foo\"\n" + "\n",
                // pragma: no-cache overrides last-modified heuristic
                "HTTP/1.1 200 OK\n" + "date: Wed, 28 Nov 2007 00:40:11 GMT\n"
                        + "last-modified: Wed, 27 Nov 2007 08:00:00 GMT\n" + "pragma: no-cache\n" + "\n",
                "date: Wed, 28 Nov 2007 00:40:11 GMT\n" + "expires: Wed, 28 Nov 2007 00:00:00 GMT\n"
                        + "cache-control: max-age=10000\n" + "\n",
                // last-modified heuristic: modified a while ago
                "HTTP/1.1 200 OK\n" + "date: Wed, 28 Nov 2007 00:40:11 GMT\n"
                        + "last-modified: Wed, 27 Nov 2007 08:00:00 GMT\n" + "\n",
                // last-modified heuristic: modified recently
                "HTTP/1.1 200 OK\n" + "date: Wed, 28 Nov 2007 00:40:11 GMT\n"
                        + "last-modified: Wed, 28 Nov 2007 00:40:10 GMT\n" + "\n",
                // cached permanent redirect
                "HTTP/1.1 301 Moved Permanently\n" + "\n",
                // cached redirect: not reusable even though by default it would be
                "HTTP/1.1 300 Multiple Choices\n" + "Cache-Control: no-cache\n" + "\n",
                // cached forever by default
                "HTTP/1.1 410 Gone\n" + "\n",
                // cached temporary redirect: not reusable
                "HTTP/1.1 302 Found\n" + "\n",
                // cached temporary redirect: reusable
                "HTTP/1.1 302 Found\n" + "cache-control: max-age=10000\n" + "\n",
                // cache-control: max-age=N overrides expires: date in the past
                "HTTP/1.1 200 OK\n" + "date: Wed, 28 Nov 2007 00:40:11 GMT\n"
                        + "expires: Wed, 28 Nov 2007 00:20:11 GMT\n" + "cache-control: max-age=10000\n" + "\n",
                // cache-control: no-store overrides expires: in the future
                "HTTP/1.1 200 OK\n" + "date: Wed, 28 Nov 2007 00:40:11 GMT\n"
                        + "expires: Wed, 29 Nov 2007 00:40:11 GMT\n"
                        + "cache-control: no-store,private,no-cache=\"foo\"\n" + "\n",
                // pragma: no-cache overrides last-modified heuristic
                "HTTP/1.1 200 OK\n" + "date: Wed, 28 Nov 2007 00:40:11 GMT\n"
                        + "last-modified: Wed, 27 Nov 2007 08:00:00 GMT\n" + "pragma: no-cache\n" + "\n" };

        for (String header : sampleHeaders) {
            NIOHttpHeaders headers = NIOHttpHeaders.parseHttpHeaders(header);
            CrawlURLMetadata metadata = new CrawlURLMetadata();
            System.out.println("****Original Header:" + header);
            System.out.println("****Parsed Results:");
            try {
                parseHeaders(headers, metadata);
                if (metadata.isFieldDirty(CrawlURLMetadata.Field_HTTPRESPONSEFLAGS)) {
                    StringBuffer buffer = new StringBuffer();
                    buffer.append("ResponseFlags:");
                    if ((metadata.getHttpResponseFlags()
                            & CrawlURLMetadata.HTTPResponseFlags.HEADER_MISSING) != 0) {
                        buffer.append(",HeaderMissing");
                    }
                    if ((metadata.getHttpResponseFlags()
                            & CrawlURLMetadata.HTTPResponseFlags.VERSION_MISSING) != 0) {
                        buffer.append(",VersionMissing");
                    }
                    if ((metadata.getHttpResponseFlags() & CrawlURLMetadata.HTTPResponseFlags.VERSION_0_9) != 0) {
                        buffer.append(",Version0.9");
                    }
                    if ((metadata.getHttpResponseFlags() & CrawlURLMetadata.HTTPResponseFlags.VERSION_1_0) != 0) {
                        buffer.append(",Version1.0");
                    }
                    if ((metadata.getHttpResponseFlags() & CrawlURLMetadata.HTTPResponseFlags.VERSION_1_1) != 0) {
                        buffer.append(",Version1.1");
                    }
                    System.out.println(buffer.toString());
                }
                if (metadata.isFieldDirty(CrawlURLMetadata.Field_HTTPRESULTCODE)) {
                    System.out.println("HttpResultCode:" + metadata.getHttpResultCode());
                }
                if (metadata.isFieldDirty(CrawlURLMetadata.Field_ETAG)) {
                    System.out.println("ETag:" + metadata.getETag());
                }
                if (metadata.isFieldDirty(CrawlURLMetadata.Field_AGE)) {
                    System.out.println("Age:" + metadata.getAge());
                }
                if (metadata.isFieldDirty(CrawlURLMetadata.Field_HTTPDATE)) {
                    System.out.println("Date:" + metadata.getHttpDate());
                }
                if (metadata.isFieldDirty(CrawlURLMetadata.Field_LASTMODIFIEDTIME)) {
                    System.out.println("Last-Modified:" + metadata.getLastModifiedTime());
                }
                if (metadata.isFieldDirty(CrawlURLMetadata.Field_EXPIRES)) {
                    System.out.println("Expires:" + metadata.getExpires());
                }
                if (metadata.isFieldDirty(CrawlURLMetadata.Field_MAXAGE)) {
                    System.out.println("MaxAge:" + metadata.getMaxAge());
                }
                if (metadata.isFieldDirty(CrawlURLMetadata.Field_CACHECONTROLFLAGS)) {
                    StringBuffer buffer = new StringBuffer();
                    buffer.append("CacheControl:");
                    if ((metadata.getCacheControlFlags() & CrawlURLMetadata.CacheControlFlags.NO_CACHE) != 0)
                        buffer.append("no-cache,");
                    if ((metadata.getCacheControlFlags() & CrawlURLMetadata.CacheControlFlags.NO_STORE) != 0)
                        buffer.append("no-store,");
                    if ((metadata.getCacheControlFlags() & CrawlURLMetadata.CacheControlFlags.MUST_REVALIDATE) != 0)
                        buffer.append("must-revalidate,");
                    if ((metadata.getCacheControlFlags() & CrawlURLMetadata.CacheControlFlags.VARY) != 0)
                        buffer.append("vary,");
                    if ((metadata.getCacheControlFlags() & CrawlURLMetadata.CacheControlFlags.PRIVATE) != 0)
                        buffer.append("private,");

                    System.out.println(buffer.toString());
                }
            } catch (IOException e) {
                System.out.println(CCStringUtils.stringifyException(e));
            }
        }
    }

}