com.cyberway.issue.io.warc.WARCRecord.java Source code

Introduction

Here is the source code for com.cyberway.issue.io.warc.WARCRecord.java
Source

/* $Id: WARCRecord.java 4566 2006-08-31 16:51:41Z stack-sf $
 *
 * Created on August 25th, 2006
 *
 * Copyright (C) 2006 Internet Archive.
 *
 * This file is part of the Heritrix web crawler (crawler.archive.org).
 *
 * Heritrix is free software; you can redistribute it and/or modify
 * it under the terms of the GNU Lesser Public License as published by
 * the Free Software Foundation; either version 2.1 of the License, or
 * any later version.
 *
 * Heritrix is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU Lesser Public License for more details.
 *
 * You should have received a copy of the GNU Lesser Public License
 * along with Heritrix; if not, write to the Free Software
 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
 */
package com.cyberway.issue.io.warc;

import it.unimi.dsi.fastutil.io.RepositionableStream;

import java.io.IOException;
import java.io.InputStream;
import java.util.HashMap;
import java.util.Map;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.apache.commons.httpclient.Header;
import org.apache.commons.httpclient.HttpParser;
import com.cyberway.issue.io.ArchiveRecord;
import com.cyberway.issue.io.ArchiveRecordHeader;

/**
 * A WARC file Record.
 *
 * @author stack
 */
public class WARCRecord extends ArchiveRecord implements WARCConstants {
    private Pattern WHITESPACE = Pattern.compile("\\s");

    /**
     * Constructor.
     *
     * @param in Stream cue'd up to be at the start of the record this instance
     * is to represent.
     * @throws IOException
     */
    public WARCRecord(InputStream in, final String identifier, final long offset) throws IOException {
        this(in, identifier, offset, true, false);
    }

    /**
     * Constructor.
     * @param in Stream cue'd up just past Header Line and Named Fields.
     * @param headers Header Line and ANVL Named fields.
     * @throws IOException
     */
    public WARCRecord(InputStream in, ArchiveRecordHeader headers) throws IOException {
        super(in, headers, 0, true, false);
    }

    /**
     * Constructor.
     *
     * @param in Stream cue'd up to be at the start of the record this instance
     * is to represent or, if <code>headers</code> is not null, just past the
     * Header Line and Named Fields.
     * @param identifier Identifier for this the hosting Reader.
     * @param offset Current offset into <code>in</code> (Used to keep
     * <code>position</code> properly aligned).  Usually 0.
     * @param digest True if we're to calculate digest for this record.  Not
     * digesting saves about ~15% of cpu during parse.
     * @param strict Be strict parsing (Parsing stops if file inproperly
     * formatted).
     * @throws IOException
     */
    public WARCRecord(final InputStream in, final String identifier, final long offset, boolean digest,
            boolean strict) throws IOException {
        super(in, null, 0, digest, strict);
        setHeader(parseHeaders(in, identifier, offset, strict));
    }

    /**
     * Parse WARC Header Line and Named Fields.
     * @param in Stream to read.
     * @param identifier Identifier for the hosting Reader.
     * @param offset Absolute offset into Reader.
     * @param strict Whether to be loose parsing or not.
     * @return An ArchiveRecordHeader.
     * @throws IOException 
     */
    protected ArchiveRecordHeader parseHeaders(final InputStream in, final String identifier, final long offset,
            final boolean strict) throws IOException {
        final Map<Object, Object> m = new HashMap<Object, Object>();
        m.put(ABSOLUTE_OFFSET_KEY, new Long(offset));
        m.put(READER_IDENTIFIER_FIELD_KEY, identifier);

        long startPosition = -1;
        if (in instanceof RepositionableStream) {
            startPosition = ((RepositionableStream) in).position();
        }
        String firstLine = new String(HttpParser.readLine(in, WARC_HEADER_ENCODING));
        if (firstLine == null || firstLine.length() <= 0) {
            throw new IOException("Failed to read WARC_MAGIC");
        }
        if (!firstLine.startsWith(WARC_MAGIC)) {
            throw new IOException("Failed to find WARC MAGIC: " + firstLine);
        }
        // Here we start reading off the inputstream but we're reading the
        // stream direct rather than going via WARCRecord#read.  The latter will
        // keep count of bytes read, digest and fail properly if EOR too soon...
        // We don't want digesting while reading Headers.
        // 
        Header[] h = HttpParser.parseHeaders(in, WARC_HEADER_ENCODING);
        for (int i = 0; i < h.length; i++) {
            m.put(h[i].getName(), h[i].getValue());
        }
        int headerLength = -1;
        if (in instanceof RepositionableStream) {
            headerLength = (int) (((RepositionableStream) in).position() - startPosition);
        }
        final int contentOffset = headerLength;
        incrementPosition(contentOffset);

        return new ArchiveRecordHeader() {
            private Map<Object, Object> headers = m;
            private int contentBegin = contentOffset;

            public String getDate() {
                return (String) this.headers.get(HEADER_KEY_DATE);
            }

            public String getDigest() {
                return null;
                // TODO: perhaps return block-digest? 
                // superclass def implies this is calculated ("only after
                // read in totality"), not pulled from header
                //            return (String)this.headers.get(HEADER_KEY_CHECKSUM);
            }

            public String getReaderIdentifier() {
                return (String) this.headers.get(READER_IDENTIFIER_FIELD_KEY);
            }

            public Set getHeaderFieldKeys() {
                return this.headers.keySet();
            }

            public Map getHeaderFields() {
                return this.headers;
            }

            public Object getHeaderValue(String key) {
                return this.headers.get(key);
            }

            public long getLength() {
                Object o = this.headers.get(CONTENT_LENGTH);
                if (o == null) {
                    return -1;
                }
                long contentLength = (o instanceof Long) ? ((Long) o).longValue() : Long.parseLong((String) o);
                return contentLength + contentOffset;
            }

            public String getMimetype() {
                return (String) this.headers.get(CONTENT_TYPE);
            }

            public long getOffset() {
                Object o = this.headers.get(ABSOLUTE_OFFSET_KEY);
                if (o == null) {
                    return -1;
                }
                return (o instanceof Long) ? ((Long) o).longValue() : Long.parseLong((String) o);
            }

            public String getRecordIdentifier() {
                return (String) this.headers.get(RECORD_IDENTIFIER_FIELD_KEY);
            }

            public String getUrl() {
                return (String) this.headers.get(HEADER_KEY_URI);
            }

            public String getVersion() {
                return (String) this.headers.get(VERSION_FIELD_KEY);
            }

            public int getContentBegin() {
                return this.contentBegin;
            }

            @Override
            public String toString() {
                return this.headers.toString();
            }
        };
    }

    @Override
    protected String getMimetype4Cdx(ArchiveRecordHeader h) {
        final String m = super.getMimetype4Cdx(h);
        // Mimetypes can have spaces in WARCs.  Emitting for CDX, just
        // squash them for now.  Later, quote them since squashing spaces won't
        // work for params that have quoted-string values.
        Matcher matcher = WHITESPACE.matcher(m);
        return matcher.replaceAll("");
    }
}