org.archive.nutchwax.tools.ArcReader.java Source code

Introduction

Here is the source code for org.archive.nutchwax.tools.ArcReader.java
Source

/*
 * Copyright (C) 2008 Internet Archive.
 * 
 * This file is part of the archive-access tools project
 * (http://sourceforge.net/projects/archive-access).
 * 
 * The archive-access tools are free software; you can redistribute them and/or
 * modify them under the terms of the GNU Lesser Public License as published by
 * the Free Software Foundation; either version 2.1 of the License, or any
 * later version.
 * 
 * The archive-access tools are distributed in the hope that they will be
 * useful, but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser
 * Public License for more details.
 * 
 * You should have received a copy of the GNU Lesser Public License along with
 * the archive-access tools; if not, write to the Free Software Foundation,
 * Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
 */
package org.archive.nutchwax.tools;

import java.util.Iterator;
import java.util.Map;
import java.util.HashMap;
import java.io.IOException;

import org.archive.io.ArchiveReader;
import org.archive.io.ArchiveReaderFactory;
import org.archive.io.ArchiveRecord;
import org.archive.io.ArchiveRecordHeader;

import org.archive.io.arc.ARCConstants;
import org.archive.io.arc.ARCReader;
import org.archive.io.arc.ARCRecord;
import org.archive.io.arc.ARCRecordMetaData;
import org.archive.io.warc.WARCConstants;
import org.archive.io.warc.WARCRecord;

import org.apache.commons.httpclient.Header;

/**
 * <p>
 *   Reader of both ARC and WARC format archive files.  This is not a
 *   general-purpose archive file reader, but is written specifically
 *   for NutchWAX.  It's possible that this could become a
 *   general-purpose archive file reader, but for now, consider it
 *   custom-tailored to the needs of NutchWAX.
 * </p>
 * <p>
 *   <code>ArcReader</code> is a wrapper around the underlying
 *   <code>ArchiveReader</code> implementation
 *   (<code>ARCReader</code>/<code>WARCReader</code>) which converts
 *   <code>WARCRecord</code>s to <code>ARCRecord</code>s on the fly.
 * </p>
 * <p>
 *   If an <code>ARCReader</code> is being wrapped, then the
 *   underlying <code>ARCRecord</code>s are read and passed-through
 *   unmolested.
 * </p>
 * <p>
 *   If a <code>WARCReader</code> is being wrapped, then the
 *   <code>WARCRecord</code>s are converted to <code>ARCRecord</code>s
 *   on the fly.
 * </p>
 * <p>
 *   <strong>WARNING:</strong> We only convert WARC
 *   <code>response</code> records.  All other WARC record types are
 *   returned as <code>null</code> by the iterator's
 *   <code>next()</code> method.  So, when using the iterator, don't
 *   forget to check for a <code>null</code> value returned by
 *   <code>next()</code>.
 * </p>
 */
public class ArcReader implements Iterable<ARCRecord> {
    private ArchiveReader reader;

    /**
     * Construct an <code>ArcReader<code> wrapping an
     * <code>ArchiveReader</code> instance.
     *
     * @param reader the ArchiveReader instance to wrap
     */
    public ArcReader(ArchiveReader reader) {
        this.reader = reader;
    }

    /**
     * Returns an iterator over <code>ARCRecord</code>s in the wrapped
     * <code>ArchiveReader</code>, converting <code>WARCRecords</code>
     * to <code>ARCRecords</code> on-the-fly.
     *
     * @return an interator
     */
    public Iterator<ARCRecord> iterator() {
        return new ArcIterator();
    }

    /**
     * 
     */
    private class ArcIterator implements Iterator<ARCRecord> {
        private Iterator<ArchiveRecord> i;

        /**
         * Construct an <code>ArcIterator</code>, skipping the header
         * record if the wrapped reader is an <code>ARCReader</code>.
         */
        public ArcIterator() {
            this.i = ArcReader.this.reader.iterator();

            if (ArcReader.this.reader instanceof ARCReader) {
                // Skip the first record, which is a "filedesc://"
                // record describing the ARC file.
                if (this.i.hasNext())
                    this.i.next();
            }
        }

        /**
         * Returns <code>true</code> if the iteration has more elements.
         * Will return <code>true</code> even if the value returned by the
         * next call to <code>next()</code> returns <code>null</code>.
         *
         * @return <code>true</code> if the iterator has more elements.
         */
        public boolean hasNext() {
            return this.i.hasNext();
        }

        /**
         * Returns the next element in the iteration. Calling this method
         * repeatedly until the <code>hasNext()</code> method returns
         * <code>false</code> will return each element in the underlying
         * collection exactly once.
         * 
         * @return the next element in the iteration, which can be <code>null</code>
         */
        public ARCRecord next() {
            try {
                ArchiveRecord record = this.i.next();

                if (record instanceof ARCRecord) {
                    // Just return the ARCRecord as-is.
                    ARCRecord arc = (ARCRecord) record;

                    return arc;
                }

                if (record instanceof WARCRecord) {
                    WARCRecord warc = (WARCRecord) record;

                    ARCRecord arc = convert(warc);

                    return arc;
                }

                // If we get here then the record we reaad in was neither an ARC
                // or WARC record.  What is a good exception to throw?
                throw new RuntimeException("Record neither ARC nor WARC: " + record.getClass());
            } catch (IOException ioe) {
                throw new RuntimeException(ioe);
            }
        }

        /**
         * Unsupported optional operation.
         *
         * @throw UnsupportedOperationException
         */
        public void remove() {
            throw new UnsupportedOperationException();
        }

        /**
         * Convert a WARCRecord to an ARCRecord.  Only "response"
         * WARCRecords are converted to meaningful ARCRecords.  All other
         * WARCRecord types are converted to <code>null</code>.
         *
         * @param warc the WARCRecord to convert
         * @return the corresponding ARCRecord, <code>null</code> if WARCRecord not a "reponse" record
         */
        private ARCRecord convert(WARCRecord warc) throws IOException {
            ArchiveRecordHeader header = warc.getHeader();

            // We only care about "response" WARC records.
            if (!WARCConstants.RESPONSE.equals(header.getHeaderValue(WARCConstants.HEADER_KEY_TYPE))) {
                return null;
            }

            // Construct an ARCRecordMetadata object based on the info in
            // the ArchiveRecordHeader.
            Map<String, Object> arcMetadataFields = new HashMap<String, Object>();
            arcMetadataFields.put(ARCConstants.URL_FIELD_KEY, header.getHeaderValue(WARCConstants.HEADER_KEY_URI));
            arcMetadataFields.put(ARCConstants.IP_HEADER_FIELD_KEY,
                    header.getHeaderValue(WARCConstants.HEADER_KEY_IP));
            arcMetadataFields.put(ARCConstants.MIMETYPE_FIELD_KEY, header.getHeaderValue(null)); // We don't know the MIME type of the *payload* in a WARC (yet)
            arcMetadataFields.put(ARCConstants.LENGTH_FIELD_KEY,
                    header.getHeaderValue(WARCConstants.CONTENT_LENGTH));
            arcMetadataFields.put(ARCConstants.VERSION_FIELD_KEY, header.getHeaderValue(null)); // FIXME: Do we need actual values for these?
            arcMetadataFields.put(ARCConstants.ABSOLUTE_OFFSET_KEY, header.getOffset());

            // Dates must be converted from WARC format to 14-digit format,
            // that is, from YYYY-MM-DDTHH:MM:SSZ to YYYYMMDDHHMMSS
            String warcDate = (String) header.getHeaderValue(WARCConstants.HEADER_KEY_DATE);
            StringBuilder date = new StringBuilder().append(warcDate, 0, 4).append(warcDate, 5, 7)
                    .append(warcDate, 8, 10).append(warcDate, 11, 13).append(warcDate, 14, 16)
                    .append(warcDate, 17, 19);

            arcMetadataFields.put(ARCConstants.DATE_FIELD_KEY, date.toString());

            ARCRecordMetaData metadata = new ARCRecordMetaData(header.getReaderIdentifier(), arcMetadataFields);

            metadata.setDigest((String) header.getHeaderValue(WARCConstants.HEADER_KEY_PAYLOAD_DIGEST));

            // Then, create an ARCRecord using the WARCRecord and the
            // ARCRecordMetaData object we just created.
            ARCRecord arc = new ARCRecord(warc, metadata, 0, // offset
                    ArcReader.this.reader.isDigest(), ArcReader.this.reader.isStrict(), true // parse HTTP headers
            );

            // Now that we've created the ARCRecord, we get the HTTP headers
            // from it.  From these HTTP headers, we obtain the Content-Type
            // of the ARCRecord's payload, then set value as the MIME-type
            // of the ARCRecord itself.

            // If the response is something other than HTTP
            // (like DNS) there are no HTTP headers.  
            if (arc.getHttpHeaders() != null) {
                for (Header h : arc.getHttpHeaders()) {
                    if (h.getName().equals("Content-Type")) {
                        arc.getMetaData().getHeaderFields().put(ARCConstants.MIMETYPE_FIELD_KEY, h.getValue());
                    }
                }
            }

            return arc;
        }

    }

    /**
     * Simple test/debug driver to read an archive file and print out
     * the header for each record.
     */
    public static void main(String args[]) throws Exception {
        if (args.length != 1) {
            System.out.println("ArcReader <(w)arc file>");
            System.exit(1);
        }

        String arcName = args[0];

        ArchiveReader r = ArchiveReaderFactory.get(arcName);
        r.setDigest(true);

        ArcReader reader = new ArcReader(r);

        for (ARCRecord rec : reader) {
            // Must call close() for digest calculation to be finished.
            rec.close();

            if (rec != null)
                System.out.println(rec.getHeader());
        }
    }
}