org.archive.jbs.arc.ArcReader.java Source code

Java tutorial

Introduction

Here is the source code for org.archive.jbs.arc.ArcReader.java

Source

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.archive.jbs.arc;

import java.util.Iterator;
import java.util.Map;
import java.util.HashMap;
import java.io.IOException;
import java.io.InputStream;

import org.archive.io.ArchiveReader;
import org.archive.io.ArchiveReaderFactory;
import org.archive.io.ArchiveRecord;
import org.archive.io.ArchiveRecordHeader;

import org.archive.io.arc.ARCConstants;
import org.archive.io.arc.ARCReader;
import org.archive.io.arc.ARCReaderFactory;
import org.archive.io.arc.ARCRecord;
import org.archive.io.arc.ARCRecordMetaData;
import org.archive.io.warc.WARCConstants;
import org.archive.io.warc.WARCRecord;

import org.apache.commons.httpclient.Header;

/**
 * Convenience wrapper around the (W)ARC readers which allows for
 * simple iteration through an (W)ARC file, returning a series of
 * ArchiveRecordProxy objects.
 *
 * This is not a general purpose (W)ARC reading class.  It is tailored
 * to the needs of jbs.Parse.
 */
public class ArcReader implements Iterable<ArchiveRecordProxy> {
    private ArchiveReader reader;

    // NOTE: See the setSizeLimit() method for details.
    private int sizeLimit = Integer.MAX_VALUE - 1024;

    /**
     * Construct an <code>ArchiveReader</code> with the
     * given path and <code>InputStream</code>.  The path
     * is used to indicate ARC vs. WARC.
     */
    public ArcReader(String path, InputStream is) throws IOException {
        this.reader = ArchiveReaderFactory.get(path, is, true);
        this.reader.setDigest(true);

        // If we are reading arc files, then we have to explictly enable
        // the parsing of the HTTP headers.  If we don't, then the call to
        // arc.skipHttpHeader() in the ArchiveRecordProxy will explode.
        //
        // BTW, we just try and cast it rather than using 'instanceof'
        // because we don't know which subtype of ARCReader it will
        // actually be.
        try {
            ((ARCReader) this.reader).setParseHttpHeaders(true);
        } catch (ClassCastException cce) {
            // Eat it.
        }
    }

    /**
     * Construct an <code>ArcReader<code> wrapping an
     * <code>ArchiveReader</code> instance.
     *
     * @param reader the ArchiveReader instance to wrap
     */
    public ArcReader(ArchiveReader reader) {
        this.reader = reader;
    }

    /**
     * Limit the amount of bytes read from the archive record to prevent
     * exceeding the available heap size.  By default, the limit is 2GB,
     * which is the maxiumum size allowed.  The limit is 2GB because we
     * load the content into a byte[].
     *
     * Giving a negative value means the same thing as "the maximum
     * allowed value".
     *
     * FIXME: There seems to be a bug in the OpenJDK 7.0 (7.0_02-b13 at
     * least) where computing a SHA-1 digest over a byte buffer close to
     * Integer.MAX_VALUE in size triggers a core-dump in the JVM.
     * Experiments show that (Integer.MAX_VALUE-10) is the edge where
     * core dumps happen..anything larger than that value.
     *
     * So, even though MAX_VALUE-11 seems safe, we'll back it off a full
     * 1024 bytes, just in case.
     */
    public void setSizeLimit(int sizeLimit) {
        if (sizeLimit < 0 || sizeLimit > (Integer.MAX_VALUE - 1024)) {
            this.sizeLimit = Integer.MAX_VALUE - 1024;
        } else {
            this.sizeLimit = sizeLimit;
        }
    }

    public int getSizeLimit() {
        return this.sizeLimit;
    }

    /**
     * Returns an iterator over <code>ArchiveRecordProxy</code> objects,
     * which wrap the <code>WARCRecord</code>/<code>ARCRecord</code>
     * objects from the inner <code>ArchiveReader</code>.
     *
     * @return an iterator
     */
    public Iterator<ArchiveRecordProxy> iterator() {
        return new ArchiveRecordProxyIterator();
    }

    /**
     * Iterator over ArchiveRecordProxy objects.
     */
    private class ArchiveRecordProxyIterator implements Iterator<ArchiveRecordProxy> {
        private Iterator<ArchiveRecord> i;

        /**
         * Construct a <code>ArchiveRecordProxyIterator</code>, skipping the header
         * record if the wrapped reader is an <code>ARCReader</code>.
         */
        public ArchiveRecordProxyIterator() {
            this.i = ArcReader.this.reader.iterator();
        }

        /**
         * Returns <code>true</code> if the iteration has more elements.
         * Will return <code>true</code> even if the value returned by the
         * next call to <code>next()</code> returns <code>null</code>.
         *
         * @return <code>true</code> if the iterator has more elements.
         */
        public boolean hasNext() {
            return this.i.hasNext();
        }

        /**
         * Returns the next element in the iteration. Calling this method
         * repeatedly until the <code>hasNext()</code> method returns
         * <code>false</code> will return each element in the underlying
         * collection exactly once.
         * 
         * @return the next element in the iteration, which can be <code>null</code>
         */
        public ArchiveRecordProxy next() {
            try {
                ArchiveRecord record = this.i.next();

                if (record instanceof ARCRecord) {
                    ArchiveRecordProxy proxy = new ArchiveRecordProxy((ARCRecord) record, sizeLimit);

                    return proxy;
                }

                if (record instanceof WARCRecord) {
                    ArchiveRecordProxy proxy = new ArchiveRecordProxy((WARCRecord) record, sizeLimit);

                    return proxy;
                }

                // If we get here then the record we read in was neither an
                // ARC or WARC record.  What is a good exception to throw?
                throw new RuntimeException("Record neither ARC nor WARC: " + record.getClass());
            } catch (IOException ioe) {
                throw new RuntimeException(ioe);
            }
        }

        /**
         * Unsupported optional operation.
         *
         * @throw UnsupportedOperationException
         */
        public void remove() {
            throw new UnsupportedOperationException();
        }

    }

    /**
     * Simple test/debug driver to read an archive file and print out
     * the header for each record.
     */
    public static void main(String args[]) throws Exception {
        if (args.length != 1) {
            System.out.println("ArcReader <(w)arc file>");
            System.exit(1);
        }

        String arcName = args[0];

        ArchiveReader r = ArchiveReaderFactory.get(arcName);
        r.setDigest(true);

        ArcReader reader = new ArcReader(r);

        for (ArchiveRecordProxy rec : reader) {
            if (rec != null) {
                System.out.print(rec.getWARCRecordType() + " ");
                System.out.print(rec.getWARCContentType() + " ");
                System.out.print(rec.getUrl() + " ");
                System.out.print(rec.getDigest() + " ");
                System.out.print(rec.getDate() + " ");
                System.out.print(rec.getLength() + " ");
                System.out.print(rec.getHttpStatusCode());
                System.out.print(rec.getHttpResponseBody() != null ? rec.getHttpResponseBody().length : 0);
                System.out.println();
            }
        }
    }

}