org.archive.io.arc.ARCReader.java Source code

Introduction

Here is the source code for org.archive.io.arc.ARCReader.java
Source

/*
 *  This file is part of the Heritrix web crawler (crawler.archive.org).
 *
 *  Licensed to the Internet Archive (IA) by one or more individual 
 *  contributors. 
 *
 *  The IA licenses this file to You under the Apache License, Version 2.0
 *  (the "License"); you may not use this file except in compliance with
 *  the License.  You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 *  Unless required by applicable law or agreed to in writing, software
 *  distributed under the License is distributed on an "AS IS" BASIS,
 *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 *  See the License for the specific language governing permissions and
 *  limitations under the License.
 */

package org.archive.io.arc;

import java.io.ByteArrayOutputStream;
import java.io.Closeable;
import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.logging.Logger;

import org.apache.commons.cli.CommandLine;
import org.apache.commons.cli.HelpFormatter;
import org.apache.commons.cli.Option;
import org.apache.commons.cli.Options;
import org.apache.commons.cli.ParseException;
import org.apache.commons.cli.PosixParser;
import org.archive.io.ArchiveReader;
import org.archive.io.ArchiveRecord;
import org.archive.io.ArchiveRecordHeader;
import org.archive.io.RecoverableIOException;
import org.archive.io.WriterPoolMember;
import org.archive.util.ArchiveUtils;

/**
 * Get an iterator on an ARC file or get a record by absolute position.
 *
 * ARC files are described here:
 * <a href="http://www.archive.org/web/researcher/ArcFileFormat.php">Arc
 * File Format</a>.
 *
 * <p>This class knows how to parse an ARC file.  Pass it a file path
 * or an URL to an ARC. It can parse ARC Version 1 and 2.
 *
 * <p>Iterator returns <code>ARCRecord</code>
 * though {@link Iterator#next()} is returning
 * java.lang.Object.  Cast the return.
 *
 * <p>Profiling java.io vs. memory-mapped ByteBufferInputStream shows the
 * latter slightly slower -- but not by much.  TODO: Test more.  Just
 * change {@link #getInputStream(File, long)}.
 *
 * @author stack
 * @version $Date$ $Revision$
 */
public abstract class ARCReader extends ArchiveReader implements ARCConstants, Closeable {
    private final Logger logger = Logger.getLogger(ARCReader.class.getName());

    /**
     * Set to true if we are aligned on first record of Archive file.
     * We used depend on offset. If offset was zero, then we were
     * aligned on first record.  This is no longer necessarily the case when
     * Reader is created at an offset into an Archive file: The offset is zero
     * but its relative to where we started reading.
     */
    private boolean alignedOnFirstRecord = true;

    private boolean parseHttpHeaders = true;

    protected ARCReader() {
        super();
    }

    /**
     * Skip over any trailing new lines at end of the record so we're lined up
     * ready to read the next.
     * @param record
     * @throws IOException
     */
    protected void gotoEOR(ArchiveRecord record) throws IOException {
        if (getIn().available() <= 0) {
            return;
        }

        // Remove any trailing LINE_SEPARATOR
        int c = -1;
        while (getIn().available() > 0) {
            if (getIn().markSupported()) {
                getIn().mark(1);
            }
            c = getIn().read();
            if (c != -1) {
                if (c == LINE_SEPARATOR) {
                    continue;
                }
                if (getIn().markSupported()) {
                    // We've overread.  We're probably in next record.  There is
                    // no way of telling for sure. It may be dross at end of
                    // current record. Backup.
                    getIn().reset();
                    break;
                }
                ArchiveRecordHeader h = (getCurrentRecord() != null) ? record.getHeader() : null;
                throw new IOException("Read " + (char) c + " when only " + LINE_SEPARATOR + " expected. "
                        + getReaderIdentifier() + ((h != null) ? h.getHeaderFields().toString() : ""));
            }
        }
    }

    /**
     * Create new arc record.
     *
     * Encapsulate housekeeping that has to do w/ creating a new record.
     *
     * <p>Call this method at end of constructor to read in the
     * arcfile header.  Will be problems reading subsequent arc records
     * if you don't since arcfile header has the list of metadata fields for
     * all records that follow.
     * 
     * <p>When parsing through ARCs writing out CDX info, we spend about
     * 38% of CPU in here -- about 30% of which is in getTokenizedHeaderLine
     * -- of which 16% is reading.
     *
     * @param is InputStream to use.
     * @param offset Absolute offset into arc file.
     * @return An arc record.
     * @throws IOException
     */
    protected ARCRecord createArchiveRecord(InputStream is, long offset) throws IOException {
        try {
            String version = super.getVersion();
            ARCRecord record = new ARCRecord(is, getReaderIdentifier(), offset, isDigest(), isStrict(),
                    isParseHttpHeaders(), isAlignedOnFirstRecord(), version);
            if (version != null && super.getVersion() == null)
                super.setVersion(version);
            currentRecord(record);
        } catch (IOException e) {
            if (e instanceof RecoverableIOException) {
                // Don't mess with RecoverableIOExceptions.  Let them out.
                throw e;
            }
            IOException newE = new IOException(e.getMessage() + " (Offset " + offset + ").");
            newE.setStackTrace(e.getStackTrace());
            throw newE;
        }
        return (ARCRecord) getCurrentRecord();
    }

    /**
     * Returns version of this ARC file.  Usually read from first record of ARC.
     * If we're reading without having first read the first record -- e.g.
     * random access into middle of an ARC -- then version will not have been
     * set.  For now, we return a default, version 1.1.  Later, if more than
     * just one version of ARC, we could look at such as the meta line to see
     * what version of ARC this is.
     * @return Version of this ARC file.
     */
    public String getVersion() {
        return (super.getVersion() == null) ? "1.1" : super.getVersion();
    }

    protected boolean isAlignedOnFirstRecord() {
        return alignedOnFirstRecord;
    }

    protected void setAlignedOnFirstRecord(boolean alignedOnFirstRecord) {
        this.alignedOnFirstRecord = alignedOnFirstRecord;
    }

    /**
     * @return Returns the parseHttpHeaders.
     */
    public boolean isParseHttpHeaders() {
        return this.parseHttpHeaders;
    }

    /**
     * @param parse The parseHttpHeaders to set.
     */
    public void setParseHttpHeaders(boolean parse) {
        this.parseHttpHeaders = parse;
    }

    public String getFileExtension() {
        return ARC_FILE_EXTENSION;
    }

    public String getDotFileExtension() {
        return DOT_ARC_FILE_EXTENSION;
    }

    protected boolean output(final String format) throws IOException, java.text.ParseException {
        boolean result = super.output(format);
        if (!result && (format.equals(NOHEAD) || format.equals(HEADER))) {
            throw new IOException(format + " format only supported for single Records");
        }
        return result;
    }

    public boolean outputRecord(final String format) throws IOException {
        boolean result = super.outputRecord(format);
        if (result) {
            return result;
        }
        if (format.equals(NOHEAD)) {
            // No point digesting if dumping content.
            setDigest(false);
            ARCRecord r = (ARCRecord) get();
            r.skipHttpHeader();
            r.dump();
            result = true;
        } else if (format.equals(HEADER)) {
            // No point digesting if dumping content.
            setDigest(false);
            ARCRecord r = (ARCRecord) get();
            r.dumpHttpHeader();
            result = true;
        }

        return result;
    }

    public void dump(final boolean compress) throws IOException, java.text.ParseException {
        // No point digesting if we're doing a dump.
        setDigest(false);
        boolean firstRecord = true;
        ARCWriter writer = null;
        for (Iterator<ArchiveRecord> ii = iterator(); ii.hasNext();) {
            ARCRecord r = (ARCRecord) ii.next();
            // We're to dump the arc on stdout.
            // Get the first record's data if any.
            ARCRecordMetaData meta = r.getMetaData();
            if (firstRecord) {
                firstRecord = false;
                // Get an ARCWriter.
                ByteArrayOutputStream baos = new ByteArrayOutputStream(r.available());
                // This is slow but done only once at top of ARC.
                while (r.available() > 0) {
                    baos.write(r.read());
                }
                List<String> listOfMetadata = new ArrayList<String>();
                listOfMetadata.add(baos.toString(WriterPoolMember.UTF8));
                // Assume getArc returns full path to file.  ARCWriter
                // or new File will complain if it is otherwise.
                List<File> outDirs = new ArrayList<File>();
                WriterPoolSettingsData settings = new WriterPoolSettingsData("", "", -1L, compress, outDirs,
                        listOfMetadata);
                writer = new ARCWriter(new AtomicInteger(), System.out, new File(meta.getArc()), settings);
                continue;
            }

            writer.write(meta.getUrl(), meta.getMimetype(), meta.getIp(),
                    ArchiveUtils.parse14DigitDate(meta.getDate()).getTime(), (int) meta.getLength(), r);
        }
        // System.out.println(System.currentTimeMillis() - start);
    }

    /**
     * @return an ArchiveReader that will delete a local file on close.  Used
     * when we bring Archive files local and need to clean up afterward.
     */
    public ARCReader getDeleteFileOnCloseReader(final File f) {
        final ARCReader d = this;
        return new ARCReader() {
            private final ARCReader delegate = d;
            private File archiveFile = f;

            public void close() throws IOException {
                this.delegate.close();
                if (this.archiveFile != null) {
                    if (archiveFile.exists()) {
                        archiveFile.delete();
                    }
                    this.archiveFile = null;
                }
            }

            public ArchiveRecord get(long o) throws IOException {
                return this.delegate.get(o);
            }

            public boolean isDigest() {
                return this.delegate.isDigest();
            }

            public boolean isStrict() {
                return this.delegate.isStrict();
            }

            public Iterator<ArchiveRecord> iterator() {
                return this.delegate.iterator();
            }

            public void setDigest(boolean d) {
                this.delegate.setDigest(d);
            }

            public void setStrict(boolean s) {
                this.delegate.setStrict(s);
            }

            public List<ArchiveRecordHeader> validate() throws IOException {
                return this.delegate.validate();
            }

            @Override
            public ArchiveRecord get() throws IOException {
                return this.delegate.get();
            }

            @Override
            public String getVersion() {
                return this.delegate.getVersion();
            }

            @Override
            public List<ArchiveRecordHeader> validate(int noRecords) throws IOException {
                return this.delegate.validate(noRecords);
            }

            @Override
            protected ARCRecord createArchiveRecord(InputStream is, long offset) throws IOException {
                return this.delegate.createArchiveRecord(is, offset);
            }

            @Override
            protected void gotoEOR(ArchiveRecord record) throws IOException {
                this.delegate.gotoEOR(record);
            }

            @Override
            public void dump(boolean compress) throws IOException, java.text.ParseException {
                this.delegate.dump(compress);
            }

            @Override
            public String getDotFileExtension() {
                return this.delegate.getDotFileExtension();
            }

            @Override
            public String getFileExtension() {
                return this.delegate.getFileExtension();
            }
        };
    }

    // Static methods follow.

    /**
     *
     * @param formatter Help formatter instance.
     * @param options Usage options.
     * @param exitCode Exit code.
     */
    private static void usage(HelpFormatter formatter, Options options, int exitCode) {
        formatter.printHelp("java org.archive.io.arc.ARCReader" + " [--digest=true|false] \\\n"
                + " [--format=cdx|cdxfile|dump|gzipdump|header|nohead]"
                + " [--offset=#] \\\n[--strict] [--parse] ARC_FILE|ARC_URL", options);
        System.exit(exitCode);
    }

    /**
     * Write out the arcfile.
     * 
     * @param reader
     * @param format Format to use outputting.
     * @throws IOException
     * @throws java.text.ParseException
     */
    protected static void output(ARCReader reader, String format) throws IOException, java.text.ParseException {
        if (!reader.output(format)) {
            throw new IOException("Unsupported format: " + format);
        }
    }

    /**
     * Generate a CDX index file for an ARC file.
     *
     * @param urlOrPath The ARC file to generate a CDX index for
     * @throws IOException
     * @throws java.text.ParseException
     */
    public static void createCDXIndexFile(String urlOrPath) throws IOException, java.text.ParseException {
        ARCReader r = ARCReaderFactory.get(urlOrPath);
        r.setStrict(false);
        r.setParseHttpHeaders(true);
        r.setDigest(true);
        output(r, CDX_FILE);
    }

    /**
     * Command-line interface to ARCReader.
     *
     * Here is the command-line interface:
     * <pre>
     * usage: java org.archive.io.arc.ARCReader [--offset=#] ARCFILE
     *  -h,--help      Prints this message and exits.
     *  -o,--offset    Outputs record at this offset into arc file.</pre>
     *
     * <p>See in <code>$HERITRIX_HOME/bin/arcreader</code> for a script that'll
     * take care of classpaths and the calling of ARCReader.
     *
     * <p>Outputs using a pseudo-CDX format as described here:
     * <a href="http://www.archive.org/web/researcher/cdx_legend.php">CDX
     * Legent</a> and here
     * <a href="http://www.archive.org/web/researcher/example_cdx.php">Example</a>.
     * Legend used in below is: 'CDX b e a m s c V (or v if uncompressed) n g'.
     * Hash is hard-coded straight SHA-1 hash of content.
     *
     * @param args Command-line arguments.
     * @throws ParseException Failed parse of the command line.
     * @throws IOException
     * @throws java.text.ParseException
     */
    @SuppressWarnings("unchecked")
    public static void main(String[] args) throws ParseException, IOException, java.text.ParseException {
        Options options = getOptions();
        options.addOption(new Option("p", "parse", false, "Parse headers."));
        PosixParser parser = new PosixParser();
        CommandLine cmdline = parser.parse(options, args, false);
        List<String> cmdlineArgs = cmdline.getArgList();
        Option[] cmdlineOptions = cmdline.getOptions();
        HelpFormatter formatter = new HelpFormatter();

        // If no args, print help.
        if (cmdlineArgs.size() <= 0) {
            usage(formatter, options, 0);
        }

        // Now look at options passed.
        long offset = -1;
        boolean digest = false;
        boolean strict = false;
        boolean parse = false;
        String format = CDX;
        for (int i = 0; i < cmdlineOptions.length; i++) {
            switch (cmdlineOptions[i].getId()) {
            case 'h':
                usage(formatter, options, 0);
                break;

            case 'o':
                offset = Long.parseLong(cmdlineOptions[i].getValue());
                break;

            case 's':
                strict = true;
                break;

            case 'p':
                parse = true;
                break;

            case 'd':
                digest = getTrueOrFalse(cmdlineOptions[i].getValue());
                break;

            case 'f':
                format = cmdlineOptions[i].getValue().toLowerCase();
                boolean match = false;
                // List of supported formats.
                final String[] supportedFormats = { CDX, DUMP, GZIP_DUMP, HEADER, NOHEAD, CDX_FILE };
                for (int ii = 0; ii < supportedFormats.length; ii++) {
                    if (supportedFormats[ii].equals(format)) {
                        match = true;
                        break;
                    }
                }
                if (!match) {
                    usage(formatter, options, 1);
                }
                break;

            default:
                throw new RuntimeException("Unexpected option: " + +cmdlineOptions[i].getId());
            }
        }

        if (offset >= 0) {
            if (cmdlineArgs.size() != 1) {
                System.out.println("Error: Pass one arcfile only.");
                usage(formatter, options, 1);
            }
            ARCReader arc = ARCReaderFactory.get((String) cmdlineArgs.get(0), offset);
            arc.setStrict(strict);
            // We must parse headers if we need to skip them.
            if (format.equals(NOHEAD) || format.equals(HEADER)) {
                parse = true;
            }
            arc.setParseHttpHeaders(parse);
            outputRecord(arc, format);
        } else {
            for (String urlOrPath : cmdlineArgs) {
                try {
                    ARCReader r = ARCReaderFactory.get(urlOrPath);
                    r.setStrict(strict);
                    r.setParseHttpHeaders(parse);
                    r.setDigest(digest);
                    output(r, format);
                } catch (RuntimeException e) {
                    // Write out name of file we failed on to help with
                    // debugging.  Then print stack trace and try to keep
                    // going.  We do this for case where we're being fed
                    // a bunch of ARCs; just note the bad one and move
                    // on to the next.
                    System.err.println("Exception processing " + urlOrPath + ": " + e.getMessage());
                    e.printStackTrace(System.err);
                    System.exit(1);
                }
            }
        }
    }
}