org.commoncrawl.util.StreamingArcFileReader.java Source code

Java tutorial

Introduction

Here is the source code for org.commoncrawl.util.StreamingArcFileReader.java

Source

/**
 * Copyright 2008 - CommonCrawl Foundation
 * 
 *    This program is free software: you can redistribute it and/or modify
 *    it under the terms of the GNU General Public License as published by
 *    the Free Software Foundation, either version 3 of the License, or
 *    (at your option) any later version.
 *
 *    This program is distributed in the hope that it will be useful,
 *    but WITHOUT ANY WARRANTY; without even the implied warranty of
 *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *    GNU General Public License for more details.
 *
 *    You should have received a copy of the GNU General Public License
 *    along with this program.  If not, see <http://www.gnu.org/licenses/>.
 *
 **/

package org.commoncrawl.util;

import java.io.BufferedReader;
import java.io.EOFException;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.SequenceInputStream;
import java.nio.ByteBuffer;
import java.nio.channels.Channels;
import java.nio.channels.ReadableByteChannel;
import java.nio.charset.Charset;
import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.LinkedList;
import java.util.StringTokenizer;
import java.util.Vector;
import java.util.concurrent.LinkedBlockingQueue;
import java.util.zip.CRC32;
import java.util.zip.CheckedInputStream;
import java.util.zip.DataFormatException;
import java.util.zip.Inflater;
import java.util.zip.ZipException;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.record.Buffer;
import org.apache.hadoop.util.StringUtils;
import org.commoncrawl.crawl.common.shared.Constants;
import org.commoncrawl.io.NIODataSink;
import org.commoncrawl.protocol.shared.ArcFileHeaderItem;
import org.commoncrawl.protocol.shared.ArcFileItem;

/**
 * 
 * StreamingArcFileDecoder - Decoder capable of extracting ArcFileItem(s) from an ARC file in a non-blocking, streaming 
 * manner.
 * 
 * @author rana
 *
 */
public final class StreamingArcFileReader implements NIODataSink {

    //////////////////////////////////////////////////////////////////////////////////
    // data members 
    //////////////////////////////////////////////////////////////////////////////////

    /** logging **/
    private static final Log LOG = LogFactory.getLog(StreamingArcFileReader.class);

    /** max expected arc header size **/
    private static final int MAX_ARCHEADER_SIZE = 4096;

    /** block size used for various operations **/
    private static final int BLOCK_SIZE = 32 * 1024;

    /** internal ByteBuffer wrapper for queuing byte buffers **/
    private static final class BufferItem {

        public BufferItem(ByteBuffer bufferItem) {
            _buffer = bufferItem;
        }

        public ByteBuffer _buffer;
    };

    /** blocking consumer queue **/
    private LinkedBlockingQueue<BufferItem> _consumerQueue = new LinkedBlockingQueue<BufferItem>();
    /** current data available */
    private int _bytesAvailable = 0;

    /** 32 bit crc  **/
    private CRC32 _crc = new CRC32();
    /** End Of Stream Indicator **/
    private boolean _eosReached = false;
    /** arc file header accumulator **/
    private byte[] _arcFileHeader = new byte[MAX_ARCHEADER_SIZE];
    /** arc file header size **/
    private int _arcFileHeaderSize = 0;
    /** input streams **/
    private InputStream _rawInput = null;
    private CheckedInputStream _checkedInput = null;
    /** content bytes read counter **/
    private int _contentBytesRead = 0;
    /** inflater object **/
    private Inflater _inflater = new Inflater(true);
    /** the active input buffer **/
    private ByteBuffer _activeInputBuffer = null;
    /** flag indicating that this arc file has a header item **/
    private boolean _hasHeaderItem = true;

    long _streamPos = 0;
    long _arcFileStartOffset;

    private final static int FixedHeaderBytes = 2 + 1 + 1 + 6;

    enum ReadState {
        ReadingArcHeader, ReadingArcHeaderData, ReadingArcHeaderTrailer, ReadingEntryHeader, ReadingEntryData, ReadingEntryTrailer, Done

    }

    enum HeaderReadState {
        ReadingFixedHeader, ReadingFlagValues
    }

    ReadState _readState = ReadState.ReadingArcHeader;
    ArcFileBuilder _builder = null;
    HeaderReadState _headerReadState = HeaderReadState.ReadingFixedHeader;
    int _headerFlags = 0;
    int _headerExtraBytes = -1;

    //////////////////////////////////////////////////////////////////////////////////
    // public API
    //////////////////////////////////////////////////////////////////////////////////

    /** 
     * Costructs a new StreamingArcFileReader object
     * 
     */

    public StreamingArcFileReader(boolean hasArcFileHeader) {

        // setup the proper stream...
        _rawInput = new InputStream() {

            byte oneByteArray[] = new byte[1];

            @Override
            public synchronized int available() throws IOException {
                return _bytesAvailable;
            }

            @Override
            public int read() throws IOException {
                if (read(oneByteArray, 0, 1) != -1) {
                    _streamPos++;
                    return oneByteArray[0] & 0xff;
                }
                return -1;
            }

            @Override
            public int read(byte b[], int off, int len) throws IOException {
                if (_activeInputBuffer == null || _activeInputBuffer.remaining() == 0) {

                    _activeInputBuffer = null;

                    BufferItem nextItem = null;

                    try {
                        if (_consumerQueue.size() != 0) {
                            nextItem = _consumerQueue.take();
                        }
                    } catch (InterruptedException e) {
                    }

                    if (nextItem != null) {
                        if (nextItem._buffer == null) {
                            return -1;
                        } else {
                            _activeInputBuffer = nextItem._buffer;
                        }
                    }
                }

                if (_activeInputBuffer != null || _activeInputBuffer.remaining() != 0) {

                    final int sizeAvailable = _activeInputBuffer.remaining();
                    final int sizeToRead = Math.min(sizeAvailable, len);

                    _activeInputBuffer.get(b, off, sizeToRead);

                    _streamPos += sizeToRead;

                    synchronized (this) {
                        _bytesAvailable -= sizeToRead;
                    }

                    return sizeToRead;
                } else {
                    return 0;
                }
            }
        };

        _checkedInput = new CheckedInputStream(_rawInput, _crc);
        if (!hasArcFileHeader) {
            _readState = ReadState.ReadingEntryHeader;
        }
    }

    /**
     * Reset all interal variables and get the Reader ready to process a new ArcFile
     */
    public void resetState() {
        _readState = ReadState.ReadingArcHeader;
        _builder = null;
        _headerReadState = HeaderReadState.ReadingFixedHeader;
        _headerFlags = 0;
        _headerExtraBytes = -1;
        _activeInputBuffer = null;
        _consumerQueue.clear();
        _crc.reset();
        _eosReached = false;
        _bytesAvailable = 0;
        resetInflater();
    }

    /** indicate whether this arc file has a header item **/
    public void setArcFileHasHeaderItemFlag(boolean value) {
        _hasHeaderItem = value;
    }

    enum TriStateResult {
        NeedsMoreData, MoreItems, NoMoreItems,
    }

    /**
     * Checks to see if additional ArcFileItems can be extracted from the current ARC File Stream
     * NON-BLOCKING version. 
     * @return TriStateResult.MoreItems if more items can be decoded from the stream,
     *              TriStateResult.NoMoreItems if we have reached the end of this stream,
     *              TriStateResult.NeedsMoreData if the decoder needs more data to determine next valid state 
     * @throws IOException if an error occurs processing ARC file data
     */
    public synchronized TriStateResult hasMoreItems() throws IOException {
        synchronized (this) {
            // if data is still queued in the buffer ... 
            if (_bytesAvailable != 0) {
                // then this means we have more items to process ... 
                return TriStateResult.MoreItems;
            } else {
                // otherwise if eos stream indicator is set ... 
                if (_eosReached) {
                    // set appropriate state   
                    _readState = ReadState.Done;
                    // and return nomore items
                    return TriStateResult.NoMoreItems;
                } else {
                    return TriStateResult.NeedsMoreData;
                }
            }
        }
    }

    /**
     * Attempts to deflate and read the next ArcFileItem from bytes available - NON-BLOCKING version
     * 
     * @return Fully constructed ArcFileItem or NULL if not enough data is available to service the request
     * @throws EOFException if end of stream is reached decoding item, or generic IOException if a corrupt stream is detected
     */
    public ArcFileItem getNextItem() throws IOException {

        // check state ... 
        if (_readState.ordinal() <= ReadState.ReadingArcHeaderTrailer.ordinal()) {
            if (_hasHeaderItem) {
                if (readARCHeader()) {
                    _crc.reset();
                    _readState = ReadState.ReadingEntryHeader;
                }
            } else {
                // skip arc header
                _readState = ReadState.ReadingEntryHeader;
            }
        }

        // if reading header for entry 
        if (_readState == ReadState.ReadingEntryHeader) {
            if (readHeader()) {
                _readState = ReadState.ReadingEntryData;
                // reset crc accumulator 
                _crc.reset();
                // and allocate a fresh builder object ..
                _builder = new ArcFileBuilder();
            }
        }

        // if reading data for entry ... 
        if (_readState == ReadState.ReadingEntryData) {

            // read header line buffer
            for (;;) {

                byte scanBuffer[] = new byte[BLOCK_SIZE];
                ByteBuffer byteBuffer = ByteBuffer.wrap(scanBuffer);

                // read up to scan buffer size of data ... 
                int readAmount = readInflatedBytes(scanBuffer, 0, scanBuffer.length);

                // if we did not read any bytes ... return immediately ... 
                if (readAmount == 0) {
                    return null;
                } else if (readAmount != -1) {
                    // update crc value ... 
                    _crc.update(scanBuffer, 0, readAmount);
                    // update content bytes read 
                    _contentBytesRead += readAmount;
                    // and setup buffer pointers ... 
                    byteBuffer.position(0);
                    byteBuffer.limit(readAmount);
                    // and input data into builder ... 
                    _builder.inputData(byteBuffer);
                }
                // -1 indicates eos 
                else {
                    // reset inflater ... 
                    resetInflater();
                    // and transition to reading trailing bytes 
                    _readState = ReadState.ReadingEntryTrailer;
                    break;
                }
            }
        }

        if (_readState == ReadState.ReadingEntryTrailer) {
            // validate crc and header length ... 
            if (readTrailer()) {

                // transition to assumed state ... 
                _readState = ReadState.ReadingEntryHeader;

                // get the arc file item 
                ArcFileItem itemOut = _builder.finish();

                itemOut.setArcFilePos((int) _arcFileStartOffset);

                // reset builder 
                _builder = null;
                //reset crc
                _crc.reset();

                // if no more data coming down the pipe...
                if (_rawInput.available() == 0 && _eosReached) {
                    // transition to done state ...
                    _readState = ReadState.Done;
                }
                return itemOut;
            }
        }
        return null;
    }

    /**
     * NIODataSink method - called by implementor when all ARC File data has been exhauseted
     * 
     */
    public void finished() {
        _consumerQueue.add(new BufferItem(null));
        _eosReached = true;
    }

    /**
     * NIODataSink method - called by the implementor to queue up compressed ARC File data for processing 
     */
    public void available(ByteBuffer availableReadBuffer) {
        _consumerQueue.add(new BufferItem(availableReadBuffer));

        synchronized (this) {
            _bytesAvailable += availableReadBuffer.remaining();
        }
    }

    //////////////////////////////////////////////////////////////////////////////////
    // internal helpers 
    //////////////////////////////////////////////////////////////////////////////////

    private void resetInflater() {
        _inflater.reset();
    }

    private int readInflatedBytes(byte[] b, int off, int len) throws IOException {
        if (b == null) {
            throw new NullPointerException();
        } else if (off < 0 || len < 0 || len > b.length - off) {
            throw new IndexOutOfBoundsException();
        } else if (len == 0) {
            return 0;
        }
        try {
            //try to output some bytes from the inflater 
            int n;
            while ((n = _inflater.inflate(b, off, len)) == 0) {
                if (_inflater.finished() || _inflater.needsDictionary()) {
                    // these are EOS conditions 

                    //first reclaim any remaining data from the inflater ... 
                    if (_inflater.getRemaining() != 0) {
                        if (_activeInputBuffer == null) {
                            throw new RuntimeException("Bad State");
                        } else {
                            // increment bytes available ... 
                            synchronized (this) {
                                _bytesAvailable += _inflater.getRemaining();
                                _streamPos -= _inflater.getRemaining();
                            }
                            // and reposition cursor ...
                            _activeInputBuffer.position(_activeInputBuffer.position() - _inflater.getRemaining());
                        }
                    }
                    // b
                    return -1;
                }
                // we produced no output .. check to see if have more input to add 
                if (_inflater.needsInput()) {
                    if (_activeInputBuffer == null || _activeInputBuffer.remaining() == 0) {

                        _activeInputBuffer = null;

                        if (_consumerQueue.size() != 0) {
                            BufferItem nextItem = null;
                            try {
                                nextItem = _consumerQueue.take();
                            } catch (InterruptedException e) {
                                LOG.error(StringUtils.stringifyException(e));
                            }
                            if (nextItem._buffer == null) {
                                throw new EOFException();
                            } else {
                                _activeInputBuffer = nextItem._buffer;
                            }
                        }
                    }
                    if (_activeInputBuffer == null) {
                        return 0;
                    } else {
                        // feed the buffer to the inflater ...
                        _inflater.setInput(_activeInputBuffer.array(), _activeInputBuffer.position(),
                                _activeInputBuffer.remaining());
                        // decrement bytes available ... 
                        synchronized (this) {
                            _bytesAvailable -= _activeInputBuffer.remaining();
                            _streamPos += _activeInputBuffer.remaining();
                        }
                        // and advance its position so 
                        _activeInputBuffer.position(_activeInputBuffer.position() + _activeInputBuffer.remaining());
                    }
                }
            }

            return n;
        } catch (DataFormatException e) {
            String s = e.getMessage();
            throw new ZipException(s != null ? s : "Invalid ZLIB data format");
        }
    }

    private boolean readARCHeader() throws IOException {

        if (_readState == ReadState.ReadingArcHeader) {

            if (readHeader()) {
                LOG.info("*** Found Fixed Header. Reading Metadata");
                // reset crc here... 
                _crc.reset();
                // and transition to reading data state ..
                _readState = ReadState.ReadingArcHeaderData;
            }
        }

        if (_readState == ReadState.ReadingArcHeaderData) {

            int readAmount = 0;

            while ((readAmount = readInflatedBytes(_arcFileHeader, _arcFileHeaderSize,
                    _arcFileHeader.length - _arcFileHeaderSize)) > 0) {
                LOG.info("*** Read:" + readAmount + " Metadata Bytes");
                // update crc ... 
                _crc.update(_arcFileHeader, _arcFileHeaderSize, _arcFileHeaderSize + readAmount);
                // increment content bytes read ... 
                _contentBytesRead += readAmount;
                // and update length ...
                _arcFileHeaderSize += readAmount;
            }

            if (_arcFileHeaderSize == MAX_ARCHEADER_SIZE) {
                throw new IOException(
                        "Invalid ARC File Header. Exceeded Arc File Header Size:" + _arcFileHeaderSize);
            } else if (readAmount == -1) {

                LOG.info("*** ARC File Header Size is:" + _arcFileHeaderSize);
                // reached eos ...
                // reset inflater
                resetInflater();
                // go to next state
                _readState = ReadState.ReadingArcHeaderTrailer;
            }
        }

        if (_readState == ReadState.ReadingArcHeaderTrailer) {
            // read trailing bytes in gzip stream ...   
            if (readTrailer()) {
                return true;
            }
        }
        return false;
    }

    /**
     * GZIP Code derived from GZIPInputStream code
     */

    // GZIP header magic number.
    public final static int GZIP_MAGIC = 0x8b1f;

    /*
     * File header flags.
     */
    private final static int FHCRC = 2; // Header CRC
    private final static int FEXTRA = 4; // Extra field
    private final static int FNAME = 8; // File name
    private final static int FCOMMENT = 16; // File comment

    /*
     * Reads GZIP member header.
     */
    private boolean readHeader() throws IOException {

        if (_rawInput.available() == 0 && _eosReached) {
            throw new EOFException();
        }

        switch (_headerReadState) {

        case ReadingFixedHeader: {

            if (_rawInput.available() >= FixedHeaderBytes) {

                _arcFileStartOffset = _streamPos;

                // reset crc accumulator first ...
                _crc.reset();
                // reset content bytes read counter ..
                _contentBytesRead = 0;

                // Check header magic
                if (readUShort(_checkedInput) != GZIP_MAGIC) {
                    throw new IOException("Not in GZIP format");
                }
                // Check compression method
                if (readUByte(_checkedInput) != 8) {
                    throw new IOException("Unsupported compression method");
                }
                // Read flags
                _headerFlags = readUByte(_checkedInput);
                // Skip MTIME, XFL, and OS fields
                skipBytes(_checkedInput, 6);

                _headerReadState = HeaderReadState.ReadingFlagValues;
            } else {
                break;
            }
        }

        case ReadingFlagValues: {

            boolean advanceToNext = true;

            // Skip optional extra field
            if ((_headerFlags & FEXTRA) == FEXTRA) {
                advanceToNext = false;
                if (_headerExtraBytes == -1) {
                    if (_checkedInput.available() >= 2) {
                        _headerExtraBytes = readUShort(_checkedInput);
                    }
                }
                if (_headerExtraBytes != -1) {
                    if (_checkedInput.available() >= _headerExtraBytes) {
                        // skip the requisite bytes 
                        skipBytes(_checkedInput, _headerExtraBytes);
                        // mask out current flag value ... 
                        _headerFlags &= ~FEXTRA;
                        // set advanceToNext flag
                        advanceToNext = true;
                    }
                }
            }

            while (advanceToNext && (_headerFlags & (FNAME | FCOMMENT)) != 0) {

                int activeFlag = FCOMMENT;

                if ((_headerFlags & FNAME) == FNAME)
                    activeFlag = FNAME;

                advanceToNext = false;

                while (_checkedInput.available() != 0) {
                    // keep scanning for null terminator 
                    if (readUByte(_checkedInput) == 0) {
                        _headerFlags &= ~activeFlag;
                        advanceToNext = true;
                    }
                }
            }

            if (advanceToNext && (_headerFlags & FHCRC) == FHCRC) {
                if (_checkedInput.available() >= 2) {
                    int v = (int) _crc.getValue() & 0xffff;
                    if (readUShort(_checkedInput) != v) {
                        throw new IOException("Corrupt GZIP header");
                    }
                    _headerFlags &= ~FHCRC;
                }
            }

            if (_headerFlags == 0 && _headerReadState == HeaderReadState.ReadingFlagValues) {
                //reset header state variables... 
                _headerReadState = HeaderReadState.ReadingFixedHeader;
                _headerFlags = 0;
                _headerExtraBytes = -1;
                return true;
            }
        }
            break;
        }
        return false;
    }

    private static int ZIPTraierBytes = 8;

    /*
     * Reads GZIP member trailer.
     */
    private boolean readTrailer() throws IOException {

        if (_rawInput.available() >= ZIPTraierBytes) {
            // Uses left-to-right evaluation order
            if ((readUInt(_rawInput) != _crc.getValue()) ||
            // rfc1952; ISIZE is the input size modulo 2^32
                    (readUInt(_rawInput) != _contentBytesRead))
                throw new IOException("Corrupt GZIP trailer");

            return true;

        }

        return false;
    }

    /*
     * Reads unsigned integer in Intel byte order.
     */
    private static long readUInt(InputStream in) throws IOException {
        long s = readUShort(in);
        return ((long) readUShort(in) << 16) | s;
    }

    /*
     * Reads unsigned short in Intel byte order.
     */
    private static int readUShort(InputStream in) throws IOException {
        int b = readUByte(in);
        return ((int) readUByte(in) << 8) | b;
    }

    /*
     * Reads unsigned byte.
     */
    private static int readUByte(InputStream in) throws IOException {
        int b = in.read();
        if (b == -1) {
            throw new EOFException();
        }
        if (b < -1 || b > 255) {
            // Report on this.in, not argument in; see read{Header, Trailer}.
            throw new IOException("read() returned value out of range -1..255: " + b);
        }
        return b;
    }

    private byte[] tmpbuf = new byte[128];

    /*
     * Skips bytes of input data blocking until all bytes are skipped.
     * Does not assume that the input stream is capable of seeking.
     */
    private void skipBytes(InputStream in, int n) throws IOException {
        while (n > 0) {
            int len = in.read(tmpbuf, 0, n < tmpbuf.length ? n : tmpbuf.length);
            if (len == -1) {
                throw new EOFException();
            }
            n -= len;
        }
    }

    /**
     * 
     * ArcFileBuilder helper class - used to construct ArcFileItem objects from an ARC File Entry in a stateful manner
     *
     */
    private static class ArcFileBuilder {

        //various states of processing an ARC FILE
        private enum State {
            LookingForMetadata, LookingForHeaderTerminator, ReadingContent
        }

        // ARC FILE HEADER TIMESTAMP FORMAT 
        SimpleDateFormat TIMESTAMP14 = new SimpleDateFormat("yyyyMMddHHmmss");
        // ArcFileItem this builder returns
        ArcFileItem _item = new ArcFileItem();
        // underlying content buffer 
        Buffer _content = new Buffer();
        // Builder State
        State _state = State.LookingForMetadata;
        // Queued Input State
        LinkedList<ByteBuffer> _buffers = new LinkedList<ByteBuffer>();
        // Active Input Buffer 
        ByteBuffer _activeBuffer = null;
        // Pattern Buffer - for capturing termination patterns
        byte patternBuffer[] = new byte[4];
        // Captured Pattern Length
        int patternSize = 0;
        // End Of Stream Indicator
        boolean eos = false;
        // Charsets used during decoding process
        static Charset UTF8_Charset = Charset.forName("UTF8");
        static Charset ASCII_Charset = Charset.forName("ASCII");

        /** check for terminator pattern **/
        private final boolean checkForTerminator() {

            boolean terminatorFound = false;

            switch (_state) {
            // metadata line is terminated by a single line-feed
            case LookingForMetadata: {
                if (patternBuffer[0] == '\n') {
                    terminatorFound = true;
                }
            }
                break;

            // http headers are terminated by the standard crlf-crlf pattern
            case LookingForHeaderTerminator: {
                if (patternSize == 4 && patternBuffer[0] == '\r' && patternBuffer[1] == '\n'
                        && patternBuffer[2] == '\r' && patternBuffer[3] == '\n') {
                    terminatorFound = true;
                }
            }
                break;
            }

            if (terminatorFound) {

                // if active buffer contains no more characters... 
                if (_activeBuffer.remaining() == 0) {
                    // add entire active buffer to input state 
                    _activeBuffer.rewind();
                    _buffers.addLast(_activeBuffer);
                    _activeBuffer = null;
                } else {
                    // otherwise, slice buffer at current position, and
                    // add one buffer to input state, and make the other current
                    ByteBuffer oldBuffer = _activeBuffer;
                    _activeBuffer = _activeBuffer.slice();
                    oldBuffer.limit(oldBuffer.position());
                    oldBuffer.rewind();
                    _buffers.addLast(oldBuffer);
                }
            }
            return terminatorFound;
        }

        /**newInputStream
         * 
         * @param buf - ByteBuffer to wrap as an InputStream
         * @return InputStream - wrapped InputStream object 
         */
        private static InputStream newInputStream(final ByteBuffer buf) {
            return new InputStream() {
                public synchronized int read() throws IOException {
                    if (!buf.hasRemaining()) {
                        return -1;
                    }
                    return buf.get();
                }

                public synchronized int read(byte[] bytes, int off, int len) throws IOException {
                    // Read only what's left
                    len = Math.min(len, buf.remaining());
                    buf.get(bytes, off, len);
                    return len;
                }
            };
        }

        /** construct a reader given a list of ByteBuffers **/
        private static InputStreamReader readerFromScanBufferList(LinkedList<ByteBuffer> buffers, Charset charset)
                throws IOException {
            Vector<InputStream> inputStreams = new Vector<InputStream>();

            for (ByteBuffer buffer : buffers) {
                inputStreams.add(newInputStream(buffer));
            }
            buffers.clear();

            SequenceInputStream seqInputStream = new SequenceInputStream(inputStreams.elements());
            ;

            return new InputStreamReader(seqInputStream, charset);
        }

        /** construct a single line from the current input state **/
        private final String readLine(Charset charset) throws IOException {

            BufferedReader reader = new BufferedReader(readerFromScanBufferList(_buffers, charset));

            return reader.readLine();
        }

        /** process the metadata line of an ARC File Entry **/
        private final void processMetadataLine(String metadata) throws IOException {

            //LOG.info("Metadata line is:" + metadata);
            StringTokenizer tokenizer = new StringTokenizer(metadata, " ");
            int tokenCount = 0;
            while (tokenizer.hasMoreElements() && tokenCount <= 5) {
                switch (++tokenCount) {

                // URI
                case 1: {
                    _item.setUri(tokenizer.nextToken());
                }
                    break;

                // Host IP Address
                case 2: {
                    _item.setHostIP(tokenizer.nextToken());
                }
                    break;

                // Timestamp
                case 3: {
                    String timestamp = tokenizer.nextToken();
                    try {
                        _item.setTimestamp(TIMESTAMP14.parse(timestamp).getTime());
                    } catch (ParseException e) {
                        LOG.error(StringUtils.stringifyException(e));
                        throw new IOException("Invalid Timestamp in Metadata");
                    } catch (NumberFormatException e) {
                        LOG.error("Number Format Exception Parsing Metadata Line:" + metadata + " TimeStamp:"
                                + timestamp);
                        throw e;
                    }
                }
                    break;

                // MimeType
                case 4: {
                    _item.setMimeType(tokenizer.nextToken());
                }
                    break;

                // and Record Length
                case 5: {
                    _item.setRecordLength(Integer.parseInt(tokenizer.nextToken()));
                }
                    break;
                }
            }
        }

        /** extract http headers from the current input state **/
        private final void processHeaders() throws IOException {

            BufferedReader reader = new BufferedReader(
                    readerFromScanBufferList(_buffers, ArcFileBuilder.UTF8_Charset));

            String line = null;

            _item.setFieldDirty(ArcFileItem.Field_HEADERITEMS);

            while ((line = reader.readLine()) != null) {
                if (line.length() != 0) {
                    int colonPos = line.indexOf(':');

                    ArcFileHeaderItem item = new ArcFileHeaderItem();

                    if (colonPos != -1 && colonPos != line.length() - 1) {

                        item.setItemKey(line.substring(0, colonPos));
                        item.setItemValue(line.substring(colonPos + 1));

                        // if this is our special truncation flag ... 
                        if (item.getItemKey().equals(Constants.ARCFileHeader_ContentTruncated)) {
                            String parts[] = item.getItemValue().split(",");
                            for (String part : parts) {
                                if (part.equals(ArcFileItem.Flags.toString(ArcFileItem.Flags.TruncatedInInflate))) {
                                    _item.setFlags(_item.getFlags() | ArcFileItem.Flags.TruncatedInDownload);
                                } else if (part
                                        .equals(ArcFileItem.Flags.toString(ArcFileItem.Flags.TruncatedInInflate))) {
                                    _item.setFlags(_item.getFlags() | ArcFileItem.Flags.TruncatedInInflate);
                                }
                            }
                        }
                    } else {
                        item.setItemValue(line);
                    }
                    _item.getHeaderItems().add(item);
                }
            }
        }

        /** transition from the current input state to the next  input state **/
        private final void transitionState() throws IOException {

            switch (_state) {

            case LookingForMetadata: {
                String metadataline = readLine(ASCII_Charset);
                try {
                    // decode the string as a utf-8 string  
                    processMetadataLine(metadataline);
                } catch (NumberFormatException e) {
                    LOG.error("Error Parsing Metadata Line:" + metadataline + " Length:" + metadataline.length());
                    throw e;
                }
                // and advance to next state ... 
                _state = ArcFileBuilder.State.LookingForHeaderTerminator;
            }
                break;
            case LookingForHeaderTerminator: {
                // found header terminator
                processHeaders();
                // and advance to next state ...
                _state = ArcFileBuilder.State.ReadingContent;
                // and set up arc file item for read ... 
                _content.setCapacity(BLOCK_SIZE);
            }
                break;
            }
        }

        /**  inform builder that input for the current item has been exhauseted
         * 
         * @return ArcFileItem - the fully constructed ArcFileItem object if construction was successfull
         * @throws IOException - if building fails 
         */
        public final ArcFileItem finish() throws IOException {
            if (_state == State.ReadingContent && _content.getCount() != 0) {
                _item.setContent(_content);
                _content = new Buffer();
                return _item;
            } else {
                throw new IOException("Incomplete ARC File Data Stream");
            }
        }

        /**
         * Input Data into the builder 
         * 
         * @param buffer - a piece of uncompressed content 
         * @throws IOException - throws exception if building fails 
         */
        public final void inputData(ByteBuffer buffer) throws IOException {

            // set the buffer as the active buffer ... 
            _activeBuffer = buffer;

            // scan looking for terminator 
            while (_activeBuffer != null && _activeBuffer.remaining() != 0) {

                // if not reading content then 
                if (_state != ArcFileBuilder.State.ReadingContent) {

                    // read a byte at a time ...
                    byte b = _activeBuffer.get();

                    // and if the byte is a delimiter ... 
                    if (b == '\r' || b == '\n') {

                        // add it to our pattern buffer 
                        patternBuffer[patternSize++] = b;

                        // and check for pattern match (terminator match)
                        if (checkForTerminator()) {
                            transitionState();
                        }
                    }
                    // otherwise reset pattern buffer 
                    else {
                        patternSize = 0;
                    }
                } else {
                    // calculate available storage in buffer ... 
                    int available = _content.getCapacity() - _content.getCount();
                    // if we need more room ... 
                    if (available < _activeBuffer.remaining()) {
                        // figure out how much to grow buffer by ... 
                        int growByAmount = Math.max(_activeBuffer.remaining() - available, BLOCK_SIZE * 2);
                        // and grow the buffer ... 
                        _content.setCapacity(_content.getCapacity() + growByAmount);
                    }
                    // copy the buffer data in one go ...
                    _content.append(_activeBuffer.array(), _activeBuffer.position() + _activeBuffer.arrayOffset(),
                            _activeBuffer.remaining());
                    _activeBuffer = null;
                }
            }
            // now if we reached the end of the buffer while scanning for a token ... 
            if (_activeBuffer != null) {
                // add entire buffer to buffer list ... 
                _activeBuffer.rewind();
                _buffers.add(_activeBuffer);
                _activeBuffer = null;
            }
        }
    }

    //////////////////////////////////////////////////////////////////////////////////
    // test routines
    //////////////////////////////////////////////////////////////////////////////////

    public void testReader(File arcFileItem) throws Exception {

        resetState();

        Thread thread = new Thread(new Runnable() {

            public void run() {
                try {

                    TriStateResult result;

                    while ((result = hasMoreItems()) != TriStateResult.NoMoreItems) {

                        if (result == TriStateResult.MoreItems) {

                            ArcFileItem item = null;

                            while ((item = getNextItem()) == null) {
                                LOG.info("Waiting to Read Next Item...");
                                try {
                                    Thread.sleep(1000);
                                } catch (InterruptedException e) {
                                }
                            }

                            LOG.info("GOT Item URL:" + item.getUri() + " OFFSET:" + item.getArcFilePos()
                                    + " ContentSize:" + item.getContent().getCount());
                            for (ArcFileHeaderItem headerItem : item.getHeaderItems()) {
                                if (headerItem.isFieldDirty(ArcFileHeaderItem.Field_ITEMKEY)) {
                                    //LOG.info("Header Item:" + headerItem.getItemKey() + " :" + headerItem.getItemValue());
                                } else {
                                    //LOG.info("Header Item:" + headerItem.getItemValue());
                                }
                            }
                            //LOG.info("Content Length:" + item.getContent().getCount());
                        } else {
                            // LOG.info("Has More Items Returned Need More Data. Sleeping");
                            try {
                                Thread.sleep(1000);
                            } catch (InterruptedException e) {
                            }
                        }
                    }
                    LOG.info("NO MORE ITEMS... BYE");
                } catch (IOException e) {
                    LOG.error(StringUtils.stringifyException(e));
                }
            }

        });

        // run the thread ... 
        thread.start();

        ReadableByteChannel channel = Channels.newChannel(new FileInputStream(arcFileItem));

        try {

            for (;;) {

                ByteBuffer buffer = ByteBuffer.allocate(BLOCK_SIZE);

                int bytesRead = channel.read(buffer);
                LOG.info("Read " + bytesRead + " From File");

                if (bytesRead == -1) {
                    finished();
                    break;
                } else {
                    buffer.flip();
                    available(buffer);
                }
            }
        } finally {
            channel.close();
        }

        // now wait for thread to die ...
        LOG.info("Done Reading File.... Waiting for ArcFileThread to DIE");
        thread.join();
        LOG.info("Done Reading File.... ArcFileThread to DIED");
    }

    public static void main(String[] args) {
        File file = new File(args[0]);

        StreamingArcFileReader reader = new StreamingArcFileReader(true);
        try {
            reader.testReader(file);
        } catch (Exception e) {
            e.printStackTrace();
        }
    }
}