XmlReader.java Source code

Introduction

Here is the source code for XmlReader.java
Source

/*
 * $Id: XmlReader.java,v 1.1 2004/08/19 05:30:22 aslom Exp $
 *
 * The Apache Software License, Version 1.1
 *
 *
 * Copyright (c) 2000 The Apache Software Foundation.  All rights 
 * reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 *
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer. 
 *
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in
 *    the documentation and/or other materials provided with the
 *    distribution.
 *
 * 3. The end-user documentation included with the redistribution,
 *    if any, must include the following acknowledgment:  
 *       "This product includes software developed by the
 *        Apache Software Foundation (http://www.apache.org/)."
 *    Alternately, this acknowledgment may appear in the software itself,
 *    if and wherever such third-party acknowledgments normally appear.
 *
 * 4. The names "Crimson" and "Apache Software Foundation" must
 *    not be used to endorse or promote products derived from this
 *    software without prior written permission. For written 
 *    permission, please contact apache@apache.org.
 *
 * 5. Products derived from this software may not be called "Apache",
 *    nor may "Apache" appear in their name, without prior written
 *    permission of the Apache Software Foundation.
 *
 * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
 * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 * DISCLAIMED.  IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
 * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
 * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 * 
 *
 * This software consists of voluntary contributions made by many
 * individuals on behalf of the Apache Software Foundation and was
 * originally based on software copyright (c) 1999, Sun Microsystems, Inc., 
 * http://www.sun.com.  For more information on the Apache Software 
 * Foundation, please see <http://www.apache.org/>.
 */

import java.io.*;
import java.util.Hashtable;

/**
 * This handles several XML-related tasks that normal java.io Readers
 * don't support, inluding use of IETF standard encoding names and
 * automatic detection of most XML encodings.  The former is needed
 * for interoperability; the latter is needed to conform with the XML
 * spec.  This class also optimizes reading some common encodings by
 * providing low-overhead unsynchronized Reader support.
 *
 * <P> Note that the autodetection facility should be used only on
 * data streams which have an unknown character encoding.  For example,
 * it should never be used on MIME text/xml entities.
 *
 * <P> Note that XML processors are only required to support UTF-8 and
 * UTF-16 character encodings.  Autodetection permits the underlying Java
 * implementation to provide support for many other encodings, such as
 * US-ASCII, ISO-8859-5, Shift_JIS, EUC-JP, and ISO-2022-JP.
 *
 * @author David Brownell
 * @version $Revision: 1.1 $
 */

final public class XmlReader extends Reader {
    private static final int MAXPUSHBACK = 512;

    private Reader in;
    private String assignedEncoding;
    private boolean closed;

    //
    // This class always delegates I/O to a reader, which gets
    // its data from the very beginning of the XML text.  It needs
    // to use a pushback stream since (a) autodetection can read
    // partial UTF-8 characters which need to be fully processed,
    // (b) the "Unicode" readers swallow characters that they think
    // are byte order marks, so tests fail if they don't see the
    // real byte order mark.
    //
    // It's got do this efficiently:  character I/O is solidly on the
    // critical path.  (So keep buffer length over 2 Kbytes to avoid
    // excess buffering. Many URL handlers stuff a BufferedInputStream
    // between here and the real data source, and larger buffers keep
    // that from slowing you down.)
    //

    /**
     * Constructs the reader from an input stream, autodetecting
     * the encoding to use according to the heuristic specified
     * in the XML 1.0 recommendation.
     *
     * @param in the input stream from which the reader is constructed
     * @exception IOException on error, such as unrecognized encoding
     */
    public static Reader createReader(InputStream in) throws IOException {
        return new XmlReader(in);
    }

    /**
     * Creates a reader supporting the given encoding, mapping
     * from standard encoding names to ones that understood by
     * Java where necessary.
     *
     * @param in the input stream from which the reader is constructed
     * @param encoding the IETF standard name of the encoding to use;
     *  if null, autodetection is used.
     * @exception IOException on error, including unrecognized encoding
     */
    public static Reader createReader(InputStream in, String encoding) throws IOException {
        if (encoding == null) {
            return new XmlReader(in);
        }
        if ("UTF-8".equalsIgnoreCase(encoding) || "UTF8".equalsIgnoreCase(encoding)) {
            return new Utf8Reader(in);
        }
        if ("US-ASCII".equalsIgnoreCase(encoding) || "ASCII".equalsIgnoreCase(encoding)) {
            return new AsciiReader(in);
        }
        if ("ISO-8859-1".equalsIgnoreCase(encoding)
        // plus numerous aliases ... 
        ) {
            return new Iso8859_1Reader(in);
        }

        // What we really want is an administerable resource mapping
        // encoding names/aliases to classnames.  For example a property
        // file resource, "readers/mapping.props", holding and a set
        // of readers in that (sub)package... defaulting to this call
        // only if no better choice is available.
        //
        return new InputStreamReader(in, std2java(encoding));
    }

    // JDK doesn't know all of the standard encoding names, and
    // in particular none of the EBCDIC ones IANA defines (and
    // which IBM encourages).
    static private final Hashtable charsets = new Hashtable(31);

    static {
        charsets.put("UTF-16", "Unicode");
        charsets.put("ISO-10646-UCS-2", "Unicode");

        // NOTE: no support for ISO-10646-UCS-4 yet.

        charsets.put("EBCDIC-CP-US", "cp037");
        charsets.put("EBCDIC-CP-CA", "cp037");
        charsets.put("EBCDIC-CP-NL", "cp037");
        charsets.put("EBCDIC-CP-WT", "cp037");

        charsets.put("EBCDIC-CP-DK", "cp277");
        charsets.put("EBCDIC-CP-NO", "cp277");
        charsets.put("EBCDIC-CP-FI", "cp278");
        charsets.put("EBCDIC-CP-SE", "cp278");

        charsets.put("EBCDIC-CP-IT", "cp280");
        charsets.put("EBCDIC-CP-ES", "cp284");
        charsets.put("EBCDIC-CP-GB", "cp285");
        charsets.put("EBCDIC-CP-FR", "cp297");

        charsets.put("EBCDIC-CP-AR1", "cp420");
        charsets.put("EBCDIC-CP-HE", "cp424");
        charsets.put("EBCDIC-CP-BE", "cp500");
        charsets.put("EBCDIC-CP-CH", "cp500");

        charsets.put("EBCDIC-CP-ROECE", "cp870");
        charsets.put("EBCDIC-CP-YU", "cp870");
        charsets.put("EBCDIC-CP-IS", "cp871");
        charsets.put("EBCDIC-CP-AR2", "cp918");

        // IANA also defines two that JDK 1.2 doesn't handle:
        //  EBCDIC-CP-GR    --> CP423
        //  EBCDIC-CP-TR    --> CP905
    }

    // returns an encoding name supported by JDK >= 1.1.6
    // for some cases required by the XML spec
    private static String std2java(String encoding) {
        String temp = encoding.toUpperCase();
        temp = (String) charsets.get(temp);
        return (temp != null) ? temp : encoding;
    }

    /** Returns the standard name of the encoding in use */
    public String getEncoding() {
        return assignedEncoding;
    }

    private XmlReader(InputStream stream) throws IOException {
        super(stream);

        PushbackInputStream pb;
        byte buf[];
        int len;

        /*if (stream instanceof PushbackInputStream)
            pb = (PushbackInputStream) stream;
        else*/
        /**
         * Commented out the above code to make sure it works when the
         * document is accessed using http. URL connection in the code uses
         * a PushbackInputStream with size 7 and when we try to push back
         * MAX which default value is set to 512 we get and exception. So
         * that's why we need to wrap the stream irrespective of what type
         * of stream we start off with.
         */
        pb = new PushbackInputStream(stream, MAXPUSHBACK);

        //
        // See if we can figure out the character encoding used
        // in this file by peeking at the first few bytes.
        //
        buf = new byte[4];
        len = pb.read(buf);
        if (len > 0)
            pb.unread(buf, 0, len);

        if (len == 4)
            switch (buf[0] & 0x0ff) {
            case 0:
                // 00 3c 00 3f == illegal UTF-16 big-endian
                if (buf[1] == 0x3c && buf[2] == 0x00 && buf[3] == 0x3f) {
                    setEncoding(pb, "UnicodeBig");
                    return;
                }
                // else it's probably UCS-4
                break;

            case '<': // 0x3c: the most common cases!
                switch (buf[1] & 0x0ff) {
                // First character is '<'; could be XML without
                // an XML directive such as "<hello>", "<!-- ...",
                // and so on.
                default:
                    break;

                // 3c 00 3f 00 == illegal UTF-16 little endian
                case 0x00:
                    if (buf[2] == 0x3f && buf[3] == 0x00) {
                        setEncoding(pb, "UnicodeLittle");
                        return;
                    }
                    // else probably UCS-4
                    break;

                // 3c 3f 78 6d == ASCII and supersets '<?xm'
                case '?':
                    if (buf[2] != 'x' || buf[3] != 'm')
                        break;
                    //
                    // One of several encodings could be used:
                    // Shift-JIS, ASCII, UTF-8, ISO-8859-*, etc
                    //
                    useEncodingDecl(pb, "UTF8");
                    return;
                }
                break;

            // 4c 6f a7 94 ... some EBCDIC code page
            case 0x4c:
                if (buf[1] == 0x6f && (0x0ff & buf[2]) == 0x0a7 && (0x0ff & buf[3]) == 0x094) {
                    useEncodingDecl(pb, "CP037");
                    return;
                }
                // whoops, treat as UTF-8
                break;

            // UTF-16 big-endian
            case 0xfe:
                if ((buf[1] & 0x0ff) != 0xff)
                    break;
                setEncoding(pb, "UTF-16");
                return;

            // UTF-16 little-endian
            case 0xff:
                if ((buf[1] & 0x0ff) != 0xfe)
                    break;
                setEncoding(pb, "UTF-16");
                return;

            // default ... no XML declaration
            default:
                break;
            }

        //
        // If all else fails, assume XML without a declaration, and
        // using UTF-8 encoding.
        //
        setEncoding(pb, "UTF-8");
    }

    /*
     * Read the encoding decl on the stream, knowing that it should
     * be readable using the specified encoding (basically, ASCII or
     * EBCDIC).  The body of the document may use a wider range of
     * characters than the XML/Text decl itself, so we switch to use
     * the specified encoding as soon as we can.  (ASCII is a subset
     * of UTF-8, ISO-8859-*, ISO-2022-JP, EUC-JP, and more; EBCDIC
     * has a variety of "code pages" that have these characters as
     * a common subset.)
     */
    private void useEncodingDecl(PushbackInputStream pb, String encoding) throws IOException {
        byte buffer[] = new byte[MAXPUSHBACK];
        int len;
        Reader r;
        int c;

        //
        // Buffer up a bunch of input, and set up to read it in
        // the specified encoding ... we can skip the first four
        // bytes since we know that "<?xm" was read to determine
        // what encoding to use!
        //
        len = pb.read(buffer, 0, buffer.length);
        pb.unread(buffer, 0, len);
        r = new InputStreamReader(new ByteArrayInputStream(buffer, 4, len), encoding);

        //
        // Next must be "l" (and whitespace) else we conclude
        // error and choose UTF-8.
        //
        if ((c = r.read()) != 'l') {
            setEncoding(pb, "UTF-8");
            return;
        }

        //
        // Then, we'll skip any
        //  S version="..."   [or single quotes]
        // bit and get any subsequent 
        //  S encoding="..."  [or single quotes]
        //
        // We put an arbitrary size limit on how far we read; lots
        // of space will break this algorithm.
        //
        StringBuffer buf = new StringBuffer();
        StringBuffer keyBuf = null;
        String key = null;
        boolean sawEq = false;
        char quoteChar = 0;
        boolean sawQuestion = false;

        XmlDecl: for (int i = 0; i < MAXPUSHBACK - 5; ++i) {
            if ((c = r.read()) == -1)
                break;

            // ignore whitespace before/between "key = 'value'"
            if (c == ' ' || c == '\t' || c == '\n' || c == '\r')
                continue;

            // ... but require at least a little!
            if (i == 0)
                break;

            // terminate the loop ASAP
            if (c == '?')
                sawQuestion = true;
            else if (sawQuestion) {
                if (c == '>')
                    break;
                sawQuestion = false;
            }

            // did we get the "key =" bit yet?
            if (key == null || !sawEq) {
                if (keyBuf == null) {
                    if (Character.isWhitespace((char) c))
                        continue;
                    keyBuf = buf;
                    buf.setLength(0);
                    buf.append((char) c);
                    sawEq = false;
                } else if (Character.isWhitespace((char) c)) {
                    key = keyBuf.toString();
                } else if (c == '=') {
                    if (key == null)
                        key = keyBuf.toString();
                    sawEq = true;
                    keyBuf = null;
                    quoteChar = 0;
                } else
                    keyBuf.append((char) c);
                continue;
            }

            // space before quoted value
            if (Character.isWhitespace((char) c))
                continue;
            if (c == '"' || c == '\'') {
                if (quoteChar == 0) {
                    quoteChar = (char) c;
                    buf.setLength(0);
                    continue;
                } else if (c == quoteChar) {
                    if ("encoding".equals(key)) {
                        assignedEncoding = buf.toString();

                        // [81] Encname ::= [A-Za-z] ([A-Za-z0-9._]|'-')*
                        for (i = 0; i < assignedEncoding.length(); i++) {
                            c = assignedEncoding.charAt(i);
                            if ((c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z'))
                                continue;
                            if (i == 0)
                                break XmlDecl;
                            if (i > 0 && (c == '-' || (c >= '0' && c <= '9') || c == '.' || c == '_'))
                                continue;
                            // map illegal names to UTF-8 default
                            break XmlDecl;
                        }

                        setEncoding(pb, assignedEncoding);
                        return;

                    } else {
                        key = null;
                        continue;
                    }
                }
            }
            buf.append((char) c);
        }

        setEncoding(pb, "UTF-8");
    }

    private void setEncoding(InputStream stream, String encoding) throws IOException {
        assignedEncoding = encoding;
        in = createReader(stream, encoding);
    }

    /**
     * Reads the number of characters read into the buffer, or -1 on EOF.
     */
    public int read(char buf[], int off, int len) throws IOException {
        int val;

        if (closed)
            return -1; // throw new IOException ("closed");
        val = in.read(buf, off, len);
        if (val == -1)
            close();
        return val;
    }

    /**
     * Reads a single character.
     */
    public int read() throws IOException {
        int val;

        if (closed) {
            throw new IOException("Stream closed");
        }
        val = in.read();
        if (val == -1) {
            close();
        }
        return val;
    }

    /**
     * Returns true iff the reader supports mark/reset.
     */
    public boolean markSupported() {
        return in == null ? false : in.markSupported();
    }

    /**
     * Sets a mark allowing a limited number of characters to
     * be "peeked", by reading and then resetting.
     * @param value how many characters may be "peeked".
     */
    public void mark(int value) throws IOException {
        if (in != null)
            in.mark(value);
    }

    /**
     * Resets the current position to the last marked position.
     */
    public void reset() throws IOException {
        if (in != null)
            in.reset();
    }

    /**
     * Skips a specified number of characters.
     */
    public long skip(long value) throws IOException {
        return in == null ? 0 : in.skip(value);
    }

    /**
     * Returns true iff input characters are known to be ready.
     */
    public boolean ready() throws IOException {
        return in == null ? false : in.ready();
    }

    /**
     * Closes the reader.
     */
    public void close() throws IOException {
        if (closed)
            return;
        in.close();
        in = null;
        closed = true;
    }

    //
    // Delegating to a converter module will always be slower than
    // direct conversion.  Use a similar approach for any other
    // readers that need to be particularly fast; only block I/O
    // speed matters to this package.  For UTF-16, separate readers
    // for big and little endian streams make a difference, too;
    // fewer conditionals in the critical path!
    //
    public static abstract class BaseReader extends Reader {
        protected InputStream instream;
        protected byte buffer[];
        protected int start, finish;

        BaseReader(InputStream stream) {
            super(stream);

            instream = stream;
            buffer = new byte[8192];

        }

        public abstract String getEncoding();

        public boolean ready() throws IOException {
            return instream == null || (finish - start) > 0 || instream.available() != 0;
        }

        // caller shouldn't read again
        public void close() throws IOException {
            if (instream != null) {
                instream.close();
                start = finish = 0;
                buffer = null;
                instream = null;
            }
        }
    }

    //
    // We want this reader, to make the default encoding be as fast
    // as we can make it.  JDK's "UTF8" (not "UTF-8" till JDK 1.2)
    // InputStreamReader works, but 20+% slower speed isn't OK for
    // the default/primary encoding.
    //
    static final class Utf8Reader extends BaseReader {
        // 2nd half of UTF-8 surrogate pair
        private char nextChar;

        Utf8Reader(InputStream stream) {
            super(stream);
        }

        public String getEncoding() {
            return "UTF-8";
        }

        public int read(char buf[], int offset, int len) throws IOException {
            int i = 0, c = 0;

            if (len <= 0)
                return 0;

            // avoid many runtime bounds checks ... a good optimizer
            // (static or JIT) will now remove checks from the loop.
            if ((offset + len) > buf.length || offset < 0)
                throw new ArrayIndexOutOfBoundsException();

            // Consume remaining half of any surrogate pair immediately
            if (nextChar != 0) {
                buf[offset + i++] = nextChar;
                nextChar = 0;
            }

            while (i < len) {
                // stop or read data if needed
                if (finish <= start) {
                    if (instream == null) {
                        c = -1;
                        break;
                    }
                    start = 0;
                    finish = instream.read(buffer, 0, buffer.length);
                    if (finish <= 0) {
                        this.close();
                        c = -1;
                        break;
                    }
                }

                // RFC 2279 describes UTF-8; there are six encodings.
                // Each encoding takes a fixed number of characters
                // (1-6 bytes) and is flagged by a bit pattern in the
                // first byte.  The five and six byte-per-character
                // encodings address characters which are disallowed
                // in XML documents, as do some four byte ones.

                // Single byte == ASCII.  Common; optimize.
                //
                c = buffer[start] & 0x0ff;
                if ((c & 0x80) == 0x00) {
                    // 0x0000 <= c <= 0x007f
                    start++;
                    buf[offset + i++] = (char) c;
                    continue;
                }

                //
                // Multibyte chars -- check offsets optimistically,
                // ditto the "10xx xxxx" format for subsequent bytes
                //
                int off = start;

                try {
                    // 2 bytes
                    if ((buffer[off] & 0x0E0) == 0x0C0) {
                        c = (buffer[off++] & 0x1f) << 6;
                        c += buffer[off++] & 0x3f;

                        // 0x0080 <= c <= 0x07ff

                        // 3 bytes
                    } else if ((buffer[off] & 0x0F0) == 0x0E0) {
                        c = (buffer[off++] & 0x0f) << 12;
                        c += (buffer[off++] & 0x3f) << 6;
                        c += buffer[off++] & 0x3f;

                        // 0x0800 <= c <= 0xffff

                        // 4 bytes
                    } else if ((buffer[off] & 0x0f8) == 0x0F0) {
                        c = (buffer[off++] & 0x07) << 18;
                        c += (buffer[off++] & 0x3f) << 12;
                        c += (buffer[off++] & 0x3f) << 6;
                        c += buffer[off++] & 0x3f;

                        // 0x0001 0000  <= c  <= 0x001f ffff

                        // Unicode supports c <= 0x0010 ffff ...
                        if (c > 0x0010ffff)
                            throw new CharConversionException("UTF-8 encoding of character 0x00"
                                    + Integer.toHexString(c) + " can't be converted to Unicode.");

                        else if (c > 0xffff) {
                            // Convert UCS-4 char to surrogate pair (UTF-16)
                            c -= 0x10000;
                            nextChar = (char) (0xDC00 + (c & 0x03ff));
                            c = 0xD800 + (c >> 10);
                        }
                        // 5 and 6 byte versions are XML WF errors, but
                        // typically come from mislabeled encodings
                    } else
                        throw new CharConversionException("Unconvertible UTF-8 character" + " beginning with 0x"
                                + Integer.toHexString(buffer[start] & 0xff));

                } catch (ArrayIndexOutOfBoundsException e) {
                    // off > length && length >= buffer.length
                    c = 0;
                }

                //
                // if the buffer held only a partial character,
                // compact it and try to read the rest of the
                // character.  worst case involves three
                // single-byte reads -- quite rare.
                //
                if (off > finish) {
                    System.arraycopy(buffer, start, buffer, 0, finish - start);
                    finish -= start;
                    start = 0;
                    off = instream.read(buffer, finish, buffer.length - finish);
                    if (off < 0) {
                        this.close();
                        throw new CharConversionException("Partial UTF-8 char");
                    }
                    finish += off;
                    continue;
                }

                //
                // check the format of the non-initial bytes
                //
                for (start++; start < off; start++) {
                    if ((buffer[start] & 0xC0) != 0x80) {
                        this.close();
                        throw new CharConversionException(
                                "Malformed UTF-8 char -- " + "is an XML encoding declaration missing?");
                    }
                }

                //
                // If this needed a surrogate pair, consume ASAP
                //
                buf[offset + i++] = (char) c;
                if (nextChar != 0 && i < len) {
                    buf[offset + i++] = nextChar;
                    nextChar = 0;
                }
            }
            if (i > 0)
                return i;
            return (c == -1) ? -1 : 0;
        }
    }

    //
    // We want ASCII and ISO-8859 Readers since they're the most common
    // encodings in the US and Europe, and we don't want performance
    // regressions for them.  They're also easy to implement efficiently,
    // since they're bitmask subsets of UNICODE.
    //
    // XXX haven't benchmarked these readers vs what we get out of JDK.
    //
    static final class AsciiReader extends BaseReader {
        AsciiReader(InputStream in) {
            super(in);
        }

        public String getEncoding() {
            return "US-ASCII";
        }

        public int read(char buf[], int offset, int len) throws IOException {
            if (instream == null) {
                return -1;
            }

            // avoid many runtime bounds checks ... a good optimizer
            // (static or JIT) will now remove checks from the loop.
            if ((offset + len) > buf.length || offset < 0)
                throw new ArrayIndexOutOfBoundsException();

            /* 07-Mar-2006, TSa: Actually, it's bad idea to try to fill the
             *   whole buffer -- if this is a blocking source (network socket
             *   for example), we may be blocking too early.
             */
            // So, do we need to try to read more?
            int avail = (finish - start);
            if (avail < 1) {
                start = 0;
                finish = instream.read(buffer, 0, buffer.length);
                if (finish <= 0) {
                    this.close();
                    return -1;
                }
                if (len > finish) {
                    len = finish;
                }
            } else {
                if (len > avail) {
                    len = avail;
                }
            }

            for (int i = 0; i < len; i++) {
                int c = buffer[start++];
                if (c < 0) {
                    throw new CharConversionException(
                            "Illegal ASCII character, 0x" + Integer.toHexString(c & 0xff));
                }
                buf[offset + i] = (char) c;
            }
            return len;
        }
    }

    static final class Iso8859_1Reader extends BaseReader {
        Iso8859_1Reader(InputStream in) {
            super(in);
        }

        public String getEncoding() {
            return "ISO-8859-1";
        }

        public int read(char buf[], int offset, int len) throws IOException {
            if (instream == null)
                return -1;

            // avoid many runtime bounds checks ... a good optimizer
            // (static or JIT) will now remove checks from the loop.
            if ((offset + len) > buf.length || offset < 0)
                throw new ArrayIndexOutOfBoundsException();

            /* 07-Mar-2006, TSa: Actually, it's bad idea to try to fill the
             *   whole buffer -- if this is a blocking source (network socket
             *   for example), we may be blocking too early.
             */
            // So, do we need to try to read more?
            int avail = (finish - start);
            if (avail < 1) {
                start = 0;
                finish = instream.read(buffer, 0, buffer.length);
                if (finish <= 0) {
                    this.close();
                    return -1;
                }
                if (len > finish) {
                    len = finish;
                }
            } else {
                if (len > avail) {
                    len = avail;
                }
            }

            for (int i = 0; i < len; i++) {
                buf[offset + i] = (char) (buffer[start++] & 0xFF);
            }
            return len;
        }
    }
}