UCSReader.java Source code

Introduction

Here is the source code for UCSReader.java
Source

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 * 
 *      http://www.apache.org/licenses/LICENSE-2.0
 * 
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

import java.io.IOException;
import java.io.InputStream;
import java.io.Reader;

/**
 * Reader for UCS-2 and UCS-4 encodings. (i.e., encodings from
 * ISO-10646-UCS-(2|4)).
 * 
 * @xerces.internal
 * 
 * @author Neil Graham, IBM
 * 
 * @version $Id: UCSReader.java 449317 2006-09-23 22:12:30Z mrglavas $
 */
public class UCSReader extends Reader {

    //
    // Constants
    //

    /**
     * Default byte buffer size (8192, larger than that of ASCIIReader since it's
     * reasonable to surmise that the average UCS-4-encoded file should be 4 times
     * as large as the average ASCII-encoded file).
     */
    public static final int DEFAULT_BUFFER_SIZE = 8192;

    public static final short UCS2LE = 1;

    public static final short UCS2BE = 2;

    public static final short UCS4LE = 4;

    public static final short UCS4BE = 8;

    //
    // Data
    //

    /** Input stream. */
    protected final InputStream fInputStream;

    /** Byte buffer. */
    protected final byte[] fBuffer;

    // what kind of data we're dealing with
    protected final short fEncoding;

    //
    // Constructors
    //

    /**
     * Constructs a UCS reader from the specified input stream using the default
     * buffer size. The Endian-ness and whether this is UCS-2 or UCS-4 needs also
     * to be known in advance.
     * 
     * @param inputStream
     *          The input stream.
     * @param encoding
     *          One of UCS2LE, UCS2BE, UCS4LE or UCS4BE.
     */
    public UCSReader(InputStream inputStream, short encoding) {
        this(inputStream, DEFAULT_BUFFER_SIZE, encoding);
    } // <init>(InputStream, short)

    /**
     * Constructs a UCS reader from the specified input stream and buffer size.
     * The Endian-ness and whether this is UCS-2 or UCS-4 needs also to be known
     * in advance.
     * 
     * @param inputStream
     *          The input stream.
     * @param size
     *          The initial buffer size.
     * @param encoding
     *          One of UCS2LE, UCS2BE, UCS4LE or UCS4BE.
     */
    public UCSReader(InputStream inputStream, int size, short encoding) {
        this(inputStream, new byte[size], encoding);
    } // <init>(InputStream,int,short)

    /**
     * Constructs a UCS reader from the specified input stream and buffer. The
     * Endian-ness and whether this is UCS-2 or UCS-4 needs also to be known in
     * advance.
     * 
     * @param inputStream
     *          The input stream.
     * @param buffer
     *          The byte buffer.
     * @param encoding
     *          One of UCS2LE, UCS2BE, UCS4LE or UCS4BE.
     */
    public UCSReader(InputStream inputStream, byte[] buffer, short encoding) {
        fInputStream = inputStream;
        fBuffer = buffer;
        fEncoding = encoding;
    } // <init>(InputStream,int,short)

    //
    // Reader methods
    //

    /**
     * Read a single character. This method will block until a character is
     * available, an I/O error occurs, or the end of the stream is reached.
     * 
     * <p>
     * Subclasses that intend to support efficient single-character input should
     * override this method.
     * 
     * @return The character read, as an integer in the range 0 to 127 (<tt>0x00-0x7f</tt>),
     *         or -1 if the end of the stream has been reached
     * 
     * @exception IOException
     *              If an I/O error occurs
     */
    public int read() throws IOException {
        int b0 = fInputStream.read() & 0xff;
        if (b0 == 0xff) {
            return -1;
        }
        int b1 = fInputStream.read() & 0xff;
        if (b1 == 0xff) {
            return -1;
        }
        // UCS-4
        if (fEncoding >= 4) {
            int b2 = fInputStream.read() & 0xff;
            if (b2 == 0xff) {
                return -1;
            }
            int b3 = fInputStream.read() & 0xff;
            if (b3 == 0xff) {
                return -1;
            }
            if (fEncoding == UCS4BE) {
                return (b0 << 24) + (b1 << 16) + (b2 << 8) + b3;
            }
            return (b3 << 24) + (b2 << 16) + (b1 << 8) + b0;
        }
        // UCS-2
        if (fEncoding == UCS2BE) {
            return (b0 << 8) + b1;
        }
        return (b1 << 8) + b0;
    } // read():int

    /**
     * Read characters into a portion of an array. This method will block until
     * some input is available, an I/O error occurs, or the end of the stream is
     * reached.
     * 
     * @param ch
     *          Destination buffer
     * @param offset
     *          Offset at which to start storing characters
     * @param length
     *          Maximum number of characters to read
     * 
     * @return The number of characters read, or -1 if the end of the stream has
     *         been reached
     * 
     * @exception IOException
     *              If an I/O error occurs
     */
    public int read(char ch[], int offset, int length) throws IOException {
        int byteLength = length << ((fEncoding >= 4) ? 2 : 1);
        if (byteLength > fBuffer.length) {
            byteLength = fBuffer.length;
        }
        int count = fInputStream.read(fBuffer, 0, byteLength);
        if (count == -1)
            return -1;
        // try and make count be a multiple of the number of bytes we're looking for
        if (fEncoding >= 4) { // BigEndian
            // this looks ugly, but it avoids an if at any rate...
            int numToRead = (4 - (count & 3) & 3);
            for (int i = 0; i < numToRead; i++) {
                int charRead = fInputStream.read();
                if (charRead == -1) { // end of input; something likely went wrong!A Pad
                                      // buffer with nulls.
                    for (int j = i; j < numToRead; j++) {
                        fBuffer[count + j] = 0;
                    }
                    break;
                }
                fBuffer[count + i] = (byte) charRead;
            }
            count += numToRead;
        } else {
            int numToRead = count & 1;
            if (numToRead != 0) {
                count++;
                int charRead = fInputStream.read();
                if (charRead == -1) { // end of input; something likely went wrong!A Pad
                                      // buffer with nulls.
                    fBuffer[count] = 0;
                } else {
                    fBuffer[count] = (byte) charRead;
                }
            }
        }

        // now count is a multiple of the right number of bytes
        int numChars = count >> ((fEncoding >= 4) ? 2 : 1);
        int curPos = 0;
        for (int i = 0; i < numChars; i++) {
            int b0 = fBuffer[curPos++] & 0xff;
            int b1 = fBuffer[curPos++] & 0xff;
            // UCS-4
            if (fEncoding >= 4) {
                int b2 = fBuffer[curPos++] & 0xff;
                int b3 = fBuffer[curPos++] & 0xff;
                if (fEncoding == UCS4BE) {
                    ch[offset + i] = (char) ((b0 << 24) + (b1 << 16) + (b2 << 8) + b3);
                } else {
                    ch[offset + i] = (char) ((b3 << 24) + (b2 << 16) + (b1 << 8) + b0);
                }
            } else { // UCS-2
                if (fEncoding == UCS2BE) {
                    ch[offset + i] = (char) ((b0 << 8) + b1);
                } else {
                    ch[offset + i] = (char) ((b1 << 8) + b0);
                }
            }
        }
        return numChars;
    } // read(char[],int,int)

    /**
     * Skip characters. This method will block until some characters are
     * available, an I/O error occurs, or the end of the stream is reached.
     * 
     * @param n
     *          The number of characters to skip
     * 
     * @return The number of characters actually skipped
     * 
     * @exception IOException
     *              If an I/O error occurs
     */
    public long skip(long n) throws IOException {
        // charWidth will represent the number of bits to move
        // n leftward to get num of bytes to skip, and then move the result
        // rightward
        // to get num of chars effectively skipped.
        // The trick with &'ing, as with elsewhere in this dcode, is
        // intended to avoid an expensive use of / that might not be optimized
        // away.
        int charWidth = (fEncoding >= 4) ? 2 : 1;
        long bytesSkipped = fInputStream.skip(n << charWidth);
        if ((bytesSkipped & (charWidth | 1)) == 0)
            return bytesSkipped >> charWidth;
        return (bytesSkipped >> charWidth) + 1;
    } // skip(long):long

    /**
     * Tell whether this stream is ready to be read.
     * 
     * @return True if the next read() is guaranteed not to block for input, false
     *         otherwise. Note that returning false does not guarantee that the
     *         next read will block.
     * 
     * @exception IOException
     *              If an I/O error occurs
     */
    public boolean ready() throws IOException {
        return false;
    } // ready()

    /**
     * Tell whether this stream supports the mark() operation.
     */
    public boolean markSupported() {
        return fInputStream.markSupported();
    } // markSupported()

    /**
     * Mark the present position in the stream. Subsequent calls to reset() will
     * attempt to reposition the stream to this point. Not all character-input
     * streams support the mark() operation.
     * 
     * @param readAheadLimit
     *          Limit on the number of characters that may be read while still
     *          preserving the mark. After reading this many characters,
     *          attempting to reset the stream may fail.
     * 
     * @exception IOException
     *              If the stream does not support mark(), or if some other I/O
     *              error occurs
     */
    public void mark(int readAheadLimit) throws IOException {
        fInputStream.mark(readAheadLimit);
    } // mark(int)

    /**
     * Reset the stream. If the stream has been marked, then attempt to reposition
     * it at the mark. If the stream has not been marked, then attempt to reset it
     * in some way appropriate to the particular stream, for example by
     * repositioning it to its starting point. Not all character-input streams
     * support the reset() operation, and some support reset() without supporting
     * mark().
     * 
     * @exception IOException
     *              If the stream has not been marked, or if the mark has been
     *              invalidated, or if the stream does not support reset(), or if
     *              some other I/O error occurs
     */
    public void reset() throws IOException {
        fInputStream.reset();
    } // reset()

    /**
     * Close the stream. Once a stream has been closed, further read(), ready(),
     * mark(), or reset() invocations will throw an IOException. Closing a
     * previously-closed stream, however, has no effect.
     * 
     * @exception IOException
     *              If an I/O error occurs
     */
    public void close() throws IOException {
        fInputStream.close();
    } // close()

} // class UCSReader