UnicodeInputStream.java Source code

Introduction

Here is the source code for UnicodeInputStream.java
Source

/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */

import java.io.IOException;
import java.io.InputStream;
import java.io.PushbackInputStream;

/**
 * This is an input stream that is unicode BOM aware. This allows you to e.g.
 * read Windows Notepad Unicode files as Velocity templates.
 * 
 * It allows you to check the actual encoding of a file by calling
 * {@link #getEncodingFromStream()} on the input stream reader.
 * 
 * This class is not thread safe! When more than one thread wants to use an
 * instance of UnicodeInputStream, the caller must provide synchronization.
 * 
 * @author <a href="mailto:mailmur@yahoo.com">Aki Nieminen</a>
 * @author <a href="mailto:henning@apache.org">Henning P. Schmiedehausen</a>
 * @version $Id: UnicodeInputStream.java 685685 2008-08-13 21:43:27Z nbubna $
 * @since 1.5
 */
public class UnicodeInputStream extends InputStream {

    /** BOM Marker for UTF 8. See http://www.unicode.org/unicode/faq/utf_bom.html */
    public static final UnicodeBOM UTF8_BOM = new UnicodeBOM("UTF-8",
            new byte[] { (byte) 0xef, (byte) 0xbb, (byte) 0xbf });

    /**
     * BOM Marker for UTF 16, little endian. See
     * http://www.unicode.org/unicode/faq/utf_bom.html
     */
    public static final UnicodeBOM UTF16LE_BOM = new UnicodeBOM("UTF-16LE",
            new byte[] { (byte) 0xff, (byte) 0xfe });

    /**
     * BOM Marker for UTF 16, big endian. See
     * http://www.unicode.org/unicode/faq/utf_bom.html
     */
    public static final UnicodeBOM UTF16BE_BOM = new UnicodeBOM("UTF-16BE",
            new byte[] { (byte) 0xfe, (byte) 0xff });

    /**
     * BOM Marker for UTF 32, little endian. See
     * http://www.unicode.org/unicode/faq/utf_bom.html
     * 
     * TODO: Does Java actually support this?
     */
    public static final UnicodeBOM UTF32LE_BOM = new UnicodeBOM("UTF-32LE",
            new byte[] { (byte) 0xff, (byte) 0xfe, (byte) 0x00, (byte) 0x00 });

    /**
     * BOM Marker for UTF 32, big endian. See
     * http://www.unicode.org/unicode/faq/utf_bom.html
     * 
     * TODO: Does Java actually support this?
     */
    public static final UnicodeBOM UTF32BE_BOM = new UnicodeBOM("UTF-32BE",
            new byte[] { (byte) 0x00, (byte) 0x00, (byte) 0xfe, (byte) 0xff });

    /** The maximum amount of bytes to read for a BOM */
    private static final int MAX_BOM_SIZE = 4;

    /** Buffer for BOM reading */
    private byte[] buf = new byte[MAX_BOM_SIZE];

    /** Buffer pointer. */
    private int pos = 0;

    /** The stream encoding as read from the BOM or null. */
    private final String encoding;

    /** True if the BOM itself should be skipped and not read. */
    private final boolean skipBOM;

    private final PushbackInputStream inputStream;

    /**
     * Creates a new UnicodeInputStream object. Skips a BOM which defines the file
     * encoding.
     * 
     * @param inputStream
     *          The input stream to use for reading.
     */
    public UnicodeInputStream(final InputStream inputStream) throws IllegalStateException, IOException {
        this(inputStream, true);
    }

    /**
     * Creates a new UnicodeInputStream object.
     * 
     * @param inputStream
     *          The input stream to use for reading.
     * @param skipBOM
     *          If this is set to true, a BOM read from the stream is discarded.
     *          This parameter should normally be true.
     */
    public UnicodeInputStream(final InputStream inputStream, boolean skipBOM)
            throws IllegalStateException, IOException {
        super();

        this.skipBOM = skipBOM;
        this.inputStream = new PushbackInputStream(inputStream, MAX_BOM_SIZE);

        try {
            this.encoding = readEncoding();
        } catch (IOException ioe) {
            IllegalStateException ex = new IllegalStateException("Could not read BOM from Stream");
            // ExceptionUtils.setCause(ex, ioe);
            throw ex;
        }
    }

    /**
     * Returns true if the input stream discards the BOM.
     * 
     * @return True if the input stream discards the BOM.
     */
    public boolean isSkipBOM() {
        return skipBOM;
    }

    /**
     * Read encoding based on BOM.
     * 
     * @return The encoding based on the BOM.
     * 
     * @throws IllegalStateException
     *           When a problem reading the BOM occured.
     */
    public String getEncodingFromStream() {
        return encoding;
    }

    /**
     * This method gets the encoding from the stream contents if a BOM exists. If
     * no BOM exists, the encoding is undefined.
     * 
     * @return The encoding of this streams contents as decided by the BOM or null
     *         if no BOM was found.
     */
    protected String readEncoding() throws IOException {
        pos = 0;

        UnicodeBOM encoding = null;

        // read first byte.
        if (readByte()) {
            // Build a list of matches
            //
            // 00 00 FE FF --> UTF 32 BE
            // EF BB BF --> UTF 8
            // FE FF --> UTF 16 BE
            // FF FE --> UTF 16 LE
            // FF FE 00 00 --> UTF 32 LE

            switch (buf[0]) {
            case (byte) 0x00: // UTF32 BE
                encoding = match(UTF32BE_BOM, null);
                break;
            case (byte) 0xef: // UTF8
                encoding = match(UTF8_BOM, null);
                break;
            case (byte) 0xfe: // UTF16 BE
                encoding = match(UTF16BE_BOM, null);
                break;
            case (byte) 0xff: // UTF16/32 LE
                encoding = match(UTF16LE_BOM, null);

                if (encoding != null) {
                    encoding = match(UTF32LE_BOM, encoding);
                }
                break;

            default:
                encoding = null;
                break;
            }
        }

        pushback(encoding);

        return (encoding != null) ? encoding.getEncoding() : null;
    }

    private final UnicodeBOM match(final UnicodeBOM matchEncoding, final UnicodeBOM noMatchEncoding)
            throws IOException {
        byte[] bom = matchEncoding.getBytes();

        for (int i = 0; i < bom.length; i++) {
            if (pos <= i) // Byte has not yet been read
            {
                if (!readByte()) {
                    return noMatchEncoding;
                }
            }

            if (bom[i] != buf[i]) {
                return noMatchEncoding;
            }
        }

        return matchEncoding;
    }

    private final boolean readByte() throws IOException {
        int res = inputStream.read();
        if (res == -1) {
            return false;
        }

        if (pos >= buf.length) {
            throw new IOException("BOM read error");
        }

        buf[pos++] = (byte) res;
        return true;
    }

    private final void pushback(final UnicodeBOM matchBOM) throws IOException {
        int count = pos; // By default, all bytes are pushed back.
        int start = 0;

        if (matchBOM != null && skipBOM) {
            // We have a match (some bytes are part of the BOM)
            // and we want to skip the BOM. Push back only the bytes
            // after the BOM.
            start = matchBOM.getBytes().length;
            count = (pos - start);

            if (count < 0) {
                throw new IllegalStateException("Match has more bytes than available!");
            }
        }

        inputStream.unread(buf, start, count);
    }

    /**
     * @see java.io.InputStream#close()
     */
    public void close() throws IOException {
        inputStream.close();
    }

    /**
     * @see java.io.InputStream#available()
     */
    public int available() throws IOException {
        return inputStream.available();
    }

    /**
     * @see java.io.InputStream#mark(int)
     */
    public void mark(final int readlimit) {
        inputStream.mark(readlimit);
    }

    /**
     * @see java.io.InputStream#markSupported()
     */
    public boolean markSupported() {
        return inputStream.markSupported();
    }

    /**
     * @see java.io.InputStream#read()
     */
    public int read() throws IOException {
        return inputStream.read();
    }

    /**
     * @see java.io.InputStream#read(byte[])
     */
    public int read(final byte[] b) throws IOException {
        return inputStream.read(b);
    }

    /**
     * @see java.io.InputStream#read(byte[], int, int)
     */
    public int read(final byte[] b, final int off, final int len) throws IOException {
        return inputStream.read(b, off, len);
    }

    /**
     * @see java.io.InputStream#reset()
     */
    public void reset() throws IOException {
        inputStream.reset();
    }

    /**
     * @see java.io.InputStream#skip(long)
     */
    public long skip(final long n) throws IOException {
        return inputStream.skip(n);
    }

    /**
     * Helper class to bundle encoding and BOM marker.
     * 
     * @author <a href="mailto:henning@apache.org">Henning P. Schmiedehausen</a>
     * @version $Id: UnicodeInputStream.java 685685 2008-08-13 21:43:27Z nbubna $
     */
    static final class UnicodeBOM {
        private final String encoding;

        private final byte[] bytes;

        private UnicodeBOM(final String encoding, final byte[] bytes) {
            this.encoding = encoding;
            this.bytes = bytes;
        }

        String getEncoding() {
            return encoding;
        }

        byte[] getBytes() {
            return bytes;
        }
    }
}