Java tutorial
/* * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, * software distributed under the License is distributed on an * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY * KIND, either express or implied. See the License for the * specific language governing permissions and limitations * under the License. */ import java.io.IOException; import java.io.InputStream; import java.io.PushbackInputStream; /** * This is an input stream that is unicode BOM aware. This allows you to e.g. * read Windows Notepad Unicode files as Velocity templates. * * It allows you to check the actual encoding of a file by calling * {@link #getEncodingFromStream()} on the input stream reader. * * This class is not thread safe! When more than one thread wants to use an * instance of UnicodeInputStream, the caller must provide synchronization. * * @author <a href="mailto:mailmur@yahoo.com">Aki Nieminen</a> * @author <a href="mailto:henning@apache.org">Henning P. Schmiedehausen</a> * @version $Id: UnicodeInputStream.java 685685 2008-08-13 21:43:27Z nbubna $ * @since 1.5 */ public class UnicodeInputStream extends InputStream { /** BOM Marker for UTF 8. See http://www.unicode.org/unicode/faq/utf_bom.html */ public static final UnicodeBOM UTF8_BOM = new UnicodeBOM("UTF-8", new byte[] { (byte) 0xef, (byte) 0xbb, (byte) 0xbf }); /** * BOM Marker for UTF 16, little endian. See * http://www.unicode.org/unicode/faq/utf_bom.html */ public static final UnicodeBOM UTF16LE_BOM = new UnicodeBOM("UTF-16LE", new byte[] { (byte) 0xff, (byte) 0xfe }); /** * BOM Marker for UTF 16, big endian. See * http://www.unicode.org/unicode/faq/utf_bom.html */ public static final UnicodeBOM UTF16BE_BOM = new UnicodeBOM("UTF-16BE", new byte[] { (byte) 0xfe, (byte) 0xff }); /** * BOM Marker for UTF 32, little endian. See * http://www.unicode.org/unicode/faq/utf_bom.html * * TODO: Does Java actually support this? */ public static final UnicodeBOM UTF32LE_BOM = new UnicodeBOM("UTF-32LE", new byte[] { (byte) 0xff, (byte) 0xfe, (byte) 0x00, (byte) 0x00 }); /** * BOM Marker for UTF 32, big endian. See * http://www.unicode.org/unicode/faq/utf_bom.html * * TODO: Does Java actually support this? */ public static final UnicodeBOM UTF32BE_BOM = new UnicodeBOM("UTF-32BE", new byte[] { (byte) 0x00, (byte) 0x00, (byte) 0xfe, (byte) 0xff }); /** The maximum amount of bytes to read for a BOM */ private static final int MAX_BOM_SIZE = 4; /** Buffer for BOM reading */ private byte[] buf = new byte[MAX_BOM_SIZE]; /** Buffer pointer. */ private int pos = 0; /** The stream encoding as read from the BOM or null. */ private final String encoding; /** True if the BOM itself should be skipped and not read. */ private final boolean skipBOM; private final PushbackInputStream inputStream; /** * Creates a new UnicodeInputStream object. Skips a BOM which defines the file * encoding. * * @param inputStream * The input stream to use for reading. */ public UnicodeInputStream(final InputStream inputStream) throws IllegalStateException, IOException { this(inputStream, true); } /** * Creates a new UnicodeInputStream object. * * @param inputStream * The input stream to use for reading. * @param skipBOM * If this is set to true, a BOM read from the stream is discarded. * This parameter should normally be true. */ public UnicodeInputStream(final InputStream inputStream, boolean skipBOM) throws IllegalStateException, IOException { super(); this.skipBOM = skipBOM; this.inputStream = new PushbackInputStream(inputStream, MAX_BOM_SIZE); try { this.encoding = readEncoding(); } catch (IOException ioe) { IllegalStateException ex = new IllegalStateException("Could not read BOM from Stream"); // ExceptionUtils.setCause(ex, ioe); throw ex; } } /** * Returns true if the input stream discards the BOM. * * @return True if the input stream discards the BOM. */ public boolean isSkipBOM() { return skipBOM; } /** * Read encoding based on BOM. * * @return The encoding based on the BOM. * * @throws IllegalStateException * When a problem reading the BOM occured. */ public String getEncodingFromStream() { return encoding; } /** * This method gets the encoding from the stream contents if a BOM exists. If * no BOM exists, the encoding is undefined. * * @return The encoding of this streams contents as decided by the BOM or null * if no BOM was found. */ protected String readEncoding() throws IOException { pos = 0; UnicodeBOM encoding = null; // read first byte. if (readByte()) { // Build a list of matches // // 00 00 FE FF --> UTF 32 BE // EF BB BF --> UTF 8 // FE FF --> UTF 16 BE // FF FE --> UTF 16 LE // FF FE 00 00 --> UTF 32 LE switch (buf[0]) { case (byte) 0x00: // UTF32 BE encoding = match(UTF32BE_BOM, null); break; case (byte) 0xef: // UTF8 encoding = match(UTF8_BOM, null); break; case (byte) 0xfe: // UTF16 BE encoding = match(UTF16BE_BOM, null); break; case (byte) 0xff: // UTF16/32 LE encoding = match(UTF16LE_BOM, null); if (encoding != null) { encoding = match(UTF32LE_BOM, encoding); } break; default: encoding = null; break; } } pushback(encoding); return (encoding != null) ? encoding.getEncoding() : null; } private final UnicodeBOM match(final UnicodeBOM matchEncoding, final UnicodeBOM noMatchEncoding) throws IOException { byte[] bom = matchEncoding.getBytes(); for (int i = 0; i < bom.length; i++) { if (pos <= i) // Byte has not yet been read { if (!readByte()) { return noMatchEncoding; } } if (bom[i] != buf[i]) { return noMatchEncoding; } } return matchEncoding; } private final boolean readByte() throws IOException { int res = inputStream.read(); if (res == -1) { return false; } if (pos >= buf.length) { throw new IOException("BOM read error"); } buf[pos++] = (byte) res; return true; } private final void pushback(final UnicodeBOM matchBOM) throws IOException { int count = pos; // By default, all bytes are pushed back. int start = 0; if (matchBOM != null && skipBOM) { // We have a match (some bytes are part of the BOM) // and we want to skip the BOM. Push back only the bytes // after the BOM. start = matchBOM.getBytes().length; count = (pos - start); if (count < 0) { throw new IllegalStateException("Match has more bytes than available!"); } } inputStream.unread(buf, start, count); } /** * @see java.io.InputStream#close() */ public void close() throws IOException { inputStream.close(); } /** * @see java.io.InputStream#available() */ public int available() throws IOException { return inputStream.available(); } /** * @see java.io.InputStream#mark(int) */ public void mark(final int readlimit) { inputStream.mark(readlimit); } /** * @see java.io.InputStream#markSupported() */ public boolean markSupported() { return inputStream.markSupported(); } /** * @see java.io.InputStream#read() */ public int read() throws IOException { return inputStream.read(); } /** * @see java.io.InputStream#read(byte[]) */ public int read(final byte[] b) throws IOException { return inputStream.read(b); } /** * @see java.io.InputStream#read(byte[], int, int) */ public int read(final byte[] b, final int off, final int len) throws IOException { return inputStream.read(b, off, len); } /** * @see java.io.InputStream#reset() */ public void reset() throws IOException { inputStream.reset(); } /** * @see java.io.InputStream#skip(long) */ public long skip(final long n) throws IOException { return inputStream.skip(n); } /** * Helper class to bundle encoding and BOM marker. * * @author <a href="mailto:henning@apache.org">Henning P. Schmiedehausen</a> * @version $Id: UnicodeInputStream.java 685685 2008-08-13 21:43:27Z nbubna $ */ static final class UnicodeBOM { private final String encoding; private final byte[] bytes; private UnicodeBOM(final String encoding, final byte[] bytes) { this.encoding = encoding; this.bytes = bytes; } String getEncoding() { return encoding; } byte[] getBytes() { return bytes; } } }