Generic Unicode text reader, which uses a BOM (Byte Order Mark) to identify the encoding to be used.

   
/* Copyright (c) 2008 Google Inc.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

//package com.google.gdata.util.io.base;

import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.PushbackInputStream;
import java.io.Reader;

/**
 * Generic Unicode text reader, which uses a BOM (Byte Order Mark) to identify
 * the encoding to be used. This also has the side effect of removing the BOM
 * from the input stream (when present).
 * 
 * @see <a href="http://bugs.sun.com/bugdatabase/view_bug.do?bug_id=4508058">
 *      JDK Bug 4508058</a>
 *
 * 
 */
public class UnicodeReader extends Reader {

  private final InputStreamReader internalInputStreamReader;
  private final String defaultEnc;

  private static final int BOM_SIZE = 4;

  /**
   * @param in input stream
   * @param defaultEnc default encoding (used only if BOM is not found) or
   *        <code>null</code> to use system default
   * @throws IOException if an I/O error occurs
   */
  public UnicodeReader(InputStream in, String defaultEnc) throws IOException {
    this.defaultEnc = defaultEnc;

    // Read ahead four bytes and check for BOM marks. Extra bytes are unread
    // back to the stream; only BOM bytes are skipped.
    String encoding;
    byte bom[] = new byte[BOM_SIZE];
    int n, unread;

    PushbackInputStream pushbackStream = new PushbackInputStream(in, BOM_SIZE);
    n = pushbackStream.read(bom, 0, bom.length);

    if ((bom[0] == (byte) 0xEF) && (bom[1] == (byte) 0xBB)
        && (bom[2] == (byte) 0xBF)) {
      encoding = "UTF-8";
      unread = n - 3;
    } else if ((bom[0] == (byte) 0xFE) && (bom[1] == (byte) 0xFF)) {
      encoding = "UTF-16BE";
      unread = n - 2;
    } else if ((bom[0] == (byte) 0xFF) && (bom[1] == (byte) 0xFE)) {
      encoding = "UTF-16LE";
      unread = n - 2;
    } else if ((bom[0] == (byte) 0x00) && (bom[1] == (byte) 0x00)
        && (bom[2] == (byte) 0xFE) && (bom[3] == (byte) 0xFF)) {
      encoding = "UTF-32BE";
      unread = n - 4;
    } else if ((bom[0] == (byte) 0xFF) && (bom[1] == (byte) 0xFE)
        && (bom[2] == (byte) 0x00) && (bom[3] == (byte) 0x00)) {
      encoding = "UTF-32LE";
      unread = n - 4;
    } else {
      // Unicode BOM mark not found, unread all bytes
      encoding = defaultEnc;
      unread = n;
    }
    if (unread > 0) {
      pushbackStream.unread(bom, (n - unread), unread);
    } else if (unread < -1) {
      pushbackStream.unread(bom, 0, 0);
    }

    // Use given encoding
    if (encoding == null) {
      internalInputStreamReader = new InputStreamReader(pushbackStream);
    } else {
      internalInputStreamReader = new InputStreamReader(pushbackStream,
          encoding);
    }
  }

  public String getDefaultEncoding() {
    return defaultEnc;
  }

  public String getEncoding() {
    return internalInputStreamReader.getEncoding();
  }

  @Override public void close() throws IOException {
    internalInputStreamReader.close();
  }

  @Override public int read(char[] cbuf, int off, int len) throws IOException {
    return internalInputStreamReader.read(cbuf, off, len);
  }
}

Related examples in the same category

1.	Unicode Display
2.	Character Sets and Unicode: Code Set Conversion
3.	Display "special character" using Unicode
4.	International friendly string comparison with case-order
5.	Generic unicode textreader, which will use BOM mark to identify the encoding to be used. If BOM is not found then use a given default or system encoding.
6.	Convert into Hexadecimal notation of Unicode
7.	Generic unicode text reader.
8.	processing SGML into unicode characters.
9.	Write a 16 bit short as LITTLE_ENDIAN
10.	Write a 32 bit int as LITTLE_ENDIAN.
11.	Arabic Reshaper

Generic Unicode text reader, which uses a BOM (Byte Order Mark) to identify the encoding to be used. : Unicode « I18N « Java

Related examples in the same category