Unicode reader : Code Unicode « Development Class « Java

Unicode reader

     


// revised from runcc 0.7

import java.io.*;

/**
 * Reads away UNICODE Byte Order Mark on construction. See
 * http://www.unicode.org/unicode/faq/utf_bom.html
 * 
 * <pre>
 *  00 00 FE FF    = UTF-32, big-endian
 *  FF FE 00 00    = UTF-32, little-endian
 *  FE FF          = UTF-16, big-endian
 *  FF FE          = UTF-16, little-endian
 *  EF BB BF       = UTF-8
 * </pre>
 */
public class UnicodeReader extends Reader
{
  private static final int BOM_MAX_SIZE = 4;

  private InputStreamReader delegate;

  public UnicodeReader(InputStream in) throws IOException {
    init(in, null);
  }

  public UnicodeReader(InputStream in, String defaultEnc) throws IOException {
    init(in, defaultEnc);
  }

  /** Returns the encoding that was read from byte order mark if there was one. */
  public String getEncoding() {
    return delegate.getEncoding();
  }

  /**
   * Read-ahead four bytes and check for BOM marks. Extra bytes are unread back
   * to the stream, only BOM bytes are skipped.
   */
  private void init(InputStream in, String defaultEnc) throws IOException {
    String encoding;
    byte bom[] = new byte[BOM_MAX_SIZE];
    int n, unread;
    PushbackInputStream internalIn = new PushbackInputStream(in, BOM_MAX_SIZE);
    n = internalIn.read(bom, 0, bom.length);

    if ((bom[0] == (byte) 0xEF) && (bom[1] == (byte) 0xBB) && (bom[2] == (byte) 0xBF)) {
      encoding = "UTF-8";
      unread = n - 3;
    }
    else
    if ((bom[0] == (byte) 0xFE) && (bom[1] == (byte) 0xFF)) {
      encoding = "UTF-16BE";
      unread = n - 2;
    }
    else
    if ((bom[0] == (byte) 0xFF) && (bom[1] == (byte) 0xFE)) {
      encoding = "UTF-16LE";
      unread = n - 2;
    }
    else
    if ((bom[0] == (byte) 0x00) && (bom[1] == (byte) 0x00) && (bom[2] == (byte) 0xFE) && (bom[3] == (byte) 0xFF)) {
      encoding = "UTF-32BE";
      unread = n - 4;
    }
    else
    if ((bom[0] == (byte) 0xFF) && (bom[1] == (byte) 0xFE) && (bom[2] == (byte) 0x00) && (bom[3] == (byte) 0x00)) {
      encoding = "UTF-32LE";
      unread = n - 4;
    }
    else {
      // Unicode BOM mark not found, unread all bytes
      encoding = defaultEnc;
      unread = n;
    }

    if (unread > 0)
      internalIn.unread(bom, (n - unread), unread);
    else
    if (unread < -1)
      internalIn.unread(bom, 0, 0);

    // Use BOM or default encoding
    if (encoding == null) {
      delegate = new InputStreamReader(internalIn);
    }
    else {
      delegate = new InputStreamReader(internalIn, encoding);
    }
  }

  /** Overridden to use delegate reader. */
  public void close() throws IOException {
    delegate.close();
  }

  /** Overridden to use delegate reader. */
  public int read(char[] cbuf, int off, int len) throws IOException {
    return delegate.read(cbuf, off, len);
  }

}

Related examples in the same category

1.	Unicode sorting
2.	Unicode: Fonts and Text Rendering
3.	Unicode: TrueType Font Test
4.	Unicode: test layout
5.	Conversion between Unicode characters and Strings
6.	Convert among Unicode, ASCII and byte/int
7.	Unicode - show a page of Unicode characters
8.	Return the Unicode char which is coded in the bytes at the given position.
9.	Vis - make special characters visible
10.	Demonstrate creating readers and writers with a specific encoding
11.	TextArea with Unicode
12.	Unicode display
13.	Check if the current character is an 7 bits ASCII CHAR (between 0 and 127). <char> ::= <alpha> \| <digit> \| '-'
14.	Internationalized Graphical User Interfaces: unicode cut and paste
15.	Unicode to ascii
16.	Return the Unicode char which is coded in the bytes at position 0.
17.	Count the number of bytes needed to return an Unicode char. This can be from 1 to 6.
18.	Return the number of bytes that hold an Unicode char.
19.	An optimized reader for reading byte streams that only contain 7-bit ASCII characters.
20.	Stream Converter Unicode
21.	String Converter Unicode
22.	Checks if the String contains only unicode digits or space
23.	Checks if the String contains only unicode digits. A decimal point is not a unicode digit and returns false.
24.	Checks if the String contains only unicode letters and space (' ').
25.	Checks if the String contains only unicode letters or digits.
26.	Checks if the String contains only unicode letters, digits or space (' ').
27.	Checks if the String contains only unicode letters.
28.	Unicode Util
29.	Get UTF String Size
30.	ASCII 2 NATIVE