Read Windows Notepad Unicode files

  

/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */

import java.io.IOException;
import java.io.InputStream;
import java.io.PushbackInputStream;

/**
 * This is an input stream that is unicode BOM aware. This allows you to e.g.
 * read Windows Notepad Unicode files as Velocity templates.
 * 
 * It allows you to check the actual encoding of a file by calling
 * {@link #getEncodingFromStream()} on the input stream reader.
 * 
 * This class is not thread safe! When more than one thread wants to use an
 * instance of UnicodeInputStream, the caller must provide synchronization.
 * 
 * @author <a href="mailto:mailmur@yahoo.com">Aki Nieminen</a>
 * @author <a href="mailto:henning@apache.org">Henning P. Schmiedehausen</a>
 * @version $Id: UnicodeInputStream.java 685685 2008-08-13 21:43:27Z nbubna $
 * @since 1.5
 */
public class UnicodeInputStream extends InputStream {

  /** BOM Marker for UTF 8. See http://www.unicode.org/unicode/faq/utf_bom.html */
  public static final UnicodeBOM UTF8_BOM = new UnicodeBOM("UTF-8", new byte[] { (byte) 0xef,
      (byte) 0xbb, (byte) 0xbf });

  /**
   * BOM Marker for UTF 16, little endian. See
   * http://www.unicode.org/unicode/faq/utf_bom.html
   */
  public static final UnicodeBOM UTF16LE_BOM = new UnicodeBOM("UTF-16LE", new byte[] { (byte) 0xff,
      (byte) 0xfe });

  /**
   * BOM Marker for UTF 16, big endian. See
   * http://www.unicode.org/unicode/faq/utf_bom.html
   */
  public static final UnicodeBOM UTF16BE_BOM = new UnicodeBOM("UTF-16BE", new byte[] { (byte) 0xfe,
      (byte) 0xff });

  /**
   * BOM Marker for UTF 32, little endian. See
   * http://www.unicode.org/unicode/faq/utf_bom.html
   * 
   * TODO: Does Java actually support this?
   */
  public static final UnicodeBOM UTF32LE_BOM = new UnicodeBOM("UTF-32LE", new byte[] { (byte) 0xff,
      (byte) 0xfe, (byte) 0x00, (byte) 0x00 });

  /**
   * BOM Marker for UTF 32, big endian. See
   * http://www.unicode.org/unicode/faq/utf_bom.html
   * 
   * TODO: Does Java actually support this?
   */
  public static final UnicodeBOM UTF32BE_BOM = new UnicodeBOM("UTF-32BE", new byte[] { (byte) 0x00,
      (byte) 0x00, (byte) 0xfe, (byte) 0xff });

  /** The maximum amount of bytes to read for a BOM */
  private static final int MAX_BOM_SIZE = 4;

  /** Buffer for BOM reading */
  private byte[] buf = new byte[MAX_BOM_SIZE];

  /** Buffer pointer. */
  private int pos = 0;

  /** The stream encoding as read from the BOM or null. */
  private final String encoding;

  /** True if the BOM itself should be skipped and not read. */
  private final boolean skipBOM;

  private final PushbackInputStream inputStream;

  /**
   * Creates a new UnicodeInputStream object. Skips a BOM which defines the file
   * encoding.
   * 
   * @param inputStream
   *          The input stream to use for reading.
   */
  public UnicodeInputStream(final InputStream inputStream) throws IllegalStateException,
      IOException {
    this(inputStream, true);
  }

  /**
   * Creates a new UnicodeInputStream object.
   * 
   * @param inputStream
   *          The input stream to use for reading.
   * @param skipBOM
   *          If this is set to true, a BOM read from the stream is discarded.
   *          This parameter should normally be true.
   */
  public UnicodeInputStream(final InputStream inputStream, boolean skipBOM)
      throws IllegalStateException, IOException {
    super();

    this.skipBOM = skipBOM;
    this.inputStream = new PushbackInputStream(inputStream, MAX_BOM_SIZE);

    try {
      this.encoding = readEncoding();
    } catch (IOException ioe) {
      IllegalStateException ex = new IllegalStateException("Could not read BOM from Stream");
      // ExceptionUtils.setCause(ex, ioe);
      throw ex;
    }
  }

  /**
   * Returns true if the input stream discards the BOM.
   * 
   * @return True if the input stream discards the BOM.
   */
  public boolean isSkipBOM() {
    return skipBOM;
  }

  /**
   * Read encoding based on BOM.
   * 
   * @return The encoding based on the BOM.
   * 
   * @throws IllegalStateException
   *           When a problem reading the BOM occured.
   */
  public String getEncodingFromStream() {
    return encoding;
  }

  /**
   * This method gets the encoding from the stream contents if a BOM exists. If
   * no BOM exists, the encoding is undefined.
   * 
   * @return The encoding of this streams contents as decided by the BOM or null
   *         if no BOM was found.
   */
  protected String readEncoding() throws IOException {
    pos = 0;

    UnicodeBOM encoding = null;

    // read first byte.
    if (readByte()) {
      // Build a list of matches
      //
      // 00 00 FE FF --> UTF 32 BE
      // EF BB BF --> UTF 8
      // FE FF --> UTF 16 BE
      // FF FE --> UTF 16 LE
      // FF FE 00 00 --> UTF 32 LE

      switch (buf[0]) {
      case (byte) 0x00: // UTF32 BE
        encoding = match(UTF32BE_BOM, null);
        break;
      case (byte) 0xef: // UTF8
        encoding = match(UTF8_BOM, null);
        break;
      case (byte) 0xfe: // UTF16 BE
        encoding = match(UTF16BE_BOM, null);
        break;
      case (byte) 0xff: // UTF16/32 LE
        encoding = match(UTF16LE_BOM, null);

        if (encoding != null) {
          encoding = match(UTF32LE_BOM, encoding);
        }
        break;

      default:
        encoding = null;
        break;
      }
    }

    pushback(encoding);

    return (encoding != null) ? encoding.getEncoding() : null;
  }

  private final UnicodeBOM match(final UnicodeBOM matchEncoding, final UnicodeBOM noMatchEncoding)
      throws IOException {
    byte[] bom = matchEncoding.getBytes();

    for (int i = 0; i < bom.length; i++) {
      if (pos <= i) // Byte has not yet been read
      {
        if (!readByte()) {
          return noMatchEncoding;
        }
      }

      if (bom[i] != buf[i]) {
        return noMatchEncoding;
      }
    }

    return matchEncoding;
  }

  private final boolean readByte() throws IOException {
    int res = inputStream.read();
    if (res == -1) {
      return false;
    }

    if (pos >= buf.length) {
      throw new IOException("BOM read error");
    }

    buf[pos++] = (byte) res;
    return true;
  }

  private final void pushback(final UnicodeBOM matchBOM) throws IOException {
    int count = pos; // By default, all bytes are pushed back.
    int start = 0;

    if (matchBOM != null && skipBOM) {
      // We have a match (some bytes are part of the BOM)
      // and we want to skip the BOM. Push back only the bytes
      // after the BOM.
      start = matchBOM.getBytes().length;
      count = (pos - start);

      if (count < 0) {
        throw new IllegalStateException("Match has more bytes than available!");
      }
    }

    inputStream.unread(buf, start, count);
  }

  /**
   * @see java.io.InputStream#close()
   */
  public void close() throws IOException {
    inputStream.close();
  }

  /**
   * @see java.io.InputStream#available()
   */
  public int available() throws IOException {
    return inputStream.available();
  }

  /**
   * @see java.io.InputStream#mark(int)
   */
  public void mark(final int readlimit) {
    inputStream.mark(readlimit);
  }

  /**
   * @see java.io.InputStream#markSupported()
   */
  public boolean markSupported() {
    return inputStream.markSupported();
  }

  /**
   * @see java.io.InputStream#read()
   */
  public int read() throws IOException {
    return inputStream.read();
  }

  /**
   * @see java.io.InputStream#read(byte[])
   */
  public int read(final byte[] b) throws IOException {
    return inputStream.read(b);
  }

  /**
   * @see java.io.InputStream#read(byte[], int, int)
   */
  public int read(final byte[] b, final int off, final int len) throws IOException {
    return inputStream.read(b, off, len);
  }

  /**
   * @see java.io.InputStream#reset()
   */
  public void reset() throws IOException {
    inputStream.reset();
  }

  /**
   * @see java.io.InputStream#skip(long)
   */
  public long skip(final long n) throws IOException {
    return inputStream.skip(n);
  }

  /**
   * Helper class to bundle encoding and BOM marker.
   * 
   * @author <a href="mailto:henning@apache.org">Henning P. Schmiedehausen</a>
   * @version $Id: UnicodeInputStream.java 685685 2008-08-13 21:43:27Z nbubna $
   */
  static final class UnicodeBOM {
    private final String encoding;

    private final byte[] bytes;

    private UnicodeBOM(final String encoding, final byte[] bytes) {
      this.encoding = encoding;
      this.bytes = bytes;
    }

    String getEncoding() {
      return encoding;
    }

    byte[] getBytes() {
      return bytes;
    }
  }
}
1.	Convert file in SJIS to UTF8
2.	Return an UTF-8 encoded String
3.	Return an UTF-8 encoded String by length
4.	UTF8 String utilities
5.	Return UTF-8 encoded byte[] representation of a String
6.	Encodes octects (using utf-8) into Hex data
7.	Decodes values of attributes in the DN encoded in hex into a UTF-8 String.
8.	converting between byte arrays and hex encoded strings
9.	Convert bytes To Hex
10.	Convert hex To Bytes
11.	Unicode 2 ASCII
12.	Make bytes
13.	String converter
14.	Show unicode string
15.	Normalizer
16.	Convert from UTF-8 to Unicode
17.	Convert from Unicode to UTF-8
18.	Utility methods for handling UTF-8 encoded byte streams.
19.	UTF Util
20.	To UTF8 InputStream
21.	Returns {@code true} if the specified character sequence is a valid sequence of UTF-16 char values.
22.	URL UTF8 Encoder
Read Windows Notepad Unicode files : UTF8 Byte Hex « Development Class « Java

Related examples in the same category