Charset Toolkit : Charset « I18N « Java

Charset Toolkit
   
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStream;
import java.nio.charset.Charset;
import java.util.Collection;

/**
 * <p>
 * Utility class to guess the encoding of a given byte array. The guess is
 * unfortunately not 100% sure. Especially for 8-bit charsets. It's not possible
 * to know which 8-bit charset is used. Except through statistical analysis. We
 * will then infer that the charset encountered is the same as the default
 * standard charset.
 * </p>
 * <p>
 * On the other hand, unicode files encoded in UTF-16 (low or big endian) or
 * UTF-8 files with a Byte Order Marker are easy to find. For UTF-8 files with
 * no BOM, if the buffer is wide enough, it's easy to guess.
 * </p>
 * <p>
 * Tested against a complicated UTF-8 file, Sun's implementation does not render
 * bad UTF-8 constructs as expected by the specification. But with a buffer wide
 * enough, the method guessEncoding() did behave correctly and recognized the
 * UTF-8 charset.
 * </p>
 * <p>
 * A byte buffer of 4KB or 8KB is sufficient to be able to guess the encoding.
 * </p>
 * <p>
 * Usage:
 * </p>
 * 
 * <pre>
 * // guess the encoding
 * Charset guessedCharset = CharsetToolkit.guessEncoding(file, 4096);
 * 
 * // create a reader with the charset we've just discovered
 * FileInputStream fis = new FileInputStream(file);
 * InputStreamReader isr = new InputStreamReader(fis, guessedCharset);
 * BufferedReader br = new BufferedReader(isr);
 * 
 * // read the file content
 * String line;
 * while ((line = br.readLine()) != null) {
 *   System.out.println(line);
 * }
 * </pre>
 * <p>
 * Date: 18 juil. 2002
 * </p>
 * 
 * @author Guillaume LAFORGE
 */
public class CharsetToolkit {
  private final byte[] buffer;
  private Charset defaultCharset;
  private boolean enforce8Bit = false;

  /**
   * Constructor of the <code>CharsetToolkit</code> utility class.
   * 
   * @param buffer
   *            the byte buffer of which we want to know the encoding.
   */
  public CharsetToolkit(final byte[] buffer) {
    this.buffer = buffer;
    this.defaultCharset = getDefaultSystemCharset();
  }

  /**
   * Constructor of the <code>CharsetToolkit</code> utility class.
   * 
   * @param buffer
   *            the byte buffer of which we want to know the encoding.
   * @param defaultCharset
   *            the default Charset to use in case an 8-bit charset is
   *            recognized.
   */
  public CharsetToolkit(final byte[] buffer, final Charset defaultCharset) {
    this.buffer = buffer;
    setDefaultCharset(defaultCharset);
  }

  /**
   * Defines the default <code>Charset</code> used in case the buffer
   * represents an 8-bit <code>Charset</code>.
   * 
   * @param defaultCharset
   *            the default <code>Charset</code> to be returned by
   *            <code>guessEncoding()</code> if an 8-bit <code>Charset</code>
   *            is encountered.
   */
  public void setDefaultCharset(final Charset defaultCharset) {
    if (defaultCharset != null)
      this.defaultCharset = defaultCharset;
    else
      this.defaultCharset = getDefaultSystemCharset();
  }

  /**
   * If US-ASCII is recognized, enforce to return the default encoding, rather
   * than US-ASCII. It might be a file without any special character in the
   * range 128-255, but that may be or become a file encoded with the default
   * <code>charset</code> rather than US-ASCII.
   * 
   * @param enforce
   *            a boolean specifying the use or not of US-ASCII.
   */
  public void setEnforce8Bit(final boolean enforce) {
    this.enforce8Bit = enforce;
  }

  /**
   * Gets the enforce8Bit flag, in case we do not want to ever get a US-ASCII
   * encoding.
   * 
   * @return a boolean representing the flag of use of US-ASCII.
   */
  public boolean getEnforce8Bit() {
    return this.enforce8Bit;
  }

  /**
   * Retrieves the default Charset
   * 
   * @return
   */
  public Charset getDefaultCharset() {
    return defaultCharset;
  }

  /**
   * <p>
   * Guess the encoding of the provided buffer.
   * </p>
   * If Byte Order Markers are encountered at the beginning of the buffer, we
   * immidiately return the charset implied by this BOM. Otherwise, the file
   * would not be a human readable text file.</p>
   * <p>
   * If there is no BOM, this method tries to discern whether the file is
   * UTF-8 or not. If it is not UTF-8, we assume the encoding is the default
   * system encoding (of course, it might be any 8-bit charset, but usually,
   * an 8-bit charset is the default one).
   * </p>
   * <p>
   * It is possible to discern UTF-8 thanks to the pattern of characters with
   * a multi-byte sequence.
   * </p>
   * 
   * <pre>
   * UCS-4 range (hex.)        UTF-8 octet sequence (binary)
   * 0000 0000-0000 007F       0xxxxxxx
   * 0000 0080-0000 07FF       110xxxxx 10xxxxxx
   * 0000 0800-0000 FFFF       1110xxxx 10xxxxxx 10xxxxxx
   * 0001 0000-001F FFFF       11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
   * 0020 0000-03FF FFFF       111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
   * 0400 0000-7FFF FFFF       1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
   * </pre>
   * <p>
   * With UTF-8, 0xFE and 0xFF never appear.
   * </p>
   * 
   * @return the Charset recognized.
   */
  public Charset guessEncoding() {
    // if the file has a Byte Order Marker, we can assume the file is in
    // UTF-xx
    // otherwise, the file would not be human readable
    if (hasUTF8Bom(buffer))
      return Charset.forName("UTF-8");
    if (hasUTF16LEBom(buffer))
      return Charset.forName("UTF-16LE");
    if (hasUTF16BEBom(buffer))
      return Charset.forName("UTF-16BE");

    // if a byte has its most significant bit set, the file is in UTF-8 or
    // in the default encoding
    // otherwise, the file is in US-ASCII
    boolean highOrderBit = false;

    // if the file is in UTF-8, high order bytes must have a certain value,
    // in order to be valid
    // if it's not the case, we can assume the encoding is the default
    // encoding of the system
    boolean validU8Char = true;

    // TODO the buffer is not read up to the end, but up to length - 6

    final int length = buffer.length;
    int i = 0;
    while (i < length - 6) {
      final byte b0 = buffer[i];
      final byte b1 = buffer[i + 1];
      final byte b2 = buffer[i + 2];
      final byte b3 = buffer[i + 3];
      final byte b4 = buffer[i + 4];
      final byte b5 = buffer[i + 5];
      if (b0 < 0) {
        // a high order bit was encountered, thus the encoding is not
        // US-ASCII
        // it may be either an 8-bit encoding or UTF-8
        highOrderBit = true;
        // a two-bytes sequence was encoutered
        if (isTwoBytesSequence(b0)) {
          // there must be one continuation byte of the form 10xxxxxx,
          // otherwise the following characteris is not a valid UTF-8
          // construct
          if (!isContinuationChar(b1))
            validU8Char = false;
          else
            i++;
        }
        // a three-bytes sequence was encoutered
        else if (isThreeBytesSequence(b0)) {
          // there must be two continuation bytes of the form
          // 10xxxxxx,
          // otherwise the following characteris is not a valid UTF-8
          // construct
          if (!(isContinuationChar(b1) && isContinuationChar(b2)))
            validU8Char = false;
          else
            i += 2;
        }
        // a four-bytes sequence was encoutered
        else if (isFourBytesSequence(b0)) {
          // there must be three continuation bytes of the form
          // 10xxxxxx,
          // otherwise the following characteris is not a valid UTF-8
          // construct
          if (!(isContinuationChar(b1) && isContinuationChar(b2) && isContinuationChar(b3)))
            validU8Char = false;
          else
            i += 3;
        }
        // a five-bytes sequence was encoutered
        else if (isFiveBytesSequence(b0)) {
          // there must be four continuation bytes of the form
          // 10xxxxxx,
          // otherwise the following characteris is not a valid UTF-8
          // construct
          if (!(isContinuationChar(b1) && isContinuationChar(b2)
              && isContinuationChar(b3) && isContinuationChar(b4)))
            validU8Char = false;
          else
            i += 4;
        }
        // a six-bytes sequence was encoutered
        else if (isSixBytesSequence(b0)) {
          // there must be five continuation bytes of the form
          // 10xxxxxx,
          // otherwise the following characteris is not a valid UTF-8
          // construct
          if (!(isContinuationChar(b1) && isContinuationChar(b2)
              && isContinuationChar(b3) && isContinuationChar(b4) && isContinuationChar(b5)))
            validU8Char = false;
          else
            i += 5;
        } else
          validU8Char = false;
      }
      if (!validU8Char)
        break;
      i++;
    }
    // if no byte with an high order bit set, the encoding is US-ASCII
    // (it might have been UTF-7, but this encoding is usually internally
    // used only by mail systems)
    if (!highOrderBit) {
      // returns the default charset rather than US-ASCII if the
      // enforce8Bit flag is set.
      if (this.enforce8Bit)
        return this.defaultCharset;
      else
        return Charset.forName("US-ASCII");
    }
    // if no invalid UTF-8 were encountered, we can assume the encoding is
    // UTF-8,
    // otherwise the file would not be human readable
    if (validU8Char)
      return Charset.forName("UTF-8");
    // finally, if it's not UTF-8 nor US-ASCII, let's assume the encoding is
    // the default encoding
    return this.defaultCharset;
  }

  public static Charset guessEncoding(final File f, final int bufferLength)
      throws FileNotFoundException, IOException {
    final FileInputStream fis = new FileInputStream(f);
    final byte[] buffer = new byte[bufferLength];
    fis.read(buffer);
    fis.close();
    final CharsetToolkit toolkit = new CharsetToolkit(buffer);
    toolkit.setDefaultCharset(getDefaultSystemCharset());
    return toolkit.guessEncoding();
  }

  public static Charset guessEncoding(final File f, final int bufferLength,
      final Charset defaultCharset) throws FileNotFoundException,
      IOException {
    final FileInputStream fis = new FileInputStream(f);
    final byte[] buffer = new byte[bufferLength];
    fis.read(buffer);
    fis.close();
    final CharsetToolkit toolkit = new CharsetToolkit(buffer);
    toolkit.setDefaultCharset(defaultCharset);
    return toolkit.guessEncoding();
  }

  /**
   * If the byte has the form 10xxxxx, then it's a continuation byte of a
   * multiple byte character;
   * 
   * @param b
   *            a byte.
   * @return true if it's a continuation char.
   */
  private static boolean isContinuationChar(final byte b) {
    return -128 <= b && b <= -65;
  }

  /**
   * If the byte has the form 110xxxx, then it's the first byte of a two-bytes
   * sequence character.
   * 
   * @param b
   *            a byte.
   * @return true if it's the first byte of a two-bytes sequence.
   */
  private static boolean isTwoBytesSequence(final byte b) {
    return -64 <= b && b <= -33;
  }

  /**
   * If the byte has the form 1110xxx, then it's the first byte of a
   * three-bytes sequence character.
   * 
   * @param b
   *            a byte.
   * @return true if it's the first byte of a three-bytes sequence.
   */
  private static boolean isThreeBytesSequence(final byte b) {
    return -32 <= b && b <= -17;
  }

  /**
   * If the byte has the form 11110xx, then it's the first byte of a
   * four-bytes sequence character.
   * 
   * @param b
   *            a byte.
   * @return true if it's the first byte of a four-bytes sequence.
   */
  private static boolean isFourBytesSequence(final byte b) {
    return -16 <= b && b <= -9;
  }

  /**
   * If the byte has the form 11110xx, then it's the first byte of a
   * five-bytes sequence character.
   * 
   * @param b
   *            a byte.
   * @return true if it's the first byte of a five-bytes sequence.
   */
  private static boolean isFiveBytesSequence(final byte b) {
    return -8 <= b && b <= -5;
  }

  /**
   * If the byte has the form 1110xxx, then it's the first byte of a six-bytes
   * sequence character.
   * 
   * @param b
   *            a byte.
   * @return true if it's the first byte of a six-bytes sequence.
   */
  private static boolean isSixBytesSequence(final byte b) {
    return -4 <= b && b <= -3;
  }

  /**
   * Retrieve the default charset of the system.
   * 
   * @return the default <code>Charset</code>.
   */
  public static Charset getDefaultSystemCharset() {
    return Charset.forName(System.getProperty("file.encoding"));
  }

  /**
   * Has a Byte Order Marker for UTF-8 (Used by Microsoft's Notepad and other
   * editors).
   * 
   * @param bom
   *            a buffer.
   * @return true if the buffer has a BOM for UTF8.
   */
  private static boolean hasUTF8Bom(final byte[] bom) {
    return (bom[0] == -17 && bom[1] == -69 && bom[2] == -65);
  }

  /**
   * Has a Byte Order Marker for UTF-16 Low Endian (ucs-2le, ucs-4le, and
   * ucs-16le).
   * 
   * @param bom
   *            a buffer.
   * @return true if the buffer has a BOM for UTF-16 Low Endian.
   */
  private static boolean hasUTF16LEBom(final byte[] bom) {
    return (bom[0] == -1 && bom[1] == -2);
  }

  /**
   * Has a Byte Order Marker for UTF-16 Big Endian (utf-16 and ucs-2).
   * 
   * @param bom
   *            a buffer.
   * @return true if the buffer has a BOM for UTF-16 Big Endian.
   */
  private static boolean hasUTF16BEBom(final byte[] bom) {
    return (bom[0] == -2 && bom[1] == -1);
  }

  /**
   * Retrieves all the available <code>Charset</code>s on the platform, among
   * which the default <code>charset</code>.
   * 
   * @return an array of <code>Charset</code>s.
   */
  public static Charset[] getAvailableCharsets() {
    final Collection collection = Charset.availableCharsets().values();
    return (Charset[]) collection.toArray(new Charset[collection.size()]);
  }
}
Related examples in the same category

1.	List the Charset in your system
2.	Converting Between Strings (Unicode) and Other Character Set Encodings
3.	Charset encoding test
4.	encoder and decoder use a supplied ByteBuffer
5.	Translate Charset
6.	Detect non-ASCII characters in string
7.	Displays Charsets and aliases
8.	Encode and Decode
9.	extends CharsetDecoder to create Base64 Decoder
10.	extends Charset to create Hex Charset
11.	Get the default charset
12.	Defines common charsets supported in all Java platforms.
13.	How to auto-detect a file's encoding