Get file encoding : Encoding « I18N « Java






Get file encoding

     
/*
 * Copyright (c) 2006, Chuck Mortimore - xmldap.org
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *
 *     * Redistributions of source code must retain the above copyright
 *       notice, this list of conditions and the following disclaimer.
 *     * Redistributions in binary form must reproduce the above copyright
 *       notice, this list of conditions and the following disclaimer in the
 *       documentation and/or other materials provided with the distribution.
 *     * Neither the names xmldap, xmldap.org, xmldap.com nor the
 *       names of its contributors may be used to endorse or promote products
 *       derived from this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND ANY
 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 * DISCLAIMED. IN NO EVENT SHALL THE REGENTS AND CONTRIBUTORS BE LIABLE FOR ANY
 * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */


/*
 * 
 * Functions to read XML files considering byte order marks
 * 
 */
//package org.xmldap.util;

import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;

public class XmlFileUtil {
  public static int getBomLength(byte[] b) {
    int b0 = 0;
    int b1 = 0;

    if (b.length > 2) {
      b0 = b[0] & 0xFF;
      b1 = b[1] & 0xFF;

      if (b0 == 0xFE && b1 == 0xFF) {
        return 2;
      } else if (b0 == 0xFF && b1 == 0xFE) {
        return 2;
      }
    } else {
      return 0;
    }

    if (b.length > 3) {
      final int b2 = b[2] & 0xFF;
      if (b0 != 0xEF || b1 != 0xBB || b2 != 0xBF) {
        return 0;
      } else {
        return 3;
      }
    }
    return 0;

  }

  public static String getEncoding(byte[] b) {
    int b0 = 0;
    int b1 = 0;

    if (b.length > 2) {
      b0 = b[0] & 0xFF;
      b1 = b[1] & 0xFF;

      if (b0 == 0xFE && b1 == 0xFF) {
        return "UTF-16BE";
      } else if (b0 == 0xFF && b1 == 0xFE) {
        return "UTF-16LE";
      }
    } else {
      return null;
    }

    if (b.length > 3) {
      final int b2 = b[2] & 0xFF;
      if (b0 != 0xEF || b1 != 0xBB || b2 != 0xBF) {
        return null;
      } else {
        return "UTF-8";
      }
    }
    return null;

  }

  /**
   * Removes the byte order mark from the stream, if it exists and returns the
   * encoding name.
   * 
   * Adapted code from org/apache/xerces/xinclude/XIncludeTextReader.java
   * 
   * If null is returned then some bytes were read but they were no BOM bytes.
   * You have to reset the stream in this case. I don't do that here because
   * mark/reset is not implemented on all plattforms (windows)
   * 
   * @param stream
   * @return
   * @throws IOException
   */
  public static String getEncoding(InputStream stream) throws IOException {

    stream.mark(4);

    byte[] b = new byte[3];
    int count = 0;
    int b0 = 0;
    int b1 = 0;

    count = stream.read(b, 0, 2);
    if (count == 2) {
      b0 = b[0] & 0xFF;
      b1 = b[1] & 0xFF;

      if (b0 == 0xFE && b1 == 0xFF) {
        return "UTF-16BE";
      } else if (b0 == 0xFF && b1 == 0xFE) {
        return "UTF-16LE";
      }
    } else {
      return null;
    }

    byte[] B = new byte[1];
    count = stream.read(B, 0, 1);
    if (count == 1) {
      final int b2 = B[0] & 0xFF;
      if (b0 != 0xEF || b1 != 0xBB || b2 != 0xBF) {
        // First three bytes are not BOM, so reset.
        stream.reset();
      } else {
        return "UTF-8";
      }
    }
    return null;
  }

  static public String doRead(InputStream in) throws IOException {
    BufferedReader ins = new BufferedReader(new InputStreamReader(in));

    StringBuilder sb = new StringBuilder();
    try {
      int c = -1;
      char[] charBuf = null;
      while (true) {
        int len = in.available();
        if (len > 0) {
          if (charBuf == null) {
            charBuf = new char[len];
          } else {
            if (len > charBuf.length) {
              charBuf = new char[len];
            }
          }
        } else {
          // available is not always relyable
          if (charBuf == null) {
            charBuf = new char[2048];
          } else {
            if (2048 > charBuf.length) {
              charBuf = new char[2048];
            }
          }
        }
        c = ins.read(charBuf, 0, charBuf.length);
        if (c == -1) {
          break;
        } else {
          sb.append(charBuf, 0, c);
        }
      }
    } finally {
      try {
        in.close();
      } catch (IOException e) {
      }
      try {
        ins.close();
      } catch (IOException e) {
      }
    }
    return sb.toString();
  }

}

   
    
    
    
    
  








Related examples in the same category

1.Convert Encoding
2.Utility class for working with character sets
3.Utility methods for ASCII character checking.
4.Reader for UCS-2 and UCS-4 encodings. (i.e., encodings from ISO-10646-UCS-(2|4)).
5.Conversions between IANA encoding names and Java encoding names, and vice versa.
6.ASCII character handling functions
7.This class represents an encoding.
8.Provides information about encodings.
9.Codec for the Quoted-Printable section of http://www.ietf.org/rfc/rfc1521.txt (RFC 1521)
10.ISO 8859-8, ASCII plus Hebrew
11.TIS-620 does not have the non-breaking space or the C1 controls.
12.ISO-8859-1; a.k.a. Latin-1
13.ISO 8859-2, a.k.a. Latin-2
14.ISO 8859-3
15.ISO 8859-4, Latin plus the characters needed for Greenlandic, Icelandic, and Lappish.
16.ISO 8859-9 for Turkish.
17.ISO-8859-10, for Lithuanian, Estonian, Greenlandic, Icelandic, Inuit, Lappish, and other Northern European languages.
18.ISO-8859-13, for Latvian and other Baltic languages.
19.ISO-8859-14, for Gaelic, Welsh, and other Celtic languages.
20.ISO 8859-9 for Western Europe. Includes the Euro sign and several uncommon French letters
21.ISO 8859-16, Romanian
22.ASCII Writer
23.UCS Writer
24.Unicode Writer
25.Whether a character is or is not available in a particular encoding
26.ISO 8859-6, ASCII plus Arabic
27.ISO 8859-5, ASCII plus Cyrillic (Russian, Byelorussian, etc.)
28.ISO 8859-7, ASCII plus Greek
29.IANA to Java Mapping
30.Java to IANA Mapping
31.EncodingMap is a convenience class which handles conversions between IANA encoding names and Java encoding names, and vice versa.