Java examples for Internationalization:Charset
guess Encoding
/*/*from w ww .ja v a2 s . co m*/ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS HEADER. * * Copyright 2011 OpenConcerto, by ILM Informatique. All rights reserved. * * The contents of this file are subject to the terms of the GNU General Public License Version 3 * only ("GPL"). You may not use this file except in compliance with the License. You can obtain a * copy of the License at http://www.gnu.org/licenses/gpl-3.0.html See the License for the specific * language governing permissions and limitations under the License. * * When distributing the software, include this License Header Notice in each file. */ import java.io.File; import java.io.FileInputStream; import java.io.FileNotFoundException; import java.io.IOException; import java.nio.charset.Charset; import java.util.Collection; public class Main{ public static void main(String[] argv) throws Exception{ System.out.println(guessEncoding()); } private boolean enforce8Bit; public Charset guessEncoding() { if (hasUTF8Bom(buffer)) return Charset.forName("UTF-8"); if (hasUTF16LEBom(buffer)) return Charset.forName("UTF-16LE"); if (hasUTF16BEBom(buffer)) return Charset.forName("UTF-16BE"); boolean highOrderBit = false; boolean validU8Char = true; int length = buffer.length; int i = 0; do { if (i >= length - 6) break; byte b0 = buffer[i]; byte b1 = buffer[i + 1]; byte b2 = buffer[i + 2]; byte b3 = buffer[i + 3]; byte b4 = buffer[i + 4]; byte b5 = buffer[i + 5]; if (b0 < 0) { highOrderBit = true; if (isTwoBytesSequence(b0)) { if (!isContinuationChar(b1)) validU8Char = false; else i++; } else if (isThreeBytesSequence(b0)) { if (!isContinuationChar(b1) || !isContinuationChar(b2)) validU8Char = false; else i += 2; } else if (isFourBytesSequence(b0)) { if (!isContinuationChar(b1) || !isContinuationChar(b2) || !isContinuationChar(b3)) validU8Char = false; else i += 3; } else if (isFiveBytesSequence(b0)) { if (!isContinuationChar(b1) || !isContinuationChar(b2) || !isContinuationChar(b3) || !isContinuationChar(b4)) validU8Char = false; else i += 4; } else if (isSixBytesSequence(b0)) { if (!isContinuationChar(b1) || !isContinuationChar(b2) || !isContinuationChar(b3) || !isContinuationChar(b4) || !isContinuationChar(b5)) validU8Char = false; else i += 5; } else { validU8Char = false; } } if (!validU8Char) break; i++; } while (true); if (!highOrderBit) if (enforce8Bit) return defaultCharset; else return Charset.forName("US-ASCII"); if (validU8Char) return Charset.forName("UTF-8"); else return defaultCharset; } public static Charset guessEncoding(File f, int bufferLength) throws FileNotFoundException, IOException { return guessEncoding(f, bufferLength, null); } public static Charset guessEncoding(File f, int bufferLength, Charset defaultCharset) throws FileNotFoundException, IOException { FileInputStream fis = new FileInputStream(f); byte buffer[] = new byte[bufferLength]; fis.read(buffer); fis.close(); CharsetHelper toolkit = new CharsetHelper(buffer); if (defaultCharset != null) { toolkit.setDefaultCharset(defaultCharset); } return toolkit.guessEncoding(); } private static boolean hasUTF8Bom(byte bom[]) { return bom[0] == -17 && bom[1] == -69 && bom[2] == -65; } private static boolean hasUTF16LEBom(byte bom[]) { return bom[0] == -1 && bom[1] == -2; } private static boolean hasUTF16BEBom(byte bom[]) { return bom[0] == -2 && bom[1] == -1; } private static boolean isTwoBytesSequence(byte b) { return -64 <= b && b <= -33; } private static boolean isContinuationChar(byte b) { return -128 <= b && b <= -65; } private static boolean isThreeBytesSequence(byte b) { return -32 <= b && b <= -17; } private static boolean isFourBytesSequence(byte b) { return -16 <= b && b <= -9; } private static boolean isFiveBytesSequence(byte b) { return -8 <= b && b <= -5; } private static boolean isSixBytesSequence(byte b) { return -4 <= b && b <= -3; } public void setDefaultCharset(Charset defaultCharset) { if (defaultCharset != null) this.defaultCharset = defaultCharset; else this.defaultCharset = getDefaultSystemCharset(); } public static Charset getDefaultSystemCharset() { return Charset.forName(System.getProperty("file.encoding")); } }