Java examples for Internationalization:Charset
get Charset For byte array
import java.text.*; import java.util.zip.*; import java.io.*; import java.nio.*; import java.nio.charset.*; public class Main{ public static CharsetInfo getCharsetFor(byte[] byteArray) { //copy the first 100 bytes to a new testing array int amt = byteArray.length; if (amt > 100) { amt = 100;//from w w w . j a v a2s .c o m } byte[] testingArray = new byte[amt]; System.arraycopy(byteArray, 0, testingArray, 0, amt); String stringForBytesUTF8 = convertToString(testingArray, Charset.forName("UTF-8")); String stringForBytesUTF16BE = convertToString(testingArray, Charset.forName("UTF-16BE")); String stringForBytesUTF16LE = convertToString(testingArray, Charset.forName("UTF-16LE")); //if no length, give up if (byteArray.length < 1) { //we'll just assume UTF-8 and give up return new CharsetInfo(Charset.forName("UTF-8"), false, false); } //first, deal with files that have BOM if (testingArray.length >= 3) { if ((testingArray[0] == 0xEF - 256) && (testingArray[1] == 0xBB - 256) && (testingArray[2] == 0xBF - 256)) { boolean hasErrors = false; //UTF-8 MIGHT have BOM (Notepad adds it for instance), but it's not a good idea if (!encodingIsCorrect(byteArray, "UTF-8")) { hasErrors = true; } return new CharsetInfo(Charset.forName("UTF-8"), true, hasErrors); } else if ((testingArray[0] == 0xFE - 256) && (testingArray[1] == 0xFF - 256)) { boolean hasErrors = false; if (!encodingIsCorrect(byteArray, "UTF-16BE")) { hasErrors = true; } return new CharsetInfo(Charset.forName("UTF-16BE"), true, hasErrors); } else if ((testingArray[0] == 0xFF - 256) && (testingArray[1] == 0xFE - 256)) { boolean hasErrors = false; if (!encodingIsCorrect(byteArray, "UTF-16LE")) { hasErrors = true; } return new CharsetInfo(Charset.forName("UTF-16LE"), true, hasErrors); } } //now, deal with files with XML encoding specified String encodingTest = ""; if (stringForBytesUTF8.startsWith("<?xml ")) { encodingTest = stringForBytesUTF8; } else if (stringForBytesUTF16BE.startsWith("<?xml ")) { encodingTest = stringForBytesUTF16BE; } else if (stringForBytesUTF16LE.startsWith("<?xml ")) { encodingTest = stringForBytesUTF16LE; } if (!(encodingTest.equals(""))) { encodingTest = encodingTest.toLowerCase(); if (encodingTest.indexOf("encoding") > 0) { int startPos = encodingTest.indexOf("encoding"); startPos = encodingTest.indexOf("\"", startPos) + 1; int endPos = encodingTest.indexOf(("\""), startPos); if ((endPos > startPos) && (endPos > 0) && (startPos > 0)) { encodingTest = encodingTest.substring(startPos, endPos); } else { encodingTest = ""; } } else { encodingTest = ""; } } boolean encodingTypeErr = false; if (!encodingTest.equals("")) { try { Charset theCharset = Charset.forName(encodingTest); boolean hasErrors = false; if (!encodingIsCorrect(byteArray, theCharset.name())) { hasErrors = true; } return new CharsetInfo(theCharset, false, hasErrors); } catch (Exception err) { //not a valid encoding type; just keep going and mark it as error encodingTypeErr = true; } } //check for XHTML with no header (assumed UTF-8) if (stringForBytesUTF8.startsWith("<!DOCTYPE HTML")) { boolean hasErrors = encodingTypeErr; if (!encodingIsCorrect(byteArray, "UTF-8")) { hasErrors = true; } return new CharsetInfo(Charset.forName("UTF-8"), false, hasErrors); } //here we check for XMP declarations //look for '<x:xmpmeta ' or '<?xpacket ' in a variety of encodings if (stringForBytesUTF8.startsWith("<?xpacket ")) { boolean hasErrors = encodingTypeErr; if (!encodingIsCorrect(byteArray, "UTF-8")) { hasErrors = true; } return new CharsetInfo(Charset.forName("UTF-8"), false, hasErrors); } if (stringForBytesUTF8.startsWith("<x:xmpmeta ")) { boolean hasErrors = encodingTypeErr; if (!encodingIsCorrect(byteArray, "UTF-8")) { hasErrors = true; } return new CharsetInfo(Charset.forName("UTF-8"), false, hasErrors); } if (stringForBytesUTF16BE.startsWith("<?xpacket ")) { boolean hasErrors = encodingTypeErr; if (!encodingIsCorrect(byteArray, "UTF-16BE")) { hasErrors = true; } return new CharsetInfo(Charset.forName("UTF-16BE"), false, hasErrors); } if (stringForBytesUTF16BE.startsWith("<x:xmpmeta ")) { boolean hasErrors = encodingTypeErr; if (!encodingIsCorrect(byteArray, "UTF-16BE")) { hasErrors = true; } return new CharsetInfo(Charset.forName("UTF-16BE"), false, hasErrors); } if (stringForBytesUTF16LE.startsWith("<?xpacket ")) { boolean hasErrors = encodingTypeErr; if (!encodingIsCorrect(byteArray, "UTF-16LE")) { hasErrors = true; } return new CharsetInfo(Charset.forName("UTF-16LE"), false, hasErrors); } if (stringForBytesUTF16LE.startsWith("<x:xmpmeta ")) { boolean hasErrors = encodingTypeErr; if (!encodingIsCorrect(byteArray, "UTF-16LE")) { hasErrors = true; } return new CharsetInfo(Charset.forName("UTF-16LE"), false, hasErrors); } //if we reached here, file is just plain text, not XMP or XML, so just check standard ones; if any of them encode properly, use that //if neither, give up if (encodingIsCorrect(byteArray, "UTF-8")) { return new CharsetInfo(Charset.forName("UTF-8"), false, encodingTypeErr); } if (encodingIsCorrect(byteArray, "ISO-8859-1")) { return new CharsetInfo(Charset.forName("ISO-8859-1"), false, encodingTypeErr); } if (encodingIsCorrect(byteArray, "UTF-16LE")) { return new CharsetInfo(Charset.forName("UTF-16LE"), false, encodingTypeErr); } if (encodingIsCorrect(byteArray, "UTF-16BE")) { return new CharsetInfo(Charset.forName("UTF-16BE"), false, encodingTypeErr); } return new CharsetInfo(null, false, true); } public static String convertToString(byte[] bytes, Charset charset) { String ret = new String(bytes, charset); if ((bytes[0] == 0xEF - 256) && (bytes[1] == 0xBB - 256) && (bytes[2] == 0xBF - 256)) { ret = ret.substring(1); } else if ((bytes[0] == 0xFE - 256) && (bytes[1] == 0xFF - 256)) { ret = ret.substring(1); } else if ((bytes[0] == 0xFF - 256) && (bytes[1] == 0xFE - 256)) { ret = ret.substring(1); } return ret; } /** * Used to verify the encoding of a byte array. This method does not actually * convert to string; it merely checks to see if it is possible without errors. * @param bytes the bytes to convert to string. * @param charset the charset to convert to * @return true is the bytes can be encoded into this charset, and false otherwise */ public static boolean encodingIsCorrect(byte[] bytes, String charset) { try { CharsetDecoder decoder = Charset.forName(charset).newDecoder(); decoder.decode(ByteBuffer.wrap(bytes)); } catch (Exception e) { return false; } return true; } }