Get file encoding
/*
* Copyright (c) 2006, Chuck Mortimore - xmldap.org
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the names xmldap, xmldap.org, xmldap.com nor the
* names of its contributors may be used to endorse or promote products
* derived from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND ANY
* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE REGENTS AND CONTRIBUTORS BE LIABLE FOR ANY
* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
/*
*
* Functions to read XML files considering byte order marks
*
*/
//package org.xmldap.util;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
public class XmlFileUtil {
public static int getBomLength(byte[] b) {
int b0 = 0;
int b1 = 0;
if (b.length > 2) {
b0 = b[0] & 0xFF;
b1 = b[1] & 0xFF;
if (b0 == 0xFE && b1 == 0xFF) {
return 2;
} else if (b0 == 0xFF && b1 == 0xFE) {
return 2;
}
} else {
return 0;
}
if (b.length > 3) {
final int b2 = b[2] & 0xFF;
if (b0 != 0xEF || b1 != 0xBB || b2 != 0xBF) {
return 0;
} else {
return 3;
}
}
return 0;
}
public static String getEncoding(byte[] b) {
int b0 = 0;
int b1 = 0;
if (b.length > 2) {
b0 = b[0] & 0xFF;
b1 = b[1] & 0xFF;
if (b0 == 0xFE && b1 == 0xFF) {
return "UTF-16BE";
} else if (b0 == 0xFF && b1 == 0xFE) {
return "UTF-16LE";
}
} else {
return null;
}
if (b.length > 3) {
final int b2 = b[2] & 0xFF;
if (b0 != 0xEF || b1 != 0xBB || b2 != 0xBF) {
return null;
} else {
return "UTF-8";
}
}
return null;
}
/**
* Removes the byte order mark from the stream, if it exists and returns the
* encoding name.
*
* Adapted code from org/apache/xerces/xinclude/XIncludeTextReader.java
*
* If null is returned then some bytes were read but they were no BOM bytes.
* You have to reset the stream in this case. I don't do that here because
* mark/reset is not implemented on all plattforms (windows)
*
* @param stream
* @return
* @throws IOException
*/
public static String getEncoding(InputStream stream) throws IOException {
stream.mark(4);
byte[] b = new byte[3];
int count = 0;
int b0 = 0;
int b1 = 0;
count = stream.read(b, 0, 2);
if (count == 2) {
b0 = b[0] & 0xFF;
b1 = b[1] & 0xFF;
if (b0 == 0xFE && b1 == 0xFF) {
return "UTF-16BE";
} else if (b0 == 0xFF && b1 == 0xFE) {
return "UTF-16LE";
}
} else {
return null;
}
byte[] B = new byte[1];
count = stream.read(B, 0, 1);
if (count == 1) {
final int b2 = B[0] & 0xFF;
if (b0 != 0xEF || b1 != 0xBB || b2 != 0xBF) {
// First three bytes are not BOM, so reset.
stream.reset();
} else {
return "UTF-8";
}
}
return null;
}
static public String doRead(InputStream in) throws IOException {
BufferedReader ins = new BufferedReader(new InputStreamReader(in));
StringBuilder sb = new StringBuilder();
try {
int c = -1;
char[] charBuf = null;
while (true) {
int len = in.available();
if (len > 0) {
if (charBuf == null) {
charBuf = new char[len];
} else {
if (len > charBuf.length) {
charBuf = new char[len];
}
}
} else {
// available is not always relyable
if (charBuf == null) {
charBuf = new char[2048];
} else {
if (2048 > charBuf.length) {
charBuf = new char[2048];
}
}
}
c = ins.read(charBuf, 0, charBuf.length);
if (c == -1) {
break;
} else {
sb.append(charBuf, 0, c);
}
}
} finally {
try {
in.close();
} catch (IOException e) {
}
try {
ins.close();
} catch (IOException e) {
}
}
return sb.toString();
}
}
Related examples in the same category
1. | Convert Encoding | | |
2. | Utility class for working with character sets | | |
3. | Utility methods for ASCII character checking. | | |
4. | Reader for UCS-2 and UCS-4 encodings. (i.e., encodings from ISO-10646-UCS-(2|4)). | | |
5. | Conversions between IANA encoding names and Java encoding names, and vice versa. | | |
6. | ASCII character handling functions | | |
7. | This class represents an encoding. | | |
8. | Provides information about encodings. | | |
9. | Codec for the Quoted-Printable section of http://www.ietf.org/rfc/rfc1521.txt (RFC 1521) | | |
10. | ISO 8859-8, ASCII plus Hebrew | | |
11. | TIS-620 does not have the non-breaking space or the C1 controls. | | |
12. | ISO-8859-1; a.k.a. Latin-1 | | |
13. | ISO 8859-2, a.k.a. Latin-2 | | |
14. | ISO 8859-3 | | |
15. | ISO 8859-4, Latin plus the characters needed for Greenlandic, Icelandic, and Lappish. | | |
16. | ISO 8859-9 for Turkish. | | |
17. | ISO-8859-10, for Lithuanian, Estonian, Greenlandic, Icelandic, Inuit, Lappish, and other Northern European languages. | | |
18. | ISO-8859-13, for Latvian and other Baltic languages. | | |
19. | ISO-8859-14, for Gaelic, Welsh, and other Celtic languages. | | |
20. | ISO 8859-9 for Western Europe. Includes the Euro sign and several uncommon French letters | | |
21. | ISO 8859-16, Romanian | | |
22. | ASCII Writer | | |
23. | UCS Writer | | |
24. | Unicode Writer | | |
25. | Whether a character is or is not available in a particular encoding | | |
26. | ISO 8859-6, ASCII plus Arabic | | |
27. | ISO 8859-5, ASCII plus Cyrillic (Russian, Byelorussian, etc.) | | |
28. | ISO 8859-7, ASCII plus Greek | | |
29. | IANA to Java Mapping | | |
30. | Java to IANA Mapping | | |
31. | EncodingMap is a convenience class which handles conversions between IANA encoding names and Java encoding names, and vice versa. | | |