Xml Encoding Sniffer
/* Copyright 2004 The Apache Software Foundation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
// Revised from xml beans
import java.io.BufferedInputStream;
import java.io.BufferedReader;
import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.OutputStream;
import java.io.OutputStreamWriter;
import java.io.Reader;
import java.io.UnsupportedEncodingException;
import java.io.Writer;
import java.nio.charset.Charset;
import com.sun.org.apache.xerces.internal.util.EncodingMap;
public class XmlEncodingSniffer
{
private String _xmlencoding;
private String _javaencoding;
private InputStream _stream;
private Reader _reader;
/**
* Sniffs the given XML stream for encoding information.
*
* After a sniffer is constructed, it can return either a stream
* (which is a buffered stream wrapper of the original) or a reader
* (which applies the proper encoding).
*
* @param stream The stream to sniff
* @param encodingOverride The XML (IANA) name for the overriding encoding
* @throws IOException
* @throws UnsupportedEncodingException
*/
public XmlEncodingSniffer(InputStream stream, String encodingOverride)
throws IOException, UnsupportedEncodingException
{
_stream = stream;
if (encodingOverride != null)
_xmlencoding = EncodingMap.getJava2IANAMapping(encodingOverride);
if (_xmlencoding == null)
_xmlencoding = encodingOverride;
if (_xmlencoding == null)
{
SniffedXmlInputStream sniffed = new SniffedXmlInputStream(_stream);
_xmlencoding = sniffed.getXmlEncoding();
assert(_xmlencoding != null);
_stream = sniffed;
}
_javaencoding = EncodingMap.getIANA2JavaMapping(_xmlencoding);
// we allow you to use Java's encoding names in XML even though you're
// not supposed to.
if (_javaencoding == null)
_javaencoding = _xmlencoding;
}
/**
* Sniffs the given XML stream for encoding information.
*
* After a sniffer is constructed, it can return either a reader
* (which is a buffered stream wrapper of the original) or a stream
* (which applies the proper encoding).
*
* @param reader The reader to sniff
* @param encodingDefault The Java name for the default encoding to apply, UTF-8 if null.
* @throws IOException
* @throws UnsupportedEncodingException
*/
public XmlEncodingSniffer(Reader reader, String encodingDefault)
throws IOException, UnsupportedEncodingException
{
if (encodingDefault == null)
encodingDefault = "UTF-8";
SniffedXmlReader sniffedReader = new SniffedXmlReader(reader);
_reader = sniffedReader;
_xmlencoding = sniffedReader.getXmlEncoding();
if (_xmlencoding == null)
{
_xmlencoding = EncodingMap.getJava2IANAMapping(encodingDefault);
if (_xmlencoding != null)
_javaencoding = encodingDefault;
else
_xmlencoding = encodingDefault;
}
if (_xmlencoding == null)
_xmlencoding = "UTF-8";
// we allow you to use Java's encoding names in XML even though you're
// not supposed to.
_javaencoding = EncodingMap.getIANA2JavaMapping(_xmlencoding);
if (_javaencoding == null)
_javaencoding = _xmlencoding;
}
public String getXmlEncoding()
{
return _xmlencoding;
}
public String getJavaEncoding()
{
return _javaencoding;
}
public InputStream getStream()
throws UnsupportedEncodingException
{
if (_stream != null)
{
InputStream is = _stream;
_stream = null;
return is;
}
if (_reader != null)
{
InputStream is = new ReaderInputStream( _reader, _javaencoding );
_reader = null;
return is;
}
return null;
}
public Reader getReader ( )
throws UnsupportedEncodingException
{
if (_reader != null)
{
Reader reader = _reader;
_reader = null;
return reader;
}
if (_stream != null)
{
Reader reader = new InputStreamReader( _stream, _javaencoding );
_stream = null;
return reader;
}
return null;
}
}
/* Copyright 2004 The Apache Software Foundation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
class ReaderInputStream extends PushedInputStream
{
private Reader reader;
private Writer writer;
private char[] buf;
public static int defaultBufferSize = 2048;
public ReaderInputStream(Reader reader, String encoding) throws UnsupportedEncodingException
{
this(reader, encoding, defaultBufferSize);
}
public ReaderInputStream(Reader reader, String encoding, int bufferSize) throws UnsupportedEncodingException
{
if (bufferSize <= 0)
throw new IllegalArgumentException("Buffer size <= 0");
this.reader = reader;
this.writer = new OutputStreamWriter(getOutputStream(), encoding);
buf = new char[bufferSize];
}
public void fill(int requestedBytes) throws IOException
{
do
{
int chars = reader.read(buf);
if (chars < 0)
return;
writer.write(buf, 0, chars);
writer.flush();
}
while (available() <= 0); // loop for safety, in case encoding didn't produce any bytes yet
}
}
/* Copyright 2004 The Apache Software Foundation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
abstract class PushedInputStream extends InputStream
{
private static int defaultBufferSize = 2048;
protected byte buf[];
protected int writepos;
protected int readpos;
protected int markpos = -1;
protected int marklimit;
protected OutputStream outputStream = new InternalOutputStream();
/**
* Called when more bytes need to be written into this stream
* (as an OutputStream).
*
* This method must write at least one byte if the stream is
* not ended, and it must not write any bytes if the stream has
* already ended.
*/
protected abstract void fill(int requestedBytes) throws IOException;
/**
* Returns the linked output stream.
*
* This is the output stream that must be written to whenever
* the fill method is called.
*/
public final OutputStream getOutputStream()
{
return outputStream;
}
public PushedInputStream()
{
this(defaultBufferSize);
}
public PushedInputStream(int size)
{
if (size < 0)
{
throw new IllegalArgumentException("Negative initial buffer size");
}
buf = new byte[size];
}
/**
* Makes room for cb more bytes of data
*/
private void shift(int cb)
{
int savepos = readpos;
if (markpos > 0)
{
if (readpos - markpos > marklimit)
markpos = -1;
else
savepos = markpos;
}
int size = writepos - savepos;
if (savepos > 0 && buf.length - size >= cb && size <= cb)
{
System.arraycopy(buf, savepos, buf, 0, size);
}
else
{
int newcount = size + cb;
byte newbuf[] = new byte[Math.max(buf.length << 1, newcount)];
System.arraycopy(buf, savepos, newbuf, 0, size);
buf = newbuf;
}
if (savepos > 0)
{
readpos -= savepos;
if (markpos > 0)
markpos -= savepos;
writepos -= savepos;
}
}
public synchronized int read() throws IOException
{
if (readpos >= writepos)
{
fill(1);
if (readpos >= writepos)
return -1;
}
return buf[readpos++] & 0xff;
}
/**
* Read characters into a portion of an array, reading from the underlying
* stream at most once if necessary.
*/
public synchronized int read(byte[] b, int off, int len) throws IOException
{
int avail = writepos - readpos;
if (avail < len)
{
fill(len - avail);
avail = writepos - readpos;
if (avail <= 0) return -1;
}
int cnt = (avail < len) ? avail : len;
System.arraycopy(buf, readpos, b, off, cnt);
readpos += cnt;
return cnt;
}
public synchronized long skip(long n) throws IOException
{
if (n <= 0)
return 0;
long avail = writepos - readpos;
if (avail < n)
{
// Fill in buffer to save bytes for reset
long req = n - avail;
if (req > Integer.MAX_VALUE)
req = Integer.MAX_VALUE;
fill((int)req);
avail = writepos - readpos;
if (avail <= 0)
return 0;
}
long skipped = (avail < n) ? avail : n;
readpos += skipped;
return skipped;
}
public synchronized int available()
{
return writepos - readpos;
}
public synchronized void mark(int readlimit)
{
marklimit = readlimit;
markpos = readpos;
}
public synchronized void reset() throws IOException
{
if (markpos < 0)
throw new IOException("Resetting to invalid mark");
readpos = markpos;
}
public boolean markSupported()
{
return true;
}
private class InternalOutputStream extends OutputStream
{
public synchronized void write(int b) throws IOException
{
if (writepos + 1 > buf.length)
{
shift(1);
}
buf[writepos] = (byte)b;
writepos += 1;
}
public synchronized void write(byte b[], int off, int len)
{
if ((off < 0) || (off > b.length) || (len < 0) ||
((off + len) > b.length) || ((off + len) < 0))
throw new IndexOutOfBoundsException();
else if (len == 0)
return;
if (writepos + len > buf.length)
shift(len);
System.arraycopy(b, off, buf, writepos, len);
writepos += len;
}
}
}
class SniffedXmlInputStream extends BufferedInputStream
{
// We don't sniff more than 192 bytes.
public static int MAX_SNIFFED_BYTES = 192;
public SniffedXmlInputStream(InputStream stream) throws IOException
{
super(stream);
// read byte order marks and detect EBCDIC etc
_encoding = sniffFourBytes();
if (_encoding != null && _encoding.equals("IBM037"))
{
// First four bytes suggest EBCDIC with <?xm at start
String encoding = sniffForXmlDecl(_encoding);
if (encoding != null)
_encoding = encoding;
}
if (_encoding == null)
{
// Haven't yet determined encoding: sniff for <?xml encoding="..."?>
// assuming we can read it as UTF-8.
_encoding = sniffForXmlDecl("UTF-8");
}
if (_encoding == null)
{
// The XML spec says these two things:
// (1) "In the absence of external character encoding information
// (such as MIME headers), parsed entities which are stored in an
// encoding other than UTF-8 or UTF-16 must begin with a text
// declaration (see 4.3.1 The Text Declaration) containing an
// encoding declaration:"
// (2) "In the absence of information provided by an external
// transport protocol (e.g. HTTP or MIME), it is an error
// for an entity including an encoding declaration to be
// presented to the XML processor in an encoding other than
// that named in the declaration, or for an entity which begins
// with neither a Byte Order Mark nor an encoding declaration
// to use an encoding other than UTF-8."
// Since we're using a sniffed stream, we do not have external
// character encoding information.
// Since we're here, we also don't have a recognized byte order
// mark or an explicit encoding declaration that can be read in
// either ASCII or EBDIC style.
// Therefore, we must use UTF-8.
_encoding = "UTF-8";
}
}
private int readAsMuchAsPossible(byte[] buf, int startAt, int len) throws IOException
{
int total = 0;
while (total < len)
{
int count = read(buf, startAt + total, len - total);
if (count < 0)
break;
total += count;
}
return total;
}
private String sniffFourBytes() throws IOException
{
mark(4);
int skip = 0;
try
{
byte[] buf = new byte[4];
if (readAsMuchAsPossible(buf, 0, 4) < 4)
return null;
long result = 0xFF000000 & (buf[0] << 24) | 0x00FF0000 & (buf[1] << 16) | 0x0000FF00 & (buf[2] << 8) | 0x000000FF & buf[3];
if (result == 0x0000FEFF)
return "UCS-4";
else if (result == 0xFFFE0000)
return "UCS-4";
else if (result == 0x0000003C)
return "UCS-4BE";
else if (result == 0x3C000000)
return "UCS-4LE";
else if (result == 0x003C003F)
return "UTF-16BE";
else if (result == 0x3C003F00)
return "UTF-16LE";
else if (result == 0x3C3F786D)
return null; // looks like US-ASCII with <?xml: sniff
else if (result == 0x4C6FA794)
return "IBM037"; // Sniff for ebdic codepage
else if ((result & 0xFFFF0000) == 0xFEFF0000)
return "UTF-16";
else if ((result & 0xFFFF0000) == 0xFFFE0000)
return "UTF-16";
else if ((result & 0xFFFFFF00) == 0xEFBBBF00)
return "UTF-8";
else return null;
}
finally
{
reset();
}
}
// BUGBUG in JDK: Charset.forName is not threadsafe, so we'll prime it
// with the common charsets.
private static Charset dummy1 = Charset.forName("UTF-8");
private static Charset dummy2 = Charset.forName("UTF-16");
private static Charset dummy3 = Charset.forName("UTF-16BE");
private static Charset dummy4 = Charset.forName("UTF-16LE");
private static Charset dummy5 = Charset.forName("ISO-8859-1");
private static Charset dummy6 = Charset.forName("US-ASCII");
private static Charset dummy7 = Charset.forName("Cp1252");
private String sniffForXmlDecl(String encoding) throws IOException
{
mark(MAX_SNIFFED_BYTES);
try
{
byte[] bytebuf = new byte[MAX_SNIFFED_BYTES];
int bytelimit = readAsMuchAsPossible(bytebuf, 0, MAX_SNIFFED_BYTES);
// BUGBUG in JDK: Charset.forName is not threadsafe.
Charset charset = Charset.forName(encoding);
Reader reader = new InputStreamReader(new ByteArrayInputStream(bytebuf, 0, bytelimit), charset);
char[] buf = new char[bytelimit];
int limit = 0;
while (limit < bytelimit)
{
int count = reader.read(buf, limit, bytelimit - limit);
if (count < 0)
break;
limit += count;
}
return extractXmlDeclEncoding(buf, 0, limit);
}
finally
{
reset();
}
}
private String _encoding;
public String getXmlEncoding()
{
return _encoding;
}
/* package */ static String extractXmlDeclEncoding(char[] buf, int offset, int size)
{
int limit = offset + size;
int xmlpi = firstIndexOf("<?xml", buf, offset, limit);
if (xmlpi >= 0)
{
int i = xmlpi + 5;
ScannedAttribute attr = new ScannedAttribute();
while (i < limit)
{
i = scanAttribute(buf, i, limit, attr);
if (i < 0)
return null;
if (attr.name.equals("encoding"))
return attr.value;
}
}
return null;
}
private static int firstIndexOf(String s, char[] buf, int startAt, int limit)
{
assert(s.length() > 0);
char[] lookFor = s.toCharArray();
char firstchar = lookFor[0];
searching: for (limit -= lookFor.length; startAt < limit; startAt++)
{
if (buf[startAt] == firstchar)
{
for (int i = 1; i < lookFor.length; i++)
{
if (buf[startAt + i] != lookFor[i])
{
continue searching;
}
}
return startAt;
}
}
return -1;
}
private static int nextNonmatchingByte(char[] lookFor, char[] buf, int startAt, int limit)
{
searching: for (; startAt < limit; startAt++)
{
int thischar = buf[startAt];
for (int i = 0; i < lookFor.length; i++)
if (thischar == lookFor[i])
continue searching;
return startAt;
}
return -1;
}
private static int nextMatchingByte(char[] lookFor, char[] buf, int startAt, int limit)
{
searching: for (; startAt < limit; startAt++)
{
int thischar = buf[startAt];
for (int i = 0; i < lookFor.length; i++)
if (thischar == lookFor[i])
return startAt;
}
return -1;
}
private static int nextMatchingByte(char lookFor, char[] buf, int startAt, int limit)
{
searching: for (; startAt < limit; startAt++)
{
if (buf[startAt] == lookFor)
return startAt;
}
return -1;
}
private static char[] WHITESPACE = new char[] { ' ', '\r', '\t', '\n' };
private static char[] NOTNAME = new char[] { '=', ' ', '\r', '\t', '\n', '?', '>', '<', '\'', '\"' };
private static class ScannedAttribute
{
public String name;
public String value;
}
private static int scanAttribute(char[] buf, int startAt, int limit, ScannedAttribute attr)
{
int nameStart = nextNonmatchingByte(WHITESPACE, buf, startAt, limit);
if (nameStart < 0)
return -1;
int nameEnd = nextMatchingByte(NOTNAME, buf, nameStart, limit);
if (nameEnd < 0)
return -1;
int equals = nextNonmatchingByte(WHITESPACE, buf, nameEnd, limit);
if (equals < 0)
return -1;
if (buf[equals] != '=')
return -1;
int valQuote = nextNonmatchingByte(WHITESPACE, buf, equals + 1, limit);
if (buf[valQuote] != '\'' && buf[valQuote] != '\"')
return -1;
int valEndquote = nextMatchingByte(buf[valQuote], buf, valQuote + 1, limit);
if (valEndquote < 0)
return -1;
attr.name = new String(buf, nameStart, nameEnd - nameStart);
attr.value = new String(buf, valQuote + 1, valEndquote - valQuote - 1);
return valEndquote + 1;
}
}
class SniffedXmlReader extends BufferedReader {
// We don't sniff more than 192 bytes.
public static int MAX_SNIFFED_CHARS = 192;
public SniffedXmlReader(Reader reader) throws IOException {
super(reader);
_encoding = sniffForXmlDecl();
}
private int readAsMuchAsPossible(char[] buf, int startAt, int len) throws IOException {
int total = 0;
while (total < len) {
int count = read(buf, startAt + total, len - total);
if (count < 0)
break;
total += count;
}
return total;
}
// BUGBUG in JDK: Charset.forName is not threadsafe, so we'll prime it
// with the common charsets.
private static Charset dummy1 = Charset.forName("UTF-8");
private static Charset dummy2 = Charset.forName("UTF-16");
private static Charset dummy3 = Charset.forName("UTF-16BE");
private static Charset dummy4 = Charset.forName("UTF-16LE");
private static Charset dummy5 = Charset.forName("ISO-8859-1");
private static Charset dummy6 = Charset.forName("US-ASCII");
private static Charset dummy7 = Charset.forName("Cp1252");
private String sniffForXmlDecl() throws IOException {
mark(MAX_SNIFFED_CHARS);
try {
char[] buf = new char[MAX_SNIFFED_CHARS];
int limit = readAsMuchAsPossible(buf, 0, MAX_SNIFFED_CHARS);
return SniffedXmlInputStream.extractXmlDeclEncoding(buf, 0, limit);
} finally {
reset();
}
}
private String _encoding;
public String getXmlEncoding() {
return _encoding;
}
}
Related examples in the same category