Java tutorial
/* Copyright 2004 The Apache Software Foundation * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ // Revised from xml beans import java.io.BufferedInputStream; import java.io.BufferedReader; import java.io.ByteArrayInputStream; import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; import java.io.OutputStream; import java.io.OutputStreamWriter; import java.io.Reader; import java.io.UnsupportedEncodingException; import java.io.Writer; import java.nio.charset.Charset; import com.sun.org.apache.xerces.internal.util.EncodingMap; public class XmlEncodingSniffer { private String _xmlencoding; private String _javaencoding; private InputStream _stream; private Reader _reader; /** * Sniffs the given XML stream for encoding information. * * After a sniffer is constructed, it can return either a stream * (which is a buffered stream wrapper of the original) or a reader * (which applies the proper encoding). * * @param stream The stream to sniff * @param encodingOverride The XML (IANA) name for the overriding encoding * @throws IOException * @throws UnsupportedEncodingException */ public XmlEncodingSniffer(InputStream stream, String encodingOverride) throws IOException, UnsupportedEncodingException { _stream = stream; if (encodingOverride != null) _xmlencoding = EncodingMap.getJava2IANAMapping(encodingOverride); if (_xmlencoding == null) _xmlencoding = encodingOverride; if (_xmlencoding == null) { SniffedXmlInputStream sniffed = new SniffedXmlInputStream(_stream); _xmlencoding = sniffed.getXmlEncoding(); assert (_xmlencoding != null); _stream = sniffed; } _javaencoding = EncodingMap.getIANA2JavaMapping(_xmlencoding); // we allow you to use Java's encoding names in XML even though you're // not supposed to. if (_javaencoding == null) _javaencoding = _xmlencoding; } /** * Sniffs the given XML stream for encoding information. * * After a sniffer is constructed, it can return either a reader * (which is a buffered stream wrapper of the original) or a stream * (which applies the proper encoding). * * @param reader The reader to sniff * @param encodingDefault The Java name for the default encoding to apply, UTF-8 if null. * @throws IOException * @throws UnsupportedEncodingException */ public XmlEncodingSniffer(Reader reader, String encodingDefault) throws IOException, UnsupportedEncodingException { if (encodingDefault == null) encodingDefault = "UTF-8"; SniffedXmlReader sniffedReader = new SniffedXmlReader(reader); _reader = sniffedReader; _xmlencoding = sniffedReader.getXmlEncoding(); if (_xmlencoding == null) { _xmlencoding = EncodingMap.getJava2IANAMapping(encodingDefault); if (_xmlencoding != null) _javaencoding = encodingDefault; else _xmlencoding = encodingDefault; } if (_xmlencoding == null) _xmlencoding = "UTF-8"; // we allow you to use Java's encoding names in XML even though you're // not supposed to. _javaencoding = EncodingMap.getIANA2JavaMapping(_xmlencoding); if (_javaencoding == null) _javaencoding = _xmlencoding; } public String getXmlEncoding() { return _xmlencoding; } public String getJavaEncoding() { return _javaencoding; } public InputStream getStream() throws UnsupportedEncodingException { if (_stream != null) { InputStream is = _stream; _stream = null; return is; } if (_reader != null) { InputStream is = new ReaderInputStream(_reader, _javaencoding); _reader = null; return is; } return null; } public Reader getReader() throws UnsupportedEncodingException { if (_reader != null) { Reader reader = _reader; _reader = null; return reader; } if (_stream != null) { Reader reader = new InputStreamReader(_stream, _javaencoding); _stream = null; return reader; } return null; } } /* Copyright 2004 The Apache Software Foundation * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ class ReaderInputStream extends PushedInputStream { private Reader reader; private Writer writer; private char[] buf; public static int defaultBufferSize = 2048; public ReaderInputStream(Reader reader, String encoding) throws UnsupportedEncodingException { this(reader, encoding, defaultBufferSize); } public ReaderInputStream(Reader reader, String encoding, int bufferSize) throws UnsupportedEncodingException { if (bufferSize <= 0) throw new IllegalArgumentException("Buffer size <= 0"); this.reader = reader; this.writer = new OutputStreamWriter(getOutputStream(), encoding); buf = new char[bufferSize]; } public void fill(int requestedBytes) throws IOException { do { int chars = reader.read(buf); if (chars < 0) return; writer.write(buf, 0, chars); writer.flush(); } while (available() <= 0); // loop for safety, in case encoding didn't produce any bytes yet } } /* Copyright 2004 The Apache Software Foundation * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ abstract class PushedInputStream extends InputStream { private static int defaultBufferSize = 2048; protected byte buf[]; protected int writepos; protected int readpos; protected int markpos = -1; protected int marklimit; protected OutputStream outputStream = new InternalOutputStream(); /** * Called when more bytes need to be written into this stream * (as an OutputStream). * * This method must write at least one byte if the stream is * not ended, and it must not write any bytes if the stream has * already ended. */ protected abstract void fill(int requestedBytes) throws IOException; /** * Returns the linked output stream. * * This is the output stream that must be written to whenever * the fill method is called. */ public final OutputStream getOutputStream() { return outputStream; } public PushedInputStream() { this(defaultBufferSize); } public PushedInputStream(int size) { if (size < 0) { throw new IllegalArgumentException("Negative initial buffer size"); } buf = new byte[size]; } /** * Makes room for cb more bytes of data */ private void shift(int cb) { int savepos = readpos; if (markpos > 0) { if (readpos - markpos > marklimit) markpos = -1; else savepos = markpos; } int size = writepos - savepos; if (savepos > 0 && buf.length - size >= cb && size <= cb) { System.arraycopy(buf, savepos, buf, 0, size); } else { int newcount = size + cb; byte newbuf[] = new byte[Math.max(buf.length << 1, newcount)]; System.arraycopy(buf, savepos, newbuf, 0, size); buf = newbuf; } if (savepos > 0) { readpos -= savepos; if (markpos > 0) markpos -= savepos; writepos -= savepos; } } public synchronized int read() throws IOException { if (readpos >= writepos) { fill(1); if (readpos >= writepos) return -1; } return buf[readpos++] & 0xff; } /** * Read characters into a portion of an array, reading from the underlying * stream at most once if necessary. */ public synchronized int read(byte[] b, int off, int len) throws IOException { int avail = writepos - readpos; if (avail < len) { fill(len - avail); avail = writepos - readpos; if (avail <= 0) return -1; } int cnt = (avail < len) ? avail : len; System.arraycopy(buf, readpos, b, off, cnt); readpos += cnt; return cnt; } public synchronized long skip(long n) throws IOException { if (n <= 0) return 0; long avail = writepos - readpos; if (avail < n) { // Fill in buffer to save bytes for reset long req = n - avail; if (req > Integer.MAX_VALUE) req = Integer.MAX_VALUE; fill((int) req); avail = writepos - readpos; if (avail <= 0) return 0; } long skipped = (avail < n) ? avail : n; readpos += skipped; return skipped; } public synchronized int available() { return writepos - readpos; } public synchronized void mark(int readlimit) { marklimit = readlimit; markpos = readpos; } public synchronized void reset() throws IOException { if (markpos < 0) throw new IOException("Resetting to invalid mark"); readpos = markpos; } public boolean markSupported() { return true; } private class InternalOutputStream extends OutputStream { public synchronized void write(int b) throws IOException { if (writepos + 1 > buf.length) { shift(1); } buf[writepos] = (byte) b; writepos += 1; } public synchronized void write(byte b[], int off, int len) { if ((off < 0) || (off > b.length) || (len < 0) || ((off + len) > b.length) || ((off + len) < 0)) throw new IndexOutOfBoundsException(); else if (len == 0) return; if (writepos + len > buf.length) shift(len); System.arraycopy(b, off, buf, writepos, len); writepos += len; } } } class SniffedXmlInputStream extends BufferedInputStream { // We don't sniff more than 192 bytes. public static int MAX_SNIFFED_BYTES = 192; public SniffedXmlInputStream(InputStream stream) throws IOException { super(stream); // read byte order marks and detect EBCDIC etc _encoding = sniffFourBytes(); if (_encoding != null && _encoding.equals("IBM037")) { // First four bytes suggest EBCDIC with <?xm at start String encoding = sniffForXmlDecl(_encoding); if (encoding != null) _encoding = encoding; } if (_encoding == null) { // Haven't yet determined encoding: sniff for <?xml encoding="..."?> // assuming we can read it as UTF-8. _encoding = sniffForXmlDecl("UTF-8"); } if (_encoding == null) { // The XML spec says these two things: // (1) "In the absence of external character encoding information // (such as MIME headers), parsed entities which are stored in an // encoding other than UTF-8 or UTF-16 must begin with a text // declaration (see 4.3.1 The Text Declaration) containing an // encoding declaration:" // (2) "In the absence of information provided by an external // transport protocol (e.g. HTTP or MIME), it is an error // for an entity including an encoding declaration to be // presented to the XML processor in an encoding other than // that named in the declaration, or for an entity which begins // with neither a Byte Order Mark nor an encoding declaration // to use an encoding other than UTF-8." // Since we're using a sniffed stream, we do not have external // character encoding information. // Since we're here, we also don't have a recognized byte order // mark or an explicit encoding declaration that can be read in // either ASCII or EBDIC style. // Therefore, we must use UTF-8. _encoding = "UTF-8"; } } private int readAsMuchAsPossible(byte[] buf, int startAt, int len) throws IOException { int total = 0; while (total < len) { int count = read(buf, startAt + total, len - total); if (count < 0) break; total += count; } return total; } private String sniffFourBytes() throws IOException { mark(4); int skip = 0; try { byte[] buf = new byte[4]; if (readAsMuchAsPossible(buf, 0, 4) < 4) return null; long result = 0xFF000000 & (buf[0] << 24) | 0x00FF0000 & (buf[1] << 16) | 0x0000FF00 & (buf[2] << 8) | 0x000000FF & buf[3]; if (result == 0x0000FEFF) return "UCS-4"; else if (result == 0xFFFE0000) return "UCS-4"; else if (result == 0x0000003C) return "UCS-4BE"; else if (result == 0x3C000000) return "UCS-4LE"; else if (result == 0x003C003F) return "UTF-16BE"; else if (result == 0x3C003F00) return "UTF-16LE"; else if (result == 0x3C3F786D) return null; // looks like US-ASCII with <?xml: sniff else if (result == 0x4C6FA794) return "IBM037"; // Sniff for ebdic codepage else if ((result & 0xFFFF0000) == 0xFEFF0000) return "UTF-16"; else if ((result & 0xFFFF0000) == 0xFFFE0000) return "UTF-16"; else if ((result & 0xFFFFFF00) == 0xEFBBBF00) return "UTF-8"; else return null; } finally { reset(); } } // BUGBUG in JDK: Charset.forName is not threadsafe, so we'll prime it // with the common charsets. private static Charset dummy1 = Charset.forName("UTF-8"); private static Charset dummy2 = Charset.forName("UTF-16"); private static Charset dummy3 = Charset.forName("UTF-16BE"); private static Charset dummy4 = Charset.forName("UTF-16LE"); private static Charset dummy5 = Charset.forName("ISO-8859-1"); private static Charset dummy6 = Charset.forName("US-ASCII"); private static Charset dummy7 = Charset.forName("Cp1252"); private String sniffForXmlDecl(String encoding) throws IOException { mark(MAX_SNIFFED_BYTES); try { byte[] bytebuf = new byte[MAX_SNIFFED_BYTES]; int bytelimit = readAsMuchAsPossible(bytebuf, 0, MAX_SNIFFED_BYTES); // BUGBUG in JDK: Charset.forName is not threadsafe. Charset charset = Charset.forName(encoding); Reader reader = new InputStreamReader(new ByteArrayInputStream(bytebuf, 0, bytelimit), charset); char[] buf = new char[bytelimit]; int limit = 0; while (limit < bytelimit) { int count = reader.read(buf, limit, bytelimit - limit); if (count < 0) break; limit += count; } return extractXmlDeclEncoding(buf, 0, limit); } finally { reset(); } } private String _encoding; public String getXmlEncoding() { return _encoding; } /* package */ static String extractXmlDeclEncoding(char[] buf, int offset, int size) { int limit = offset + size; int xmlpi = firstIndexOf("<?xml", buf, offset, limit); if (xmlpi >= 0) { int i = xmlpi + 5; ScannedAttribute attr = new ScannedAttribute(); while (i < limit) { i = scanAttribute(buf, i, limit, attr); if (i < 0) return null; if (attr.name.equals("encoding")) return attr.value; } } return null; } private static int firstIndexOf(String s, char[] buf, int startAt, int limit) { assert (s.length() > 0); char[] lookFor = s.toCharArray(); char firstchar = lookFor[0]; searching: for (limit -= lookFor.length; startAt < limit; startAt++) { if (buf[startAt] == firstchar) { for (int i = 1; i < lookFor.length; i++) { if (buf[startAt + i] != lookFor[i]) { continue searching; } } return startAt; } } return -1; } private static int nextNonmatchingByte(char[] lookFor, char[] buf, int startAt, int limit) { searching: for (; startAt < limit; startAt++) { int thischar = buf[startAt]; for (int i = 0; i < lookFor.length; i++) if (thischar == lookFor[i]) continue searching; return startAt; } return -1; } private static int nextMatchingByte(char[] lookFor, char[] buf, int startAt, int limit) { searching: for (; startAt < limit; startAt++) { int thischar = buf[startAt]; for (int i = 0; i < lookFor.length; i++) if (thischar == lookFor[i]) return startAt; } return -1; } private static int nextMatchingByte(char lookFor, char[] buf, int startAt, int limit) { searching: for (; startAt < limit; startAt++) { if (buf[startAt] == lookFor) return startAt; } return -1; } private static char[] WHITESPACE = new char[] { ' ', '\r', '\t', '\n' }; private static char[] NOTNAME = new char[] { '=', ' ', '\r', '\t', '\n', '?', '>', '<', '\'', '\"' }; private static class ScannedAttribute { public String name; public String value; } private static int scanAttribute(char[] buf, int startAt, int limit, ScannedAttribute attr) { int nameStart = nextNonmatchingByte(WHITESPACE, buf, startAt, limit); if (nameStart < 0) return -1; int nameEnd = nextMatchingByte(NOTNAME, buf, nameStart, limit); if (nameEnd < 0) return -1; int equals = nextNonmatchingByte(WHITESPACE, buf, nameEnd, limit); if (equals < 0) return -1; if (buf[equals] != '=') return -1; int valQuote = nextNonmatchingByte(WHITESPACE, buf, equals + 1, limit); if (buf[valQuote] != '\'' && buf[valQuote] != '\"') return -1; int valEndquote = nextMatchingByte(buf[valQuote], buf, valQuote + 1, limit); if (valEndquote < 0) return -1; attr.name = new String(buf, nameStart, nameEnd - nameStart); attr.value = new String(buf, valQuote + 1, valEndquote - valQuote - 1); return valEndquote + 1; } } class SniffedXmlReader extends BufferedReader { // We don't sniff more than 192 bytes. public static int MAX_SNIFFED_CHARS = 192; public SniffedXmlReader(Reader reader) throws IOException { super(reader); _encoding = sniffForXmlDecl(); } private int readAsMuchAsPossible(char[] buf, int startAt, int len) throws IOException { int total = 0; while (total < len) { int count = read(buf, startAt + total, len - total); if (count < 0) break; total += count; } return total; } // BUGBUG in JDK: Charset.forName is not threadsafe, so we'll prime it // with the common charsets. private static Charset dummy1 = Charset.forName("UTF-8"); private static Charset dummy2 = Charset.forName("UTF-16"); private static Charset dummy3 = Charset.forName("UTF-16BE"); private static Charset dummy4 = Charset.forName("UTF-16LE"); private static Charset dummy5 = Charset.forName("ISO-8859-1"); private static Charset dummy6 = Charset.forName("US-ASCII"); private static Charset dummy7 = Charset.forName("Cp1252"); private String sniffForXmlDecl() throws IOException { mark(MAX_SNIFFED_CHARS); try { char[] buf = new char[MAX_SNIFFED_CHARS]; int limit = readAsMuchAsPossible(buf, 0, MAX_SNIFFED_CHARS); return SniffedXmlInputStream.extractXmlDeclEncoding(buf, 0, limit); } finally { reset(); } } private String _encoding; public String getXmlEncoding() { return _encoding; } }