Sniffed Xml InputStream to find out the declaration and file encoding : XML Reader « XML « Java Tutorial

/*   Copyright 2004 The Apache Software Foundation
 *
 *   Licensed under the Apache License, Version 2.0 (the "License");
 *   you may not use this file except in compliance with the License.
 *   You may obtain a copy of the License at
 *
 *       http://www.apache.org/licenses/LICENSE-2.0
 *
 *   Unless required by applicable law or agreed to in writing, software
 *   distributed under the License is distributed on an "AS IS" BASIS,
 *   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 *   See the License for the specific language governing permissions and
 *  limitations under the License.
 */

// revised from xmlbeans

import java.io.InputStream;
import java.io.BufferedInputStream;
import java.io.IOException;
import java.io.ByteArrayInputStream;
import java.io.InputStreamReader;
import java.io.Reader;
import java.nio.charset.Charset;

public class SniffedXmlInputStream extends BufferedInputStream
{
    // We don't sniff more than 192 bytes.
    public static int MAX_SNIFFED_BYTES = 192;

    public SniffedXmlInputStream(InputStream stream) throws IOException
    {
        super(stream);

        // read byte order marks and detect EBCDIC etc
        _encoding = sniffFourBytes();

        if (_encoding != null && _encoding.equals("IBM037"))
        {
            // First four bytes suggest EBCDIC with <?xm at start
            String encoding = sniffForXmlDecl(_encoding);
            if (encoding != null)
                _encoding = encoding;
        }

        if (_encoding == null)
        {
            // Haven't yet determined encoding: sniff for <?xml encoding="..."?>
            // assuming we can read it as UTF-8.
            _encoding = sniffForXmlDecl("UTF-8");
        }

        if (_encoding == null)
        {
            // The XML spec says these two things:

            // (1) "In the absence of external character encoding information
            // (such as MIME headers), parsed entities which are stored in an
            // encoding other than UTF-8 or UTF-16 must begin with a text
            // declaration (see 4.3.1 The Text Declaration) containing an
            // encoding declaration:"

            // (2) "In the absence of information provided by an external
            // transport protocol (e.g. HTTP or MIME), it is an error
            // for an entity including an encoding declaration to be
            // presented to the XML processor in an encoding other than
            // that named in the declaration, or for an entity which begins
            // with neither a Byte Order Mark nor an encoding declaration
            // to use an encoding other than UTF-8."

            // Since we're using a sniffed stream, we do not have external
            // character encoding information.

            // Since we're here, we also don't have a recognized byte order
            // mark or an explicit encoding declaration that can be read in
            // either ASCII or EBDIC style.

            // Therefore, we must use UTF-8.

            _encoding = "UTF-8";
        }
    }

    private int readAsMuchAsPossible(byte[] buf, int startAt, int len) throws IOException
    {
        int total = 0;
        while (total < len)
        {
            int count = read(buf, startAt + total, len - total);
            if (count < 0)
                break;
            total += count;
        }
        return total;
    }

    private String sniffFourBytes() throws IOException
    {
        mark(4);
        int skip = 0;
        try
        {
            byte[] buf = new byte[4];
            if (readAsMuchAsPossible(buf, 0, 4) < 4)
                return null;
            long result = 0xFF000000 & (buf[0] << 24) | 0x00FF0000 & (buf[1] << 16) | 0x0000FF00 & (buf[2] << 8) | 0x000000FF & buf[3];

            if (result == 0x0000FEFF)
                return "UCS-4";
            else if (result == 0xFFFE0000)
                return "UCS-4";
            else if (result == 0x0000003C)
                return "UCS-4BE";
            else if (result == 0x3C000000)
                return "UCS-4LE";
            else if (result == 0x003C003F)
                return "UTF-16BE";
            else if (result == 0x3C003F00)
                return "UTF-16LE";
            else if (result == 0x3C3F786D)
                return null; // looks like US-ASCII with <?xml: sniff
            else if (result == 0x4C6FA794)
                return "IBM037"; // Sniff for ebdic codepage
            else if ((result & 0xFFFF0000) == 0xFEFF0000)
                return "UTF-16";
            else if ((result & 0xFFFF0000) == 0xFFFE0000)
                return "UTF-16";
            else if ((result & 0xFFFFFF00) == 0xEFBBBF00)
                return "UTF-8";
            else return null;
        }
        finally
        {
            reset();
        }
    }

    // BUGBUG in JDK: Charset.forName is not threadsafe, so we'll prime it
    // with the common charsets.

    private static Charset dummy1 = Charset.forName("UTF-8");
    private static Charset dummy2 = Charset.forName("UTF-16");
    private static Charset dummy3 = Charset.forName("UTF-16BE");
    private static Charset dummy4 = Charset.forName("UTF-16LE");
    private static Charset dummy5 = Charset.forName("ISO-8859-1");
    private static Charset dummy6 = Charset.forName("US-ASCII");
    private static Charset dummy7 = Charset.forName("Cp1252");


    private String sniffForXmlDecl(String encoding) throws IOException
    {
        mark(MAX_SNIFFED_BYTES);
        try
        {
            byte[] bytebuf = new byte[MAX_SNIFFED_BYTES];
            int bytelimit = readAsMuchAsPossible(bytebuf, 0, MAX_SNIFFED_BYTES);

            // BUGBUG in JDK: Charset.forName is not threadsafe.
            Charset charset = Charset.forName(encoding);
            Reader reader = new InputStreamReader(new ByteArrayInputStream(bytebuf, 0, bytelimit), charset);
            char[] buf = new char[bytelimit];
            int limit = 0;
            while (limit < bytelimit)
            {
                int count = reader.read(buf, limit, bytelimit - limit);
                if (count < 0)
                    break;
                limit += count;
            }

            return extractXmlDeclEncoding(buf, 0, limit);
        }
        finally
        {
            reset();
        }
    }

    private String _encoding;

    public String getXmlEncoding()
    {
        return _encoding;
    }

    /* package */ static String extractXmlDeclEncoding(char[] buf, int offset, int size)
    {
        int limit = offset + size;
        int xmlpi = firstIndexOf("<?xml", buf, offset, limit);
        if (xmlpi >= 0)
        {
            int i = xmlpi + 5;
            ScannedAttribute attr = new ScannedAttribute();
            while (i < limit)
            {
                i = scanAttribute(buf, i, limit, attr);
                if (i < 0)
                    return null;
                if (attr.name.equals("encoding"))
                    return attr.value;
            }
        }
        return null;
    }

    private static int firstIndexOf(String s, char[] buf, int startAt, int limit)
    {
        assert(s.length() > 0);
        char[] lookFor = s.toCharArray();

        char firstchar = lookFor[0];
        searching: for (limit -= lookFor.length; startAt < limit; startAt++)
        {
            if (buf[startAt] == firstchar)
            {
                for (int i = 1; i < lookFor.length; i++)
                {
                    if (buf[startAt + i] != lookFor[i])
                    {
                        continue searching;
                    }
                }
                return startAt;
            }
        }

        return -1;
    }

    private static int nextNonmatchingByte(char[] lookFor, char[] buf, int startAt, int limit)
    {
        searching: for (; startAt < limit; startAt++)
        {
            int thischar = buf[startAt];
            for (int i = 0; i < lookFor.length; i++)
                if (thischar == lookFor[i])
                    continue searching;
            return startAt;
        }
        return -1;
    }

    private static int nextMatchingByte(char[] lookFor, char[] buf, int startAt, int limit)
    {
        searching: for (; startAt < limit; startAt++)
        {
            int thischar = buf[startAt];
            for (int i = 0; i < lookFor.length; i++)
                if (thischar == lookFor[i])
                    return startAt;
        }
        return -1;
    }

    private static int nextMatchingByte(char lookFor, char[] buf, int startAt, int limit)
    {
        searching: for (; startAt < limit; startAt++)
        {
            if (buf[startAt] == lookFor)
                return startAt;
        }
        return -1;
    }
    private static char[] WHITESPACE = new char[] { ' ', '\r', '\t', '\n' };
    private static char[] NOTNAME = new char[] { '=', ' ', '\r', '\t', '\n', '?', '>', '<', '\'', '\"' };

    private static class ScannedAttribute
    {
        public String name;
        public String value;
    }

    private static int scanAttribute(char[] buf, int startAt, int limit, ScannedAttribute attr)
    {
        int nameStart = nextNonmatchingByte(WHITESPACE, buf, startAt, limit);
        if (nameStart < 0)
            return -1;
        int nameEnd = nextMatchingByte(NOTNAME, buf, nameStart, limit);
        if (nameEnd < 0)
            return -1;
        int equals = nextNonmatchingByte(WHITESPACE, buf, nameEnd, limit);
        if (equals < 0)
            return -1;
        if (buf[equals] != '=')
            return -1;
        int valQuote = nextNonmatchingByte(WHITESPACE, buf, equals + 1, limit);
        if (buf[valQuote] != '\'' && buf[valQuote] != '\"')
            return -1;
        int valEndquote = nextMatchingByte(buf[valQuote], buf, valQuote + 1, limit);
        if (valEndquote < 0)
            return -1;
        attr.name = new String(buf, nameStart, nameEnd - nameStart);
        attr.value = new String(buf, valQuote + 1, valEndquote - valQuote - 1);
        return valEndquote + 1;
    }
}
33.30.XML Reader
	33.30.1.	Read Xml from InputStream and return Document
	33.30.2.	Read Xml from Reader and return Document
	33.30.3.	Sniffed Xml Reader
	33.30.4.	Xml Reader To Writer
	33.30.5.	Xml Encoding Sniffer
	33.30.6.	Sniffed Xml InputStream to find out the declaration and file encoding