Xml Encoding Sniffer : XML Reader « XML « Java Tutorial

/*   Copyright 2004 The Apache Software Foundation
 *
 *   Licensed under the Apache License, Version 2.0 (the "License");
 *   you may not use this file except in compliance with the License.
 *   You may obtain a copy of the License at
 *
 *       http://www.apache.org/licenses/LICENSE-2.0
 *
 *   Unless required by applicable law or agreed to in writing, software
 *   distributed under the License is distributed on an "AS IS" BASIS,
 *   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 *   See the License for the specific language governing permissions and
 *  limitations under the License.
 */

// Revised from xml beans

import java.io.BufferedInputStream;
import java.io.BufferedReader;
import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.OutputStream;
import java.io.OutputStreamWriter;
import java.io.Reader;
import java.io.UnsupportedEncodingException;
import java.io.Writer;
import java.nio.charset.Charset;

import com.sun.org.apache.xerces.internal.util.EncodingMap;

public class XmlEncodingSniffer
{
    private String      _xmlencoding;
    private String      _javaencoding;
    private InputStream _stream;
    private Reader      _reader;

    /**
     * Sniffs the given XML stream for encoding information.
     *
     * After a sniffer is constructed, it can return either a stream
     * (which is a buffered stream wrapper of the original) or a reader
     * (which applies the proper encoding).
     *
     * @param stream           The stream to sniff
     * @param encodingOverride The XML (IANA) name for the overriding encoding
     * @throws IOException
     * @throws UnsupportedEncodingException
     */
    public XmlEncodingSniffer(InputStream stream, String encodingOverride)
        throws IOException, UnsupportedEncodingException
    {
        _stream = stream;
        
        if (encodingOverride != null)
            _xmlencoding = EncodingMap.getJava2IANAMapping(encodingOverride);

        if (_xmlencoding == null)
            _xmlencoding = encodingOverride;

        if (_xmlencoding == null)
        {
            SniffedXmlInputStream sniffed = new SniffedXmlInputStream(_stream);
            _xmlencoding = sniffed.getXmlEncoding();
            assert(_xmlencoding != null);
            _stream = sniffed;
        }

        _javaencoding = EncodingMap.getIANA2JavaMapping(_xmlencoding);
        
        // we allow you to use Java's encoding names in XML even though you're
        // not supposed to.
        
        if (_javaencoding == null)
            _javaencoding = _xmlencoding;
    }

    /**
     * Sniffs the given XML stream for encoding information.
     *
     * After a sniffer is constructed, it can return either a reader
     * (which is a buffered stream wrapper of the original) or a stream
     * (which applies the proper encoding).
     *
     * @param reader           The reader to sniff
     * @param encodingDefault  The Java name for the default encoding to apply, UTF-8 if null.
     * @throws IOException
     * @throws UnsupportedEncodingException
     */
    public XmlEncodingSniffer(Reader reader, String encodingDefault)
            throws IOException, UnsupportedEncodingException
    {
        if (encodingDefault == null)
            encodingDefault = "UTF-8";
        
        SniffedXmlReader sniffedReader = new SniffedXmlReader(reader);
        _reader = sniffedReader;
        _xmlencoding = sniffedReader.getXmlEncoding();

        if (_xmlencoding == null)
        {
            _xmlencoding = EncodingMap.getJava2IANAMapping(encodingDefault);
            if (_xmlencoding != null)
                _javaencoding = encodingDefault;
            else
                _xmlencoding = encodingDefault;
        }

        if (_xmlencoding == null)
            _xmlencoding = "UTF-8";
        
        // we allow you to use Java's encoding names in XML even though you're
        // not supposed to.
        
        _javaencoding = EncodingMap.getIANA2JavaMapping(_xmlencoding);
        
        if (_javaencoding == null)
            _javaencoding = _xmlencoding;
    }

    public String getXmlEncoding()
    {
        return _xmlencoding;
    }

    public String getJavaEncoding()
    {
        return _javaencoding;
    }

    public InputStream getStream()
            throws UnsupportedEncodingException
    {
        if (_stream != null)
        {
            InputStream is = _stream;
            _stream = null;
            return is;
        }

        if (_reader != null)
        {
            InputStream is = new ReaderInputStream( _reader, _javaencoding );
            _reader = null;
            return is;
        }

        return null;
    }


    public Reader getReader ( )
        throws UnsupportedEncodingException
    {
        if (_reader != null)
        {
            Reader reader = _reader;
            _reader = null;
            return reader;
        }

        if (_stream != null)
        {
            Reader reader = new InputStreamReader( _stream, _javaencoding );
            _stream = null;
            return reader;
        }

        return null;
    }
}
/*   Copyright 2004 The Apache Software Foundation
*
*   Licensed under the Apache License, Version 2.0 (the "License");
*   you may not use this file except in compliance with the License.
*   You may obtain a copy of the License at
*
*       http://www.apache.org/licenses/LICENSE-2.0
*
*   Unless required by applicable law or agreed to in writing, software
*   distributed under the License is distributed on an "AS IS" BASIS,
*   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
*   See the License for the specific language governing permissions and
*  limitations under the License.
*/

class ReaderInputStream extends PushedInputStream
{
   private Reader reader;
   private Writer writer;
   private char[] buf;
   public static int defaultBufferSize = 2048;

   public ReaderInputStream(Reader reader, String encoding) throws UnsupportedEncodingException
   {
       this(reader, encoding, defaultBufferSize);
   }

   public ReaderInputStream(Reader reader, String encoding, int bufferSize) throws UnsupportedEncodingException
   {
       if (bufferSize <= 0)
           throw new IllegalArgumentException("Buffer size <= 0");

       this.reader = reader;
       this.writer = new OutputStreamWriter(getOutputStream(), encoding);
       buf = new char[bufferSize];
   }

   public void fill(int requestedBytes) throws IOException
   {
       do
       {
           int chars = reader.read(buf);
           if (chars < 0)
               return;

           writer.write(buf, 0, chars);
           writer.flush();
       }
       while (available() <= 0); // loop for safety, in case encoding didn't produce any bytes yet
   }
}

/*   Copyright 2004 The Apache Software Foundation
*
*   Licensed under the Apache License, Version 2.0 (the "License");
*   you may not use this file except in compliance with the License.
*   You may obtain a copy of the License at
*
*       http://www.apache.org/licenses/LICENSE-2.0
*
*   Unless required by applicable law or agreed to in writing, software
*   distributed under the License is distributed on an "AS IS" BASIS,
*   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
*   See the License for the specific language governing permissions and
*  limitations under the License.
*/

abstract class PushedInputStream extends InputStream
{
   private static int defaultBufferSize = 2048;
   protected byte buf[];
   protected int writepos;
   protected int readpos;
   protected int markpos = -1;
   protected int marklimit;
   protected OutputStream outputStream = new InternalOutputStream();

   /**
    * Called when more bytes need to be written into this stream
    * (as an OutputStream).
    *
    * This method must write at least one byte if the stream is
    * not ended, and it must not write any bytes if the stream has
    * already ended.
    */
   protected abstract void fill(int requestedBytes) throws IOException;

   /**
    * Returns the linked output stream.
    *
    * This is the output stream that must be written to whenever
    * the fill method is called.
    */
   public final OutputStream getOutputStream()
   {
       return outputStream;
   }

   public PushedInputStream()
   {
       this(defaultBufferSize);
   }

   public PushedInputStream(int size)
   {
       if (size < 0)
       {
           throw new IllegalArgumentException("Negative initial buffer size");
       }
       buf = new byte[size];
   }

   /**
    * Makes room for cb more bytes of data
    */
   private void shift(int cb)
   {
       int savepos = readpos;
       if (markpos > 0)
       {
           if (readpos - markpos > marklimit)
               markpos = -1;
           else
               savepos = markpos;
       }

       int size = writepos - savepos;

       if (savepos > 0 && buf.length - size >= cb && size <= cb)
       {
           System.arraycopy(buf, savepos, buf, 0, size);
       }
       else
       {
           int newcount = size + cb;
           byte newbuf[] = new byte[Math.max(buf.length << 1, newcount)];
           System.arraycopy(buf, savepos, newbuf, 0, size);
           buf = newbuf;
       }

       if (savepos > 0)
       {
           readpos -= savepos;
           if (markpos > 0)
               markpos -= savepos;
           writepos -= savepos;
       }
   }

   public synchronized int read() throws IOException
   {
       if (readpos >= writepos)
       {
           fill(1);
           if (readpos >= writepos)
               return -1;
       }
       return buf[readpos++] & 0xff;
   }

   /**
    * Read characters into a portion of an array, reading from the underlying
    * stream at most once if necessary.
    */
   public synchronized int read(byte[] b, int off, int len) throws IOException
   {
       int avail = writepos - readpos;
       if (avail < len)
       {
           fill(len - avail);
           avail = writepos - readpos;
           if (avail <= 0) return -1;
       }
       int cnt = (avail < len) ? avail : len;
       System.arraycopy(buf, readpos, b, off, cnt);
       readpos += cnt;
       return cnt;
   }

   public synchronized long skip(long n) throws IOException
   {
       if (n <= 0)
           return 0;

       long avail = writepos - readpos;

       if (avail < n)
       {
           // Fill in buffer to save bytes for reset
           long req = n - avail;
           if (req > Integer.MAX_VALUE)
               req = Integer.MAX_VALUE;
           fill((int)req);
           avail = writepos - readpos;
           if (avail <= 0)
               return 0;
       }

       long skipped = (avail < n) ? avail : n;
       readpos += skipped;
       return skipped;
   }

   public synchronized int available()
   {
       return writepos - readpos;
   }

   public synchronized void mark(int readlimit)
   {
       marklimit = readlimit;
       markpos = readpos;
   }

   public synchronized void reset() throws IOException
   {
       if (markpos < 0)
           throw new IOException("Resetting to invalid mark");
       readpos = markpos;
   }

   public boolean markSupported()
   {
       return true;
   }

   private class InternalOutputStream extends OutputStream
   {
       public synchronized void write(int b) throws IOException
       {
           if (writepos + 1 > buf.length)
           {
               shift(1);
           }
           buf[writepos] = (byte)b;
           writepos += 1;
       }

       public synchronized void write(byte b[], int off, int len)
       {
           if ((off < 0) || (off > b.length) || (len < 0) ||
               ((off + len) > b.length) || ((off + len) < 0))
               throw new IndexOutOfBoundsException();
           else if (len == 0)
               return;

           if (writepos + len > buf.length)
               shift(len);

           System.arraycopy(b, off, buf, writepos, len);
           writepos += len;
       }
   }
}

class SniffedXmlInputStream extends BufferedInputStream
{
    // We don't sniff more than 192 bytes.
    public static int MAX_SNIFFED_BYTES = 192;

    public SniffedXmlInputStream(InputStream stream) throws IOException
    {
        super(stream);

        // read byte order marks and detect EBCDIC etc
        _encoding = sniffFourBytes();

        if (_encoding != null && _encoding.equals("IBM037"))
        {
            // First four bytes suggest EBCDIC with <?xm at start
            String encoding = sniffForXmlDecl(_encoding);
            if (encoding != null)
                _encoding = encoding;
        }

        if (_encoding == null)
        {
            // Haven't yet determined encoding: sniff for <?xml encoding="..."?>
            // assuming we can read it as UTF-8.
            _encoding = sniffForXmlDecl("UTF-8");
        }

        if (_encoding == null)
        {
            // The XML spec says these two things:

            // (1) "In the absence of external character encoding information
            // (such as MIME headers), parsed entities which are stored in an
            // encoding other than UTF-8 or UTF-16 must begin with a text
            // declaration (see 4.3.1 The Text Declaration) containing an
            // encoding declaration:"

            // (2) "In the absence of information provided by an external
            // transport protocol (e.g. HTTP or MIME), it is an error
            // for an entity including an encoding declaration to be
            // presented to the XML processor in an encoding other than
            // that named in the declaration, or for an entity which begins
            // with neither a Byte Order Mark nor an encoding declaration
            // to use an encoding other than UTF-8."

            // Since we're using a sniffed stream, we do not have external
            // character encoding information.

            // Since we're here, we also don't have a recognized byte order
            // mark or an explicit encoding declaration that can be read in
            // either ASCII or EBDIC style.

            // Therefore, we must use UTF-8.

            _encoding = "UTF-8";
        }
    }

    private int readAsMuchAsPossible(byte[] buf, int startAt, int len) throws IOException
    {
        int total = 0;
        while (total < len)
        {
            int count = read(buf, startAt + total, len - total);
            if (count < 0)
                break;
            total += count;
        }
        return total;
    }

    private String sniffFourBytes() throws IOException
    {
        mark(4);
        int skip = 0;
        try
        {
            byte[] buf = new byte[4];
            if (readAsMuchAsPossible(buf, 0, 4) < 4)
                return null;
            long result = 0xFF000000 & (buf[0] << 24) | 0x00FF0000 & (buf[1] << 16) | 0x0000FF00 & (buf[2] << 8) | 0x000000FF & buf[3];

            if (result == 0x0000FEFF)
                return "UCS-4";
            else if (result == 0xFFFE0000)
                return "UCS-4";
            else if (result == 0x0000003C)
                return "UCS-4BE";
            else if (result == 0x3C000000)
                return "UCS-4LE";
            else if (result == 0x003C003F)
                return "UTF-16BE";
            else if (result == 0x3C003F00)
                return "UTF-16LE";
            else if (result == 0x3C3F786D)
                return null; // looks like US-ASCII with <?xml: sniff
            else if (result == 0x4C6FA794)
                return "IBM037"; // Sniff for ebdic codepage
            else if ((result & 0xFFFF0000) == 0xFEFF0000)
                return "UTF-16";
            else if ((result & 0xFFFF0000) == 0xFFFE0000)
                return "UTF-16";
            else if ((result & 0xFFFFFF00) == 0xEFBBBF00)
                return "UTF-8";
            else return null;
        }
        finally
        {
            reset();
        }
    }

    // BUGBUG in JDK: Charset.forName is not threadsafe, so we'll prime it
    // with the common charsets.

    private static Charset dummy1 = Charset.forName("UTF-8");
    private static Charset dummy2 = Charset.forName("UTF-16");
    private static Charset dummy3 = Charset.forName("UTF-16BE");
    private static Charset dummy4 = Charset.forName("UTF-16LE");
    private static Charset dummy5 = Charset.forName("ISO-8859-1");
    private static Charset dummy6 = Charset.forName("US-ASCII");
    private static Charset dummy7 = Charset.forName("Cp1252");


    private String sniffForXmlDecl(String encoding) throws IOException
    {
        mark(MAX_SNIFFED_BYTES);
        try
        {
            byte[] bytebuf = new byte[MAX_SNIFFED_BYTES];
            int bytelimit = readAsMuchAsPossible(bytebuf, 0, MAX_SNIFFED_BYTES);

            // BUGBUG in JDK: Charset.forName is not threadsafe.
            Charset charset = Charset.forName(encoding);
            Reader reader = new InputStreamReader(new ByteArrayInputStream(bytebuf, 0, bytelimit), charset);
            char[] buf = new char[bytelimit];
            int limit = 0;
            while (limit < bytelimit)
            {
                int count = reader.read(buf, limit, bytelimit - limit);
                if (count < 0)
                    break;
                limit += count;
            }

            return extractXmlDeclEncoding(buf, 0, limit);
        }
        finally
        {
            reset();
        }
    }

    private String _encoding;

    public String getXmlEncoding()
    {
        return _encoding;
    }

    /* package */ static String extractXmlDeclEncoding(char[] buf, int offset, int size)
    {
        int limit = offset + size;
        int xmlpi = firstIndexOf("<?xml", buf, offset, limit);
        if (xmlpi >= 0)
        {
            int i = xmlpi + 5;
            ScannedAttribute attr = new ScannedAttribute();
            while (i < limit)
            {
                i = scanAttribute(buf, i, limit, attr);
                if (i < 0)
                    return null;
                if (attr.name.equals("encoding"))
                    return attr.value;
            }
        }
        return null;
    }

    private static int firstIndexOf(String s, char[] buf, int startAt, int limit)
    {
        assert(s.length() > 0);
        char[] lookFor = s.toCharArray();

        char firstchar = lookFor[0];
        searching: for (limit -= lookFor.length; startAt < limit; startAt++)
        {
            if (buf[startAt] == firstchar)
            {
                for (int i = 1; i < lookFor.length; i++)
                {
                    if (buf[startAt + i] != lookFor[i])
                    {
                        continue searching;
                    }
                }
                return startAt;
            }
        }

        return -1;
    }

    private static int nextNonmatchingByte(char[] lookFor, char[] buf, int startAt, int limit)
    {
        searching: for (; startAt < limit; startAt++)
        {
            int thischar = buf[startAt];
            for (int i = 0; i < lookFor.length; i++)
                if (thischar == lookFor[i])
                    continue searching;
            return startAt;
        }
        return -1;
    }

    private static int nextMatchingByte(char[] lookFor, char[] buf, int startAt, int limit)
    {
        searching: for (; startAt < limit; startAt++)
        {
            int thischar = buf[startAt];
            for (int i = 0; i < lookFor.length; i++)
                if (thischar == lookFor[i])
                    return startAt;
        }
        return -1;
    }

    private static int nextMatchingByte(char lookFor, char[] buf, int startAt, int limit)
    {
        searching: for (; startAt < limit; startAt++)
        {
            if (buf[startAt] == lookFor)
                return startAt;
        }
        return -1;
    }
    private static char[] WHITESPACE = new char[] { ' ', '\r', '\t', '\n' };
    private static char[] NOTNAME = new char[] { '=', ' ', '\r', '\t', '\n', '?', '>', '<', '\'', '\"' };

    private static class ScannedAttribute
    {
        public String name;
        public String value;
    }

    private static int scanAttribute(char[] buf, int startAt, int limit, ScannedAttribute attr)
    {
        int nameStart = nextNonmatchingByte(WHITESPACE, buf, startAt, limit);
        if (nameStart < 0)
            return -1;
        int nameEnd = nextMatchingByte(NOTNAME, buf, nameStart, limit);
        if (nameEnd < 0)
            return -1;
        int equals = nextNonmatchingByte(WHITESPACE, buf, nameEnd, limit);
        if (equals < 0)
            return -1;
        if (buf[equals] != '=')
            return -1;
        int valQuote = nextNonmatchingByte(WHITESPACE, buf, equals + 1, limit);
        if (buf[valQuote] != '\'' && buf[valQuote] != '\"')
            return -1;
        int valEndquote = nextMatchingByte(buf[valQuote], buf, valQuote + 1, limit);
        if (valEndquote < 0)
            return -1;
        attr.name = new String(buf, nameStart, nameEnd - nameStart);
        attr.value = new String(buf, valQuote + 1, valEndquote - valQuote - 1);
        return valEndquote + 1;
    }
}

class SniffedXmlReader extends BufferedReader {
  // We don't sniff more than 192 bytes.
  public static int MAX_SNIFFED_CHARS = 192;

  public SniffedXmlReader(Reader reader) throws IOException {
    super(reader);
    _encoding = sniffForXmlDecl();
  }

  private int readAsMuchAsPossible(char[] buf, int startAt, int len) throws IOException {
    int total = 0;
    while (total < len) {
      int count = read(buf, startAt + total, len - total);
      if (count < 0)
        break;
      total += count;
    }
    return total;
  }

  // BUGBUG in JDK: Charset.forName is not threadsafe, so we'll prime it
  // with the common charsets.

  private static Charset dummy1 = Charset.forName("UTF-8");

  private static Charset dummy2 = Charset.forName("UTF-16");

  private static Charset dummy3 = Charset.forName("UTF-16BE");

  private static Charset dummy4 = Charset.forName("UTF-16LE");

  private static Charset dummy5 = Charset.forName("ISO-8859-1");

  private static Charset dummy6 = Charset.forName("US-ASCII");

  private static Charset dummy7 = Charset.forName("Cp1252");

  private String sniffForXmlDecl() throws IOException {
    mark(MAX_SNIFFED_CHARS);
    try {
      char[] buf = new char[MAX_SNIFFED_CHARS];
      int limit = readAsMuchAsPossible(buf, 0, MAX_SNIFFED_CHARS);
      return SniffedXmlInputStream.extractXmlDeclEncoding(buf, 0, limit);
    } finally {
      reset();
    }
  }

  private String _encoding;

  public String getXmlEncoding() {
    return _encoding;
  }
}
33.30.XML Reader
	33.30.1.	Read Xml from InputStream and return Document
	33.30.2.	Read Xml from Reader and return Document
	33.30.3.	Sniffed Xml Reader
	33.30.4.	Xml Reader To Writer
	33.30.5.	Xml Encoding Sniffer
	33.30.6.	Sniffed Xml InputStream to find out the declaration and file encoding