mitm.common.util.AutoDetectUnicodeReader.java Source code

Introduction

Here is the source code for mitm.common.util.AutoDetectUnicodeReader.java
Source

/*
 * Copyright (c) 2010-2012, Martijn Brinkers, Djigzo.
 * 
 * This file is part of Djigzo email encryption.
 *
 * Djigzo is free software: you can redistribute it and/or modify
 * it under the terms of the GNU Affero General Public License 
 * version 3, 19 November 2007 as published by the Free Software 
 * Foundation.
 *
 * Djigzo is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
 * GNU Affero General Public License for more details.
 *
 * You should have received a copy of the GNU Affero General Public 
 * License along with Djigzo. If not, see <http://www.gnu.org/licenses/>
 *
 * Additional permission under GNU AGPL version 3 section 7
 * 
 * If you modify this Program, or any covered work, by linking or 
 * combining it with aspectjrt.jar, aspectjweaver.jar, tyrex-1.0.3.jar, 
 * freemarker.jar, dom4j.jar, mx4j-jmx.jar, mx4j-tools.jar, 
 * spice-classman-1.0.jar, spice-loggerstore-0.5.jar, spice-salt-0.8.jar, 
 * spice-xmlpolicy-1.0.jar, saaj-api-1.3.jar, saaj-impl-1.3.jar, 
 * wsdl4j-1.6.1.jar (or modified versions of these libraries), 
 * containing parts covered by the terms of Eclipse Public License, 
 * tyrex license, freemarker license, dom4j license, mx4j license,
 * Spice Software License, Common Development and Distribution License
 * (CDDL), Common Public License (CPL) the licensors of this Program grant 
 * you additional permission to convey the resulting work.
 */
package mitm.common.util;

import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.PushbackInputStream;
import java.io.Reader;
import java.nio.charset.Charset;

import mitm.common.locale.CharacterEncoding;

import org.apache.commons.lang.ArrayUtils;

import com.ibm.icu.text.CharsetDetector;
import com.ibm.icu.text.CharsetMatch;

/**
 * Reader that tries to detect the encoding (for example UTF-8) used by the input stream. The first detection step
 * is to check whether the input contains a byte order mark (BOM). If a BOM is found, the BOM will be used. If
 * a BOM is not found, the char type will be guessed using a {@link CharsetDetector}
 * 
 * @author Martijn Brinkers
 *
 */
public class AutoDetectUnicodeReader extends Reader {
    /*
     * Maximal BOM size
     */
    private static final int BOM_SIZE = 4;

    /*
     * Max bytes used for auto detect
     */
    private static final int AUTO_DETECT_BYTES = SizeUtils.KB * 8;

    /*
     * The encoding to use
     */
    private String encoding;

    /*
     * The confidence level of the detected encoding (0-100)
     */
    private int confidence;

    /*
     * The input
     */
    private InputStream input;

    /*
     * Required for BOM detection
     */
    private PushbackInputStream pushback;

    /*
     * Reader wrapping to input
     */
    private Reader reader;

    /**
     * Creates in instance of AutoDetectUnicodeReader that will detect the encoding using BOM or based
     * in the used characters using a CharsetDetector.
     */
    public AutoDetectUnicodeReader(InputStream input) {
        this(input, null);
    }

    /**
     * Conveniance constructor. If encoding is non-null, the encoding will not be detected but the given
     * encoding will be used. This constructor is added for conveniance to make it possible to use this
     * reader when the encoding is known.
     */
    public AutoDetectUnicodeReader(InputStream input, String encoding) {
        Check.notNull(input, "input");

        this.input = input;
        this.encoding = encoding;
        this.pushback = new PushbackInputStream(input, AUTO_DETECT_BYTES);
    }

    private int detectBOM() throws IOException {
        byte bom[] = new byte[BOM_SIZE];

        int n, unread;

        n = input.read(bom, 0, bom.length);

        if ((bom[0] == (byte) 0x00) && (bom[1] == (byte) 0x00) && (bom[2] == (byte) 0xFE)
                && (bom[3] == (byte) 0xFF)) {
            encoding = "UTF-32BE";
            unread = n - 4;
        } else if ((bom[0] == (byte) 0xFF) && (bom[1] == (byte) 0xFE) && (bom[2] == (byte) 0x00)
                && (bom[3] == (byte) 0x00)) {
            encoding = "UTF-32LE";
            unread = n - 4;
        } else if ((bom[0] == (byte) 0xEF) && (bom[1] == (byte) 0xBB) && (bom[2] == (byte) 0xBF)) {
            encoding = "UTF-8";
            unread = n - 3;
        } else if ((bom[0] == (byte) 0xFE) && (bom[1] == (byte) 0xFF)) {
            encoding = "UTF-16BE";
            unread = n - 2;
        } else if ((bom[0] == (byte) 0xFF) && (bom[1] == (byte) 0xFE)) {
            encoding = "UTF-16LE";
            unread = n - 2;
        } else {
            /*
             * Unicode BOM mark not found, unread all bytes
             */
            unread = n;
        }

        if (unread > 0) {
            pushback.unread(bom, (n - unread), unread);
        }

        return n;
    }

    private void detect() throws IOException {
        if (reader != null) {
            return;
        }

        if (encoding == null) {
            int n = detectBOM();

            /*
             * if n == -1, the a EOF was detected during BOM detection
             */
            if (encoding == null && n != -1) {
                /*
                 * No BOM found so auto detect charset
                 */
                CharsetDetector detector = new CharsetDetector();

                byte[] detectionBuffer = new byte[AUTO_DETECT_BYTES];

                int read = pushback.read(detectionBuffer);

                if (read == -1) {
                    throw new IOException("EOF during detection.");
                }

                byte[] detectionBytes = ArrayUtils.subarray(detectionBuffer, 0, read);

                /*
                 * need to push back the bytes read for detection
                 */
                pushback.unread(detectionBytes);

                detector.setText(detectionBytes);

                CharsetMatch[] matches = detector.detectAll();

                if (matches != null) {
                    /*
                     * There might be multiple encodings. Find the first valid one.
                     */
                    for (CharsetMatch match : matches) {
                        if (match == null) {
                            continue;
                        }

                        if (Charset.isSupported(match.getName())) {
                            encoding = match.getName();
                            confidence = match.getConfidence();

                            break;
                        }
                    }

                }
            }

            if (encoding == null) {
                /*
                 * fallback to ASCII if no encoding is found.
                 */
                encoding = CharacterEncoding.US_ASCII;
            }
        }

        reader = new InputStreamReader(pushback, encoding);
    }

    @Override
    public int read(char[] cbuf, int off, int len) throws IOException {
        detect();

        return reader.read(cbuf, off, len);
    }

    @Override
    public void close() throws IOException {
        if (reader != null) {
            reader.close();
        } else {
            input.close();
        }
    }

    public String getEncoding() {
        return encoding;
    }

    public int getConfidence() {
        return confidence;
    }
}