uk.ac.ucl.excites.sapelli.shared.io.text.UnicodeBOM.java Source code

Introduction

Here is the source code for uk.ac.ucl.excites.sapelli.shared.io.text.UnicodeBOM.java
Source

/**
 * Sapelli data collection platform: http://sapelli.org
 * 
 * Copyright 2012-2016 University College London - ExCiteS group
 * 
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 * 
 *     http://www.apache.org/licenses/LICENSE-2.0
 * 
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and 
 * limitations under the License.
 */

package uk.ac.ucl.excites.sapelli.shared.io.text;

import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.io.OutputStreamWriter;
import java.nio.charset.Charset;
import java.nio.charset.IllegalCharsetNameException;
import java.nio.charset.UnsupportedCharsetException;
import java.util.Arrays;

import org.apache.commons.io.Charsets;

import uk.ac.ucl.excites.sapelli.shared.util.Objects;

/**
 * <p>
 * An instance of the <code>UnicodeBOM</code> class represents the <b>byte order mark (BOM)</b>
 * specific to a Unicode ("UTF-x") encoding. Instances hold a {@link Charset}, corresponding
 * to the specific encoding, and a byte array containing the bytes that make up the BOM.</p>
 * <p>
 * The class has static instances corresponding to each of the 5 Unicode BOMs defined in the relevant
 * <a href="http://www.unicode.org/unicode/faq/utf_bom.html">Unicode FAQ document</a>, as well as a 6th
 * instance, {@link #NO_BOM}, representing the absence of any (known/detectable) BOM.</p>
 * <p>
 * This class also provides static methods to deal with reading, detecting, and writing Unicode BOMs.</p> 
 * <p>
 * <b>Note 1:</b><br/>
 *    Java's {@link OutputStreamWriter} annoyingly does not insert BOMs automatically. The static {@code GetWriter()}
 *    methods in this class solve this problem.</p>
 * <p>
 * <b>Note 2:</b><br/> 
 *    While Java's {@code InpuStreamReader} detects and skips the BOM on UTF-16 files it does not recognise the UTF-8 BOM.
 *    This problem can be solved by wrapping the {@link InputStream} in a {@link UnicodeBOMInputStream} in order to detect
 *    and skip the BOM (and thereby detect the used {@link Charset}).</p> 
 * 
 * @author mstevens
 * 
 * @see http://koti.mbnet.fi/akini/java/java_utf8_xml
 * @see http://koti.mbnet.fi/akini/java/unicodereader
 * @see http://bugs.java.com/bugdatabase/view_bug.do?bug_id=4508058
 * @see http://tripoverit.blogspot.be/2007/04/javas-utf-8-and-unicode-writing-is.html
 * @see http://www.rgagnon.com/javadetails/java-handle-utf8-file-with-bom.html
 * @see https://svn.codehaus.org/jtstand/jtstand/tags/jtstand-1.5.9/jtstand-editor/src/main/java/org/fife/io/UnicodeWriter.java
 * @see https://github.com/gpakosz/UnicodeBOMInputStream
 */
public final class UnicodeBOM {

    // STATICS --------------------------------------------------------------------------
    static public final UnicodeBOM UTF8_BOM = new UnicodeBOM(Charsets.UTF_8,
            new byte[] { (byte) 0xEF, (byte) 0xBB, (byte) 0xBF });

    static public final UnicodeBOM UTF16LE_BOM = new UnicodeBOM(Charsets.UTF_16LE,
            new byte[] { (byte) 0xFF, (byte) 0xFE });

    static public final UnicodeBOM UTF16BE_BOM = new UnicodeBOM(Charsets.UTF_16BE,
            new byte[] { (byte) 0xFE, (byte) 0xFF });

    static public final UnicodeBOM UTF32LE_BOM = new UnicodeBOM(Charset.forName("UTF-32LE"),
            new byte[] { (byte) 0xFF, (byte) 0xFE, (byte) 0x00, (byte) 0x00 });

    static public final UnicodeBOM UTF32BE_BOM = new UnicodeBOM(Charset.forName("UTF-32BE"),
            new byte[] { (byte) 0x00, (byte) 0x00, (byte) 0xFE, (byte) 0xFF });

    static public final UnicodeBOM NO_BOM = new UnicodeBOM(null, new byte[] {});

    static private final UnicodeBOM[] BOMS = { UTF8_BOM, UTF16LE_BOM, UTF16BE_BOM, UTF32LE_BOM, UTF32BE_BOM,
            NO_BOM };

    /**
     * Returns a UnicodeBOM instance for the given (Unicode) character set.
     * 
     * @param charset
     * @return the UnicodeBOM for the given Charset, or {@code null} no matching BOM was found
     */
    static public UnicodeBOM GetBOM(Charset charset) {
        for (UnicodeBOM bom : BOMS)
            if (bom.charset == charset)
                return bom;
        return null;
    }

    /**
     * Returns a UnicodeBOM instance for the (Unicode) character set with the given name.
     * 
     * @param charsetName
     * @return the UnicodeBOM for the named charset, or {@code null} no matching BOM was found
     * @throws IllegalCharsetNameException if the specified charset name is illegal.
     * @throws UnsupportedCharsetException if the desired charset is not supported by this runtime.
     */
    static public UnicodeBOM GetBOM(String charsetName)
            throws IllegalCharsetNameException, UnsupportedCharsetException {
        return GetBOM(Charset.forName(charsetName));
    }

    /**
     * Returns the UnicodeBOM instance matching the given byte order mark bytes, or {@link #NO_BOM} if
     * the given byte array does not matching any of the (known) Unicode byte order marks. 
     * 
     * @param bomBytes
     * @return a UnicodeBOM instance matching, never {@code null} but possibly {@link #NO_BOM}
     */
    static public UnicodeBOM GetBOM(byte[] bomBytes) {
        for (UnicodeBOM bom : BOMS)
            if (bomBytes.length >= bom.bytes.length
                    && Arrays.equals(bom.bytes, Arrays.copyOf(bomBytes, bom.bytes.length)))
                return bom;
        return NO_BOM;
    }

    /**
     * @return the maximum length (in number of bytes) of the known Unicode byte order marks
     */
    static /*package*/ int GetMaxBOMLength() {
        int max = 0;
        for (UnicodeBOM bom : BOMS)
            if (bom.bytes.length > max)
                max = bom.bytes.length;
        return max;
    }

    /**
     * Returns an {@link OutputStreamWriter} using the system default {@link Charset}.
     * The given {@link OutputStream} is assumed not to be appending.
     * If there is a known UnicodeBOM for the charset then the appropriate BOM bytes will be written to the output.
     * 
     * @param out the {@link OutputStream} to write to
     * @param append
     * @return
     * @throws IllegalArgumentException
     * @throws NullPointerException
     * @throws IOException if an I/O error occurs upon writing the BOM
     */
    static public OutputStreamWriter GetWriter(OutputStream out)
            throws IllegalArgumentException, NullPointerException, IOException {
        return GetWriter(out, Charset.defaultCharset(), false);
    }

    /**
     * Returns an {@link OutputStreamWriter} using the system default {@link Charset}.
     * If there is a known UnicodeBOM for that charset and {@code append} is {@code false} then the appropriate BOM bytes will be written to the output.
     * 
     * @param out the {@link OutputStream} to write to
     * @param append
     * @return
     * @throws IllegalArgumentException
     * @throws NullPointerException
     * @throws IOException if an I/O error occurs upon writing the BOM
     */
    static public OutputStreamWriter GetWriter(OutputStream out, boolean append)
            throws IllegalArgumentException, NullPointerException, IOException {
        return GetWriter(out, Charset.defaultCharset(), append);
    }

    /**
     * Returns an {@link OutputStreamWriter} using the named {@link Charset}.
     * The given {@link OutputStream} is assumed not to be appending.
     * If there is a known UnicodeBOM for the charset then the appropriate BOM bytes will be written to the output.
     * 
     * @param out the {@link OutputStream} to write to
     * @param charsetName
     * @param append
     * @return
     * @throws IllegalCharsetNameException
     * @throws UnsupportedCharsetException
     * @throws IllegalArgumentException
     * @throws NullPointerException
     * @throws IOException if an I/O error occurs upon writing the BOM
     */
    static public OutputStreamWriter GetWriter(OutputStream out, String charsetName)
            throws IllegalCharsetNameException, UnsupportedCharsetException, IllegalArgumentException,
            NullPointerException, IOException {
        return GetWriter(out, charsetName, false);
    }

    /**
     * Returns an {@link OutputStreamWriter} using the named {@link Charset}.
     * If there is a known UnicodeBOM for that charset and {@code append} is {@code false} then the appropriate BOM bytes will be written to the output.
     * 
     * @param out the {@link OutputStream} to write to
     * @param charsetName
     * @param append
     * @return
     * @throws IllegalCharsetNameException
     * @throws UnsupportedCharsetException
     * @throws IllegalArgumentException
     * @throws NullPointerException
     * @throws IOException if an I/O error occurs upon writing the BOM
     */
    static public OutputStreamWriter GetWriter(OutputStream out, String charsetName, boolean append)
            throws IllegalCharsetNameException, UnsupportedCharsetException, IllegalArgumentException,
            NullPointerException, IOException {
        return GetWriter(out, Charset.forName(charsetName), append);
    }

    /**
     * Returns an {@link OutputStreamWriter} using the given {@link Charset}.
     * The given {@link OutputStream} is assumed not to be appending.
     * If there is a known UnicodeBOM for the charset then the appropriate BOM bytes will be written to the output.
     * 
     * @param out the {@link OutputStream} to write to
     * @param charset
     * @param append
     * @return
     * @throws NullPointerException
     * @throws IOException if an I/O error occurs upon writing the BOM
     */
    static public OutputStreamWriter GetWriter(OutputStream out, Charset charset)
            throws NullPointerException, IOException {
        return GetWriter(out, charset, false);
    }

    /**
     * Returns an {@link OutputStreamWriter} using the given {@link Charset}.
     * If there is a known UnicodeBOM for that charset and {@code append} is {@code false} then the appropriate BOM bytes will be written to the output.
     * 
     * @param out the {@link OutputStream} to write to
     * @param charset
     * @param append
     * @return
     * @throws NullPointerException
     * @throws IOException if an I/O error occurs upon writing the BOM
     */
    static public OutputStreamWriter GetWriter(OutputStream out, Charset charset, boolean append)
            throws NullPointerException, IOException {
        // Write BOM if needed:
        UnicodeBOM bom = null;
        if (!append && ((bom = GetBOM(charset)) != null))
            bom.writeBytes(out);
        // Return writer:
        return new OutputStreamWriter(out, charset);
    }

    // DYNAMICS -------------------------------------------------------------------------
    private final Charset charset;
    private final byte[] bytes;

    /**
     * @param charset may be {@code null} if and only if {@code bytes.length} is 0 (-> "NO_BOM")
     * @param bytes should not be {@code null}, may have length 0 if and only if {@code charset} is {@code null} (-> "NO_BOM")
     */
    private UnicodeBOM(Charset charset, byte[] bytes) {
        if (bytes == null || (bytes.length == 0 && charset != null) || (bytes.length > 0 && charset == null))
            throw new IllegalArgumentException("Invalid charset and/or bytes");
        this.charset = charset;
        this.bytes = bytes;
    }

    public String getCharsetName() {
        if (charset != null)
            return charset.name();
        else
            return "NONE";
    }

    public Charset getCharset() {
        return charset;
    }

    public byte[] getBytes() {
        return Arrays.copyOf(bytes, bytes.length);
    }

    public int getLength() {
        return bytes.length;
    }

    public void writeBytes(OutputStream outputStream) throws IOException {
        outputStream.write(bytes);
    }

    @Override
    public String toString() {
        return getClass().getSimpleName() + '_' + getCharsetName();
    }

    @Override
    public boolean equals(Object obj) {
        if (this == obj)
            return true;
        if (obj instanceof UnicodeBOM) {
            UnicodeBOM that = (UnicodeBOM) obj;
            return Objects.equals(this.charset, that.charset) && Arrays.equals(this.bytes, that.bytes);
        } else
            return false;
    }

    @Override
    public int hashCode() {
        int hash = 1;
        hash = 31 * hash + (charset == null ? 0 : charset.hashCode());
        hash = 31 * hash + Arrays.hashCode(bytes);
        return hash;
    }

}