org.apache.commons.io.input.BOMInputStream.java Source code

Java tutorial

Introduction

Here is the source code for org.apache.commons.io.input.BOMInputStream.java

Source

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 * 
 *      http://www.apache.org/licenses/LICENSE-2.0
 * 
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.commons.io.input;

import java.io.IOException;
import java.io.InputStream;
import java.util.Arrays;
import java.util.List;

import org.apache.commons.io.ByteOrderMark;

/**
 * This class is used to wrap a stream that includes an encoded
 * {@link ByteOrderMark} as its first bytes.
 *
 * This class detects these bytes and, if required, can automatically skip them
 * and return the subsequent byte as the first byte in the stream.
 *
 * The {@link ByteOrderMark} implementation has the following pre-defined BOMs:
 * <ul>
 *   <li>UTF-8 - {@link ByteOrderMark#UTF_8}</li>
 *   <li>UTF-16BE - {@link ByteOrderMark#UTF_16LE}</li>
 *   <li>UTF-16LE - {@link ByteOrderMark#UTF_16BE}</li>
 * </ul>
 *
 *
 * <h3>Example 1 - Detect and exclude a UTF-8 BOM</h3>
 * <pre>
 *      BOMInputStream bomIn = new BOMInputStream(in);
 *      if (bomIn.hasBOM()) {
 *          // has a UTF-8 BOM
 *      }
 * </pre>
 *
 * <h3>Example 2 - Detect a UTF-8 BOM (but don't exclude it)</h3>
 * <pre>
 *      boolean include = true;
 *      BOMInputStream bomIn = new BOMInputStream(in, include);
 *      if (bomIn.hasBOM()) {
 *          // has a UTF-8 BOM
 *      }
 * </pre>
 *
 * <h3>Example 3 - Detect Multiple BOMs</h3>
 * <pre>
 *      BOMInputStream bomIn = new BOMInputStream(in, ByteOrderMark.UTF_16LE, ByteOrderMark.UTF_16BE);
 *      if (bomIn.hasBOM() == false) {
 *          // No BOM found
 *      } else if (bomIn.hasBOM(ByteOrderMark.UTF_16LE)) {
 *          // has a UTF-16LE BOM
 *      } else if (bomIn.hasBOM(ByteOrderMark.UTF_16BE)) {
 *          // has a UTF-16BE BOM
 *      }
 * </pre>
 *
 * @see org.apache.commons.io.ByteOrderMark
 * @see <a href="http://en.wikipedia.org/wiki/Byte_order_mark">Wikipedia - Byte Order Mark</a>
 * @version $Revision: 1052095 $ $Date: 2010-12-22 23:03:20 +0000 (Wed, 22 Dec 2010) $
 * @since Commons IO 2.0
 */
public class BOMInputStream extends ProxyInputStream {
    private final boolean include;
    private final List<ByteOrderMark> boms;
    private ByteOrderMark byteOrderMark;
    private int[] firstBytes;
    private int fbLength;
    private int fbIndex;
    private int markFbIndex;
    private boolean markedAtStart;

    /**
     * Constructs a new BOM InputStream that excludes
     * a {@link ByteOrderMark#UTF_8} BOM.
     * @param delegate the InputStream to delegate to
     */
    public BOMInputStream(InputStream delegate) {
        this(delegate, false, ByteOrderMark.UTF_8);
    }

    /**
     * Constructs a new BOM InputStream that detects a
     * a {@link ByteOrderMark#UTF_8} and optionally includes it.
     * @param delegate the InputStream to delegate to
     * @param include true to include the UTF-8 BOM or
     * false to exclude it
     */
    public BOMInputStream(InputStream delegate, boolean include) {
        this(delegate, include, ByteOrderMark.UTF_8);
    }

    /**
     * Constructs a new BOM InputStream that excludes
     * the specified BOMs.
     * @param delegate the InputStream to delegate to
     * @param boms The BOMs to detect and exclude
     */
    public BOMInputStream(InputStream delegate, ByteOrderMark... boms) {
        this(delegate, false, boms);
    }

    /**
     * Constructs a new BOM InputStream that detects the
     * specified BOMs and optionally includes them.
     * @param delegate the InputStream to delegate to
     * @param include true to include the specified BOMs or
     * false to exclude them
     * @param boms The BOMs to detect and optionally exclude
     */
    public BOMInputStream(InputStream delegate, boolean include, ByteOrderMark... boms) {
        super(delegate);
        if (boms == null || boms.length == 0) {
            throw new IllegalArgumentException("No BOMs specified");
        }
        this.include = include;
        this.boms = Arrays.asList(boms);
    }

    /**
     * Indicates whether the stream contains one of the specified BOMs.
     *
     * @return true if the stream has one of the specified BOMs, otherwise false
     * if it does not
     * @throws IOException if an error reading the first bytes of the stream occurs
     */
    public boolean hasBOM() throws IOException {
        return (getBOM() != null);
    }

    /**
     * Indicates whether the stream contains the specified BOM.
     *
     * @param bom The BOM to check for
     * @return true if the stream has the specified BOM, otherwise false
     * if it does not
     * @throws IllegalArgumentException if the BOM is not one the stream
     * is configured to detect
     * @throws IOException if an error reading the first bytes of the stream occurs
     */
    public boolean hasBOM(ByteOrderMark bom) throws IOException {
        if (!boms.contains(bom)) {
            throw new IllegalArgumentException("Stream not configure to detect " + bom);
        }
        return (byteOrderMark != null && getBOM().equals(bom));
    }

    /**
     * Return the BOM (Byte Order Mark).
     *
     * @return The BOM or null if none
     * @throws IOException if an error reading the first bytes of the stream occurs
     */
    public ByteOrderMark getBOM() throws IOException {
        if (firstBytes == null) {
            int max = 0;
            for (ByteOrderMark bom : boms) {
                max = Math.max(max, bom.length());
            }
            firstBytes = new int[max];
            for (int i = 0; i < firstBytes.length; i++) {
                firstBytes[i] = in.read();
                fbLength++;
                if (firstBytes[i] < 0) {
                    break;
                }

                byteOrderMark = find();
                if (byteOrderMark != null) {
                    if (!include) {
                        fbLength = 0;
                    }
                    break;
                }
            }
        }
        return byteOrderMark;
    }

    /**
     * Return the BOM charset Name - {@link ByteOrderMark#getCharsetName()}.
     *
     * @return The BOM charset Name or null if no BOM found
     * @throws IOException if an error reading the first bytes of the stream occurs
     * 
     */
    public String getBOMCharsetName() throws IOException {
        getBOM();
        return (byteOrderMark == null ? null : byteOrderMark.getCharsetName());
    }

    /**
     * This method reads and either preserves or skips the first bytes in the
     * stream. It behaves like the single-byte <code>read()</code> method,
     * either returning a valid byte or -1 to indicate that the initial bytes
     * have been processed already.
     * @return the byte read (excluding BOM) or -1 if the end of stream
     * @throws IOException if an I/O error occurs
     */
    private int readFirstBytes() throws IOException {
        getBOM();
        return (fbIndex < fbLength) ? firstBytes[fbIndex++] : -1;
    }

    /**
     * Find a BOM with the specified bytes.
     *
     * @return The matched BOM or null if none matched
     */
    private ByteOrderMark find() {
        for (ByteOrderMark bom : boms) {
            if (matches(bom)) {
                return bom;
            }
        }
        return null;
    }

    /**
     * Check if the bytes match a BOM.
     *
     * @param bom The BOM
     * @return true if the bytes match the bom, otherwise false
     */
    private boolean matches(ByteOrderMark bom) {
        if (bom.length() != fbLength) {
            return false;
        }
        for (int i = 0; i < bom.length(); i++) {
            if (bom.get(i) != firstBytes[i]) {
                return false;
            }
        }
        return true;
    }

    //----------------------------------------------------------------------------
    //  Implementation of InputStream
    //----------------------------------------------------------------------------

    /**
     * Invokes the delegate's <code>read()</code> method, detecting and
     * optionally skipping BOM.
     * @return the byte read (excluding BOM) or -1 if the end of stream
     * @throws IOException if an I/O error occurs
     */
    @Override
    public int read() throws IOException {
        int b = readFirstBytes();
        return (b >= 0) ? b : in.read();
    }

    /**
     * Invokes the delegate's <code>read(byte[], int, int)</code> method, detecting
     * and optionally skipping BOM.
     * @param buf the buffer to read the bytes into
     * @param off The start offset
     * @param len The number of bytes to read (excluding BOM)
     * @return the number of bytes read or -1 if the end of stream
     * @throws IOException if an I/O error occurs
     */
    @Override
    public int read(byte[] buf, int off, int len) throws IOException {
        int firstCount = 0;
        int b = 0;
        while ((len > 0) && (b >= 0)) {
            b = readFirstBytes();
            if (b >= 0) {
                buf[off++] = (byte) (b & 0xFF);
                len--;
                firstCount++;
            }
        }
        int secondCount = in.read(buf, off, len);
        return (secondCount < 0) ? (firstCount > 0 ? firstCount : -1) : firstCount + secondCount;
    }

    /**
     * Invokes the delegate's <code>read(byte[])</code> method, detecting and
     * optionally skipping BOM.
     * @param buf the buffer to read the bytes into
     * @return the number of bytes read (excluding BOM)
     * or -1 if the end of stream
     * @throws IOException if an I/O error occurs
     */
    @Override
    public int read(byte[] buf) throws IOException {
        return read(buf, 0, buf.length);
    }

    /**
     * Invokes the delegate's <code>mark(int)</code> method.
     * @param readlimit read ahead limit
     */
    @Override
    public synchronized void mark(int readlimit) {
        markFbIndex = fbIndex;
        markedAtStart = (firstBytes == null);
        in.mark(readlimit);
    }

    /**
     * Invokes the delegate's <code>reset()</code> method.
     * @throws IOException if an I/O error occurs
     */
    @Override
    public synchronized void reset() throws IOException {
        fbIndex = markFbIndex;
        if (markedAtStart) {
            firstBytes = null;
        }

        in.reset();
    }

    /**
     * Invokes the delegate's <code>skip(long)</code> method, detecting
     * and optionallyskipping BOM.
     * @param n the number of bytes to skip
     * @return the number of bytes to skipped or -1 if the end of stream
     * @throws IOException if an I/O error occurs
     */
    @Override
    public long skip(long n) throws IOException {
        while ((n > 0) && (readFirstBytes() >= 0)) {
            n--;
        }
        return in.skip(n);
    }
}