com.microsoft.tfs.core.util.FileEncodingDetector.java Source code

Introduction

Here is the source code for com.microsoft.tfs.core.util.FileEncodingDetector.java
Source

// Copyright (c) Microsoft. All rights reserved.
// Licensed under the MIT license. See License.txt in the repository root.

package com.microsoft.tfs.core.util;

import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.UnsupportedEncodingException;
import java.nio.ByteBuffer;
import java.nio.CharBuffer;
import java.nio.charset.Charset;
import java.nio.charset.CharsetDecoder;
import java.nio.charset.CoderResult;
import java.nio.charset.CodingErrorAction;
import java.text.MessageFormat;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;

import com.microsoft.tfs.core.Messages;
import com.microsoft.tfs.core.clients.versioncontrol.path.LocalPath;
import com.microsoft.tfs.core.clients.versioncontrol.path.ServerPath;
import com.microsoft.tfs.core.clients.versioncontrol.path.Wildcard;
import com.microsoft.tfs.core.exceptions.TECoreException;
import com.microsoft.tfs.util.Check;
import com.microsoft.tfs.util.IOUtils;
import com.microsoft.tfs.util.Platform;

/**
 * Static methods which assist in detecting text encodings in disk files. Always
 * returns encodings represented by known {@link FileEncoding} instances (which
 * can be queried for their text names and code page numbers).
 *
 * @since TEE-SDK-10.1
 * @threadsafety immutable
 */
public final class FileEncodingDetector {
    private static final Log tracer = LogFactory.getLog(FileEncodingDetector.class);

    /**
     * This class is static, no one constructs us.
     */
    private FileEncodingDetector() {
    }

    /**
     * Detects the encoding used for a server or local path with hints. For
     * server paths, the contents are never read and only the hint is used.
     * <p>
     * The {@link FileEncoding#AUTOMATICALLY_DETECT} encoding hint is only valid
     * for local (not server) paths that do not contain wildcard characters,
     * exist on disk, and are files (not directories). If it is specified for
     * other kinds of paths, an exception is thrown.
     * <p>
     * Encoding hints are evaluated in the following way:
     * <ul>
     * <li>If the hint is {@link FileEncoding#BINARY} that encoding is returned
     * immediately for all item types.</li>
     * <li>If the hint is {@link FileEncoding#DEFAULT_TEXT} the default encoding
     * for this platform is returned immediately for all item types.</li>
     * <li>If the hint is something other than
     * {@link FileEncoding#AUTOMATICALLY_DETECT}, that encoding is returned
     * immediately for all item types.</li>
     * <li>The hint is {@link FileEncoding#AUTOMATICALLY_DETECT}:
     * <ul>
     * <li>If the path is a server path, is a local directory, or contains
     * wildcards, an exception is thrown.</li>
     * <li>The file's contents are read to determine the encoding, which is
     * returned.</li>
     * </ul>
     * </li>
     * </ul>
     *
     * @param path
     *        the path to detect encoding for (must not be <code>null</code>)
     * @param encodingHint
     *        the encoding hint (must not be <code>null</code>)
     * @return the {@link FileEncoding} that matches the given file's encoding.
     * @throws TECoreException
     *         if the specified encoding hint is not valid for the type of path
     *         given
     */
    public static FileEncoding detectEncoding(final String path, final FileEncoding encodingHint) {
        tracer.trace(MessageFormat.format("path={0}, encodingHint={1}", path, encodingHint)); //$NON-NLS-1$

        Check.notNull(path, "path"); //$NON-NLS-1$
        Check.notNull(encodingHint, "encodingHint"); //$NON-NLS-1$

        // These hints apply immediately to all types of paths.

        if (encodingHint == FileEncoding.BINARY) {
            return encodingHint;
        } else if (encodingHint == FileEncoding.DEFAULT_TEXT) {
            return FileEncoding.getDefaultTextEncoding();
        } else if (encodingHint != FileEncoding.AUTOMATICALLY_DETECT) {
            return encodingHint;
        }

        // The encoding is FileEncoding.AUTOMATICALLY_DETECT

        if (ServerPath.isServerPath(path)) {
            throw new TECoreException(
                    MessageFormat.format(Messages.getString("FileEncodingDetector.ServerPathsNotDetectedFormat"), //$NON-NLS-1$
                            path));
        } else if (Wildcard.isWildcard(path)) {
            throw new TECoreException(MessageFormat.format(
                    Messages.getString("FileEncodingDetector.LocalItemContainsWildcardsFormat"), //$NON-NLS-1$
                    path));
        }

        // The path is a local path without wildcards

        final File file = new File(LocalPath.canonicalize(path));

        if (file.exists() == false) {
            throw new TECoreException(
                    MessageFormat.format(Messages.getString("FileEncodingDetector.LocalItemDoesNotExistFormat"), //$NON-NLS-1$
                            path));
        } else if (file.isDirectory()) {
            throw new TECoreException(
                    MessageFormat.format(Messages.getString("FileEncodingDetector.LocalItemIsDirectoryFormat"), //$NON-NLS-1$
                            path));
        }

        FileInputStream stream = null;

        try {
            stream = new FileInputStream(path);

            final byte[] buffer = new byte[1024];

            // Look for the Unicode Byte Order Mark (BOM).
            final int read = stream.read(buffer, 0, buffer.length);

            if (read < 0) {
                return FileEncoding.getDefaultTextEncoding();
            }

            /*
             * Examine the BOM for details. Java uses signed bytes, so we use
             * (signed) integers for easier comparison.
             */
            if (read >= 2 && buffer[0] == -2 && buffer[1] == -1) {
                return FileEncoding.UTF_16BE;
            } else if (read >= 2 && buffer[0] == -1 && buffer[1] == -2) {
                if (read >= 4 && buffer[2] == 0 && buffer[3] == 0) {
                    return FileEncoding.UTF_32;
                } else {
                    return FileEncoding.UTF_16;
                }
            } else if (read >= 3 && buffer[0] == -17 && buffer[1] == -69 && buffer[2] == -65) {
                return FileEncoding.UTF_8;
            } else if (read >= 4 && buffer[0] == 0 && buffer[1] == 0 && buffer[2] == -2 && buffer[3] == -1) {
                return FileEncoding.UTF_32BE;
            } else if (startsWithPDFHeader(buffer, buffer.length)) {
                /*
                 * We go out of our way to detect PDF files so we can claim
                 * they're all binary. This is because a PDF file can be created
                 * with no non-text bytes in the first 1024 bytes (or in the
                 * whole file itself). However, these files shouldn't ever be
                 * automatically merged, and in the case where the first 1024
                 * bytes are clean, there may lie non-text bytes near the end.
                 */
                return FileEncoding.BINARY;
            }

            /*
             * No encoding determined yet. Search the chunk we read for
             * "non-text" characters that would indicate this is not a text
             * file. The values we search for on z/OS are for EBCDIC, the others
             * use ASCII.
             */
            if (Platform.isCurrentPlatform(Platform.Z_OS)) {
                if (looksLikeEBCDIC(buffer, read) == false) {
                    return FileEncoding.BINARY;
                }
            } else {
                if (looksLikeANSI(buffer, read) == false) {
                    return FileEncoding.BINARY;
                }
            }

            /*
             * If we got down here, we could not identify the file as certainly
             * binary by searching the first block of the file we read, so its
             * probably simple text. We should use the code default encoding for
             * this platform / user.
             *
             * This covers the case of an empty file, which VSS treats as a text
             * file in the default encoding, and so does TFS, so we should too.
             */
            return FileEncoding.getDefaultTextEncoding();
        } catch (final IOException e) {
            throw new TECoreException(e);
        } finally {
            if (stream != null) {
                IOUtils.closeSafely(stream);
            }
        }
    }

    /**
     * Tests whether the given bytes declare an Adobe PDF file.
     *
     * See the PDF file spec at
     * http://partners.adobe.com/public/developer/pdf/index_reference.html for
     * details on file structure.
     *
     * @param bytes
     *        the bytes to test, where the byte at index 0 is the first byte of
     *        the file.
     * @param size
     *        the maximum number of bytes to evaluate in the given byte array.
     * @return true if the bytes denote a PDF file of any version, false if they
     *         do not.
     */
    private static boolean startsWithPDFHeader(final byte[] bytes, final int size) {
        /*
         * PDF files start with the header "%PDF-X.Y" where X is the major
         * version number and Y is the minor. See the PDF specification for
         * notes on this header and the version scheme.
         */

        // The header is 8 bytes, so smaller files can't be PDF.
        if (size < 8) {
            return false;
        }

        // Make sure the first five bytes are an exact match.
        byte[] firstFive;
        try {
            firstFive = "%PDF-".getBytes("US-ASCII"); //$NON-NLS-1$ //$NON-NLS-2$
        } catch (final UnsupportedEncodingException e) {
            /*
             * Should never happen as all JVMs are required to support US-ASCII.
             */
            throw new RuntimeException(e);
        }
        for (int i = 0; i < firstFive.length; i++) {
            if (firstFive[i] != bytes[i]) {
                return false;
            }
        }

        // Finally, it should end with "X.Y".
        if (Character.isDigit((char) bytes[5]) && bytes[6] == '.' && Character.isDigit((char) bytes[7])) {
            return true;
        }

        return false;
    }

    /**
     * Tests whether the given byte array looks like an ANSI text file with the
     * default text encoding, i.e. can be decoded with the current ANSI
     * character set. In multi-byte character sets (like Japanese, for example)
     * the entire byte array might not be converted entirely, because at the end
     * of array it might contain a broken multi-byte character. We still accept
     * this kind of files as ANSI ones if the not converted reminder of the
     * array is short enough.
     *
     * @param bytes
     *        the bytes to check for ANSI-ness (must not be <code>null</code>)
     * @param limit
     *        the maximum number of bytes to read.
     * @return true if the given bytes look like part of an ANSI text file,
     *         false if they do not (because they contain control characters or
     *         other patterns).
     */
    protected static boolean looksLikeANSI(final byte[] bytes, final int limit) {
        final Charset charSet = CodePageMapping.getCharset(FileEncoding.getDefaultTextEncoding().getCodePage());

        final ByteBuffer byteBuffer = ByteBuffer.wrap(bytes, 0, limit);
        final CharBuffer charBuffer = CharBuffer.allocate(limit);

        final CharsetDecoder decoder = charSet.newDecoder();
        decoder.onUnmappableCharacter(CodingErrorAction.REPORT);
        decoder.onMalformedInput(CodingErrorAction.REPORT);

        final CoderResult rc = decoder.decode(byteBuffer, charBuffer, true);

        if (!rc.isError()) {
            return true;
        } else {
            return byteBuffer.position() > limit - 5;
        }
    }

    /**
     * Tests whether the given byte array looks like an EBCDIC text file
     * (contains character values that would be present in an EBCDIC text file
     * without the control characters that would not).
     *
     * @param bytes
     *        the bytes to check for EBCDIC-ness (must not be <code>null</code>)
     * @param limit
     *        the maximum number of bytes to read.
     * @return true if the given bytes look like part of an EBCDIC text file,
     *         false if they do not (because they contain control characters or
     *         other patterns).
     */
    protected static boolean looksLikeEBCDIC(final byte[] bytes, final int limit) {
        /*
         * EBDIC is like ASCII in that it uses the lower values for control
         * characters, but it uses up to value 62. We test exceptions for a few
         * of those. Because Java uses signed bytes, and EBCDIC uses values >
         * 127, we can ignore the high half of the EBCDIC space (valid
         * characters, mostly) by just accepting all negative values.
         *
         * 5: horizontal tab
         *
         * 6: required new line
         *
         * 9: superscript
         *
         * 11: vertical tab
         *
         * 12: form feed
         *
         * 13: carriage return
         *
         * 21: new line
         *
         * 37: line feed
         *
         * 56: subscript
         *
         * 57: indent tab
         */
        for (int i = 0; i < limit; i++) {
            if ((bytes[i] >= 0 && bytes[i] <= 62) && (bytes[i] != 5 && bytes[i] != 6 && bytes[i] != 9
                    && bytes[i] != 11 && bytes[i] != 12 && bytes[i] != 13 && bytes[i] != 21 && bytes[i] != 37
                    && bytes[i] != 56 && bytes[i] != 57)) {
                return false;
            }
        }

        return true;
    }
}