uk.ac.ucl.excites.sapelli.shared.io.text.CharsetHelpers.java Source code

Java tutorial

Introduction

Here is the source code for uk.ac.ucl.excites.sapelli.shared.io.text.CharsetHelpers.java

Source

/**
 * Sapelli data collection platform: http://sapelli.org
 * 
 * Copyright 2012-2016 University College London - ExCiteS group
 * 
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 * 
 *     http://www.apache.org/licenses/LICENSE-2.0
 * 
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and 
 * limitations under the License.
 */

package uk.ac.ucl.excites.sapelli.shared.io.text;

import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileFilter;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.OutputStreamWriter;
import java.nio.charset.Charset;
import java.util.Map;
import java.util.ResourceBundle;
import java.util.SortedMap;
import java.util.TreeMap;

import org.apache.commons.io.Charsets;
import org.apache.commons.io.filefilter.WildcardFileFilter;

import uk.ac.ucl.excites.sapelli.shared.io.FileHelpers;
import uk.ac.ucl.excites.sapelli.shared.util.ClassHelpers;
import uk.ac.ucl.excites.sapelli.shared.util.IterableEnumeration;
import uk.ac.ucl.excites.sapelli.shared.util.TimeUtils;

/**
 * <p>
 * Helper class which provides consistent (across different Java VMs/runtimes) and conservative (i.e., possibly
 * overestimated) values for the maximum number of bytes used per char(acter) when encoding {@link String}s with
 * any supported {@link Charset}s.</p>
 * <p>
 * Created on 2015-10-21 when I learned that the result of {@code charset.newEncoder().maxBytesPerChar()}
 * can vary across different Java VM/runtimes. For example, on the Oracle JRE v1.8.0_x and v1.7.0_x we get
* {@code 3.0f} for UTF-8 (as on all Android versions as far as I've been able to test), but on JRE v1.6.0_x
 * it was {@code 4.0f} (which was arguably wrong). See:<br/>
 *     - <a href="http://bugs.java.com/view_bug.do?bug_id=6957230">http://bugs.java.com/view_bug.do?bug_id=6957230</a><br/>
 *   - <a href="http://bugs.java.com/view_bug.do?bug_id=8058875">http://bugs.java.com/view_bug.do?bug_id=8058875</a><br/>
 * Further research showed that there are also differences, in the reported maximum bytes per char for certain encodings,
 * between the Oracle JRE and Android.</p>
 * <p>
 * All of this is problematic for our StringColumn class which needs a consistent {@code maxBytesPerChar} value
 * to compute the maximum number of bytes Strings of a certain number of chars will require when encoded using
 * a given {@link Charset}. Hence I created this class to solve this problem.</p>
 * <p>
 * The {@link #GetMaxBytesPerChar(Charset)} and {@link #GetMaxBytesPerChar(String)} methods provide the maximum
 * {@code maxBytesPerChar} value, across several Java VM/runtime implementations/versions, for the a given
 * {@link Charset} (name). The {@code maxBytesPerChar} values for different {@code Charset}s as implemented on
 * different VMs/runtimes are loaded from a properties file (named {@value #CMMBPC_PROPERTIES_FILE_NAME}) which
 * must reside in the same package as this class.</p>
 * <p>
 * The {@link #WriteCharsetInfoFile(File, String, String)} method writes out a ".charsetinfo" file with the
 * {@code maxBytesPerChar} values for all {@link Charset}s supported by the current platform/device.</p>
 * <p>
 * The {@link #GeneratePropertiesFile(String, String, boolean) method loads multiple ".charsetinfo" files
 * and summarises the information (i.e. finding the maximum {@code maxBytesPerChar} for each {@link Charset})
 * and writes it to a new {@value #CMMBPC_PROPERTIES_FILE_NAME} properties file, to be packaged with the
 * library for the usage by {@link #GetMaxBytesPerChar(Charset)} and {@link #GetMaxBytesPerChar(String)} methods.</p>
 * 
 * @author mstevens
 */
public final class CharsetHelpers {

    private CharsetHelpers() {
    }

    private static final Charset CHARSETINFO_FILE_CHARSET = Charsets.UTF_8;

    private static final String CHARSETINFO_FILE_EXTENSION = "charsetinfo";

    private static final Charset PROPERTIES_FILE_CHARSET = Charsets.ISO_8859_1;

    private static final String PROPERTIES_FILE_EXTENSION = "properties";

    private static final char COMMENT_LINE_MARKER = '#';

    private static final String EOL = System.getProperty("line.separator");

    private static final String CMMBPC_PROPERTIES_FILE_NAME = "CharsetMaxMaxBytesPerChar";

    private static final ResourceBundle CharsetMaxMaxBytesPerCharBundle;
    static {
        ResourceBundle bundle = null;
        try {
            bundle = ResourceBundle
                    .getBundle(ClassHelpers.classPackage(CharsetHelpers.class) + "." + CMMBPC_PROPERTIES_FILE_NAME);
            /*if(bundle != null)
               System.out.println("Got bundle, UTF-8 maxBytesPerChar:" + bundle.getString(Charsets.UTF_8.name()));*/
        } catch (Exception e) {
            //e.printStackTrace(System.err);
            System.err.println(
                    "Could not load resource bundle (" + CMMBPC_PROPERTIES_FILE_NAME + ".properties" + ")");
        }
        CharsetMaxMaxBytesPerCharBundle = bundle;
    }

    private static int getNumberOfKnownCharsets() {
        int count = 0;
        if (CharsetMaxMaxBytesPerCharBundle != null)
            for (@SuppressWarnings("unused")
            String charsetName : IterableEnumeration.Make(CharsetMaxMaxBytesPerCharBundle.getKeys()))
                count++;
        return count;
    }

    /**
     * Returns the maximum "maxBytesPerChar" value for the given {@link Charset},
     * across all Java VM/runtime implementations/versions we know about.
     * 
     * @param charset
     * @return
     * @throws IllegalArgumentException
     */
    static public final float GetMaxBytesPerChar(Charset charset) throws IllegalArgumentException {
        float maxBytesPerChar = 0.0f;
        try {
            maxBytesPerChar = GetMaxBytesPerChar(charset.name());
        } catch (IllegalArgumentException iae) { // Fall-back (determine now):
            maxBytesPerChar = new CharsetInfo(charset).maxBytesPerChar;
        }
        // Check:
        if (maxBytesPerChar <= 0.0f)
            throw new IllegalArgumentException(
                    "maxBytesPerChar for Charset \"" + charset.name() + "\" is indeterminable.");
        //else:
        return maxBytesPerChar;
    }

    /**
     * Returns the maximum "maxBytesPerChar" value for the {@link Charset} with the given name,
     * across all Java VM/runtime implementations/versions we know about.
     * 
     * @param charsetName
     * @return
     * @throws IllegalArgumentException
     */
    static public final float GetMaxBytesPerChar(String charsetName) throws IllegalArgumentException {
        float maxBytesPerChar = 0.0f;
        if (CharsetMaxMaxBytesPerCharBundle != null && CharsetMaxMaxBytesPerCharBundle.containsKey(charsetName))
            // Consult resource bundle:
            maxBytesPerChar = Float.valueOf(CharsetMaxMaxBytesPerCharBundle.getString(charsetName));
        // Check:
        if (maxBytesPerChar <= 0.0f)
            throw new IllegalArgumentException(
                    "maxBytesPerChar for Charset \"" + charsetName + "\" is indeterminable.");
        //else:
        return maxBytesPerChar;
    }

    /**
     * Writes a .charsetinfo file containing information about all supported {@link Charset}s on the current machine/platform.
     * 
     * @param outputFolder
     * @param filename the name of the output file, without extension
     * @param header
     * @throws IOException
     */
    static public final void WriteCharsetInfoFile(File outputFolder, String filename, String header)
            throws IOException {
        if (!FileHelpers.createDirectory(outputFolder))
            throw new IOException("Could not create output directory.");
        try (FileOutputStream fos = new FileOutputStream(
                new File(outputFolder, filename + "." + CHARSETINFO_FILE_EXTENSION));
                BufferedWriter writer = new BufferedWriter(UnicodeBOM.GetWriter(fos, CHARSETINFO_FILE_CHARSET))) {
            // Header:
            if (header != null) {
                if (!header.isEmpty())
                    writer.write(COMMENT_LINE_MARKER + " " + header.trim() + EOL);
            }
            // Body:
            for (Charset cs : Charset.availableCharsets().values())
                writer.write(new CharsetInfo(cs).toString() + EOL);
        }
    }

    /**
     * Parses all .charsetinfo files in the input folder and works out which is the 
     * maximum "maxBytesPerChar" value for each known Charset across all input files.
     * If this summarised information is in anyway different from the information in
     * {@link #CharsetMaxMaxBytesPerCharBundle} (or if {@code force} is {@code true})
     * then a new CharsetMaxMaxBytesPerChar.properties is created (or overwritten!)
     * in the output folder.
     * 
     * This method is called from a Groovy script in the pom.xml of the Sapelli Library.
     * 
     * @param inputFolderPath path of a directory containing CharsetInfo files (with *.charsetinfo extension!) to process
     * @param outputFolderPath path of a (resource) directory in which to create the new/updated CharsetMaxMaxBytesPerChar.properties file
     * @param force when {@code true} a new CharsetMaxMaxBytesPerChar.properties file will always be generated
     * @return whether or not a new or updated CharsetMaxMaxBytesPerChar.properties file was created
     * @throws IOException
     */
    static public final boolean GeneratePropertiesFile(String inputFolderPath, String outputFolderPath,
            boolean force) throws IOException {
        File inputFolder = new File(inputFolderPath);
        if (!inputFolder.isDirectory() || !inputFolder.exists())
            throw new IllegalArgumentException("Please provide a valid and existing folder!");

        SortedMap<String, String> fileHeaders = new TreeMap<String, String>();
        SortedMap<String, Float> tempCN2MBPC = new TreeMap<String, Float>();

        // Process all charsetinfo files in the folder:
        for (File charsetFile : inputFolder
                .listFiles((FileFilter) new WildcardFileFilter("*." + CHARSETINFO_FILE_EXTENSION))) {
            try (UnicodeBOMInputStream input = new UnicodeBOMInputStream(new FileInputStream(charsetFile));
                    BufferedReader reader = new BufferedReader(input.getReader(CHARSETINFO_FILE_CHARSET))) {
                String line;
                while ((line = reader.readLine()) != null) {
                    if (line.isEmpty())
                        continue; // skip blank lines
                    if (line.charAt(0) == COMMENT_LINE_MARKER) {
                        if (!fileHeaders.containsKey(charsetFile.getName()))
                            fileHeaders.put(charsetFile.getName(), line.substring(1).trim());
                        continue; // skip comment lines
                    }
                    // Parse Charset info:
                    CharsetInfo csInfo = CharsetInfo.Parse(line);
                    // Store/replace max maxBytesPerChar value:
                    if (csInfo.maxBytesPerChar > 0.0f && (!tempCN2MBPC.containsKey(csInfo.name)
                            || tempCN2MBPC.get(csInfo.name).floatValue() < csInfo.maxBytesPerChar))
                        tempCN2MBPC.put(csInfo.name, csInfo.maxBytesPerChar);
                }
                if (!fileHeaders.containsKey(charsetFile.getName()))
                    fileHeaders.put(charsetFile.getName(), "");
            }
        }

        // Compare information loaded from charsetinfo files with that in the resource bundle:
        boolean different = force || CharsetMaxMaxBytesPerCharBundle == null
                || tempCN2MBPC.size() != getNumberOfKnownCharsets();
        if (!different)
            for (Map.Entry<String, Float> mapping : tempCN2MBPC.entrySet())
                try {
                    if (!Float.valueOf(CharsetMaxMaxBytesPerCharBundle.getString(mapping.getKey()))
                            .equals(mapping.getValue())) // getString throws Exception if key is not found
                        throw new Exception("maxBytesPerChar mismatch!");
                } catch (Exception e) {
                    different = true;
                    break;
                }

        // If the information is different...
        if (different) { // Write new properties file (in package-specific subfolder of the given output folder):
            File outputFolder = new File(new File(outputFolderPath),
                    ClassHelpers.classPackageAsResourcePath(CharsetHelpers.class));
            FileHelpers.createDirectory(outputFolder);
            try (FileOutputStream fos = new FileOutputStream(
                    new File(outputFolder, CMMBPC_PROPERTIES_FILE_NAME + "." + PROPERTIES_FILE_EXTENSION));
                    BufferedWriter writer = new BufferedWriter(
                            new OutputStreamWriter(fos, PROPERTIES_FILE_CHARSET))) {
                // Header:
                writer.write(COMMENT_LINE_MARKER + " Generated on "
                        + TimeUtils.getISOTimestamp(System.currentTimeMillis(), false) + " from input files:"
                        + EOL);
                for (Map.Entry<String, String> fileAndHeader : fileHeaders.entrySet()) {
                    writer.write(COMMENT_LINE_MARKER + "\t- " + fileAndHeader.getKey() + ":" + EOL);
                    writer.write(COMMENT_LINE_MARKER + "\t\t" + fileAndHeader.getValue() + EOL);
                }
                writer.write(EOL);
                // Body:
                for (Map.Entry<String, Float> mapping : tempCN2MBPC.entrySet())
                    writer.write(mapping.getKey() + "=" + mapping.getValue().toString() + EOL);
            }
        }

        return different;
    }

    /**
     * @author mstevens
     */
    static private final class CharsetInfo {

        // STATICS --------------------------------------------------
        static final char SEPARATOR = '|';
        static final float UNKNOWN_MAX_BYTES_PER_CHAR = -1.0f;

        static public CharsetInfo Parse(String charInfoString) {
            final int sepPos = charInfoString.indexOf(SEPARATOR);
            return new CharsetInfo(charInfoString.substring(0, sepPos),
                    Float.parseFloat(charInfoString.substring(sepPos + 1)));
        }

        // DYNAMICS -------------------------------------------------
        public final String name;
        public final float maxBytesPerChar;

        /**
         * @param charset
         * @param maxBytesPerChar
         */
        public CharsetInfo(Charset charset) {
            this.name = charset.name();
            float charsetmaxBytesPerChar = UNKNOWN_MAX_BYTES_PER_CHAR;
            try {
                charsetmaxBytesPerChar = charset.newEncoder().maxBytesPerChar();
            } catch (Exception ignore) {
            }
            this.maxBytesPerChar = charsetmaxBytesPerChar;
        }

        /**
         * @param charsetName
         * @param charsetMaxBytesPerChar
         */
        private CharsetInfo(String charsetName, float charsetMaxBytesPerChar) {
            this.name = charsetName;
            this.maxBytesPerChar = charsetMaxBytesPerChar;
        }

        @Override
        public String toString() {
            return name + SEPARATOR + Float.toString(maxBytesPerChar);
        }

    }

}