edu.cuny.qc.speech.AuToBI.util.ClassifierUtils.java Source code

Introduction

Here is the source code for edu.cuny.qc.speech.AuToBI.util.ClassifierUtils.java
Source

/*  ClassifierUtils.java
    
Copyright (c) 2009-2014 Andrew Rosenberg
    
This file is part of the AuToBI prosodic analysis package.
    
AuToBI is free software: you can redistribute it and/or modify
it under the terms of the Apache License (see boilerplate below)
    
 ***********************************************************************************************************************
 *
 * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
 * the License. You should have received a copy of the Apache 2.0 License along with AuToBI.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 * specific language governing permissions and limitations under the License.
 *
 ***********************************************************************************************************************
 */
package edu.cuny.qc.speech.AuToBI.util;

import com.google.common.collect.BiMap;
import com.google.common.collect.HashBiMap;
import de.bwaldvogel.liblinear.*;
import edu.cuny.qc.speech.AuToBI.classifier.AuToBIClassifier;
import edu.cuny.qc.speech.AuToBI.classifier.WeightFunction;
import edu.cuny.qc.speech.AuToBI.core.*;
import edu.cuny.qc.speech.AuToBI.core.Feature;
import weka.core.*;

import java.util.*;
import java.io.*;

/**
 * A class containing static utility functions for classifiers.
 */
public class ClassifierUtils {

    // Utility classes cannot be initialized.
    private ClassifierUtils() {
        throw new AssertionError();
    }

    /**
     * Loads a serialized AuToBIClassifier from a file.
     *
     * @param filename the filename
     * @return the stored AuToBIClassifier
     */
    public static AuToBIClassifier readAuToBIClassifier(String filename) {
        FileInputStream fis;
        ObjectInputStream in;
        try {
            fis = new FileInputStream(filename);
            in = new ObjectInputStream(fis);
            Object o = in.readObject();
            if (o instanceof AuToBIClassifier) {
                return (AuToBIClassifier) o;
            }
        } catch (IOException e) {
            AuToBIUtils.error(e.getMessage());
        } catch (ClassNotFoundException e) {
            AuToBIUtils.error(e.getMessage());
        }
        return null;
    }

    /**
     * Writes an AuToBIClassifier to a file.
     *
     * @param filename the filename to write the classifier to.
     * @param c        the classifier to store
     */
    public static void writeAuToBIClassifier(String filename, AuToBIClassifier c) throws IOException {
        AuToBIUtils.log("writing model to: " + filename);
        FileOutputStream fos;
        ObjectOutputStream out;
        fos = new FileOutputStream(filename);
        out = new ObjectOutputStream(fos);
        out.writeObject(c);
        out.close();

    }

    /**
     * Converts a single point to a weka Instance.
     * <p/>
     * This conversion requires a FeatureSet to determine the features to include in the instance.
     *
     * @param point the data point.
     * @param fs    the containing feature set.
     * @return a weka instance of the point.
     * @throws Exception if something goes wrong
     */
    public static Instance convertWordToInstance(Word point, FeatureSet fs) throws Exception {
        return convertWordToInstance(point, fs.getFeatures(), fs.getClassAttribute());
    }

    /**
     * Converts a single point to a weka instance
     *
     * @param point           the data point
     * @param features        the features to include on the data point
     * @param class_attribute the class attribute
     * @return a weka instance of the point
     * @throws Exception if something goes wrong
     */
    public static Instance convertWordToInstance(Word point, Set<Feature> features, String class_attribute)
            throws Exception {
        ArrayList<Attribute> attributes = generateWekaAttributes(features);
        return constructWekaInstance(attributes, point, class_attribute);
    }

    /**
     * Generate a FastVector of weka Attributes from a set of features.
     *
     * @param features the set of features
     * @return a FastVector of weka attributes
     */
    public static ArrayList<Attribute> generateWekaAttributes(Set<Feature> features) {
        ArrayList<Attribute> attributes = new ArrayList<Attribute>();

        for (Feature f : features) {
            String attribute_name = f.getName();
            if (f.isNominal()) {
                List<String> attribute_values = new ArrayList<String>();
                for (String s : f.getNominalValues()) {
                    attribute_values.add(s);
                }
                attributes.add(new Attribute(attribute_name, attribute_values, attributes.size()));
            } else if (f.isString()) {
                attributes.add(new weka.core.Attribute(attribute_name, (List<String>) null, attributes.size()));
            } else {
                attributes.add(new weka.core.Attribute(attribute_name, attributes.size()));
            }
        }
        return attributes;
    }

    /**
     * Converts a feature set object to a weka Instances object
     * <p/>
     * The class is set to the last attribute.
     *
     * @param feature_set the feature set to convert
     * @return a weka instances object
     * @throws Exception If the arff file can't be written or read.
     */
    public static Instances convertFeatureSetToWekaInstances(FeatureSet feature_set) throws Exception {
        ArrayList<Attribute> attributes = generateWekaAttributes(feature_set.getFeatures());
        Instances instances = new Instances("AuToBI_feature_set", attributes, feature_set.getDataPoints().size());
        for (Word w : feature_set.getDataPoints()) {
            Instance inst = ClassifierUtils.assignWekaAttributes(instances, w);
            instances.add(inst);
        }

        ClassifierUtils.setWekaClassAttribute(instances, feature_set.getClassAttribute());
        return instances;
    }

    /**
     * Converts a feature set object to a weka Instances object.
     * <p/>
     * Use wekas instance weighting capability to assign weights for each data point.
     *
     * @param feature_set the feature set to convert
     * @param fn          a weight function
     * @return a weka instances object
     */
    public static Instances convertFeatureSetToWeightedWekaInstances(FeatureSet feature_set, WeightFunction fn) {
        ArrayList<Attribute> attributes = generateWekaAttributes(feature_set.getFeatures());
        Instances instances = new Instances("AuToBI_feature_set", attributes, feature_set.getDataPoints().size());
        for (Word w : feature_set.getDataPoints()) {
            Instance inst = ClassifierUtils.assignWekaAttributes(instances, w);
            inst.setWeight(fn.weight(w));
            instances.add(inst);
        }

        ClassifierUtils.setWekaClassAttribute(instances, feature_set.getClassAttribute());
        return instances;
    }

    /**
     * Constructs a data point to a weka instance given a FastVector of weka attribute and a class attribute.
     *
     * @param attributes      a FastVector of weka attributes
     * @param data_point      the data point to convert
     * @param class_attribute the class attribute
     * @return a weka instance.
     */
    protected static Instance constructWekaInstance(ArrayList<Attribute> attributes, Word data_point,
            String class_attribute) {
        Instances instances = new Instances("single_instance_set", attributes, 0);

        setWekaClassAttribute(instances, class_attribute);
        return assignWekaAttributes(instances, data_point);
    }

    /**
     * Given a (possibly empty) Instances object containing the required weka Attributes, generates a weka Instance for a
     * single data point.
     *
     * @param instances  the weka Instances object containing attributes
     * @param data_point the data point to convert
     * @return a weka instance with assigned attributes
     */
    protected static Instance assignWekaAttributes(Instances instances, Word data_point) {
        double[] instance = new double[instances.numAttributes()];

        for (int i = 0; i < instances.numAttributes(); ++i) {
            Attribute attribute = instances.attribute(i);
            if (data_point.hasAttribute(attribute.name())
                    && !data_point.getAttribute(attribute.name()).toString().equals("?")) {
                switch (attribute.type()) {
                case Attribute.NOMINAL:
                    int index = attribute.indexOfValue(data_point.getAttribute(attribute.name()).toString());
                    instance[i] = (double) index;
                    break;
                case Attribute.NUMERIC:
                    // Check if value is really a number.
                    try {
                        instance[i] = Double.valueOf(data_point.getAttribute(attribute.name()).toString());
                    } catch (NumberFormatException e) {
                        AuToBIUtils.error("Number expected for feature: " + attribute.name());
                    }
                    break;
                case Attribute.STRING:
                    instance[i] = attribute.addStringValue(data_point.getAttribute(attribute.name()).toString());
                    break;
                default:
                    AuToBIUtils.error("Unknown attribute type");
                }
            } else {
                instance[i] = Utils.missingValue();
            }
        }

        Instance inst = new DenseInstance(1, instance);
        inst.setDataset(instances);
        return inst;
    }

    /**
     * Assigns a class attribute to a weka Instances object.
     * <p/>
     * If no class attribute is given, or if the class attribute is not found in the list of attributes, the last
     * attribute is set to the class attribute.
     *
     * @param instances       the instances object
     * @param class_attribute the desired class attribute.
     */
    static void setWekaClassAttribute(Instances instances, String class_attribute) {
        if (class_attribute != null) {
            int i = 0;
            boolean set = false;
            while (i < instances.numAttributes() && !set) {
                Attribute attr = instances.attribute(i);
                if (class_attribute.equals(attr.name())) {
                    instances.setClassIndex(i);
                    set = true;
                }
                ++i;
            }
            if (!set) {
                instances.setClassIndex(instances.numAttributes() - 1);
            }
        } else {
            instances.setClassIndex(instances.numAttributes() - 1);
        }
    }

    /**
     * Evaluates classification results by comparing the values of the hypothesized and true features.
     *
     * @param hyp_feature  The hypothesized feature name
     * @param true_feature The true feature name
     * @param fs           The feature set to be evaluated
     * @return a string representation of the evaluation
     * @throws edu.cuny.qc.speech.AuToBI.core.AuToBIException IF there is an inconsistency in the evalution
     */
    public static String evaluateClassification(String hyp_feature, String true_feature, FeatureSet fs)
            throws AuToBIException {
        EvaluationSummary eval = new EvaluationSummary(generateEvaluationResults(hyp_feature, true_feature, fs));
        return eval.toString();
    }

    /**
     * Generates an EvaluationResults object by comparing the values of the hypothesized and true features.
     * <p/>
     * EvaluationResults objects store contingency tables for the classification.
     *
     * @param hyp_feature  The hypothesized feature name
     * @param true_feature The true feature name
     * @param fs           The feature set to be evaluated
     * @return a string representation of the evaluation
     * @throws edu.cuny.qc.speech.AuToBI.core.AuToBIException IF there is an inconsistency in the evalution
     */
    public static EvaluationResults generateEvaluationResults(String hyp_feature, String true_feature,
            FeatureSet fs) throws AuToBIException {
        Feature class_attribute = new Feature(true_feature);
        class_attribute.generateNominalValues(fs.getDataPoints());

        Feature hyp_attribute = new Feature(hyp_feature);
        hyp_attribute.generateNominalValues(fs.getDataPoints());

        Set<String> sorted_values = new LinkedHashSet<String>();
        sorted_values.addAll(class_attribute.getNominalValues());
        sorted_values.addAll(hyp_attribute.getNominalValues());

        EvaluationResults eval = new EvaluationResults(sorted_values);

        for (Word w : fs.getDataPoints()) {
            if (w.hasAttribute("__ignore__") && ((Boolean) w.getAttribute("__ignore__"))) {
                continue;
            }
            if (!w.hasAttribute(hyp_feature)) {
                AuToBIUtils.warn("Word, " + w + ", has no hypothesized attribute: " + hyp_feature);
            } else if (!w.hasAttribute(true_feature)) {
                AuToBIUtils.warn("Word, " + w + ", has no true attribute: " + hyp_feature);
            } else {
                eval.addInstance(w.getAttribute(hyp_feature).toString(), w.getAttribute(true_feature).toString());
            }
        }
        return eval;
    }

    /**
     * Generates predictions for a set of words using the supplied classifier.
     * <p/>
     * Results are stored in hyp_attribute. If the classifier throws an error, the default_value is assigned as the
     * hypothesis
     *
     * @param classifier    the classifier to generate predictions
     * @param hyp_attribute the destination attribute for the hypotheses
     * @param default_value the default classification value
     * @param fs            the featureset to generate predictions for.
     */
    public static void generatePredictions(AuToBIClassifier classifier, String hyp_attribute, String default_value,
            FeatureSet fs) {
        for (Word w : fs.getDataPoints()) {
            try {
                String result = classifier.classify(w);
                w.setAttribute(hyp_attribute, result);
            } catch (Exception e) {
                w.setAttribute(hyp_attribute, default_value);
                AuToBIUtils.warn("Classifier threw an exception. Assigning default value, " + default_value
                        + ", to word, " + w.toString() + "\n" + e.getMessage());
            }
        }
    }

    /**
     * Generates predictions for a set of words using the supplied classifier.
     * <p/>
     * Results are stored in hyp_attribute. If the classifier throws an error, the default_value is assigned as the
     * hypothesis.
     * <p/>
     * Confidence scores are stored in a separate attribute.
     *
     * @param classifier     the classifier to generate predictions
     * @param hyp_attribute  the destination attribute for the hypotheses
     * @param conf_attribute the destination attribute for confidence in the hypothesis
     * @param default_value  the default classification value
     * @param fs             the featureset to generate predictions for.
     */
    public static void generatePredictionsWithConfidenceScores(AuToBIClassifier classifier, String hyp_attribute,
            String conf_attribute, String default_value, FeatureSet fs) {
        for (Word w : fs.getDataPoints()) {
            try {
                Distribution dist = classifier.distributionForInstance(w);
                String result = dist.getKeyWithMaximumValue();
                Double conf = dist.get(result);
                w.setAttribute(hyp_attribute, result);
                w.setAttribute(conf_attribute, conf);
            } catch (Exception e) {
                w.setAttribute(hyp_attribute, default_value);
                w.setAttribute(conf_attribute, 0.5);
                AuToBIUtils.warn("Classifier threw an exception. Assigning default value, " + default_value
                        + ", to word, " + w.toString() + "\n" + e.getMessage());

            }
        }
    }

    /**
     * Generates a prediction distribution for a set of words using the supplied classifier.
     * <p/>
     * Results are stored in dist_attribute. If the classifier throws an error, the default_value is assigned as the
     * hypothesis.
     * <p/>
     * Confidence scores are stored in a separate attribute.
     *
     * @param classifier     the classifier to generate predictions
     * @param dist_attribute the destination attribute for the hypotheses
     * @param default_value  the default value to return if the classifier throws an exception
     * @param fs             the featureset to generate predictions for.
     */
    public static void generatePredictionDistribution(AuToBIClassifier classifier, String dist_attribute,
            String default_value, FeatureSet fs) {
        for (Word w : fs.getDataPoints()) {
            try {
                Distribution dist = classifier.distributionForInstance(w);
                w.setAttribute(dist_attribute, dist);
            } catch (Exception e) {
                w.setAttribute(dist_attribute, default_value);
                AuToBIUtils.warn("Classifier threw an exception. Assigning default value, " + default_value
                        + ", to word, " + w.toString() + "\n" + e.getMessage());
            }
        }
    }

    /**
     * Converts a FeatureSet to a list of LibLinear labels.
     *
     * @param feature_set  The feature set
     * @param class_values An array of class values to describe the indexing of the labels.
     * @return a list of doubles corresponding to labels.
     */
    public static double[] convertFeatureSetToLibLinearLabels(FeatureSet feature_set, String[] class_values) {
        String class_attribute = feature_set.getClassAttribute();
        double[] labels = new double[feature_set.getDataPoints().size()];
        int i = 0;
        for (Word w : feature_set.getDataPoints()) {
            String s = w.getAttribute(class_attribute).toString();
            labels[i] = java.util.Arrays.asList(class_values).indexOf(s) + 1;
            i++;
        }
        return labels;
    }

    /**
     * Converts a FeatureSet to a list of LibLinear Feature[] descriptions.
     *
     * @param feature_set the feature set to convert
     * @param feature_map a map of features to indices
     * @return a list of Feature[] descriptions.
     */
    public static de.bwaldvogel.liblinear.Feature[][] convertFeatureSetToLibLinearFeatures(FeatureSet feature_set,
            HashBiMap<Feature, Integer> feature_map) throws AuToBIException {
        int n = feature_set.getDataPoints().size();
        de.bwaldvogel.liblinear.Feature[][] features = new de.bwaldvogel.liblinear.Feature[n][];
        int i = 0;
        for (Word w : feature_set.getDataPoints()) {
            features[i] = convertWordToLibLinearFeatures(w, feature_map);
            i++;
        }

        return features;
    }

    /**
     * Converts a FeatureSet to a list of LibLinear Feature[] descriptions.
     *
     * @param feature_set the feature set to convert
     * @return a list of Feature[] descriptions.
     */
    public static de.bwaldvogel.liblinear.Feature[][] convertFeatureSetToLibLinearFeatures(FeatureSet feature_set)
            throws AuToBIException {

        HashBiMap<Feature, Integer> feature_map = HashBiMap.create();
        int i = 1;
        for (Feature f : feature_set.getFeatures()) {
            feature_map.put(f, i);
            i++;
        }

        return convertFeatureSetToLibLinearFeatures(feature_set, feature_map);
    }

    public static de.bwaldvogel.liblinear.Feature[] convertWordToLibLinearFeatures(Word w,
            HashBiMap<Feature, Integer> feature_map) throws AuToBIException {

        ArrayList<FeatureNode> fs = new ArrayList<FeatureNode>();

        BiMap<Integer, Feature> map_feature = feature_map.inverse();

        for (int i = 1; i < feature_map.size(); i++) {
            Feature feature = map_feature.get(i);
            if (w.hasAttribute(feature.getName())) {
                if (feature.isString()) {
                    throw new AuToBIException("Feature, " + feature.getName()
                            + " is a 'string' feature.  LibLinear does not support this feature type.");
                } else if (feature.isNominal()) {
                    double idx = feature.getNominalIndex((String) w.getAttribute(feature.getName()));
                    fs.add(new FeatureNode(i, idx));
                } else {
                    Double value = (Double) w.getAttribute(feature.getName());
                    if (!Double.isNaN(value)) {
                        fs.add(new FeatureNode(i, value));
                    }
                }
            }
        }

        return fs.toArray(new de.bwaldvogel.liblinear.Feature[fs.size()]);
    }

    /**
     * Normalizes features using precomputed statistics to 0 mean and unit variance.
     *
     * @param features    input (unscaled) features
     * @param feature_map a map of features to indices
     * @return normalized features
     */
    public static de.bwaldvogel.liblinear.Feature[][] normalizeLibLinearFeatures(
            de.bwaldvogel.liblinear.Feature[][] features, BiMap<Integer, Feature> feature_map,
            HashMap<String, Aggregation> norm_params) {

        de.bwaldvogel.liblinear.Feature[][] f_out = features.clone();

        int i = 0;
        for (de.bwaldvogel.liblinear.Feature[] f : features) {
            f_out[i] = normalizeLibLinearFeatures(f, feature_map, norm_params);
            i++;
        }

        return f_out;
    }

    /**
     * Normalizes features so they have 0 mean and unit variance.
     *
     * @param features    input (unscaled) features
     * @param feature_map a map of features to indices
     * @return normalized features
     */
    public static de.bwaldvogel.liblinear.Feature[] normalizeLibLinearFeatures(
            de.bwaldvogel.liblinear.Feature[] features, BiMap<Integer, Feature> feature_map,
            HashMap<String, Aggregation> norm_params) {

        de.bwaldvogel.liblinear.Feature[] f_out = features.clone();

        for (de.bwaldvogel.liblinear.Feature fn : f_out) {
            if (!feature_map.containsKey(fn.getIndex())) {
                fn.setValue(0);
            } else {
                Aggregation agg = norm_params.get(feature_map.get(fn.getIndex()).getName());
                if (agg.getSize() < 2) {
                    fn.setValue(0);
                } else {
                    double u = agg.getMean();
                    double sd = agg.getStdev();
                    fn.setValue((fn.getValue() - u) / sd);
                    if (Double.isNaN(fn.getValue())) {
                        fn.setValue(0);
                    }
                }
            }
        }

        return f_out;
    }

    /**
     * Generates normalization parameters based on the data availble in a feature set.
     *
     * @param feature_set the feature set to analyze
     * @return a hash containing aggregations to be used for normaliation
     */
    public static HashMap<String, Aggregation> generateNormParams(FeatureSet feature_set) {
        HashMap<String, Aggregation> norm_params = new HashMap<String, Aggregation>();

        for (Word w : feature_set.getDataPoints()) {
            for (String f : feature_set.getFeatureNames()) {

                if (!norm_params.containsKey(f)) {
                    norm_params.put(f, new Aggregation());
                }
                if (w.hasAttribute(f)) {
                    Object v = w.getAttribute(f);
                    if (v instanceof Number) {
                        Double value = ((Number) v).doubleValue();
                        if (!Double.isNaN(value)) {
                            norm_params.get(f).insert(((Number) v).doubleValue());
                        }
                    }
                }
            }
        }

        return norm_params;
    }

    /**
     * Creates an invertible mapping from Feature names to liblinear compatible feature indices.
     *
     * @param feature_set the feature set
     * @return a bidirectional map between feature names and indices.
     */
    public static HashBiMap<Feature, Integer> generateFeatureMap(FeatureSet feature_set) {
        HashBiMap<Feature, Integer> feature_map = HashBiMap.create();
        int idx = 1;
        for (edu.cuny.qc.speech.AuToBI.core.Feature f : feature_set.getFeatures()) {
            feature_map.put(f, idx);
            idx++;
        }
        return feature_map;
    }
}