weka.attributeSelection.ConsistencySubsetEval.java Source code

Java tutorial

Introduction

Here is the source code for weka.attributeSelection.ConsistencySubsetEval.java

Source

/*
 *   This program is free software: you can redistribute it and/or modify
 *   it under the terms of the GNU General Public License as published by
 *   the Free Software Foundation, either version 3 of the License, or
 *   (at your option) any later version.
 *
 *   This program is distributed in the hope that it will be useful,
 *   but WITHOUT ANY WARRANTY; without even the implied warranty of
 *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *   GNU General Public License for more details.
 *
 *   You should have received a copy of the GNU General Public License
 *   along with this program.  If not, see <http://www.gnu.org/licenses/>.
 */

/*
 *    ConsistencySubsetEval.java
 *    Copyright (C) 1999 University of Waikato, Hamilton, New Zealand
 *
 */

package weka.attributeSelection;

import java.io.Serializable;
import java.util.BitSet;
import java.util.Enumeration;
import java.util.Hashtable;

import weka.core.Capabilities;
import weka.core.Capabilities.Capability;
import weka.core.Instance;
import weka.core.Instances;
import weka.core.RevisionHandler;
import weka.core.RevisionUtils;
import weka.core.TechnicalInformation;
import weka.core.TechnicalInformation.Field;
import weka.core.TechnicalInformation.Type;
import weka.core.TechnicalInformationHandler;
import weka.core.Utils;
import weka.filters.Filter;
import weka.filters.supervised.attribute.Discretize;

/**
 * <!-- globalinfo-start --> ConsistencySubsetEval :<br/>
 * <br/>
 * Evaluates the worth of a subset of attributes by the level of consistency in
 * the class values when the training instances are projected onto the subset of
 * attributes. <br/>
 * <br/>
 * Consistency of any subset can never be lower than that of the full set of
 * attributes, hence the usual practice is to use this subset evaluator in
 * conjunction with a Random or Exhaustive search which looks for the smallest
 * subset with consistency equal to that of the full set of attributes.<br/>
 * <br/>
 * For more information see:<br/>
 * <br/>
 * H. Liu, R. Setiono: A probabilistic approach to feature selection - A filter
 * solution. In: 13th International Conference on Machine Learning, 319-327,
 * 1996.
 * <p/>
 * <!-- globalinfo-end -->
 * 
 * <!-- technical-bibtex-start --> BibTeX:
 * 
 * <pre>
 * &#64;inproceedings{Liu1996,
 *    author = {H. Liu and R. Setiono},
 *    booktitle = {13th International Conference on Machine Learning},
 *    pages = {319-327},
 *    title = {A probabilistic approach to feature selection - A filter solution},
 *    year = {1996}
 * }
 * </pre>
 * <p/>
 * <!-- technical-bibtex-end -->
 * 
 * @author Mark Hall (mhall@cs.waikato.ac.nz)
 * @version $Revision$
 * @see Discretize
 */
public class ConsistencySubsetEval extends ASEvaluation implements SubsetEvaluator, TechnicalInformationHandler {

    /** for serialization */
    static final long serialVersionUID = -2880323763295270402L;

    /** training instances */
    private Instances m_trainInstances;

    /** class index */
    private int m_classIndex;

    /** number of attributes in the training data */
    private int m_numAttribs;

    /** number of instances in the training data */
    private int m_numInstances;

    /** Discretise numeric attributes */
    private Discretize m_disTransform;

    /** Hash table for evaluating feature subsets */
    private Hashtable<hashKey, double[]> m_table;

    /**
     * Class providing keys to the hash table.
     */
    public class hashKey implements Serializable, RevisionHandler {

        /** for serialization */
        static final long serialVersionUID = 6144138512017017408L;

        /** Array of attribute values for an instance */
        private final double[] attributes;

        /** True for an index if the corresponding attribute value is missing. */
        private final boolean[] missing;

        /** The key */
        private int key;

        /**
         * Constructor for a hashKey
         * 
         * @param t an instance from which to generate a key
         * @param numAtts the number of attributes
         * @throws Exception if something goes wrong
         */
        public hashKey(Instance t, int numAtts) throws Exception {

            int i;
            int cindex = t.classIndex();

            key = -999;
            attributes = new double[numAtts];
            missing = new boolean[numAtts];
            for (i = 0; i < numAtts; i++) {
                if (i == cindex) {
                    missing[i] = true;
                } else {
                    if ((missing[i] = t.isMissing(i)) == false) {
                        attributes[i] = t.value(i);
                    }
                }
            }
        }

        /**
         * Convert a hash entry to a string
         * 
         * @param t the set of instances
         * @param maxColWidth width to make the fields
         * @return the hash entry as string
         */
        public String toString(Instances t, int maxColWidth) {

            int i;
            int cindex = t.classIndex();
            StringBuffer text = new StringBuffer();

            for (i = 0; i < attributes.length; i++) {
                if (i != cindex) {
                    if (missing[i]) {
                        text.append("?");
                        for (int j = 0; j < maxColWidth; j++) {
                            text.append(" ");
                        }
                    } else {
                        String ss = t.attribute(i).value((int) attributes[i]);
                        StringBuffer sb = new StringBuffer(ss);

                        for (int j = 0; j < (maxColWidth - ss.length() + 1); j++) {
                            sb.append(" ");
                        }
                        text.append(sb);
                    }
                }
            }
            return text.toString();
        }

        /**
         * Constructor for a hashKey
         * 
         * @param t an array of feature values
         */
        public hashKey(double[] t) {

            int i;
            int l = t.length;

            key = -999;
            attributes = new double[l];
            missing = new boolean[l];
            for (i = 0; i < l; i++) {
                if (t[i] == Double.MAX_VALUE) {
                    missing[i] = true;
                } else {
                    missing[i] = false;
                    attributes[i] = t[i];
                }
            }
        }

        /**
         * Calculates a hash code
         * 
         * @return the hash code as an integer
         */
        @Override
        public int hashCode() {

            int hv = 0;

            if (key != -999) {
                return key;
            }
            for (int i = 0; i < attributes.length; i++) {
                if (missing[i]) {
                    hv += (i * 13);
                } else {
                    hv += (i * 5 * (attributes[i] + 1));
                }
            }
            if (key == -999) {
                key = hv;
            }
            return hv;
        }

        /**
         * Tests if two instances are equal
         * 
         * @param b a key to compare with
         * @return true if the objects are equal
         */
        @Override
        public boolean equals(Object b) {

            if ((b == null) || !(b.getClass().equals(this.getClass()))) {
                return false;
            }
            boolean ok = true;
            boolean l;
            if (b instanceof hashKey) {
                hashKey n = (hashKey) b;
                for (int i = 0; i < attributes.length; i++) {
                    l = n.missing[i];
                    if (missing[i] || l) {
                        if ((missing[i] && !l) || (!missing[i] && l)) {
                            ok = false;
                            break;
                        }
                    } else {
                        if (attributes[i] != n.attributes[i]) {
                            ok = false;
                            break;
                        }
                    }
                }
            } else {
                return false;
            }
            return ok;
        }

        /**
         * Prints the hash code
         */
        public void print_hash_code() {

            System.out.println("Hash val: " + hashCode());
        }

        /**
         * Returns the revision string.
         * 
         * @return the revision
         */
        @Override
        public String getRevision() {
            return RevisionUtils.extract("$Revision$");
        }
    }

    /**
     * Returns a string describing this search method
     * 
     * @return a description of the search suitable for displaying in the
     *         explorer/experimenter gui
     */
    public String globalInfo() {
        return "ConsistencySubsetEval :\n\nEvaluates the worth of a subset of "
                + "attributes by the level of consistency in the class values when the "
                + "training instances are projected onto the subset of attributes. "
                + "\n\nConsistency of any subset can never be lower than that of the "
                + "full set of attributes, hence the usual practice is to use this "
                + "subset evaluator in conjunction with a Random or Exhaustive search "
                + "which looks for the smallest subset with consistency equal to that "
                + "of the full set of attributes.\n\n" + "For more information see:\n\n"
                + getTechnicalInformation().toString();
    }

    /**
     * Returns an instance of a TechnicalInformation object, containing detailed
     * information about the technical background of this class, e.g., paper
     * reference or book this class is based on.
     * 
     * @return the technical information about this class
     */
    @Override
    public TechnicalInformation getTechnicalInformation() {
        TechnicalInformation result;

        result = new TechnicalInformation(Type.INPROCEEDINGS);
        result.setValue(Field.AUTHOR, "H. Liu and R. Setiono");
        result.setValue(Field.TITLE, "A probabilistic approach to feature selection - A filter solution");
        result.setValue(Field.BOOKTITLE, "13th International Conference on Machine Learning");
        result.setValue(Field.YEAR, "1996");
        result.setValue(Field.PAGES, "319-327");

        return result;
    }

    /**
     * Constructor. Calls restOptions to set default options
     **/
    public ConsistencySubsetEval() {
        resetOptions();
    }

    /**
     * reset to defaults
     */
    private void resetOptions() {
        m_trainInstances = null;
    }

    /**
     * Returns the capabilities of this evaluator.
     * 
     * @return the capabilities of this evaluator
     * @see Capabilities
     */
    @Override
    public Capabilities getCapabilities() {
        Capabilities result = super.getCapabilities();
        result.disableAll();

        // attributes
        result.enable(Capability.NOMINAL_ATTRIBUTES);
        result.enable(Capability.NUMERIC_ATTRIBUTES);
        result.enable(Capability.DATE_ATTRIBUTES);
        result.enable(Capability.MISSING_VALUES);

        // class
        result.enable(Capability.NOMINAL_CLASS);
        result.enable(Capability.MISSING_CLASS_VALUES);

        return result;
    }

    /**
     * Generates a attribute evaluator. Has to initialize all fields of the
     * evaluator that are not being set via options.
     * 
     * @param data set of instances serving as training data
     * @throws Exception if the evaluator has not been generated successfully
     */
    @Override
    public void buildEvaluator(Instances data) throws Exception {

        // can evaluator handle data?
        getCapabilities().testWithFail(data);

        m_trainInstances = new Instances(data);
        m_trainInstances.deleteWithMissingClass();
        m_classIndex = m_trainInstances.classIndex();
        m_numAttribs = m_trainInstances.numAttributes();
        m_numInstances = m_trainInstances.numInstances();

        m_disTransform = new Discretize();
        m_disTransform.setUseBetterEncoding(true);
        m_disTransform.setInputFormat(m_trainInstances);
        m_trainInstances = Filter.useFilter(m_trainInstances, m_disTransform);
    }

    /**
     * Evaluates a subset of attributes
     * 
     * @param subset a bitset representing the attribute subset to be evaluated
     * @throws Exception if the subset could not be evaluated
     */
    @Override
    public double evaluateSubset(BitSet subset) throws Exception {
        int[] fs;
        int i;
        int count = 0;

        for (i = 0; i < m_numAttribs; i++) {
            if (subset.get(i)) {
                count++;
            }
        }

        double[] instArray = new double[count];
        int index = 0;
        fs = new int[count];
        for (i = 0; i < m_numAttribs; i++) {
            if (subset.get(i)) {
                fs[index++] = i;
            }
        }

        // create new hash table
        m_table = new Hashtable<hashKey, double[]>((int) (m_numInstances * 1.5));

        for (i = 0; i < m_numInstances; i++) {
            Instance inst = m_trainInstances.instance(i);
            for (int j = 0; j < fs.length; j++) {
                if (fs[j] == m_classIndex) {
                    throw new Exception("A subset should not contain the class!");
                }
                if (inst.isMissing(fs[j])) {
                    instArray[j] = Double.MAX_VALUE;
                } else {
                    instArray[j] = inst.value(fs[j]);
                }
            }
            insertIntoTable(inst, instArray);
        }

        return consistencyCount();
    }

    /**
     * calculates the level of consistency in a dataset using a subset of
     * features. The consistency of a hash table entry is the total number of
     * instances hashed to that location minus the number of instances in the
     * largest class hashed to that location. The total consistency is 1.0 minus
     * the sum of the individual consistencies divided by the total number of
     * instances.
     * 
     * @return the consistency of the hash table as a value between 0 and 1.
     */
    private double consistencyCount() {
        Enumeration<hashKey> e = m_table.keys();
        double[] classDist;
        double count = 0.0;

        while (e.hasMoreElements()) {
            hashKey tt = e.nextElement();
            classDist = m_table.get(tt);
            count += Utils.sum(classDist);
            int max = Utils.maxIndex(classDist);
            count -= classDist[max];
        }

        count /= m_numInstances;
        return (1.0 - count);
    }

    /**
     * Inserts an instance into the hash table
     * 
     * @param inst instance to be inserted
     * @param instA the instance to be inserted as an array of attribute values.
     * @throws Exception if the instance can't be inserted
     */
    private void insertIntoTable(Instance inst, double[] instA) throws Exception {

        double[] tempClassDist2;
        double[] newDist;
        hashKey thekey;

        thekey = new hashKey(instA);

        // see if this one is already in the table
        tempClassDist2 = m_table.get(thekey);
        if (tempClassDist2 == null) {
            newDist = new double[m_trainInstances.classAttribute().numValues()];
            newDist[(int) inst.classValue()] = inst.weight();

            // add to the table
            m_table.put(thekey, newDist);
        } else {
            // update the distribution for this instance
            tempClassDist2[(int) inst.classValue()] += inst.weight();

            // update the table
            m_table.put(thekey, tempClassDist2);
        }
    }

    /**
     * returns a description of the evaluator
     * 
     * @return a description of the evaluator as a String.
     */
    @Override
    public String toString() {
        StringBuffer text = new StringBuffer();

        if (m_trainInstances == null) {
            text.append("\tConsistency subset evaluator has not been built yet\n");
        } else {
            text.append("\tConsistency Subset Evaluator\n");
        }

        return text.toString();
    }

    /**
     * Returns the revision string.
     * 
     * @return the revision
     */
    @Override
    public String getRevision() {
        return RevisionUtils.extract("$Revision$");
    }

    @Override
    public void clean() {

        // save memory
        m_trainInstances = new Instances(m_trainInstances, 0);
    }

    /**
     * Main method for testing this class.
     * 
     * @param args the options
     */
    public static void main(String[] args) {
        runEvaluator(new ConsistencySubsetEval(), args);
    }
}