weka.attributeSelection.GainRatioAttributeEval.java Source code

Java tutorial

Introduction

Here is the source code for weka.attributeSelection.GainRatioAttributeEval.java

Source

/*
 *   This program is free software: you can redistribute it and/or modify
 *   it under the terms of the GNU General Public License as published by
 *   the Free Software Foundation, either version 3 of the License, or
 *   (at your option) any later version.
 *
 *   This program is distributed in the hope that it will be useful,
 *   but WITHOUT ANY WARRANTY; without even the implied warranty of
 *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *   GNU General Public License for more details.
 *
 *   You should have received a copy of the GNU General Public License
 *   along with this program.  If not, see <http://www.gnu.org/licenses/>.
 */

/*
 *    GainRatioAttributeEval.java
 *    Copyright (C) 1999-2012 University of Waikato, Hamilton, New Zealand
 *
 */

package weka.attributeSelection;

import java.util.Enumeration;
import java.util.Vector;

import weka.core.Capabilities;
import weka.core.Capabilities.Capability;
import weka.core.ContingencyTables;
import weka.core.Instance;
import weka.core.Instances;
import weka.core.Option;
import weka.core.OptionHandler;
import weka.core.RevisionUtils;
import weka.core.Utils;
import weka.filters.Filter;
import weka.filters.supervised.attribute.Discretize;

/**
 * <!-- globalinfo-start --> GainRatioAttributeEval :<br/>
 * <br/>
 * Evaluates the worth of an attribute by measuring the gain ratio with respect
 * to the class.<br/>
 * <br/>
 * GainR(Class, Attribute) = (H(Class) - H(Class | Attribute)) / H(Attribute).<br/>
 * <p/>
 * <!-- globalinfo-end -->
 * 
 * <!-- options-start --> Valid options are:
 * <p/>
 * 
 * <pre>
 * -M
 *  treat missing values as a seperate value.
 * </pre>
 * 
 * <!-- options-end -->
 * 
 * @author Mark Hall (mhall@cs.waikato.ac.nz)
 * @version $Revision$
 * @see Discretize
 */
public class GainRatioAttributeEval extends ASEvaluation implements AttributeEvaluator, OptionHandler {

    /** for serialization */
    static final long serialVersionUID = -8504656625598579926L;

    /** The training instances */
    private Instances m_trainInstances;

    /** The class index */
    private int m_classIndex;

    /** The number of instances */
    private int m_numInstances;

    /** The number of classes */
    private int m_numClasses;

    /** Merge missing values */
    private boolean m_missing_merge;

    /**
     * Returns a string describing this attribute evaluator
     * 
     * @return a description of the evaluator suitable for displaying in the
     *         explorer/experimenter gui
     */
    public String globalInfo() {
        return "GainRatioAttributeEval :\n\nEvaluates the worth of an attribute "
                + "by measuring the gain ratio with respect to the class.\n\n"
                + "GainR(Class, Attribute) = (H(Class) - H(Class | Attribute)) / " + "H(Attribute).\n";
    }

    /**
     * Constructor
     */
    public GainRatioAttributeEval() {
        resetOptions();
    }

    /**
     * Returns an enumeration describing the available options.
     * 
     * @return an enumeration of all the available options.
     **/
    @Override
    public Enumeration<Option> listOptions() {
        Vector<Option> newVector = new Vector<Option>(1);
        newVector.addElement(new Option("\ttreat missing values as a seperate " + "value.", "M", 0, "-M"));
        return newVector.elements();
    }

    /**
     * Parses a given list of options.
     * <p/>
     * 
     * <!-- options-start --> Valid options are:
     * <p/>
     * 
     * <pre>
     * -M
     *  treat missing values as a seperate value.
     * </pre>
     * 
     * <!-- options-end -->
     * 
     * @param options the list of options as an array of strings
     * @throws Exception if an option is not supported
     **/
    @Override
    public void setOptions(String[] options) throws Exception {
        resetOptions();
        setMissingMerge(!(Utils.getFlag('M', options)));
    }

    /**
     * Returns the tip text for this property
     * 
     * @return tip text for this property suitable for displaying in the
     *         explorer/experimenter gui
     */
    public String missingMergeTipText() {
        return "Distribute counts for missing values. Counts are distributed "
                + "across other values in proportion to their frequency. Otherwise, "
                + "missing is treated as a separate value.";
    }

    /**
     * distribute the counts for missing values across observed values
     * 
     * @param b true=distribute missing values.
     */
    public void setMissingMerge(boolean b) {
        m_missing_merge = b;
    }

    /**
     * get whether missing values are being distributed or not
     * 
     * @return true if missing values are being distributed.
     */
    public boolean getMissingMerge() {
        return m_missing_merge;
    }

    /**
     * Gets the current settings of WrapperSubsetEval.
     * 
     * @return an array of strings suitable for passing to setOptions()
     */
    @Override
    public String[] getOptions() {
        String[] options = new String[1];

        if (!getMissingMerge()) {
            options[0] = "-M";
        } else {
            options[0] = "";
        }

        return options;
    }

    /**
     * Returns the capabilities of this evaluator.
     * 
     * @return the capabilities of this evaluator
     * @see Capabilities
     */
    @Override
    public Capabilities getCapabilities() {
        Capabilities result = super.getCapabilities();
        result.disableAll();

        // attributes
        result.enable(Capability.NOMINAL_ATTRIBUTES);
        result.enable(Capability.NUMERIC_ATTRIBUTES);
        result.enable(Capability.DATE_ATTRIBUTES);
        result.enable(Capability.MISSING_VALUES);

        // class
        result.enable(Capability.NOMINAL_CLASS);
        result.enable(Capability.MISSING_CLASS_VALUES);

        return result;
    }

    /**
     * Initializes a gain ratio attribute evaluator. Discretizes all attributes
     * that are numeric.
     * 
     * @param data set of instances serving as training data
     * @throws Exception if the evaluator has not been generated successfully
     */
    @Override
    public void buildEvaluator(Instances data) throws Exception {

        // can evaluator handle data?
        getCapabilities().testWithFail(data);

        m_trainInstances = data;
        m_classIndex = m_trainInstances.classIndex();
        m_numInstances = m_trainInstances.numInstances();
        Discretize disTransform = new Discretize();
        disTransform.setUseBetterEncoding(true);
        disTransform.setInputFormat(m_trainInstances);
        m_trainInstances = Filter.useFilter(m_trainInstances, disTransform);
        m_numClasses = m_trainInstances.attribute(m_classIndex).numValues();
    }

    /**
     * reset options to default values
     */
    protected void resetOptions() {
        m_trainInstances = null;
        m_missing_merge = true;
    }

    /**
     * evaluates an individual attribute by measuring the gain ratio of the class
     * given the attribute.
     * 
     * @param attribute the index of the attribute to be evaluated
     * @return the gain ratio
     * @throws Exception if the attribute could not be evaluated
     */
    @Override
    public double evaluateAttribute(int attribute) throws Exception {
        int i, j, ii, jj;
        int ni, nj;
        double sum = 0.0;
        ni = m_trainInstances.attribute(attribute).numValues() + 1;
        nj = m_numClasses + 1;
        double[] sumi, sumj;
        Instance inst;
        double temp = 0.0;
        sumi = new double[ni];
        sumj = new double[nj];
        double[][] counts = new double[ni][nj];
        sumi = new double[ni];
        sumj = new double[nj];

        for (i = 0; i < ni; i++) {
            sumi[i] = 0.0;

            for (j = 0; j < nj; j++) {
                sumj[j] = 0.0;
                counts[i][j] = 0.0;
            }
        }

        // Fill the contingency table
        for (i = 0; i < m_numInstances; i++) {
            inst = m_trainInstances.instance(i);

            if (inst.isMissing(attribute)) {
                ii = ni - 1;
            } else {
                ii = (int) inst.value(attribute);
            }

            if (inst.isMissing(m_classIndex)) {
                jj = nj - 1;
            } else {
                jj = (int) inst.value(m_classIndex);
            }

            counts[ii][jj] += inst.weight();
        }

        // get the row totals
        for (i = 0; i < ni; i++) {
            sumi[i] = 0.0;

            for (j = 0; j < nj; j++) {
                sumi[i] += counts[i][j];
                sum += counts[i][j];
            }
        }

        // get the column totals
        for (j = 0; j < nj; j++) {
            sumj[j] = 0.0;

            for (i = 0; i < ni; i++) {
                sumj[j] += counts[i][j];
            }
        }

        // distribute missing counts
        if (m_missing_merge && (sumi[ni - 1] < sum) && (sumj[nj - 1] < sum)) {
            double[] i_copy = new double[sumi.length];
            double[] j_copy = new double[sumj.length];
            double[][] counts_copy = new double[sumi.length][sumj.length];

            for (i = 0; i < ni; i++) {
                System.arraycopy(counts[i], 0, counts_copy[i], 0, sumj.length);
            }

            System.arraycopy(sumi, 0, i_copy, 0, sumi.length);
            System.arraycopy(sumj, 0, j_copy, 0, sumj.length);
            double total_missing = (sumi[ni - 1] + sumj[nj - 1] - counts[ni - 1][nj - 1]);

            // do the missing i's
            if (sumi[ni - 1] > 0.0) {
                for (j = 0; j < nj - 1; j++) {
                    if (counts[ni - 1][j] > 0.0) {
                        for (i = 0; i < ni - 1; i++) {
                            temp = ((i_copy[i] / (sum - i_copy[ni - 1])) * counts[ni - 1][j]);
                            counts[i][j] += temp;
                            sumi[i] += temp;
                        }

                        counts[ni - 1][j] = 0.0;
                    }
                }
            }

            sumi[ni - 1] = 0.0;

            // do the missing j's
            if (sumj[nj - 1] > 0.0) {
                for (i = 0; i < ni - 1; i++) {
                    if (counts[i][nj - 1] > 0.0) {
                        for (j = 0; j < nj - 1; j++) {
                            temp = ((j_copy[j] / (sum - j_copy[nj - 1])) * counts[i][nj - 1]);
                            counts[i][j] += temp;
                            sumj[j] += temp;
                        }

                        counts[i][nj - 1] = 0.0;
                    }
                }
            }

            sumj[nj - 1] = 0.0;

            // do the both missing
            if (counts[ni - 1][nj - 1] > 0.0 && total_missing < sum) {
                for (i = 0; i < ni - 1; i++) {
                    for (j = 0; j < nj - 1; j++) {
                        temp = (counts_copy[i][j] / (sum - total_missing)) * counts_copy[ni - 1][nj - 1];
                        counts[i][j] += temp;
                        sumi[i] += temp;
                        sumj[j] += temp;
                    }
                }

                counts[ni - 1][nj - 1] = 0.0;
            }
        }

        return ContingencyTables.gainRatio(counts);
    }

    /**
     * Return a description of the evaluator
     * 
     * @return description as a string
     */
    @Override
    public String toString() {
        StringBuffer text = new StringBuffer();

        if (m_trainInstances == null) {
            text.append("\tGain Ratio evaluator has not been built");
        } else {
            text.append("\tGain Ratio feature evaluator");

            if (!m_missing_merge) {
                text.append("\n\tMissing values treated as seperate");
            }
        }

        text.append("\n");
        return text.toString();
    }

    /**
     * Returns the revision string.
     * 
     * @return the revision
     */
    @Override
    public String getRevision() {
        return RevisionUtils.extract("$Revision$");
    }

    @Override
    public int[] postProcess(int[] attributeSet) {

        // save memory
        m_trainInstances = new Instances(m_trainInstances, 0);

        return attributeSet;
    }

    /**
     * Main method.
     * 
     * @param args the options -t training file
     */
    public static void main(String[] args) {
        runEvaluator(new GainRatioAttributeEval(), args);
    }
}