weka.core.stats.NominalStats.java Source code

Java tutorial

Introduction

Here is the source code for weka.core.stats.NominalStats.java

Source

/*
 *   This program is free software: you can redistribute it and/or modify
 *   it under the terms of the GNU General Public License as published by
 *   the Free Software Foundation, either version 3 of the License, or
 *   (at your option) any later version.
 *
 *   This program is distributed in the hope that it will be useful,
 *   but WITHOUT ANY WARRANTY; without even the implied warranty of
 *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *   GNU General Public License for more details.
 *
 *   You should have received a copy of the GNU General Public License
 *   along with this program.  If not, see <http://www.gnu.org/licenses/>.
 */

/*
 *    NominalStats
 *    Copyright (C) 2014 University of Waikato, Hamilton, New Zealand
 *
 */

package weka.core.stats;

import java.io.Serializable;
import java.util.ArrayList;
import java.util.Map;
import java.util.Set;
import java.util.TreeMap;

import weka.core.Attribute;
import weka.core.Utils;
import weka.distributed.CSVToARFFHeaderMapTask;

/**
 * Class for computing nominal statistics (primarily frequency counts)
 * 
 * @author Mark Hall (mhall{[at]}pentaho{[dot]}com)
 * @version $Revision$
 */
public class NominalStats extends Stats implements Serializable {

    /** A map of values to counts */
    protected Map<String, NominalStats.Count> m_counts = new TreeMap<String, NominalStats.Count>();

    /** The number of missing values for this nominal attribute */
    protected double m_numMissing;

    /** A "label" to use when storing the number of missing values */
    public static final String MISSING_LABEL = "**missing**";

    /** For serialization */
    private static final long serialVersionUID = -6176046647546730423L;

    /**
     * Class that encapsulates a count for nominal value
     * 
     * @author Mark Hall (mhall{[at]}pentaho{[dot]}com)
     */
    public static class Count implements Serializable {

        /** For serialization */
        private static final long serialVersionUID = 4310467271632108735L;

        /** The value of the count */
        public double m_count;
    }

    /**
     * Constructs a new NominalStats
     * 
     * @param attributeName the name of the data attribute that these stats
     *          pertain to
     */
    public NominalStats(String attributeName) {
        super(attributeName);
    }

    /**
     * Convert a summary meta attribute to a NominalStats
     * 
     * @param a the attribute to convert
     * @return a NominalStats
     * @throws IllegalArgumentException if a problem occurs
     */
    public static NominalStats attributeToStats(Attribute a) throws IllegalArgumentException {

        if (!a.isNominal()) {
            throw new IllegalArgumentException("Stats attribute is not nominal!");
        }

        NominalStats ns = new NominalStats(a.name());
        for (int j = 0; j < a.numValues(); j++) {
            String v = a.value(j);
            String label = v.substring(0, v.lastIndexOf("_"));
            String freqCount = v.substring(v.lastIndexOf("_") + 1, v.length());
            try {
                double fC = Double.parseDouble(freqCount);
                if (label.equals(NominalStats.MISSING_LABEL)) {
                    ns.add(null, fC);
                } else {
                    ns.add(label, fC);
                }
            } catch (NumberFormatException n) {
                throw new IllegalArgumentException(n);
            }
        }

        return ns;
    }

    /**
     * Adds to the count for a given label. If the label is null then it adds to
     * the count for missing.
     * 
     * @param label the label to add the count to
     * @param value the count to add
     */
    public void add(String label, double value) {

        if (label == null) {
            m_numMissing += value;
        } else {

            NominalStats.Count c = m_counts.get(label);
            if (c == null) {
                c = new Count();
                m_counts.put(label, c);
            }

            c.m_count += value;
        }
    }

    /**
     * Get the set of labels seen by this NominalStats
     * 
     * @return the set of labels
     */
    public Set<String> getLabels() {
        return m_counts.keySet();
    }

    /**
     * Get the count for a given label
     * 
     * @param label the label to get the count for
     * @return the count or missing value if the label is unknown
     */
    public double getCount(String label) {
        NominalStats.Count c = m_counts.get(label);

        if (c == null) {
            return Utils.missingValue();
        }

        return c.m_count;
    }

    /**
     * Get the number of missing values for this attribute
     * 
     * @return the number of missing values seen
     */
    public double getNumMissing() {
        return m_numMissing;
    }

    /**
     * Get the index of the mode
     * 
     * @return the index (in the sorted list of labels) of the mode
     */
    public int getMode() {
        double max = -1;
        int maxIndex = -1;

        int index = 0;
        for (Map.Entry<String, NominalStats.Count> e : m_counts.entrySet()) {
            if (e.getValue().m_count > max) {
                max = e.getValue().m_count;
                maxIndex = index;
            }
            index++;
        }

        return maxIndex;
    }

    /**
     * Get the most frequent label (not including missing values)
     * 
     * @return the most frequent label
     */
    public String getModeLabel() {
        double max = -1;
        String maxLabel = "";

        for (Map.Entry<String, NominalStats.Count> e : m_counts.entrySet()) {
            if (e.getValue().m_count > max) {
                max = e.getValue().m_count;
                maxLabel = e.getKey();
            }
        }

        return maxLabel;
    }

    /**
     * Set the number of missing values for this attribute
     * 
     * @param missing the number of missing values
     */
    public void setNumMissing(double missing) {
        m_numMissing = missing;
    }

    @Override
    public Attribute makeAttribute() {
        ArrayList<String> vals = new ArrayList<String>();

        for (Map.Entry<String, NominalStats.Count> e : m_counts.entrySet()) {
            vals.add(e.getKey() + "_" + e.getValue().m_count);
        }

        vals.add(MISSING_LABEL + "_" + m_numMissing);

        Attribute a = new Attribute(CSVToARFFHeaderMapTask.ARFF_SUMMARY_ATTRIBUTE_PREFIX + m_attributeName, vals);

        return a;
    }
}