Java tutorial
/* * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see <http://www.gnu.org/licenses/>. */ /* * ConsistencySubsetEval.java * Copyright (C) 1999 University of Waikato, Hamilton, New Zealand * */ package weka.attributeSelection; import java.io.Serializable; import java.util.BitSet; import java.util.Enumeration; import java.util.Hashtable; import weka.core.Capabilities; import weka.core.Capabilities.Capability; import weka.core.Instance; import weka.core.Instances; import weka.core.RevisionHandler; import weka.core.RevisionUtils; import weka.core.TechnicalInformation; import weka.core.TechnicalInformation.Field; import weka.core.TechnicalInformation.Type; import weka.core.TechnicalInformationHandler; import weka.core.Utils; import weka.filters.Filter; import weka.filters.supervised.attribute.Discretize; /** * <!-- globalinfo-start --> ConsistencySubsetEval :<br/> * <br/> * Evaluates the worth of a subset of attributes by the level of consistency in * the class values when the training instances are projected onto the subset of * attributes. <br/> * <br/> * Consistency of any subset can never be lower than that of the full set of * attributes, hence the usual practice is to use this subset evaluator in * conjunction with a Random or Exhaustive search which looks for the smallest * subset with consistency equal to that of the full set of attributes.<br/> * <br/> * For more information see:<br/> * <br/> * H. Liu, R. Setiono: A probabilistic approach to feature selection - A filter * solution. In: 13th International Conference on Machine Learning, 319-327, * 1996. * <p/> * <!-- globalinfo-end --> * * <!-- technical-bibtex-start --> BibTeX: * * <pre> * @inproceedings{Liu1996, * author = {H. Liu and R. Setiono}, * booktitle = {13th International Conference on Machine Learning}, * pages = {319-327}, * title = {A probabilistic approach to feature selection - A filter solution}, * year = {1996} * } * </pre> * <p/> * <!-- technical-bibtex-end --> * * @author Mark Hall (mhall@cs.waikato.ac.nz) * @version $Revision$ * @see Discretize */ public class ConsistencySubsetEval extends ASEvaluation implements SubsetEvaluator, TechnicalInformationHandler { /** for serialization */ static final long serialVersionUID = -2880323763295270402L; /** training instances */ private Instances m_trainInstances; /** class index */ private int m_classIndex; /** number of attributes in the training data */ private int m_numAttribs; /** number of instances in the training data */ private int m_numInstances; /** Discretise numeric attributes */ private Discretize m_disTransform; /** Hash table for evaluating feature subsets */ private Hashtable<hashKey, double[]> m_table; /** * Class providing keys to the hash table. */ public class hashKey implements Serializable, RevisionHandler { /** for serialization */ static final long serialVersionUID = 6144138512017017408L; /** Array of attribute values for an instance */ private final double[] attributes; /** True for an index if the corresponding attribute value is missing. */ private final boolean[] missing; /** The key */ private int key; /** * Constructor for a hashKey * * @param t an instance from which to generate a key * @param numAtts the number of attributes * @throws Exception if something goes wrong */ public hashKey(Instance t, int numAtts) throws Exception { int i; int cindex = t.classIndex(); key = -999; attributes = new double[numAtts]; missing = new boolean[numAtts]; for (i = 0; i < numAtts; i++) { if (i == cindex) { missing[i] = true; } else { if ((missing[i] = t.isMissing(i)) == false) { attributes[i] = t.value(i); } } } } /** * Convert a hash entry to a string * * @param t the set of instances * @param maxColWidth width to make the fields * @return the hash entry as string */ public String toString(Instances t, int maxColWidth) { int i; int cindex = t.classIndex(); StringBuffer text = new StringBuffer(); for (i = 0; i < attributes.length; i++) { if (i != cindex) { if (missing[i]) { text.append("?"); for (int j = 0; j < maxColWidth; j++) { text.append(" "); } } else { String ss = t.attribute(i).value((int) attributes[i]); StringBuffer sb = new StringBuffer(ss); for (int j = 0; j < (maxColWidth - ss.length() + 1); j++) { sb.append(" "); } text.append(sb); } } } return text.toString(); } /** * Constructor for a hashKey * * @param t an array of feature values */ public hashKey(double[] t) { int i; int l = t.length; key = -999; attributes = new double[l]; missing = new boolean[l]; for (i = 0; i < l; i++) { if (t[i] == Double.MAX_VALUE) { missing[i] = true; } else { missing[i] = false; attributes[i] = t[i]; } } } /** * Calculates a hash code * * @return the hash code as an integer */ @Override public int hashCode() { int hv = 0; if (key != -999) { return key; } for (int i = 0; i < attributes.length; i++) { if (missing[i]) { hv += (i * 13); } else { hv += (i * 5 * (attributes[i] + 1)); } } if (key == -999) { key = hv; } return hv; } /** * Tests if two instances are equal * * @param b a key to compare with * @return true if the objects are equal */ @Override public boolean equals(Object b) { if ((b == null) || !(b.getClass().equals(this.getClass()))) { return false; } boolean ok = true; boolean l; if (b instanceof hashKey) { hashKey n = (hashKey) b; for (int i = 0; i < attributes.length; i++) { l = n.missing[i]; if (missing[i] || l) { if ((missing[i] && !l) || (!missing[i] && l)) { ok = false; break; } } else { if (attributes[i] != n.attributes[i]) { ok = false; break; } } } } else { return false; } return ok; } /** * Prints the hash code */ public void print_hash_code() { System.out.println("Hash val: " + hashCode()); } /** * Returns the revision string. * * @return the revision */ @Override public String getRevision() { return RevisionUtils.extract("$Revision$"); } } /** * Returns a string describing this search method * * @return a description of the search suitable for displaying in the * explorer/experimenter gui */ public String globalInfo() { return "ConsistencySubsetEval :\n\nEvaluates the worth of a subset of " + "attributes by the level of consistency in the class values when the " + "training instances are projected onto the subset of attributes. " + "\n\nConsistency of any subset can never be lower than that of the " + "full set of attributes, hence the usual practice is to use this " + "subset evaluator in conjunction with a Random or Exhaustive search " + "which looks for the smallest subset with consistency equal to that " + "of the full set of attributes.\n\n" + "For more information see:\n\n" + getTechnicalInformation().toString(); } /** * Returns an instance of a TechnicalInformation object, containing detailed * information about the technical background of this class, e.g., paper * reference or book this class is based on. * * @return the technical information about this class */ @Override public TechnicalInformation getTechnicalInformation() { TechnicalInformation result; result = new TechnicalInformation(Type.INPROCEEDINGS); result.setValue(Field.AUTHOR, "H. Liu and R. Setiono"); result.setValue(Field.TITLE, "A probabilistic approach to feature selection - A filter solution"); result.setValue(Field.BOOKTITLE, "13th International Conference on Machine Learning"); result.setValue(Field.YEAR, "1996"); result.setValue(Field.PAGES, "319-327"); return result; } /** * Constructor. Calls restOptions to set default options **/ public ConsistencySubsetEval() { resetOptions(); } /** * reset to defaults */ private void resetOptions() { m_trainInstances = null; } /** * Returns the capabilities of this evaluator. * * @return the capabilities of this evaluator * @see Capabilities */ @Override public Capabilities getCapabilities() { Capabilities result = super.getCapabilities(); result.disableAll(); // attributes result.enable(Capability.NOMINAL_ATTRIBUTES); result.enable(Capability.NUMERIC_ATTRIBUTES); result.enable(Capability.DATE_ATTRIBUTES); result.enable(Capability.MISSING_VALUES); // class result.enable(Capability.NOMINAL_CLASS); result.enable(Capability.MISSING_CLASS_VALUES); return result; } /** * Generates a attribute evaluator. Has to initialize all fields of the * evaluator that are not being set via options. * * @param data set of instances serving as training data * @throws Exception if the evaluator has not been generated successfully */ @Override public void buildEvaluator(Instances data) throws Exception { // can evaluator handle data? getCapabilities().testWithFail(data); m_trainInstances = new Instances(data); m_trainInstances.deleteWithMissingClass(); m_classIndex = m_trainInstances.classIndex(); m_numAttribs = m_trainInstances.numAttributes(); m_numInstances = m_trainInstances.numInstances(); m_disTransform = new Discretize(); m_disTransform.setUseBetterEncoding(true); m_disTransform.setInputFormat(m_trainInstances); m_trainInstances = Filter.useFilter(m_trainInstances, m_disTransform); } /** * Evaluates a subset of attributes * * @param subset a bitset representing the attribute subset to be evaluated * @throws Exception if the subset could not be evaluated */ @Override public double evaluateSubset(BitSet subset) throws Exception { int[] fs; int i; int count = 0; for (i = 0; i < m_numAttribs; i++) { if (subset.get(i)) { count++; } } double[] instArray = new double[count]; int index = 0; fs = new int[count]; for (i = 0; i < m_numAttribs; i++) { if (subset.get(i)) { fs[index++] = i; } } // create new hash table m_table = new Hashtable<hashKey, double[]>((int) (m_numInstances * 1.5)); for (i = 0; i < m_numInstances; i++) { Instance inst = m_trainInstances.instance(i); for (int j = 0; j < fs.length; j++) { if (fs[j] == m_classIndex) { throw new Exception("A subset should not contain the class!"); } if (inst.isMissing(fs[j])) { instArray[j] = Double.MAX_VALUE; } else { instArray[j] = inst.value(fs[j]); } } insertIntoTable(inst, instArray); } return consistencyCount(); } /** * calculates the level of consistency in a dataset using a subset of * features. The consistency of a hash table entry is the total number of * instances hashed to that location minus the number of instances in the * largest class hashed to that location. The total consistency is 1.0 minus * the sum of the individual consistencies divided by the total number of * instances. * * @return the consistency of the hash table as a value between 0 and 1. */ private double consistencyCount() { Enumeration<hashKey> e = m_table.keys(); double[] classDist; double count = 0.0; while (e.hasMoreElements()) { hashKey tt = e.nextElement(); classDist = m_table.get(tt); count += Utils.sum(classDist); int max = Utils.maxIndex(classDist); count -= classDist[max]; } count /= m_numInstances; return (1.0 - count); } /** * Inserts an instance into the hash table * * @param inst instance to be inserted * @param instA the instance to be inserted as an array of attribute values. * @throws Exception if the instance can't be inserted */ private void insertIntoTable(Instance inst, double[] instA) throws Exception { double[] tempClassDist2; double[] newDist; hashKey thekey; thekey = new hashKey(instA); // see if this one is already in the table tempClassDist2 = m_table.get(thekey); if (tempClassDist2 == null) { newDist = new double[m_trainInstances.classAttribute().numValues()]; newDist[(int) inst.classValue()] = inst.weight(); // add to the table m_table.put(thekey, newDist); } else { // update the distribution for this instance tempClassDist2[(int) inst.classValue()] += inst.weight(); // update the table m_table.put(thekey, tempClassDist2); } } /** * returns a description of the evaluator * * @return a description of the evaluator as a String. */ @Override public String toString() { StringBuffer text = new StringBuffer(); if (m_trainInstances == null) { text.append("\tConsistency subset evaluator has not been built yet\n"); } else { text.append("\tConsistency Subset Evaluator\n"); } return text.toString(); } /** * Returns the revision string. * * @return the revision */ @Override public String getRevision() { return RevisionUtils.extract("$Revision$"); } @Override public void clean() { // save memory m_trainInstances = new Instances(m_trainInstances, 0); } /** * Main method for testing this class. * * @param args the options */ public static void main(String[] args) { runEvaluator(new ConsistencySubsetEval(), args); } }