Java tutorial
/* * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see <http://www.gnu.org/licenses/>. */ /* * VFI.java * Copyright (C) 2000 University of Waikato, Hamilton, New Zealand * */ package weka.classifiers.misc; import java.util.Collections; import java.util.Enumeration; import java.util.Vector; import weka.classifiers.AbstractClassifier; import weka.core.Capabilities; import weka.core.Capabilities.Capability; import weka.core.Instance; import weka.core.Instances; import weka.core.Option; import weka.core.OptionHandler; import weka.core.RevisionUtils; import weka.core.TechnicalInformation; import weka.core.TechnicalInformation.Field; import weka.core.TechnicalInformation.Type; import weka.core.TechnicalInformationHandler; import weka.core.Utils; import weka.core.WeightedInstancesHandler; /** * <!-- globalinfo-start --> Classification by voting feature intervals. * Intervals are constucted around each class for each attribute (basically * discretization). Class counts are recorded for each interval on each * attribute. Classification is by voting. For more info see:<br/> * <br/> * G. Demiroz, A. Guvenir: Classification by voting feature intervals. In: 9th * European Conference on Machine Learning, 85-92, 1997.<br/> * <br/> * Have added a simple attribute weighting scheme. Higher weight is assigned to * more confident intervals, where confidence is a function of entropy:<br/> * weight (att_i) = (entropy of class distrib att_i / max uncertainty)^-bias * <p/> * <!-- globalinfo-end --> * * <!-- technical-bibtex-start --> BibTeX: * * <pre> * @inproceedings{Demiroz1997, * author = {G. Demiroz and A. Guvenir}, * booktitle = {9th European Conference on Machine Learning}, * pages = {85-92}, * publisher = {Springer}, * title = {Classification by voting feature intervals}, * year = {1997} * } * </pre> * <p/> * <!-- technical-bibtex-end --> * * Faster than NaiveBayes but slower than HyperPipes. * <p> * <p> * * <pre> * Confidence: 0.01 (two tailed) * * Dataset (1) VFI '-B | (2) Hyper (3) Naive * ------------------------------------ * anneal.ORIG (10) 74.56 | 97.88 v 74.77 * anneal (10) 71.83 | 97.88 v 86.51 v * audiology (10) 51.69 | 66.26 v 72.25 v * autos (10) 57.63 | 62.79 v 57.76 * balance-scale (10) 68.72 | 46.08 * 90.5 v * breast-cancer (10) 67.25 | 69.84 v 73.12 v * wisconsin-breast-cancer (10) 95.72 | 88.31 * 96.05 v * horse-colic.ORIG (10) 66.13 | 70.41 v 66.12 * horse-colic (10) 78.36 | 62.07 * 78.28 * credit-rating (10) 85.17 | 44.58 * 77.84 * * german_credit (10) 70.81 | 69.89 * 74.98 v * pima_diabetes (10) 62.13 | 65.47 v 75.73 v * Glass (10) 56.82 | 50.19 * 47.43 * * cleveland-14-heart-diseas (10) 80.01 | 55.18 * 83.83 v * hungarian-14-heart-diseas (10) 82.8 | 65.55 * 84.37 v * heart-statlog (10) 79.37 | 55.56 * 84.37 v * hepatitis (10) 83.78 | 63.73 * 83.87 * hypothyroid (10) 92.64 | 93.33 v 95.29 v * ionosphere (10) 94.16 | 35.9 * 82.6 * * iris (10) 96.2 | 91.47 * 95.27 * * kr-vs-kp (10) 88.22 | 54.1 * 87.84 * * labor (10) 86.73 | 87.67 93.93 v * lymphography (10) 78.48 | 58.18 * 83.24 v * mushroom (10) 99.85 | 99.77 * 95.77 * * primary-tumor (10) 29 | 24.78 * 49.35 v * segment (10) 77.42 | 75.15 * 80.1 v * sick (10) 65.92 | 93.85 v 92.71 v * sonar (10) 58.02 | 57.17 67.97 v * soybean (10) 86.81 | 86.12 * 92.9 v * splice (10) 88.61 | 41.97 * 95.41 v * vehicle (10) 52.94 | 32.77 * 44.8 * * vote (10) 91.5 | 61.38 * 90.19 * * vowel (10) 57.56 | 36.34 * 62.81 v * waveform (10) 56.33 | 46.11 * 80.02 v * zoo (10) 94.05 | 94.26 95.04 v * ------------------------------------ * (v| |*) | (9|3|23) (22|5|8) * </pre> * <p> * * <!-- options-start --> Valid options are: * <p/> * * <pre> * -C * Don't weight voting intervals by confidence * </pre> * * <pre> * -B <bias> * Set exponential bias towards confident intervals * (default = 0.6) * </pre> * * <!-- options-end --> * * @author Mark Hall (mhall@cs.waikato.ac.nz) * @version $Revision$ */ public class VFI extends AbstractClassifier implements OptionHandler, WeightedInstancesHandler, TechnicalInformationHandler { /** for serialization */ static final long serialVersionUID = 8081692166331321866L; /** The index of the class attribute */ protected int m_ClassIndex; /** The number of classes */ protected int m_NumClasses; /** The training data */ protected Instances m_Instances = null; /** The class counts for each interval of each attribute */ protected double[][][] m_counts; /** The global class counts */ protected double[] m_globalCounts; /** The lower bounds for each attribute */ protected double[][] m_intervalBounds; /** The maximum entropy for the class */ protected double m_maxEntrop; /** Exponentially bias more confident intervals */ protected boolean m_weightByConfidence = true; /** Bias towards more confident intervals */ protected double m_bias = -0.6; private double TINY = 0.1e-10; /** * Returns a string describing this search method * * @return a description of the search method suitable for displaying in the * explorer/experimenter gui */ public String globalInfo() { return "Classification by voting feature intervals. Intervals are " + "constucted around each class for each attribute (" + "basically discretization). Class counts are " + "recorded for each interval on each attribute. Classification is by " + "voting. For more info see:\n\n" + getTechnicalInformation().toString() + "\n\n" + "Have added a simple attribute weighting scheme. Higher weight is " + "assigned to more confident intervals, where confidence is a function " + "of entropy:\nweight (att_i) = (entropy of class distrib att_i / " + "max uncertainty)^-bias"; } /** * Returns an instance of a TechnicalInformation object, containing detailed * information about the technical background of this class, e.g., paper * reference or book this class is based on. * * @return the technical information about this class */ @Override public TechnicalInformation getTechnicalInformation() { TechnicalInformation result; result = new TechnicalInformation(Type.INPROCEEDINGS); result.setValue(Field.AUTHOR, "G. Demiroz and A. Guvenir"); result.setValue(Field.TITLE, "Classification by voting feature intervals"); result.setValue(Field.BOOKTITLE, "9th European Conference on Machine Learning"); result.setValue(Field.YEAR, "1997"); result.setValue(Field.PAGES, "85-92"); result.setValue(Field.PUBLISHER, "Springer"); return result; } /** * Returns an enumeration describing the available options. * * @return an enumeration of all the available options. */ @Override public Enumeration<Option> listOptions() { Vector<Option> newVector = new Vector<Option>(2); newVector.addElement(new Option("\tDon't weight voting intervals by confidence", "C", 0, "-C")); newVector.addElement(new Option( "\tSet exponential bias towards confident intervals\n" + "\t(default = 0.6)", "B", 1, "-B <bias>")); newVector.addAll(Collections.list(super.listOptions())); return newVector.elements(); } /** * Parses a given list of options. * <p/> * * <!-- options-start --> Valid options are: * <p/> * * <pre> * -C * Don't weight voting intervals by confidence * </pre> * * <pre> * -B <bias> * Set exponential bias towards confident intervals * (default = 0.6) * </pre> * * <!-- options-end --> * * @param options the list of options as an array of strings * @throws Exception if an option is not supported */ @Override public void setOptions(String[] options) throws Exception { String optionString; setWeightByConfidence(!Utils.getFlag('C', options)); optionString = Utils.getOption('B', options); if (optionString.length() != 0) { Double temp = new Double(optionString); setBias(temp.doubleValue()); } super.setOptions(options); Utils.checkForRemainingOptions(options); } /** * Returns the tip text for this property * * @return tip text for this property suitable for displaying in the * explorer/experimenter gui */ public String weightByConfidenceTipText() { return "Weight feature intervals by confidence"; } /** * Set weighting by confidence * * @param c true if feature intervals are to be weighted by confidence */ public void setWeightByConfidence(boolean c) { m_weightByConfidence = c; } /** * Get whether feature intervals are being weighted by confidence * * @return true if weighting by confidence is selected */ public boolean getWeightByConfidence() { return m_weightByConfidence; } /** * Returns the tip text for this property * * @return tip text for this property suitable for displaying in the * explorer/experimenter gui */ public String biasTipText() { return "Strength of bias towards more confident features"; } /** * Set the value of the exponential bias towards more confident intervals * * @param b the value of the bias parameter */ public void setBias(double b) { m_bias = -b; } /** * Get the value of the bias parameter * * @return the bias parameter */ public double getBias() { return -m_bias; } /** * Gets the current settings of VFI * * @return an array of strings suitable for passing to setOptions() */ @Override public String[] getOptions() { Vector<String> options = new Vector<String>(); if (!getWeightByConfidence()) { options.add("-C"); } options.add("-B"); options.add("" + getBias()); Collections.addAll(options, super.getOptions()); return options.toArray(new String[0]); } /** * Returns default capabilities of the classifier. * * @return the capabilities of this classifier */ @Override public Capabilities getCapabilities() { Capabilities result = super.getCapabilities(); result.disableAll(); // attributes result.enable(Capability.NOMINAL_ATTRIBUTES); result.enable(Capability.NUMERIC_ATTRIBUTES); result.enable(Capability.DATE_ATTRIBUTES); result.enable(Capability.MISSING_VALUES); // class result.enable(Capability.NOMINAL_CLASS); result.enable(Capability.MISSING_CLASS_VALUES); // instances result.setMinimumNumberInstances(0); return result; } /** * Generates the classifier. * * @param instances set of instances serving as training data * @throws Exception if the classifier has not been generated successfully */ @Override public void buildClassifier(Instances instances) throws Exception { if (!m_weightByConfidence) { TINY = 0.0; } // can classifier handle the data? getCapabilities().testWithFail(instances); // remove instances with missing class instances = new Instances(instances); instances.deleteWithMissingClass(); m_ClassIndex = instances.classIndex(); m_NumClasses = instances.numClasses(); m_globalCounts = new double[m_NumClasses]; m_maxEntrop = Math.log(m_NumClasses) / Math.log(2); m_Instances = new Instances(instances, 0); // Copy the structure for ref m_intervalBounds = new double[instances.numAttributes()][2 + (2 * m_NumClasses)]; for (int j = 0; j < instances.numAttributes(); j++) { boolean alt = false; for (int i = 0; i < m_NumClasses * 2 + 2; i++) { if (i == 0) { m_intervalBounds[j][i] = Double.NEGATIVE_INFINITY; } else if (i == m_NumClasses * 2 + 1) { m_intervalBounds[j][i] = Double.POSITIVE_INFINITY; } else { if (alt) { m_intervalBounds[j][i] = Double.NEGATIVE_INFINITY; alt = false; } else { m_intervalBounds[j][i] = Double.POSITIVE_INFINITY; alt = true; } } } } // find upper and lower bounds for numeric attributes for (int j = 0; j < instances.numAttributes(); j++) { if (j != m_ClassIndex && instances.attribute(j).isNumeric()) { for (int i = 0; i < instances.numInstances(); i++) { Instance inst = instances.instance(i); if (!inst.isMissing(j)) { if (inst.value(j) < m_intervalBounds[j][((int) inst.classValue() * 2 + 1)]) { m_intervalBounds[j][((int) inst.classValue() * 2 + 1)] = inst.value(j); } if (inst.value(j) > m_intervalBounds[j][((int) inst.classValue() * 2 + 2)]) { m_intervalBounds[j][((int) inst.classValue() * 2 + 2)] = inst.value(j); } } } } } m_counts = new double[instances.numAttributes()][][]; // sort intervals for (int i = 0; i < instances.numAttributes(); i++) { if (instances.attribute(i).isNumeric()) { int[] sortedIntervals = Utils.sort(m_intervalBounds[i]); // remove any duplicate bounds int count = 1; for (int j = 1; j < sortedIntervals.length; j++) { if (m_intervalBounds[i][sortedIntervals[j]] != m_intervalBounds[i][sortedIntervals[j - 1]]) { count++; } } double[] reordered = new double[count]; count = 1; reordered[0] = m_intervalBounds[i][sortedIntervals[0]]; for (int j = 1; j < sortedIntervals.length; j++) { if (m_intervalBounds[i][sortedIntervals[j]] != m_intervalBounds[i][sortedIntervals[j - 1]]) { reordered[count] = m_intervalBounds[i][sortedIntervals[j]]; count++; } } m_intervalBounds[i] = reordered; m_counts[i] = new double[count][m_NumClasses]; } else if (i != m_ClassIndex) { // nominal attribute m_counts[i] = new double[instances.attribute(i).numValues()][m_NumClasses]; } } // collect class counts for (int i = 0; i < instances.numInstances(); i++) { Instance inst = instances.instance(i); m_globalCounts[(int) instances.instance(i).classValue()] += inst.weight(); for (int j = 0; j < instances.numAttributes(); j++) { if (!inst.isMissing(j) && j != m_ClassIndex) { if (instances.attribute(j).isNumeric()) { double val = inst.value(j); int k; for (k = m_intervalBounds[j].length - 1; k >= 0; k--) { if (val > m_intervalBounds[j][k]) { m_counts[j][k][(int) inst.classValue()] += inst.weight(); break; } else if (val == m_intervalBounds[j][k]) { m_counts[j][k][(int) inst.classValue()] += (inst.weight() / 2.0); m_counts[j][k - 1][(int) inst.classValue()] += (inst.weight() / 2.0); ; break; } } } else { // nominal attribute m_counts[j][(int) inst.value(j)][(int) inst.classValue()] += inst.weight(); ; } } } } } /** * Returns a description of this classifier. * * @return a description of this classifier as a string. */ @Override public String toString() { if (m_Instances == null) { return "FVI: Classifier not built yet!"; } StringBuffer sb = new StringBuffer("Voting feature intervals classifier\n"); /* Output the intervals and class counts for each attribute */ /* * for (int i = 0; i < m_Instances.numAttributes(); i++) { if (i != * m_ClassIndex) { sb.append("\n"+m_Instances.attribute(i).name()+" :\n"); * if (m_Instances.attribute(i).isNumeric()) { for (int j = 0; j < * m_intervalBounds[i].length; j++) { * sb.append(m_intervalBounds[i][j]).append("\n"); if (j != * m_intervalBounds[i].length-1) { for (int k = 0; k < m_NumClasses; k++) { * sb.append(m_counts[i][j][k]+" "); } } sb.append("\n"); } } else { for * (int j = 0; j < m_Instances.attribute(i).numValues(); j++) { * sb.append(m_Instances.attribute(i).value(j)).append("\n"); for (int k = * 0; k < m_NumClasses; k++) { sb.append(m_counts[i][j][k]+" "); } * sb.append("\n"); } } } } */ return sb.toString(); } /** * Classifies the given test instance. * * @param instance the instance to be classified * @return the predicted class for the instance * @throws Exception if the instance can't be classified */ @Override public double[] distributionForInstance(Instance instance) throws Exception { double[] dist = new double[m_NumClasses]; double[] temp = new double[m_NumClasses]; double weight = 1.0; for (int i = 0; i < instance.numAttributes(); i++) { if (i != m_ClassIndex && !instance.isMissing(i)) { double val = instance.value(i); boolean ok = false; if (instance.attribute(i).isNumeric()) { int k; for (k = m_intervalBounds[i].length - 1; k >= 0; k--) { if (val > m_intervalBounds[i][k]) { for (int j = 0; j < m_NumClasses; j++) { if (m_globalCounts[j] > 0) { temp[j] = ((m_counts[i][k][j] + TINY) / (m_globalCounts[j] + TINY)); } } ok = true; break; } else if (val == m_intervalBounds[i][k]) { for (int j = 0; j < m_NumClasses; j++) { if (m_globalCounts[j] > 0) { temp[j] = ((m_counts[i][k][j] + m_counts[i][k - 1][j]) / 2.0) + TINY; temp[j] /= (m_globalCounts[j] + TINY); } } ok = true; break; } } if (!ok) { throw new Exception("This shouldn't happen"); } } else { // nominal attribute ok = true; for (int j = 0; j < m_NumClasses; j++) { if (m_globalCounts[j] > 0) { temp[j] = ((m_counts[i][(int) val][j] + TINY) / (m_globalCounts[j] + TINY)); } } } double sum = Utils.sum(temp); if (sum <= 0) { for (int j = 0; j < temp.length; j++) { temp[j] = 1.0 / temp.length; } } else { Utils.normalize(temp, sum); } if (m_weightByConfidence) { weight = weka.core.ContingencyTables.entropy(temp); weight = Math.pow(weight, m_bias); if (weight < 1.0) { weight = 1.0; } } for (int j = 0; j < m_NumClasses; j++) { dist[j] += (temp[j] * weight); } } } double sum = Utils.sum(dist); if (sum <= 0) { for (int j = 0; j < dist.length; j++) { dist[j] = 1.0 / dist.length; } return dist; } else { Utils.normalize(dist, sum); return dist; } } /** * Returns the revision string. * * @return the revision */ @Override public String getRevision() { return RevisionUtils.extract("$Revision$"); } /** * Main method for testing this class. * * @param args should contain command line arguments for evaluation (see * Evaluation). */ public static void main(String[] args) { runClassifier(new VFI(), args); } }