Java tutorial
/* * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see <http://www.gnu.org/licenses/>. */ /* * ClassifierSubsetEval.java * Copyright (C) 2000 University of Waikato, Hamilton, New Zealand * */ package weka.attributeSelection; import java.io.File; import java.util.BitSet; import java.util.Collections; import java.util.Enumeration; import java.util.List; import java.util.Random; import java.util.Vector; import weka.classifiers.AbstractClassifier; import weka.classifiers.Classifier; import weka.classifiers.Evaluation; import weka.classifiers.evaluation.AbstractEvaluationMetric; import weka.classifiers.evaluation.InformationRetrievalEvaluationMetric; import weka.classifiers.rules.ZeroR; import weka.core.Capabilities; import weka.core.Capabilities.Capability; import weka.core.Instance; import weka.core.Instances; import weka.core.Option; import weka.core.OptionHandler; import weka.core.RevisionUtils; import weka.core.SelectedTag; import weka.core.Tag; import weka.core.Utils; import weka.filters.Filter; import weka.filters.unsupervised.attribute.Remove; /** <!-- globalinfo-start --> * Classifier subset evaluator:<br> * <br> * Evaluates attribute subsets on training data or a seperate hold out testing set. Uses a classifier to estimate the 'merit' of a set of attributes. * <br><br> <!-- globalinfo-end --> * <!-- options-start --> * Valid options are: <p> * * <pre> -B <classifier> * class name of the classifier to use for accuracy estimation. * Place any classifier options LAST on the command line * following a "--". eg.: * -B weka.classifiers.bayes.NaiveBayes ... -- -K * (default: weka.classifiers.rules.ZeroR)</pre> * * <pre> -T * Use the training data to estimate accuracy.</pre> * * <pre> -H <filename> * Name of the hold out/test set to * estimate accuracy on.</pre> * * <pre> -percentage-split * Perform a percentage split on the training data. * Use in conjunction with -T.</pre> * * <pre> -P * Split percentage to use (default = 90).</pre> * * <pre> -S * Random seed for percentage split (default = 1).</pre> * * <pre> -E <DEFAULT|ACC|RMSE|MAE|F-MEAS|AUC|AUPRC|CORR-COEFF> * Performance evaluation measure to use for selecting attributes. * (Default = default: accuracy for discrete class and rmse for numeric class)</pre> * * <pre> -IRclass <label | index> * Optional class value (label or 1-based index) to use in conjunction with * IR statistics (f-meas, auc or auprc). Omitting this option will use * the class-weighted average.</pre> * * <pre> * Options specific to scheme weka.classifiers.rules.ZeroR: * </pre> * * <pre> -output-debug-info * If set, classifier is run in debug mode and * may output additional info to the console</pre> * * <pre> -do-not-check-capabilities * If set, classifier capabilities are not checked before classifier is built * (use with caution).</pre> * * <pre> -num-decimal-places * The number of decimal places for the output of numbers in the model (default 2).</pre> * * <pre> -batch-size * The desired batch size for batch prediction (default 100).</pre> * <!-- options-end --> * * @author Mark Hall (mhall@cs.waikato.ac.nz) * @version $Revision: 10332 $ */ public class ClassifierSubsetEval extends HoldOutSubsetEvaluator implements OptionHandler, ErrorBasedMeritEvaluator { /** for serialization */ static final long serialVersionUID = 7532217899385278710L; /** training instances */ private Instances m_trainingInstances; /** class index */ private int m_classIndex; /** number of attributes in the training data */ private int m_numAttribs; /** number of training instances */ // private int m_numInstances; NOT USED /** holds the template classifier to use for error estimates */ private Classifier m_ClassifierTemplate = new ZeroR(); /** * Holds the classifier used when evaluating single hold-out instances - this * is used by RaceSearch and the trained classifier may need to persist * between calls to that particular method. */ private Classifier m_Classifier = new ZeroR(); /** the file that containts hold out/test instances */ private File m_holdOutFile = new File("Click to set hold out or " + "test instances"); /** the instances to test on */ private Instances m_holdOutInstances; /** evaluate on training data rather than separate hold out/test set */ private boolean m_useTraining = true; /** Whether to hold out a percentage of the training data */ protected boolean m_usePercentageSplit; /** Seed for randomizing prior to splitting training data */ protected int m_seed = 1; /** The split to use if doing a percentage split */ protected String m_splitPercent = "90"; public static final int EVAL_DEFAULT = 1; public static final int EVAL_ACCURACY = 2; public static final int EVAL_RMSE = 3; public static final int EVAL_MAE = 4; public static final int EVAL_FMEASURE = 5; public static final int EVAL_AUC = 6; public static final int EVAL_AUPRC = 7; public static final int EVAL_CORRELATION = 8; public static final int EVAL_PLUGIN = 9; protected static List<AbstractEvaluationMetric> PLUGIN_METRICS = AbstractEvaluationMetric.getPluginMetrics(); /** Holds all tags for metrics */ public static final Tag[] TAGS_EVALUATION; static { int totalPluginCount = 0; if (PLUGIN_METRICS != null) { for (AbstractEvaluationMetric m : PLUGIN_METRICS) { totalPluginCount += m.getStatisticNames().size(); } } TAGS_EVALUATION = new Tag[8 + totalPluginCount]; TAGS_EVALUATION[0] = new Tag(EVAL_DEFAULT, "default", "Default: accuracy (discrete class); RMSE (numeric class)"); TAGS_EVALUATION[1] = new Tag(EVAL_ACCURACY, "acc", "Accuracy (discrete class only)"); TAGS_EVALUATION[2] = new Tag(EVAL_RMSE, "rmse", "RMSE (of the class probabilities for discrete class)"); TAGS_EVALUATION[3] = new Tag(EVAL_MAE, "mae", "MAE (of the class probabilities for discrete class)"); TAGS_EVALUATION[4] = new Tag(EVAL_FMEASURE, "f-meas", "F-measure (discrete class only)"); TAGS_EVALUATION[5] = new Tag(EVAL_AUC, "auc", "AUC (area under the ROC curve - discrete class only)"); TAGS_EVALUATION[6] = new Tag(EVAL_AUPRC, "auprc", "AUPRC (area under the precision-recall curve - discrete class only)"); TAGS_EVALUATION[7] = new Tag(EVAL_CORRELATION, "corr-coeff", "Correlation coefficient - numeric class only"); if (PLUGIN_METRICS != null) { int index = 8; for (AbstractEvaluationMetric m : PLUGIN_METRICS) { for (String stat : m.getStatisticNames()) { TAGS_EVALUATION[index++] = new WrapperSubsetEval.PluginTag(index + 1, m, stat); } } } } /** The evaluation measure to use */ protected Tag m_evaluationMeasure = TAGS_EVALUATION[0]; /** * If >= 0, and an IR metric is being used, then evaluate with respect to this * class value (0-based index) */ protected int m_IRClassVal = -1; /** User supplied option for IR class value (either name or 1-based index) */ protected String m_IRClassValS = ""; /** * Returns a string describing this attribute evaluator * * @return a description of the evaluator suitable for displaying in the * explorer/experimenter gui */ public String globalInfo() { return "Classifier subset evaluator:\n\nEvaluates attribute subsets on training data or a seperate " + "hold out testing set. Uses a classifier to estimate the 'merit' of a set of attributes."; } /** * Returns an enumeration describing the available options. * * @return an enumeration of all the available options. **/ @Override public Enumeration<Option> listOptions() { Vector<Option> newVector = new Vector<Option>(8); newVector.addElement(new Option("\tclass name of the classifier to use for accuracy estimation.\n" + "\tPlace any classifier options LAST on the command line\n" + "\tfollowing a \"--\". eg.:\n" + "\t\t-B weka.classifiers.bayes.NaiveBayes ... -- -K\n" + "\t(default: weka.classifiers.rules.ZeroR)", "B", 1, "-B <classifier>")); newVector.addElement(new Option("\tUse the training data to estimate" + " accuracy.", "T", 0, "-T")); newVector.addElement(new Option("\tName of the hold out/test set to " + "\n\testimate accuracy on.", "H", 1, "-H <filename>")); newVector.addElement( new Option("\tPerform a percentage split on the " + "training data.\n\tUse in conjunction with -T.", "percentage-split", 0, "-percentage-split")); newVector.addElement(new Option("\tSplit percentage to use (default = 90).", "P", 1, "-P")); newVector.addElement(new Option("\tRandom seed for percentage split (default = 1).", "S", 1, "-S")); newVector.addElement(new Option( "\tPerformance evaluation measure to use for selecting attributes.\n" + "\t(Default = default: accuracy for discrete class and rmse for " + "numeric class)", "E", 1, "-E " + Tag.toOptionList(TAGS_EVALUATION))); newVector.addElement( new Option("\tOptional class value (label or 1-based index) to use in conjunction with\n" + "\tIR statistics (f-meas, auc or auprc). Omitting this option will use\n" + "\tthe class-weighted average.", "IRclass", 1, "-IRclass <label | index>")); if ((m_ClassifierTemplate != null) && (m_ClassifierTemplate instanceof OptionHandler)) { newVector.addElement(new Option("", "", 0, "\nOptions specific to " + "scheme " + m_ClassifierTemplate.getClass().getName() + ":")); newVector.addAll(Collections.list(((OptionHandler) m_ClassifierTemplate).listOptions())); } return newVector.elements(); } /** * Parses a given list of options. * <p/> * <!-- options-start --> * Valid options are: <p> * * <pre> -B <classifier> * class name of the classifier to use for accuracy estimation. * Place any classifier options LAST on the command line * following a "--". eg.: * -B weka.classifiers.bayes.NaiveBayes ... -- -K * (default: weka.classifiers.rules.ZeroR)</pre> * * <pre> -T * Use the training data to estimate accuracy.</pre> * * <pre> -H <filename> * Name of the hold out/test set to * estimate accuracy on.</pre> * * <pre> -percentage-split * Perform a percentage split on the training data. * Use in conjunction with -T.</pre> * * <pre> -P * Split percentage to use (default = 90).</pre> * * <pre> -S * Random seed for percentage split (default = 1).</pre> * * <pre> -E <DEFAULT|ACC|RMSE|MAE|F-MEAS|AUC|AUPRC|CORR-COEFF> * Performance evaluation measure to use for selecting attributes. * (Default = default: accuracy for discrete class and rmse for numeric class)</pre> * * <pre> -IRclass <label | index> * Optional class value (label or 1-based index) to use in conjunction with * IR statistics (f-meas, auc or auprc). Omitting this option will use * the class-weighted average.</pre> * * <pre> * Options specific to scheme weka.classifiers.rules.ZeroR: * </pre> * * <pre> -output-debug-info * If set, classifier is run in debug mode and * may output additional info to the console</pre> * * <pre> -do-not-check-capabilities * If set, classifier capabilities are not checked before classifier is built * (use with caution).</pre> * * <pre> -num-decimal-places * The number of decimal places for the output of numbers in the model (default 2).</pre> * * <pre> -batch-size * The desired batch size for batch prediction (default 100).</pre> * <!-- options-end --> * * @param options the list of options as an array of strings * @throws Exception if an option is not supported */ @Override public void setOptions(String[] options) throws Exception { String optionString; resetOptions(); optionString = Utils.getOption('B', options); if (optionString.length() == 0) { optionString = ZeroR.class.getName(); } setClassifier(AbstractClassifier.forName(optionString, Utils.partitionOptions(options))); optionString = Utils.getOption('H', options); if (optionString.length() != 0) { setHoldOutFile(new File(optionString)); } setUsePercentageSplit(Utils.getFlag("percentage-split", options)); optionString = Utils.getOption('P', options); if (optionString.length() > 0) { setSplitPercent(optionString); } setUseTraining(Utils.getFlag('T', options)); optionString = Utils.getOption('E', options); if (optionString.length() != 0) { for (Tag t : TAGS_EVALUATION) { if (t.getIDStr().equalsIgnoreCase(optionString)) { setEvaluationMeasure(new SelectedTag(t.getIDStr(), TAGS_EVALUATION)); break; } } } optionString = Utils.getOption("IRClass", options); if (optionString.length() > 0) { setIRClassValue(optionString); } optionString = Utils.getOption("S", options); if (optionString.length() > 0) { setSeed(Integer.parseInt(optionString)); } } /** * Returns the tip text for this property * * @return tip text for this property suitable for displaying in the * explorer/experimenter gui */ public String seedTipText() { return "The random seed to use for randomizing the training data " + "prior to performing a percentage split"; } /** * Set the random seed used to randomize the data before performing a * percentage split * * @param s the seed to use */ public void setSeed(int s) { m_seed = s; } /** * Get the random seed used to randomize the data before performing a * percentage split * * @return the seed to use */ public int getSeed() { return m_seed; } /** * Returns the tip text for this property * * @return tip text for this property suitable for displaying in the * explorer/experimenter gui */ public String usePercentageSplitTipText() { return "Evaluate using a percentage split on the training data"; } /** * Set whether to perform a percentage split on the training data for * evaluation * * @param p true if a percentage split is to be performed */ public void setUsePercentageSplit(boolean p) { m_usePercentageSplit = p; } /** * Get whether to perform a percentage split on the training data for * evaluation * * @return true if a percentage split is to be performed */ public boolean getUsePercentageSplit() { return m_usePercentageSplit; } /** * Returns the tip text for this property * * @return tip text for this property suitable for displaying in the * explorer/experimenter gui */ public String splitPercentTipText() { return "The percentage split to use"; } /** * Set the split percentage to use * * @param sp the split percentage to use */ public void setSplitPercent(String sp) { m_splitPercent = sp; } /** * Get the split percentage to use * * @return the split percentage to use */ public String getSplitPercent() { return m_splitPercent; } /** * Set the class value (label or index) to use with IR metric evaluation of * subsets. Leaving this unset will result in the class weighted average for * the IR metric being used. * * @param val the class label or 1-based index of the class label to use when * evaluating subsets with an IR metric */ public void setIRClassValue(String val) { m_IRClassValS = val; } /** * Get the class value (label or index) to use with IR metric evaluation of * subsets. Leaving this unset will result in the class weighted average for * the IR metric being used. * * @return the class label or 1-based index of the class label to use when * evaluating subsets with an IR metric */ public String getIRClassValue() { return m_IRClassValS; } /** * Returns the tip text for this property * * @return tip text for this property suitable for displaying in the * explorer/experimenter gui */ public String IRClassValueTipText() { return "The class label, or 1-based index of the class label, to use " + "when evaluating subsets with an IR metric (such as f-measure " + "or AUC. Leaving this unset will result in the class frequency " + "weighted average of the metric being used."; } /** * Returns the tip text for this property * * @return tip text for this property suitable for displaying in the * explorer/experimenter gui */ public String evaluationMeasureTipText() { return "The measure used to evaluate the performance of attribute combinations."; } /** * Gets the currently set performance evaluation measure used for selecting * attributes for the decision table * * @return the performance evaluation measure */ public SelectedTag getEvaluationMeasure() { return new SelectedTag(m_evaluationMeasure.getIDStr(), TAGS_EVALUATION); } /** * Sets the performance evaluation measure to use for selecting attributes for * the decision table * * @param newMethod the new performance evaluation metric to use */ public void setEvaluationMeasure(SelectedTag newMethod) { if (newMethod.getTags() == TAGS_EVALUATION) { m_evaluationMeasure = newMethod.getSelectedTag(); } } /** * Returns the tip text for this property * * @return tip text for this property suitable for displaying in the * explorer/experimenter gui */ public String classifierTipText() { return "Classifier to use for estimating the accuracy of subsets"; } /** * Set the classifier to use for accuracy estimation * * @param newClassifier the Classifier to use. */ public void setClassifier(Classifier newClassifier) { m_ClassifierTemplate = newClassifier; m_Classifier = newClassifier; } /** * Get the classifier used as the base learner. * * @return the classifier used as the classifier */ public Classifier getClassifier() { return m_ClassifierTemplate; } /** * Returns the tip text for this property * * @return tip text for this property suitable for displaying in the * explorer/experimenter gui */ public String holdOutFileTipText() { return "File containing hold out/test instances."; } /** * Gets the file that holds hold out/test instances. * * @return File that contains hold out instances */ public File getHoldOutFile() { return m_holdOutFile; } /** * Set the file that contains hold out/test instances * * @param h the hold out file */ public void setHoldOutFile(File h) { m_holdOutFile = h; } /** * Returns the tip text for this property * * @return tip text for this property suitable for displaying in the * explorer/experimenter gui */ public String useTrainingTipText() { return "Use training data instead of hold out/test instances."; } /** * Get if training data is to be used instead of hold out/test data * * @return true if training data is to be used instead of hold out data */ public boolean getUseTraining() { return m_useTraining; } /** * Set if training data is to be used instead of hold out/test data * * @param t true if training data is to be used instead of hold out data */ public void setUseTraining(boolean t) { m_useTraining = t; } /** * Gets the current settings of ClassifierSubsetEval * * @return an array of strings suitable for passing to setOptions() */ @Override public String[] getOptions() { Vector<String> options = new Vector<String>(); if (getClassifier() != null) { options.add("-B"); options.add(getClassifier().getClass().getName()); } if (getUseTraining()) { options.add("-T"); } options.add("-H"); options.add(getHoldOutFile().getPath()); if (getUsePercentageSplit()) { options.add("-percentage-split"); options.add("-P"); options.add(m_splitPercent); options.add("-S"); options.add("" + getSeed()); } options.add("-E"); options.add(m_evaluationMeasure.getIDStr()); if (m_IRClassValS != null && m_IRClassValS.length() > 0) { options.add("-IRClass"); options.add(m_IRClassValS); } if ((m_ClassifierTemplate != null) && (m_ClassifierTemplate instanceof OptionHandler)) { String[] classifierOptions = ((OptionHandler) m_ClassifierTemplate).getOptions(); if (classifierOptions.length > 0) { options.add("--"); Collections.addAll(options, classifierOptions); } } return options.toArray(new String[0]); } /** * Returns the capabilities of this evaluator. * * @return the capabilities of this evaluator * @see Capabilities */ @Override public Capabilities getCapabilities() { Capabilities result; if (getClassifier() == null) { result = super.getCapabilities(); result.disableAll(); } else { result = getClassifier().getCapabilities(); } // set dependencies for (Capability cap : Capability.values()) { result.enableDependency(cap); } return result; } /** * Generates a attribute evaluator. Has to initialize all fields of the * evaluator that are not being set via options. * * @param data set of instances serving as training data * @throws Exception if the evaluator has not been generated successfully */ @Override public void buildEvaluator(Instances data) throws Exception { // can evaluator handle data? getCapabilities().testWithFail(data); m_trainingInstances = new Instances(data); m_classIndex = m_trainingInstances.classIndex(); m_numAttribs = m_trainingInstances.numAttributes(); // m_numInstances = m_trainingInstances.numInstances(); NOT USED // load the testing data if (!m_useTraining && (!getHoldOutFile().getPath().startsWith("Click to set"))) { java.io.Reader r = new java.io.BufferedReader(new java.io.FileReader(getHoldOutFile().getPath())); m_holdOutInstances = new Instances(r); m_holdOutInstances.setClassIndex(m_trainingInstances.classIndex()); if (m_trainingInstances.equalHeaders(m_holdOutInstances) == false) { throw new Exception("Hold out/test set is not compatable with " + "training data.\n" + m_trainingInstances.equalHeadersMsg(m_holdOutInstances)); } } else if (m_usePercentageSplit) { int splitPercentage = 90; // default try { splitPercentage = Integer.parseInt(m_splitPercent); } catch (NumberFormatException n) { } m_trainingInstances.randomize(new Random(m_seed)); int trainSize = Math.round(m_trainingInstances.numInstances() * splitPercentage / 100); int testSize = m_trainingInstances.numInstances() - trainSize; m_holdOutInstances = new Instances(m_trainingInstances, trainSize, testSize); m_trainingInstances = new Instances(m_trainingInstances, 0, trainSize); } if (m_IRClassValS != null && m_IRClassValS.length() > 0) { // try to parse as a number first try { m_IRClassVal = Integer.parseInt(m_IRClassValS); // make zero-based m_IRClassVal--; } catch (NumberFormatException e) { // now try as a named class label m_IRClassVal = m_trainingInstances.classAttribute().indexOfValue(m_IRClassValS); } } } /** * Evaluates a subset of attributes * * @param subset a bitset representing the attribute subset to be evaluated * @return the error rate * @throws Exception if the subset could not be evaluated */ @Override public double evaluateSubset(BitSet subset) throws Exception { int i, j; double evalMetric = 0; int numAttributes = 0; Instances trainCopy = null; Instances testCopy = null; String[] cOpts = null; Evaluation evaluation = null; if (m_ClassifierTemplate instanceof OptionHandler) { cOpts = ((OptionHandler) m_ClassifierTemplate).getOptions(); } Classifier classifier = AbstractClassifier.forName(m_ClassifierTemplate.getClass().getName(), cOpts); Remove delTransform = new Remove(); delTransform.setInvertSelection(true); // copy the training instances trainCopy = new Instances(m_trainingInstances); if (!m_useTraining) { if (m_holdOutInstances == null) { throw new Exception("Must specify a set of hold out/test instances " + "with -H"); } // copy the test instances testCopy = new Instances(m_holdOutInstances); } else if (m_usePercentageSplit) { testCopy = new Instances(m_holdOutInstances); } // count attributes set in the BitSet for (i = 0; i < m_numAttribs; i++) { if (subset.get(i)) { numAttributes++; } } // set up an array of attribute indexes for the filter (+1 for the class) int[] featArray = new int[numAttributes + 1]; for (i = 0, j = 0; i < m_numAttribs; i++) { if (subset.get(i)) { featArray[j++] = i; } } featArray[j] = m_classIndex; delTransform.setAttributeIndicesArray(featArray); delTransform.setInputFormat(trainCopy); trainCopy = Filter.useFilter(trainCopy, delTransform); if (!m_useTraining || m_usePercentageSplit) { testCopy = Filter.useFilter(testCopy, delTransform); } // build the classifier classifier.buildClassifier(trainCopy); evaluation = new Evaluation(trainCopy); if (!m_useTraining || m_usePercentageSplit) { evaluation.evaluateModel(classifier, testCopy); } else { evaluation.evaluateModel(classifier, trainCopy); } String metricName = null; String statName = null; AbstractEvaluationMetric pluginMetric = null; switch (m_evaluationMeasure.getID()) { case EVAL_DEFAULT: evalMetric = evaluation.errorRate(); break; case EVAL_ACCURACY: evalMetric = evaluation.errorRate(); break; case EVAL_RMSE: evalMetric = evaluation.rootMeanSquaredError(); break; case EVAL_MAE: evalMetric = evaluation.meanAbsoluteError(); break; case EVAL_FMEASURE: if (m_IRClassVal < 0) { evalMetric = evaluation.weightedFMeasure(); } else { evalMetric = evaluation.fMeasure(m_IRClassVal); } break; case EVAL_AUC: if (m_IRClassVal < 0) { evalMetric = evaluation.weightedAreaUnderROC(); } else { evalMetric = evaluation.areaUnderROC(m_IRClassVal); } break; case EVAL_AUPRC: if (m_IRClassVal < 0) { evalMetric = evaluation.weightedAreaUnderPRC(); } else { evalMetric = evaluation.areaUnderPRC(m_IRClassVal); } case EVAL_CORRELATION: evalMetric = evaluation.correlationCoefficient(); break; default: if (m_evaluationMeasure.getID() >= EVAL_PLUGIN) { metricName = ((WrapperSubsetEval.PluginTag) m_evaluationMeasure).getMetricName(); statName = ((WrapperSubsetEval.PluginTag) m_evaluationMeasure).getStatisticName(); statName = ((WrapperSubsetEval.PluginTag) m_evaluationMeasure).getStatisticName(); pluginMetric = evaluation.getPluginMetric(metricName); if (pluginMetric == null) { throw new Exception("Metric " + metricName + " does not seem to be " + "available"); } } if (pluginMetric instanceof InformationRetrievalEvaluationMetric) { if (m_IRClassVal < 0) { evalMetric = ((InformationRetrievalEvaluationMetric) pluginMetric) .getClassWeightedAverageStatistic(statName); } else { evalMetric = ((InformationRetrievalEvaluationMetric) pluginMetric).getStatistic(statName, m_IRClassVal); } } else { evalMetric = pluginMetric.getStatistic(statName); } break; } switch (m_evaluationMeasure.getID()) { case EVAL_DEFAULT: case EVAL_ACCURACY: case EVAL_RMSE: case EVAL_MAE: if (m_trainingInstances.classAttribute().isNominal() && (m_evaluationMeasure.getID() == EVAL_DEFAULT || m_evaluationMeasure.getID() == EVAL_ACCURACY)) { evalMetric = 1 - evalMetric; } else { evalMetric = -evalMetric; // maximize } break; default: if (pluginMetric != null && !pluginMetric.statisticIsMaximisable(statName)) { evalMetric = -evalMetric; // maximize } } return evalMetric; } /** * Evaluates a subset of attributes with respect to a set of instances. * Calling this function overrides any test/hold out instances set from * setHoldOutFile. * * @param subset a bitset representing the attribute subset to be evaluated * @param holdOut a set of instances (possibly separate and distinct from * those use to build/train the evaluator) with which to evaluate the * merit of the subset * @return the "merit" of the subset on the holdOut data * @throws Exception if the subset cannot be evaluated */ public double evaluateSubset(BitSet subset, Instances holdOut) throws Exception { int i, j; double evalMetric = 0; int numAttributes = 0; Instances trainCopy = null; Instances testCopy = null; String[] cOpts = null; Evaluation evaluation = null; if (m_ClassifierTemplate instanceof OptionHandler) { cOpts = ((OptionHandler) m_ClassifierTemplate).getOptions(); } Classifier classifier = AbstractClassifier.forName(m_ClassifierTemplate.getClass().getName(), cOpts); if (m_trainingInstances.equalHeaders(holdOut) == false) { throw new Exception("evaluateSubset : Incompatable instance types.\n" + m_trainingInstances.equalHeadersMsg(holdOut)); } Remove delTransform = new Remove(); delTransform.setInvertSelection(true); // copy the training instances trainCopy = new Instances(m_trainingInstances); testCopy = new Instances(holdOut); // count attributes set in the BitSet for (i = 0; i < m_numAttribs; i++) { if (subset.get(i)) { numAttributes++; } } // set up an array of attribute indexes for the filter (+1 for the class) int[] featArray = new int[numAttributes + 1]; for (i = 0, j = 0; i < m_numAttribs; i++) { if (subset.get(i)) { featArray[j++] = i; } } featArray[j] = m_classIndex; delTransform.setAttributeIndicesArray(featArray); delTransform.setInputFormat(trainCopy); trainCopy = Filter.useFilter(trainCopy, delTransform); testCopy = Filter.useFilter(testCopy, delTransform); // build the classifier classifier.buildClassifier(trainCopy); evaluation = new Evaluation(trainCopy); evaluation.evaluateModel(classifier, testCopy); String metricName = null; String statName = null; AbstractEvaluationMetric pluginMetric = null; switch (m_evaluationMeasure.getID()) { case EVAL_DEFAULT: evalMetric = evaluation.errorRate(); break; case EVAL_ACCURACY: evalMetric = evaluation.errorRate(); break; case EVAL_RMSE: evalMetric = evaluation.rootMeanSquaredError(); break; case EVAL_MAE: evalMetric = evaluation.meanAbsoluteError(); break; case EVAL_FMEASURE: if (m_IRClassVal < 0) { evalMetric = evaluation.weightedFMeasure(); } else { evalMetric = evaluation.fMeasure(m_IRClassVal); } break; case EVAL_AUC: if (m_IRClassVal < 0) { evalMetric = evaluation.weightedAreaUnderROC(); } else { evalMetric = evaluation.areaUnderROC(m_IRClassVal); } break; case EVAL_AUPRC: if (m_IRClassVal < 0) { evalMetric = evaluation.weightedAreaUnderPRC(); } else { evalMetric = evaluation.areaUnderPRC(m_IRClassVal); } case EVAL_CORRELATION: evalMetric = evaluation.correlationCoefficient(); break; default: if (m_evaluationMeasure.getID() >= EVAL_PLUGIN) { metricName = ((WrapperSubsetEval.PluginTag) m_evaluationMeasure).getMetricName(); statName = ((WrapperSubsetEval.PluginTag) m_evaluationMeasure).getStatisticName(); statName = ((WrapperSubsetEval.PluginTag) m_evaluationMeasure).getStatisticName(); pluginMetric = evaluation.getPluginMetric(metricName); if (pluginMetric == null) { throw new Exception("Metric " + metricName + " does not seem to be " + "available"); } } if (pluginMetric instanceof InformationRetrievalEvaluationMetric) { if (m_IRClassVal < 0) { evalMetric = ((InformationRetrievalEvaluationMetric) pluginMetric) .getClassWeightedAverageStatistic(statName); } else { evalMetric = ((InformationRetrievalEvaluationMetric) pluginMetric).getStatistic(statName, m_IRClassVal); } } else { evalMetric = pluginMetric.getStatistic(statName); } break; } switch (m_evaluationMeasure.getID()) { case EVAL_DEFAULT: case EVAL_ACCURACY: case EVAL_RMSE: case EVAL_MAE: if (m_trainingInstances.classAttribute().isNominal() && (m_evaluationMeasure.getID() == EVAL_DEFAULT || m_evaluationMeasure.getID() == EVAL_ACCURACY)) { evalMetric = 1 - evalMetric; } else { evalMetric = -evalMetric; // maximize } break; default: if (pluginMetric != null && !pluginMetric.statisticIsMaximisable(statName)) { evalMetric = -evalMetric; // maximize } } return evalMetric; } /** * Evaluates a subset of attributes with respect to a single instance. Calling * this function overides any hold out/test instances set through * setHoldOutFile. * * @param subset a bitset representing the attribute subset to be evaluated * @param holdOut a single instance (possibly not one of those used to * build/train the evaluator) with which to evaluate the merit of the * subset * @param retrain true if the classifier should be retrained with respect to * the new subset before testing on the holdOut instance. * @return the "merit" of the subset on the holdOut instance * @throws Exception if the subset cannot be evaluated */ @Override public double evaluateSubset(BitSet subset, Instance holdOut, boolean retrain) throws Exception { if (m_evaluationMeasure.getID() != EVAL_DEFAULT) { throw new Exception("Can only use default evaluation measure in the method"); } int i, j; double error; int numAttributes = 0; Instances trainCopy = null; Instance testCopy = null; if (m_trainingInstances.equalHeaders(holdOut.dataset()) == false) { throw new Exception("evaluateSubset : Incompatable instance types.\n" + m_trainingInstances.equalHeadersMsg(holdOut.dataset())); } Remove delTransform = new Remove(); delTransform.setInvertSelection(true); // copy the training instances trainCopy = new Instances(m_trainingInstances); testCopy = (Instance) holdOut.copy(); // count attributes set in the BitSet for (i = 0; i < m_numAttribs; i++) { if (subset.get(i)) { numAttributes++; } } // set up an array of attribute indexes for the filter (+1 for the class) int[] featArray = new int[numAttributes + 1]; for (i = 0, j = 0; i < m_numAttribs; i++) { if (subset.get(i)) { featArray[j++] = i; } } featArray[j] = m_classIndex; delTransform.setAttributeIndicesArray(featArray); delTransform.setInputFormat(trainCopy); if (retrain) { trainCopy = Filter.useFilter(trainCopy, delTransform); // build the classifier m_Classifier.buildClassifier(trainCopy); } delTransform.input(testCopy); testCopy = delTransform.output(); double pred; double[] distrib; distrib = m_Classifier.distributionForInstance(testCopy); if (m_trainingInstances.classAttribute().isNominal()) { pred = distrib[(int) testCopy.classValue()]; } else { pred = distrib[0]; } if (m_trainingInstances.classAttribute().isNominal()) { error = 1.0 - pred; } else { error = testCopy.classValue() - pred; } // return the negative of the error as search methods need to // maximize something return -error; } /** * Returns a string describing classifierSubsetEval * * @return the description as a string */ @Override public String toString() { StringBuffer text = new StringBuffer(); if (m_trainingInstances == null) { text.append("\tClassifier subset evaluator has not been built yet\n"); } else { text.append("\tClassifier Subset Evaluator\n"); text.append("\tLearning scheme: " + getClassifier().getClass().getName() + "\n"); text.append("\tScheme options: "); String[] classifierOptions = new String[0]; if (m_ClassifierTemplate instanceof OptionHandler) { classifierOptions = ((OptionHandler) m_ClassifierTemplate).getOptions(); for (String classifierOption : classifierOptions) { text.append(classifierOption + " "); } } text.append("\n"); text.append("\tHold out/test set: "); if (!m_useTraining) { if (getHoldOutFile().getPath().startsWith("Click to set")) { text.append("none\n"); } else { text.append(getHoldOutFile().getPath() + '\n'); } } else { if (m_usePercentageSplit) { text.append("Percentage split: " + m_splitPercent + "\n"); } else { text.append("Training data\n"); } } String IRClassL = ""; if (m_IRClassVal >= 0) { IRClassL = "(class value: " + m_trainingInstances.classAttribute().value(m_IRClassVal) + ")"; } switch (m_evaluationMeasure.getID()) { case EVAL_DEFAULT: case EVAL_ACCURACY: if (m_trainingInstances.attribute(m_classIndex).isNumeric()) { text.append("\tSubset evaluation: RMSE\n"); } else { text.append("\tSubset evaluation: classification error\n"); } break; case EVAL_RMSE: if (m_trainingInstances.attribute(m_classIndex).isNumeric()) { text.append("\tSubset evaluation: RMSE\n"); } else { text.append("\tSubset evaluation: RMSE (probability estimates)\n"); } break; case EVAL_MAE: if (m_trainingInstances.attribute(m_classIndex).isNumeric()) { text.append("\tSubset evaluation: MAE\n"); } else { text.append("\tSubset evaluation: MAE (probability estimates)\n"); } break; case EVAL_FMEASURE: text.append("\tSubset evaluation: F-measure " + (m_IRClassVal >= 0 ? IRClassL : "") + "\n"); break; case EVAL_AUC: text.append("\tSubset evaluation: area under the ROC curve " + (m_IRClassVal >= 0 ? IRClassL : "") + "\n"); break; case EVAL_AUPRC: text.append("\tSubset evalation: area under the precision-recal curve " + (m_IRClassVal >= 0 ? IRClassL : "") + "\n"); break; case EVAL_CORRELATION: text.append("\tSubset evaluation: correlation coefficient\n"); break; default: text.append("\tSubset evaluation: " + m_evaluationMeasure.getReadable()); if (((WrapperSubsetEval.PluginTag) m_evaluationMeasure) .getMetric() instanceof InformationRetrievalEvaluationMetric) { text.append(" " + (m_IRClassVal > 0 ? IRClassL : "")); } text.append("\n"); break; } } return text.toString(); } /** * reset to defaults */ protected void resetOptions() { m_trainingInstances = null; m_ClassifierTemplate = new ZeroR(); m_holdOutFile = new File("Click to set hold out or test instances"); m_holdOutInstances = null; m_useTraining = false; m_splitPercent = "90"; m_usePercentageSplit = false; m_evaluationMeasure = TAGS_EVALUATION[0]; m_IRClassVal = -1; } /** * Returns the revision string. * * @return the revision */ @Override public String getRevision() { return RevisionUtils.extract("$Revision: 10332 $"); } /** * Main method for testing this class. * * @param args the options */ public static void main(String[] args) { runEvaluator(new ClassifierSubsetEval(), args); } }