Java tutorial
/* * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see <http://www.gnu.org/licenses/>. */ /* * JRip.java * Copyright (C) 2001-2012 University of Waikato, Hamilton, New Zealand */ package weka.classifiers.rules; import java.io.Serializable; import java.util.ArrayList; import java.util.Collections; import java.util.Enumeration; import java.util.Random; import java.util.Vector; import weka.classifiers.AbstractClassifier; import weka.core.AdditionalMeasureProducer; import weka.core.Attribute; import weka.core.Capabilities; import weka.core.Capabilities.Capability; import weka.core.Copyable; import weka.core.Instance; import weka.core.Instances; import weka.core.Option; import weka.core.RevisionHandler; import weka.core.RevisionUtils; import weka.core.TechnicalInformation; import weka.core.TechnicalInformation.Field; import weka.core.TechnicalInformation.Type; import weka.core.TechnicalInformationHandler; import weka.core.Utils; import weka.core.WeightedInstancesHandler; import weka.filters.Filter; import weka.filters.supervised.attribute.ClassOrder; /** * <!-- globalinfo-start --> This class implements a propositional rule learner, * Repeated Incremental Pruning to Produce Error Reduction (RIPPER), which was * proposed by William W. Cohen as an optimized version of IREP. <br/> * <br/> * The algorithm is briefly described as follows: <br/> * <br/> * Initialize RS = {}, and for each class from the less prevalent one to the * more frequent one, DO: <br/> * <br/> * 1. Building stage:<br/> * Repeat 1.1 and 1.2 until the descrition length (DL) of the ruleset and * examples is 64 bits greater than the smallest DL met so far, or there are no * positive examples, or the error rate >= 50%. <br/> * <br/> * 1.1. Grow phase:<br/> * Grow one rule by greedily adding antecedents (or conditions) to the rule * until the rule is perfect (i.e. 100% accurate). The procedure tries every * possible value of each attribute and selects the condition with highest * information gain: p(log(p/t)-log(P/T)).<br/> * <br/> * 1.2. Prune phase:<br/> * Incrementally prune each rule and allow the pruning of any final sequences of * the antecedents;The pruning metric is (p-n)/(p+n) -- but it's actually * 2p/(p+n) -1, so in this implementation we simply use p/(p+n) (actually * (p+1)/(p+n+2), thus if p+n is 0, it's 0.5).<br/> * <br/> * 2. Optimization stage:<br/> * after generating the initial ruleset {Ri}, generate and prune two variants of * each rule Ri from randomized data using procedure 1.1 and 1.2. But one * variant is generated from an empty rule while the other is generated by * greedily adding antecedents to the original rule. Moreover, the pruning * metric used here is (TP+TN)/(P+N).Then the smallest possible DL for each * variant and the original rule is computed. The variant with the minimal DL is * selected as the final representative of Ri in the ruleset.After all the rules * in {Ri} have been examined and if there are still residual positives, more * rules are generated based on the residual positives using Building Stage * again. <br/> * 3. Delete the rules from the ruleset that would increase the DL of the whole * ruleset if it were in it. and add resultant ruleset to RS. <br/> * ENDDO<br/> * <br/> * Note that there seem to be 2 bugs in the original ripper program that would * affect the ruleset size and accuracy slightly. This implementation avoids * these bugs and thus is a little bit different from Cohen's original * implementation. Even after fixing the bugs, since the order of classes with * the same frequency is not defined in ripper, there still seems to be some * trivial difference between this implementation and the original ripper, * especially for audiology data in UCI repository, where there are lots of * classes of few instances.<br/> * <br/> * Details please see:<br/> * <br/> * William W. Cohen: Fast Effective Rule Induction. In: Twelfth International * Conference on Machine Learning, 115-123, 1995.<br/> * <br/> * PS. We have compared this implementation with the original ripper * implementation in aspects of accuracy, ruleset size and running time on both * artificial data "ab+bcd+defg" and UCI datasets. In all these aspects it seems * to be quite comparable to the original ripper implementation. However, we * didn't consider memory consumption optimization in this implementation.<br/> * <br/> * <p/> * <!-- globalinfo-end --> * * <!-- technical-bibtex-start --> BibTeX: * * <pre> * @inproceedings{Cohen1995, * author = {William W. Cohen}, * booktitle = {Twelfth International Conference on Machine Learning}, * pages = {115-123}, * publisher = {Morgan Kaufmann}, * title = {Fast Effective Rule Induction}, * year = {1995} * } * </pre> * <p/> * <!-- technical-bibtex-end --> * * <!-- options-start --> Valid options are: * <p/> * * <pre> * -F <number of folds> * Set number of folds for REP * One fold is used as pruning set. * (default 3) * </pre> * * <pre> * -N <min. weights> * Set the minimal weights of instances * within a split. * (default 2.0) * </pre> * * <pre> * -O <number of runs> * Set the number of runs of * optimizations. (Default: 2) * </pre> * * <pre> * -D * Set whether turn on the * debug mode (Default: false) * </pre> * * <pre> * -S <seed> * The seed of randomization * (Default: 1) * </pre> * * <pre> * -E * Whether NOT check the error rate>=0.5 * in stopping criteria (default: check) * </pre> * * <pre> * -P * Whether NOT use pruning * (default: use pruning) * </pre> * * <!-- options-end --> * * @author Xin Xu (xx5@cs.waikato.ac.nz) * @author Eibe Frank (eibe@cs.waikato.ac.nz) * @version $Revision$ */ public class JRip extends AbstractClassifier implements AdditionalMeasureProducer, WeightedInstancesHandler, TechnicalInformationHandler { /** for serialization */ static final long serialVersionUID = -6589312996832147161L; /** The limit of description length surplus in ruleset generation */ private static double MAX_DL_SURPLUS = 64.0; /** The class attribute of the data */ private Attribute m_Class; /** The ruleset */ private ArrayList<Rule> m_Ruleset; /** The predicted class distribution */ private ArrayList<double[]> m_Distributions; /** Runs of optimizations */ private int m_Optimizations = 2; /** Random object used in this class */ private Random m_Random = null; /** # of all the possible conditions in a rule */ private double m_Total = 0; /** The seed to perform randomization */ private long m_Seed = 1; /** The number of folds to split data into Grow and Prune for IREP */ private int m_Folds = 3; /** The minimal number of instance weights within a split */ private double m_MinNo = 2.0; /** Whether in a debug mode */ private boolean m_Debug = false; /** Whether check the error rate >= 0.5 in stopping criteria */ private boolean m_CheckErr = true; /** Whether use pruning, i.e. the data is clean or not */ private boolean m_UsePruning = true; /** The filter used to randomize the class order */ private Filter m_Filter = null; /** The RuleStats for the ruleset of each class value */ private ArrayList<RuleStats> m_RulesetStats; /** * Returns a string describing classifier * * @return a description suitable for displaying in the explorer/experimenter * gui */ public String globalInfo() { return "This class implements a propositional rule learner, Repeated Incremental " + "Pruning to Produce Error Reduction (RIPPER), which was proposed by William " + "W. Cohen as an optimized version of IREP. \n\n" + "The algorithm is briefly described as follows: \n\n" + "Initialize RS = {}, and for each class from the less prevalent one to " + "the more frequent one, DO: \n\n" + "1. Building stage:\nRepeat 1.1 and 1.2 until the descrition length (DL) " + "of the ruleset and examples is 64 bits greater than the smallest DL " + "met so far, or there are no positive examples, or the error rate >= 50%. " + "\n\n" + "1.1. Grow phase:\n" + "Grow one rule by greedily adding antecedents (or conditions) to " + "the rule until the rule is perfect (i.e. 100% accurate). The " + "procedure tries every possible value of each attribute and selects " + "the condition with highest information gain: p(log(p/t)-log(P/T))." + "\n\n" + "1.2. Prune phase:\n" + "Incrementally prune each rule and allow the pruning of any " + "final sequences of the antecedents;" + "The pruning metric is (p-n)/(p+n) -- but it's actually " + "2p/(p+n) -1, so in this implementation we simply use p/(p+n) " + "(actually (p+1)/(p+n+2), thus if p+n is 0, it's 0.5).\n\n" + "2. Optimization stage:\n after generating the initial ruleset {Ri}, " + "generate and prune two variants of each rule Ri from randomized data " + "using procedure 1.1 and 1.2. But one variant is generated from an " + "empty rule while the other is generated by greedily adding antecedents " + "to the original rule. Moreover, the pruning metric used here is " + "(TP+TN)/(P+N)." + "Then the smallest possible DL for each variant and the original rule " + "is computed. The variant with the minimal DL is selected as the final " + "representative of Ri in the ruleset." + "After all the rules in {Ri} have been examined and if there are still " + "residual positives, more rules are generated based on the residual " + "positives using Building Stage again. \n" + "3. Delete the rules from the ruleset that would increase the DL of the " + "whole ruleset if it were in it. and add resultant ruleset to RS. \n" + "ENDDO\n\n" + "Note that there seem to be 2 bugs in the original ripper program that would " + "affect the ruleset size and accuracy slightly. This implementation avoids " + "these bugs and thus is a little bit different from Cohen's original " + "implementation. Even after fixing the bugs, since the order of classes with " + "the same frequency is not defined in ripper, there still seems to be " + "some trivial difference between this implementation and the original ripper, " + "especially for audiology data in UCI repository, where there are lots of " + "classes of few instances.\n\n" + "Details please see:\n\n" + getTechnicalInformation().toString() + "\n\n" + "PS. We have compared this implementation with the original ripper " + "implementation in aspects of accuracy, ruleset size and running time " + "on both artificial data \"ab+bcd+defg\" and UCI datasets. In all these " + "aspects it seems to be quite comparable to the original ripper " + "implementation. However, we didn't consider memory consumption " + "optimization in this implementation.\n\n"; } /** * Returns an instance of a TechnicalInformation object, containing detailed * information about the technical background of this class, e.g., paper * reference or book this class is based on. * * @return the technical information about this class */ @Override public TechnicalInformation getTechnicalInformation() { TechnicalInformation result; result = new TechnicalInformation(Type.INPROCEEDINGS); result.setValue(Field.AUTHOR, "William W. Cohen"); result.setValue(Field.TITLE, "Fast Effective Rule Induction"); result.setValue(Field.BOOKTITLE, "Twelfth International Conference on Machine Learning"); result.setValue(Field.YEAR, "1995"); result.setValue(Field.PAGES, "115-123"); result.setValue(Field.PUBLISHER, "Morgan Kaufmann"); return result; } /** * Returns an enumeration describing the available options Valid options are: * <p> * * -F number <br> * The number of folds for reduced error pruning. One fold is used as the * pruning set. (Default: 3) * <p> * * -N number <br> * The minimal weights of instances within a split. (Default: 2) * <p> * * -O number <br> * Set the number of runs of optimizations. (Default: 2) * <p> * * -D <br> * Whether turn on the debug mode * * -S number <br> * The seed of randomization used in Ripper.(Default: 1) * <p> * * -E <br> * Whether NOT check the error rate >= 0.5 in stopping criteria. (default: * check) * <p> * * -P <br> * Whether NOT use pruning. (default: use pruning) * <p> * * @return an enumeration of all the available options */ @Override public Enumeration<Option> listOptions() { Vector<Option> newVector = new Vector<Option>(7); newVector.add(new Option( "\tSet number of folds for REP\n" + "\tOne fold is used as pruning set.\n" + "\t(default 3)", "F", 1, "-F <number of folds>")); newVector.add( new Option("\tSet the minimal weights of instances\n" + "\twithin a split.\n" + "\t(default 2.0)", "N", 1, "-N <min. weights>")); newVector.add(new Option("\tSet the number of runs of\n" + "\toptimizations. (Default: 2)", "O", 1, "-O <number of runs>")); newVector.add(new Option("\tSet whether turn on the\n" + "\tdebug mode (Default: false)", "D", 0, "-D")); newVector.add(new Option("\tThe seed of randomization\n" + "\t(Default: 1)", "S", 1, "-S <seed>")); newVector.add(new Option( "\tWhether NOT check the error rate>=0.5\n" + "\tin stopping criteria " + "\t(default: check)", "E", 0, "-E")); newVector.add(new Option("\tWhether NOT use pruning\n" + "\t(default: use pruning)", "P", 0, "-P")); newVector.addAll(Collections.list(super.listOptions())); return newVector.elements(); } /** * Parses a given list of options. * <p/> * * <!-- options-start --> Valid options are: * <p/> * * <pre> * -F <number of folds> * Set number of folds for REP * One fold is used as pruning set. * (default 3) * </pre> * * <pre> * -N <min. weights> * Set the minimal weights of instances * within a split. * (default 2.0) * </pre> * * <pre> * -O <number of runs> * Set the number of runs of * optimizations. (Default: 2) * </pre> * * <pre> * -D * Set whether turn on the * debug mode (Default: false) * </pre> * * <pre> * -S <seed> * The seed of randomization * (Default: 1) * </pre> * * <pre> * -E * Whether NOT check the error rate>=0.5 * in stopping criteria (default: check) * </pre> * * <pre> * -P * Whether NOT use pruning * (default: use pruning) * </pre> * * <!-- options-end --> * * @param options the list of options as an array of strings * @throws Exception if an option is not supported */ @Override public void setOptions(String[] options) throws Exception { String numFoldsString = Utils.getOption('F', options); if (numFoldsString.length() != 0) { m_Folds = Integer.parseInt(numFoldsString); } else { m_Folds = 3; } String minNoString = Utils.getOption('N', options); if (minNoString.length() != 0) { m_MinNo = Double.parseDouble(minNoString); } else { m_MinNo = 2.0; } String seedString = Utils.getOption('S', options); if (seedString.length() != 0) { m_Seed = Long.parseLong(seedString); } else { m_Seed = 1; } String runString = Utils.getOption('O', options); if (runString.length() != 0) { m_Optimizations = Integer.parseInt(runString); } else { m_Optimizations = 2; } m_Debug = Utils.getFlag('D', options); m_CheckErr = !Utils.getFlag('E', options); m_UsePruning = !Utils.getFlag('P', options); super.setOptions(options); Utils.checkForRemainingOptions(options); } /** * Gets the current settings of the Classifier. * * @return an array of strings suitable for passing to setOptions */ @Override public String[] getOptions() { Vector<String> options = new Vector<String>(); options.add("-F"); options.add("" + m_Folds); options.add("-N"); options.add("" + m_MinNo); options.add("-O"); options.add("" + m_Optimizations); options.add("-S"); options.add("" + m_Seed); if (m_Debug) { options.add("-D"); } if (!m_CheckErr) { options.add("-E"); } if (!m_UsePruning) { options.add("-P"); } Collections.addAll(options, super.getOptions()); return options.toArray(new String[0]); } /** * Returns an enumeration of the additional measure names * * @return an enumeration of the measure names */ @Override public Enumeration<String> enumerateMeasures() { Vector<String> newVector = new Vector<String>(1); newVector.add("measureNumRules"); return newVector.elements(); } /** * Returns the value of the named measure * * @param additionalMeasureName the name of the measure to query for its value * @return the value of the named measure * @throws IllegalArgumentException if the named measure is not supported */ @Override public double getMeasure(String additionalMeasureName) { if (additionalMeasureName.compareToIgnoreCase("measureNumRules") == 0) { return m_Ruleset.size(); } else { throw new IllegalArgumentException(additionalMeasureName + " not supported (RIPPER)"); } } /** * Returns the tip text for this property * * @return tip text for this property suitable for displaying in the * explorer/experimenter gui */ public String foldsTipText() { return "Determines the amount of data used for pruning. One fold is used for " + "pruning, the rest for growing the rules."; } /** * Sets the number of folds to use * * @param fold the number of folds */ public void setFolds(int fold) { m_Folds = fold; } /** * Gets the number of folds * * @return the number of folds */ public int getFolds() { return m_Folds; } /** * Returns the tip text for this property * * @return tip text for this property suitable for displaying in the * explorer/experimenter gui */ public String minNoTipText() { return "The minimum total weight of the instances in a rule."; } /** * Sets the minimum total weight of the instances in a rule * * @param m the minimum total weight of the instances in a rule */ public void setMinNo(double m) { m_MinNo = m; } /** * Gets the minimum total weight of the instances in a rule * * @return the minimum total weight of the instances in a rule */ public double getMinNo() { return m_MinNo; } /** * Returns the tip text for this property * * @return tip text for this property suitable for displaying in the * explorer/experimenter gui */ public String seedTipText() { return "The seed used for randomizing the data."; } /** * Sets the seed value to use in randomizing the data * * @param s the new seed value */ public void setSeed(long s) { m_Seed = s; } /** * Gets the current seed value to use in randomizing the data * * @return the seed value */ public long getSeed() { return m_Seed; } /** * Returns the tip text for this property * * @return tip text for this property suitable for displaying in the * explorer/experimenter gui */ public String optimizationsTipText() { return "The number of optimization runs."; } /** * Sets the number of optimization runs * * @param run the number of optimization runs */ public void setOptimizations(int run) { m_Optimizations = run; } /** * Gets the the number of optimization runs * * @return the number of optimization runs */ public int getOptimizations() { return m_Optimizations; } /** * Returns the tip text for this property * * @return tip text for this property suitable for displaying in the * explorer/experimenter gui */ @Override public String debugTipText() { return "Whether debug information is output to the console."; } /** * Sets whether debug information is output to the console * * @param d whether debug information is output to the console */ @Override public void setDebug(boolean d) { m_Debug = d; } /** * Gets whether debug information is output to the console * * @return whether debug information is output to the console */ @Override public boolean getDebug() { return m_Debug; } /** * Returns the tip text for this property * * @return tip text for this property suitable for displaying in the * explorer/experimenter gui */ public String checkErrorRateTipText() { return "Whether check for error rate >= 1/2 is included" + " in stopping criterion."; } /** * Sets whether to check for error rate is in stopping criterion * * @param d whether to check for error rate is in stopping criterion */ public void setCheckErrorRate(boolean d) { m_CheckErr = d; } /** * Gets whether to check for error rate is in stopping criterion * * @return true if checking for error rate is in stopping criterion */ public boolean getCheckErrorRate() { return m_CheckErr; } /** * Returns the tip text for this property * * @return tip text for this property suitable for displaying in the * explorer/experimenter gui */ public String usePruningTipText() { return "Whether pruning is performed."; } /** * Sets whether pruning is performed * * @param d Whether pruning is performed */ public void setUsePruning(boolean d) { m_UsePruning = d; } /** * Gets whether pruning is performed * * @return true if pruning is performed */ public boolean getUsePruning() { return m_UsePruning; } /** * Get the ruleset generated by Ripper * * @return the ruleset */ public ArrayList<Rule> getRuleset() { return m_Ruleset; } /** * Get the statistics of the ruleset in the given position * * @param pos the position of the stats, assuming correct * @return the statistics of the ruleset in the given position */ public RuleStats getRuleStats(int pos) { return m_RulesetStats.get(pos); } /** * The single antecedent in the rule, which is composed of an attribute and * the corresponding value. There are two inherited classes, namely * NumericAntd and NominalAntd in which the attributes are numeric and nominal * respectively. */ public abstract class Antd implements WeightedInstancesHandler, Copyable, Serializable, RevisionHandler { /** for serialization */ private static final long serialVersionUID = -8929754772994154334L; /** The attribute of the antecedent */ protected Attribute att; /** * The attribute value of the antecedent. For numeric attribute, value is * either 0(1st bag) or 1(2nd bag) */ protected double value; /** * The maximum infoGain achieved by this antecedent test in the growing data */ protected double maxInfoGain; /** The accurate rate of this antecedent test on the growing data */ protected double accuRate; /** The coverage of this antecedent in the growing data */ protected double cover; /** The accurate data for this antecedent in the growing data */ protected double accu; /** * Constructor */ public Antd(Attribute a) { att = a; value = Double.NaN; maxInfoGain = 0; accuRate = Double.NaN; cover = Double.NaN; accu = Double.NaN; } /* The abstract members for inheritance */ public abstract Instances[] splitData(Instances data, double defAcRt, double cla); public abstract boolean covers(Instance inst); @Override public abstract String toString(); /** * Implements Copyable * * @return a copy of this object */ @Override public abstract Object copy(); /* Get functions of this antecedent */ public Attribute getAttr() { return att; } public double getAttrValue() { return value; } public double getMaxInfoGain() { return maxInfoGain; } public double getAccuRate() { return accuRate; } public double getAccu() { return accu; } public double getCover() { return cover; } /** * Returns the revision string. * * @return the revision */ @Override public String getRevision() { return RevisionUtils.extract("$Revision$"); } } /** * The antecedent with numeric attribute */ public class NumericAntd extends Antd { /** for serialization */ static final long serialVersionUID = 5699457269983735442L; /** The split point for this numeric antecedent */ private double splitPoint; /** * Constructor */ public NumericAntd(Attribute a) { super(a); splitPoint = Double.NaN; } /** * Get split point of this numeric antecedent * * @return the split point of this numeric antecedent */ public double getSplitPoint() { return splitPoint; } /** * Implements Copyable * * @return a copy of this object */ @Override public Object copy() { NumericAntd na = new NumericAntd(getAttr()); na.value = this.value; na.splitPoint = this.splitPoint; return na; } /** * Implements the splitData function. This procedure is to split the data * into two bags according to the information gain of the numeric attribute * value The maximum infoGain is also calculated. * * @param insts the data to be split * @param defAcRt the default accuracy rate for data * @param cl the class label to be predicted * @return the array of data after split */ @Override public Instances[] splitData(Instances insts, double defAcRt, double cl) { Instances data = insts; int total = data.numInstances();// Total number of instances without // missing value for att int split = 1; // Current split position int prev = 0; // Previous split position int finalSplit = split; // Final split position maxInfoGain = 0; value = 0; double fstCover = 0, sndCover = 0, fstAccu = 0, sndAccu = 0; data.sort(att); // Find the las instance without missing value for (int x = 0; x < data.numInstances(); x++) { Instance inst = data.instance(x); if (inst.isMissing(att)) { total = x; break; } sndCover += inst.weight(); if (Utils.eq(inst.classValue(), cl)) { sndAccu += inst.weight(); } } if (total == 0) { return null; // Data all missing for the attribute } splitPoint = data.instance(total - 1).value(att); for (; split <= total; split++) { if ((split == total) || (data.instance(split).value(att) > // Can't // split // within data.instance(prev).value(att))) { // same value for (int y = prev; y < split; y++) { Instance inst = data.instance(y); fstCover += inst.weight(); if (Utils.eq(data.instance(y).classValue(), cl)) { fstAccu += inst.weight(); // First bag positive# ++ } } double fstAccuRate = (fstAccu + 1.0) / (fstCover + 1.0), sndAccuRate = (sndAccu + 1.0) / (sndCover + 1.0); /* Which bag has higher information gain? */ boolean isFirst; double fstInfoGain, sndInfoGain; double accRate, infoGain, coverage, accurate; fstInfoGain = // Utils.eq(defAcRt, 1.0) ? // fstAccu/(double)numConds : fstAccu * (Utils.log2(fstAccuRate) - Utils.log2(defAcRt)); sndInfoGain = // Utils.eq(defAcRt, 1.0) ? // sndAccu/(double)numConds : sndAccu * (Utils.log2(sndAccuRate) - Utils.log2(defAcRt)); if (fstInfoGain > sndInfoGain) { isFirst = true; infoGain = fstInfoGain; accRate = fstAccuRate; accurate = fstAccu; coverage = fstCover; } else { isFirst = false; infoGain = sndInfoGain; accRate = sndAccuRate; accurate = sndAccu; coverage = sndCover; } /* Check whether so far the max infoGain */ if (infoGain > maxInfoGain) { splitPoint = data.instance(prev).value(att); value = (isFirst) ? 0 : 1; accuRate = accRate; accu = accurate; cover = coverage; maxInfoGain = infoGain; finalSplit = (isFirst) ? split : prev; } for (int y = prev; y < split; y++) { Instance inst = data.instance(y); sndCover -= inst.weight(); if (Utils.eq(data.instance(y).classValue(), cl)) { sndAccu -= inst.weight(); // Second bag positive# -- } } prev = split; } } /* Split the data */ Instances[] splitData = new Instances[2]; splitData[0] = new Instances(data, 0, finalSplit); splitData[1] = new Instances(data, finalSplit, total - finalSplit); return splitData; } /** * Whether the instance is covered by this antecedent * * @param inst the instance in question * @return the boolean value indicating whether the instance is covered by * this antecedent */ @Override public boolean covers(Instance inst) { boolean isCover = true; if (!inst.isMissing(att)) { if ((int) value == 0) { // First bag if (inst.value(att) > splitPoint) { isCover = false; } } else if (inst.value(att) < splitPoint) { isCover = false; } } else { isCover = false; } return isCover; } /** * Prints this antecedent * * @return a textual description of this antecedent */ @Override public String toString() { String symbol = ((int) value == 0) ? " <= " : " >= "; return (att.name() + symbol + Utils.doubleToString(splitPoint, 6)); } /** * Returns the revision string. * * @return the revision */ @Override public String getRevision() { return RevisionUtils.extract("$Revision$"); } } /** * The antecedent with nominal attribute */ public class NominalAntd extends Antd { /** for serialization */ static final long serialVersionUID = -9102297038837585135L; /* * The parameters of infoGain calculated for each attribute value in the * growing data */ private final double[] accurate; private final double[] coverage; /** * Constructor */ public NominalAntd(Attribute a) { super(a); int bag = att.numValues(); accurate = new double[bag]; coverage = new double[bag]; } /** * Implements Copyable * * @return a copy of this object */ @Override public Object copy() { Antd antec = new NominalAntd(getAttr()); antec.value = this.value; return antec; } /** * Implements the splitData function. This procedure is to split the data * into bags according to the nominal attribute value The infoGain for each * bag is also calculated. * * @param data the data to be split * @param defAcRt the default accuracy rate for data * @param cl the class label to be predicted * @return the array of data after split */ @Override public Instances[] splitData(Instances data, double defAcRt, double cl) { int bag = att.numValues(); Instances[] splitData = new Instances[bag]; for (int x = 0; x < bag; x++) { splitData[x] = new Instances(data, data.numInstances()); accurate[x] = 0; coverage[x] = 0; } for (int x = 0; x < data.numInstances(); x++) { Instance inst = data.instance(x); if (!inst.isMissing(att)) { int v = (int) inst.value(att); splitData[v].add(inst); coverage[v] += inst.weight(); if ((int) inst.classValue() == (int) cl) { accurate[v] += inst.weight(); } } } for (int x = 0; x < bag; x++) { double t = coverage[x] + 1.0; double p = accurate[x] + 1.0; double infoGain = // Utils.eq(defAcRt, 1.0) ? // accurate[x]/(double)numConds : accurate[x] * (Utils.log2(p / t) - Utils.log2(defAcRt)); if (infoGain > maxInfoGain) { maxInfoGain = infoGain; cover = coverage[x]; accu = accurate[x]; accuRate = p / t; value = x; } } return splitData; } /** * Whether the instance is covered by this antecedent * * @param inst the instance in question * @return the boolean value indicating whether the instance is covered by * this antecedent */ @Override public boolean covers(Instance inst) { boolean isCover = false; if (!inst.isMissing(att)) { if ((int) inst.value(att) == (int) value) { isCover = true; } } return isCover; } /** * Prints this antecedent * * @return a textual description of this antecedent */ @Override public String toString() { return (att.name() + " = " + att.value((int) value)); } /** * Returns the revision string. * * @return the revision */ @Override public String getRevision() { return RevisionUtils.extract("$Revision$"); } } /** * This class implements a single rule that predicts specified class. * * A rule consists of antecedents "AND"ed together and the consequent (class * value) for the classification. In this class, the Information Gain * (p*[log(p/t) - log(P/T)]) is used to select an antecedent and Reduced Error * Prunning (REP) with the metric of accuracy rate p/(p+n) or (TP+TN)/(P+N) is * used to prune the rule. */ public class RipperRule extends Rule { /** for serialization */ static final long serialVersionUID = -2410020717305262952L; /** The internal representation of the class label to be predicted */ private double m_Consequent = -1; /** The vector of antecedents of this rule */ protected ArrayList<Antd> m_Antds = null; /** Constructor */ public RipperRule() { m_Antds = new ArrayList<Antd>(); } /** * Removes redundant tests in the rule. * * @param data an instance object that contains the appropriate header information for the attributes. */ public void cleanUp(Instances data) { double[] mins = new double[data.numAttributes()]; double[] maxs = new double[data.numAttributes()]; for (int i = 0; i < data.numAttributes(); i++) { mins[i] = Double.MAX_VALUE; maxs[i] = -Double.MAX_VALUE; } for (int i = m_Antds.size() - 1; i >= 0; i--) { Attribute att = m_Antds.get(i).getAttr(); if (att.isNumeric()) { double splitPoint = ((NumericAntd) m_Antds.get(i)).getSplitPoint(); if (m_Antds.get(i).getAttrValue() == 0) { if (splitPoint < mins[att.index()]) { mins[att.index()] = splitPoint; } else { m_Antds.remove(i); } } else { if (splitPoint > maxs[att.index()]) { maxs[att.index()] = splitPoint; } else { m_Antds.remove(i); } } } } } /** * Sets the internal representation of the class label to be predicted * * @param cl the internal representation of the class label to be predicted */ public void setConsequent(double cl) { m_Consequent = cl; } /** * Gets the internal representation of the class label to be predicted * * @return the internal representation of the class label to be predicted */ @Override public double getConsequent() { return m_Consequent; } /** * Get a shallow copy of this rule * * @return the copy */ @Override public Object copy() { RipperRule copy = new RipperRule(); copy.setConsequent(getConsequent()); copy.m_Antds = new ArrayList<Antd>(this.m_Antds.size()); for (Antd a : this.m_Antds) { copy.m_Antds.add((Antd) a.copy()); } return copy; } /** * Whether the instance covered by this rule * * @param datum the instance in question * @return the boolean value indicating whether the instance is covered by * this rule */ @Override public boolean covers(Instance datum) { boolean isCover = true; for (int i = 0; i < m_Antds.size(); i++) { Antd antd = m_Antds.get(i); if (!antd.covers(datum)) { isCover = false; break; } } return isCover; } /** * Whether this rule has antecedents, i.e. whether it is a default rule * * @return the boolean value indicating whether the rule has antecedents */ @Override public boolean hasAntds() { if (m_Antds == null) { return false; } else { return (m_Antds.size() > 0); } } /** * Return the antecedents * * @return the vector of antecedents */ public ArrayList<Antd> getAntds() { return m_Antds; } /** * the number of antecedents of the rule * * @return the size of this rule */ @Override public double size() { return m_Antds.size(); } /** * Private function to compute default number of accurate instances in the * specified data for the consequent of the rule * * @param data the data in question * @return the default accuracy number */ private double computeDefAccu(Instances data) { double defAccu = 0; for (int i = 0; i < data.numInstances(); i++) { Instance inst = data.instance(i); if ((int) inst.classValue() == (int) m_Consequent) { defAccu += inst.weight(); } } return defAccu; } /** * Build one rule using the growing data * * @param data the growing data used to build the rule * @throws Exception if the consequent is not set yet */ @Override public void grow(Instances data) throws Exception { if (m_Consequent == -1) { throw new Exception(" Consequent not set yet."); } Instances growData = data; double sumOfWeights = growData.sumOfWeights(); if (!Utils.gr(sumOfWeights, 0.0)) { return; } /* Compute the default accurate rate of the growing data */ double defAccu = computeDefAccu(growData); double defAcRt = (defAccu + 1.0) / (sumOfWeights + 1.0); /* Keep the record of which attributes have already been used */ boolean[] used = new boolean[growData.numAttributes()]; for (int k = 0; k < used.length; k++) { used[k] = false; } int numUnused = used.length; // If there are already antecedents existing for (int j = 0; j < m_Antds.size(); j++) { Antd antdj = m_Antds.get(j); if (!antdj.getAttr().isNumeric()) { used[antdj.getAttr().index()] = true; numUnused--; } } double maxInfoGain; while (Utils.gr(growData.numInstances(), 0.0) && (numUnused > 0) && Utils.sm(defAcRt, 1.0)) { // We require that infoGain be positive /* * if(numAntds == originalSize) maxInfoGain = 0.0; // At least one * condition allowed else maxInfoGain = Utils.eq(defAcRt, 1.0) ? * defAccu/(double)numAntds : 0.0; */ maxInfoGain = 0.0; /* Build a list of antecedents */ Antd oneAntd = null; Instances coverData = null; Enumeration<Attribute> enumAttr = growData.enumerateAttributes(); /* Build one condition based on all attributes not used yet */ while (enumAttr.hasMoreElements()) { Attribute att = (enumAttr.nextElement()); if (m_Debug) { System.err.println("\nOne condition: size = " + growData.sumOfWeights()); } Antd antd = null; if (att.isNumeric()) { antd = new NumericAntd(att); } else { antd = new NominalAntd(att); } if (!used[att.index()]) { /* * Compute the best information gain for each attribute, it's stored * in the antecedent formed by this attribute. This procedure * returns the data covered by the antecedent */ Instances coveredData = computeInfoGain(growData, defAcRt, antd); if (coveredData != null) { double infoGain = antd.getMaxInfoGain(); if (m_Debug) { System.err.println("Test of \'" + antd.toString() + "\': infoGain = " + infoGain + " | Accuracy = " + antd.getAccuRate() + "=" + antd.getAccu() + "/" + antd.getCover() + " def. accuracy: " + defAcRt); } if (infoGain > maxInfoGain) { oneAntd = antd; coverData = coveredData; maxInfoGain = infoGain; } } } } if (oneAntd == null) { break; // Cannot find antds } if (Utils.sm(oneAntd.getAccu(), m_MinNo)) { break;// Too low coverage } // Numeric attributes can be used more than once if (!oneAntd.getAttr().isNumeric()) { used[oneAntd.getAttr().index()] = true; numUnused--; } m_Antds.add(oneAntd); growData = coverData;// Grow data size is shrinking defAcRt = oneAntd.getAccuRate(); } } /** * Compute the best information gain for the specified antecedent * * @param instances the data based on which the infoGain is computed * @param defAcRt the default accuracy rate of data * @param antd the specific antecedent * @return the data covered by the antecedent */ private Instances computeInfoGain(Instances instances, double defAcRt, Antd antd) { Instances data = instances; /* * Split the data into bags. The information gain of each bag is also * calculated in this procedure */ Instances[] splitData = antd.splitData(data, defAcRt, m_Consequent); /* Get the bag of data to be used for next antecedents */ if (splitData != null) { return splitData[(int) antd.getAttrValue()]; } else { return null; } } /** * Prune all the possible final sequences of the rule using the pruning * data. The measure used to prune the rule is based on flag given. * * @param pruneData the pruning data used to prune the rule * @param useWhole flag to indicate whether use the error rate of the whole * pruning data instead of the data covered */ public void prune(Instances pruneData, boolean useWhole) { Instances data = pruneData; double total = data.sumOfWeights(); if (!Utils.gr(total, 0.0)) { return; } /* The default accurate # and rate on pruning data */ double defAccu = computeDefAccu(data); if (m_Debug) { System.err.println("Pruning with " + defAccu + " positive data out of " + total + " instances"); } int size = m_Antds.size(); if (size == 0) { return; // Default rule before pruning } double[] worthRt = new double[size]; double[] coverage = new double[size]; double[] worthValue = new double[size]; for (int w = 0; w < size; w++) { worthRt[w] = coverage[w] = worthValue[w] = 0.0; } /* Calculate accuracy parameters for all the antecedents in this rule */ double tn = 0.0; // True negative if useWhole for (int x = 0; x < size; x++) { Antd antd = m_Antds.get(x); Instances newData = data; data = new Instances(newData, 0); // Make data empty for (int y = 0; y < newData.numInstances(); y++) { Instance ins = newData.instance(y); if (antd.covers(ins)) { // Covered by this antecedent coverage[x] += ins.weight(); data.add(ins); // Add to data for further pruning if ((int) ins.classValue() == (int) m_Consequent) { worthValue[x] += ins.weight(); } } else if (useWhole) { // Not covered if ((int) ins.classValue() != (int) m_Consequent) { tn += ins.weight(); } } } if (useWhole) { worthValue[x] += tn; worthRt[x] = worthValue[x] / total; } else { worthRt[x] = (worthValue[x] + 1.0) / (coverage[x] + 2.0); } } double maxValue = (defAccu + 1.0) / (total + 2.0); int maxIndex = -1; for (int i = 0; i < worthValue.length; i++) { if (m_Debug) { double denom = useWhole ? total : coverage[i]; System.err.println(i + "(useAccuray? " + !useWhole + "): " + worthRt[i] + "=" + worthValue[i] + "/" + denom); } if (worthRt[i] > maxValue) { // Prefer to the maxValue = worthRt[i]; // shorter rule maxIndex = i; } } /* Prune the antecedents according to the accuracy parameters */ for (int z = size - 1; z > maxIndex; z--) { m_Antds.remove(z); } } /** * Prints this rule * * @param classAttr the class attribute in the data * @return a textual description of this rule */ public String toString(Attribute classAttr) { StringBuffer text = new StringBuffer(); if (m_Antds.size() > 0) { for (int j = 0; j < (m_Antds.size() - 1); j++) { text.append("(" + (m_Antds.get(j)).toString() + ") and "); } text.append("(" + (m_Antds.get(m_Antds.size() - 1)).toString() + ")"); } text.append(" => " + classAttr.name() + "=" + classAttr.value((int) m_Consequent)); return text.toString(); } /** * Returns the revision string. * * @return the revision */ @Override public String getRevision() { return RevisionUtils.extract("$Revision$"); } } /** * Returns default capabilities of the classifier. * * @return the capabilities of this classifier */ @Override public Capabilities getCapabilities() { Capabilities result = super.getCapabilities(); result.disableAll(); // attributes result.enable(Capability.NOMINAL_ATTRIBUTES); result.enable(Capability.NUMERIC_ATTRIBUTES); result.enable(Capability.DATE_ATTRIBUTES); result.enable(Capability.MISSING_VALUES); // class result.enable(Capability.NOMINAL_CLASS); result.enable(Capability.MISSING_CLASS_VALUES); // instances result.setMinimumNumberInstances(m_Folds); return result; } /** * Builds Ripper in the order of class frequencies. For each class it's built * in two stages: building and optimization * * @param instances the training data * @throws Exception if classifier can't be built successfully */ @Override public void buildClassifier(Instances instances) throws Exception { // can classifier handle the data? getCapabilities().testWithFail(instances); // remove instances with missing class instances = new Instances(instances); instances.deleteWithMissingClass(); m_Random = instances.getRandomNumberGenerator(m_Seed); m_Total = RuleStats.numAllConditions(instances); if (m_Debug) { System.err.println("Number of all possible conditions = " + m_Total); } Instances data = null; m_Filter = new ClassOrder(); ((ClassOrder) m_Filter).setSeed(m_Random.nextInt()); ((ClassOrder) m_Filter).setClassOrder(ClassOrder.FREQ_ASCEND); m_Filter.setInputFormat(instances); data = Filter.useFilter(instances, m_Filter); if (data == null) { throw new Exception(" Unable to randomize the class orders."); } m_Class = data.classAttribute(); m_Ruleset = new ArrayList<Rule>(); m_RulesetStats = new ArrayList<RuleStats>(); m_Distributions = new ArrayList<double[]>(); // Sort by classes frequency double[] orderedClasses = ((ClassOrder) m_Filter).getClassCounts(); if (m_Debug) { System.err.println("Sorted classes:"); for (int x = 0; x < m_Class.numValues(); x++) { System.err.println(x + ": " + m_Class.value(x) + " has " + orderedClasses[x] + " instances."); } } // Iterate from less prevalent class to more frequent one oneClass: for (int y = 0; y < data.numClasses() - 1; y++) { // For each // class double classIndex = y; if (m_Debug) { int ci = (int) classIndex; System.err.println("\n\nClass " + m_Class.value(ci) + "(" + ci + "): " + orderedClasses[y] + "instances\n" + "=====================================\n"); } if (Utils.eq(orderedClasses[y], 0.0)) { continue oneClass; } // The expected FP/err is the proportion of the class double all = 0; for (int i = y; i < orderedClasses.length; i++) { all += orderedClasses[i]; } double expFPRate = orderedClasses[y] / all; double classYWeights = 0, totalWeights = 0; for (int j = 0; j < data.numInstances(); j++) { Instance datum = data.instance(j); totalWeights += datum.weight(); if ((int) datum.classValue() == y) { classYWeights += datum.weight(); } } // DL of default rule, no theory DL, only data DL double defDL; if (classYWeights > 0) { defDL = RuleStats.dataDL(expFPRate, 0.0, totalWeights, 0.0, classYWeights); } else { continue oneClass; // Subsumed by previous rules } if (Double.isNaN(defDL) || Double.isInfinite(defDL)) { throw new Exception("Should never happen: " + "defDL NaN or infinite!"); } if (m_Debug) { System.err.println("The default DL = " + defDL); } data = rulesetForOneClass(expFPRate, data, classIndex, defDL); } // Remove redundant numeric tests from the rules for (Rule rule : m_Ruleset) { ((RipperRule) rule).cleanUp(data); } // Set the default rule RipperRule defRule = new RipperRule(); defRule.setConsequent(data.numClasses() - 1); m_Ruleset.add(defRule); RuleStats defRuleStat = new RuleStats(); defRuleStat.setData(data); defRuleStat.setNumAllConds(m_Total); defRuleStat.addAndUpdate(defRule); m_RulesetStats.add(defRuleStat); for (int z = 0; z < m_RulesetStats.size(); z++) { RuleStats oneClass = m_RulesetStats.get(z); for (int xyz = 0; xyz < oneClass.getRulesetSize(); xyz++) { double[] classDist = oneClass.getDistributions(xyz); Utils.normalize(classDist); if (classDist != null) { m_Distributions.add(((ClassOrder) m_Filter).distributionsByOriginalIndex(classDist)); } } } // free up memory for (int i = 0; i < m_RulesetStats.size(); i++) { (m_RulesetStats.get(i)).cleanUp(); } } /** * Classify the test instance with the rule learner and provide the class * distributions * * @param datum the instance to be classified * @return the distribution */ @Override public double[] distributionForInstance(Instance datum) { try { for (int i = 0; i < m_Ruleset.size(); i++) { Rule rule = m_Ruleset.get(i); if (rule.covers(datum)) { return m_Distributions.get(i); } } } catch (Exception e) { System.err.println(e.getMessage()); e.printStackTrace(); } System.err.println("Should never happen!"); return new double[datum.classAttribute().numValues()]; } /** * Build a ruleset for the given class according to the given data * * @param expFPRate the expected FP/(FP+FN) used in DL calculation * @param data the given data * @param classIndex the given class index * @param defDL the default DL in the data * @throws Exception if the ruleset can be built properly */ protected Instances rulesetForOneClass(double expFPRate, Instances data, double classIndex, double defDL) throws Exception { Instances newData = data, growData, pruneData; boolean stop = false; ArrayList<Rule> ruleset = new ArrayList<Rule>(); double dl = defDL, minDL = defDL; RuleStats rstats = null; double[] rst; // Check whether data have positive examples boolean defHasPositive = true; // No longer used boolean hasPositive = defHasPositive; /********************** Building stage ***********************/ if (m_Debug) { System.err.println("\n*** Building stage ***"); } while ((!stop) && hasPositive) { // Generate new rules until // stopping criteria met RipperRule oneRule; if (m_UsePruning) { /* Split data into Grow and Prune */ // We should have stratified the data, but ripper seems // to have a bug that makes it not to do so. In order // to simulate it more precisely, we do the same thing. // newData.randomize(m_Random); newData = RuleStats.stratify(newData, m_Folds, m_Random); Instances[] part = RuleStats.partition(newData, m_Folds); growData = part[0]; pruneData = part[1]; // growData=newData.trainCV(m_Folds, m_Folds-1); // pruneData=newData.testCV(m_Folds, m_Folds-1); oneRule = new RipperRule(); oneRule.setConsequent(classIndex); // Must set first if (m_Debug) { System.err.println("\nGrowing a rule ..."); } oneRule.grow(growData); // Build the rule if (m_Debug) { System.err.println("One rule found before pruning:" + oneRule.toString(m_Class)); } if (m_Debug) { System.err.println("\nPruning the rule ..."); } oneRule.prune(pruneData, false); // Prune the rule if (m_Debug) { System.err.println("One rule found after pruning:" + oneRule.toString(m_Class)); } } else { oneRule = new RipperRule(); oneRule.setConsequent(classIndex); // Must set first if (m_Debug) { System.err.println("\nNo pruning: growing a rule ..."); } oneRule.grow(newData); // Build the rule if (m_Debug) { System.err.println("No pruning: one rule found:\n" + oneRule.toString(m_Class)); } } // Compute the DL of this ruleset if (rstats == null) { // First rule rstats = new RuleStats(); rstats.setNumAllConds(m_Total); rstats.setData(newData); } rstats.addAndUpdate(oneRule); int last = rstats.getRuleset().size() - 1; // Index of last rule dl += rstats.relativeDL(last, expFPRate, m_CheckErr); if (Double.isNaN(dl) || Double.isInfinite(dl)) { throw new Exception("Should never happen: dl in " + "building stage NaN or infinite!"); } if (m_Debug) { System.err.println("Before optimization(" + last + "): the dl = " + dl + " | best: " + minDL); } if (dl < minDL) { minDL = dl; // The best dl so far } rst = rstats.getSimpleStats(last); if (m_Debug) { System.err.println("The rule covers: " + rst[0] + " | pos = " + rst[2] + " | neg = " + rst[4] + "\nThe rule doesn't cover: " + rst[1] + " | pos = " + rst[5]); } stop = checkStop(rst, minDL, dl); if (!stop) { ruleset.add(oneRule); // Accepted newData = rstats.getFiltered(last)[1];// Data not covered hasPositive = Utils.gr(rst[5], 0.0); // Positives remaining? if (m_Debug) { System.err.println("One rule added: has positive? " + hasPositive); } } else { if (m_Debug) { System.err.println("Quit rule"); } rstats.removeLast(); // Remove last to be re-used } } // while !stop /******************** Optimization stage *******************/ RuleStats finalRulesetStat = null; if (m_UsePruning) { for (int z = 0; z < m_Optimizations; z++) { if (m_Debug) { System.err.println("\n*** Optimization: run #" + z + " ***"); } newData = data; finalRulesetStat = new RuleStats(); finalRulesetStat.setData(newData); finalRulesetStat.setNumAllConds(m_Total); int position = 0; stop = false; boolean isResidual = false; hasPositive = defHasPositive; dl = minDL = defDL; oneRule: while (!stop && hasPositive) { isResidual = (position >= ruleset.size()); // Cover residual positive // examples // Re-do shuffling and stratification // newData.randomize(m_Random); newData = RuleStats.stratify(newData, m_Folds, m_Random); Instances[] part = RuleStats.partition(newData, m_Folds); growData = part[0]; pruneData = part[1]; // growData=newData.trainCV(m_Folds, m_Folds-1); // pruneData=newData.testCV(m_Folds, m_Folds-1); RipperRule finalRule; if (m_Debug) { System.err.println("\nRule #" + position + "| isResidual?" + isResidual + "| data size: " + newData.sumOfWeights()); } if (isResidual) { RipperRule newRule = new RipperRule(); newRule.setConsequent(classIndex); if (m_Debug) { System.err.println("\nGrowing and pruning" + " a new rule ..."); } newRule.grow(growData); newRule.prune(pruneData, false); finalRule = newRule; if (m_Debug) { System.err.println("\nNew rule found: " + newRule.toString(m_Class)); } } else { RipperRule oldRule = (RipperRule) ruleset.get(position); boolean covers = false; // Test coverage of the next old rule for (int i = 0; i < newData.numInstances(); i++) { if (oldRule.covers(newData.instance(i))) { covers = true; break; } } if (!covers) {// Null coverage, no variants can be generated finalRulesetStat.addAndUpdate(oldRule); position++; continue oneRule; } // 2 variants if (m_Debug) { System.err.println("\nGrowing and pruning" + " Replace ..."); } RipperRule replace = new RipperRule(); replace.setConsequent(classIndex); replace.grow(growData); // Remove the pruning data covered by the following // rules, then simply compute the error rate of the // current rule to prune it. According to Ripper, // it's equivalent to computing the error of the // whole ruleset -- is it true? pruneData = RuleStats.rmCoveredBySuccessives(pruneData, ruleset, position); replace.prune(pruneData, true); if (m_Debug) { System.err.println("\nGrowing and pruning" + " Revision ..."); } RipperRule revision = (RipperRule) oldRule.copy(); // For revision, first rm the data covered by the old rule Instances newGrowData = new Instances(growData, 0); for (int b = 0; b < growData.numInstances(); b++) { Instance inst = growData.instance(b); if (revision.covers(inst)) { newGrowData.add(inst); } } revision.grow(newGrowData); revision.prune(pruneData, true); double[][] prevRuleStats = new double[position][6]; for (int c = 0; c < position; c++) { prevRuleStats[c] = finalRulesetStat.getSimpleStats(c); } // Now compare the relative DL of variants ArrayList<Rule> tempRules = new ArrayList<Rule>(ruleset.size()); for (Rule r : ruleset) { tempRules.add((Rule) r.copy()); } tempRules.set(position, replace); RuleStats repStat = new RuleStats(data, tempRules); repStat.setNumAllConds(m_Total); repStat.countData(position, newData, prevRuleStats); // repStat.countData(); rst = repStat.getSimpleStats(position); if (m_Debug) { System.err.println("Replace rule covers: " + rst[0] + " | pos = " + rst[2] + " | neg = " + rst[4] + "\nThe rule doesn't cover: " + rst[1] + " | pos = " + rst[5]); } double repDL = repStat.relativeDL(position, expFPRate, m_CheckErr); if (m_Debug) { System.err.println("\nReplace: " + replace.toString(m_Class) + " |dl = " + repDL); } if (Double.isNaN(repDL) || Double.isInfinite(repDL)) { throw new Exception( "Should never happen: repDL" + "in optmz. stage NaN or " + "infinite!"); } tempRules.set(position, revision); RuleStats revStat = new RuleStats(data, tempRules); revStat.setNumAllConds(m_Total); revStat.countData(position, newData, prevRuleStats); // revStat.countData(); double revDL = revStat.relativeDL(position, expFPRate, m_CheckErr); if (m_Debug) { System.err.println("Revision: " + revision.toString(m_Class) + " |dl = " + revDL); } if (Double.isNaN(revDL) || Double.isInfinite(revDL)) { throw new Exception( "Should never happen: revDL" + "in optmz. stage NaN or " + "infinite!"); } rstats = new RuleStats(data, ruleset); rstats.setNumAllConds(m_Total); rstats.countData(position, newData, prevRuleStats); // rstats.countData(); double oldDL = rstats.relativeDL(position, expFPRate, m_CheckErr); if (Double.isNaN(oldDL) || Double.isInfinite(oldDL)) { throw new Exception( "Should never happen: oldDL" + "in optmz. stage NaN or " + "infinite!"); } if (m_Debug) { System.err.println("Old rule: " + oldRule.toString(m_Class) + " |dl = " + oldDL); } if (m_Debug) { System.err.println("\nrepDL: " + repDL + "\nrevDL: " + revDL + "\noldDL: " + oldDL); } if ((oldDL <= revDL) && (oldDL <= repDL)) { finalRule = oldRule; // Old the best } else if (revDL <= repDL) { finalRule = revision; // Revision the best } else { finalRule = replace; // Replace the best } } finalRulesetStat.addAndUpdate(finalRule); rst = finalRulesetStat.getSimpleStats(position); if (isResidual) { dl += finalRulesetStat.relativeDL(position, expFPRate, m_CheckErr); if (m_Debug) { System.err.println("After optimization: the dl" + "=" + dl + " | best: " + minDL); } if (dl < minDL) { minDL = dl; // The best dl so far } stop = checkStop(rst, minDL, dl); if (!stop) { ruleset.add(finalRule); // Accepted } else { finalRulesetStat.removeLast(); // Remove last to be re-used position--; } } else { ruleset.set(position, finalRule); // Accepted } if (m_Debug) { System.err.println("The rule covers: " + rst[0] + " | pos = " + rst[2] + " | neg = " + rst[4] + "\nThe rule doesn't cover: " + rst[1] + " | pos = " + rst[5]); System.err.println("\nRuleset so far: "); for (int x = 0; x < ruleset.size(); x++) { System.err.println(x + ": " + ((RipperRule) ruleset.get(x)).toString(m_Class)); } System.err.println(); } // Data not covered if (finalRulesetStat.getRulesetSize() > 0) { newData = finalRulesetStat.getFiltered(position)[1]; } hasPositive = Utils.gr(rst[5], 0.0); // Positives remaining? position++; } // while !stop && hasPositive if (ruleset.size() > (position + 1)) { // Hasn't gone through yet for (int k = position + 1; k < ruleset.size(); k++) { finalRulesetStat.addAndUpdate(ruleset.get(k)); } } if (m_Debug) { System.err.println("\nDeleting rules to decrease" + " DL of the whole ruleset ..."); } finalRulesetStat.reduceDL(expFPRate, m_CheckErr); if (m_Debug) { int del = ruleset.size() - finalRulesetStat.getRulesetSize(); System.err.println(del + " rules are deleted" + " after DL reduction procedure"); } ruleset = finalRulesetStat.getRuleset(); rstats = finalRulesetStat; } // For each run of optimization } // if pruning is used // Concatenate the ruleset for this class to the whole ruleset if (m_Debug) { System.err.println("\nFinal ruleset: "); for (int x = 0; x < ruleset.size(); x++) { System.err.println(x + ": " + ((RipperRule) ruleset.get(x)).toString(m_Class)); } System.err.println(); } m_Ruleset.addAll(ruleset); m_RulesetStats.add(rstats); if (ruleset.size() > 0) { return rstats.getFiltered(ruleset.size() - 1)[1]; // Data not } else { return data; } } /** * Check whether the stopping criterion meets * * @param rst the statistic of the ruleset * @param minDL the min description length so far * @param dl the current description length of the ruleset * @return true if stop criterion meets, false otherwise */ private boolean checkStop(double[] rst, double minDL, double dl) { if (dl > minDL + MAX_DL_SURPLUS) { if (m_Debug) { System.err.println("DL too large: " + dl + " | " + minDL); } return true; } else if (!Utils.gr(rst[2], 0.0)) {// Covered positives if (m_Debug) { System.err.println("Too few positives."); } return true; } else if ((rst[4] / rst[0]) >= 0.5) {// Err rate if (m_CheckErr) { if (m_Debug) { System.err.println("Error too large: " + rst[4] + "/" + rst[0]); } return true; } else { return false; } } else {// Not stops if (m_Debug) { System.err.println("Continue."); } return false; } } /** * Prints the all the rules of the rule learner. * * @return a textual description of the classifier */ @Override public String toString() { if (m_Ruleset == null) { return "JRIP: No model built yet."; } StringBuffer sb = new StringBuffer("JRIP rules:\n" + "===========\n\n"); for (int j = 0; j < m_RulesetStats.size(); j++) { RuleStats rs = m_RulesetStats.get(j); ArrayList<Rule> rules = rs.getRuleset(); for (int k = 0; k < rules.size(); k++) { double[] simStats = rs.getSimpleStats(k); sb.append(((RipperRule) rules.get(k)).toString(m_Class) + " (" + simStats[0] + "/" + simStats[4] + ")\n"); } } if (m_Debug) { System.err.println("Inside m_Ruleset"); for (int i = 0; i < m_Ruleset.size(); i++) { System.err.println(((RipperRule) m_Ruleset.get(i)).toString(m_Class)); } } sb.append("\nNumber of Rules : " + m_Ruleset.size() + "\n"); return sb.toString(); } /** * Returns the revision string. * * @return the revision */ @Override public String getRevision() { return RevisionUtils.extract("$Revision$"); } /** * Main method. * * @param args the options for the classifier */ public static void main(String[] args) { runClassifier(new JRip(), args); } }