Java tutorial
/* * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see <http://www.gnu.org/licenses/>. */ /* * Discretize.java * Copyright (C) 1999-2012 University of Waikato, Hamilton, New Zealand * */ package weka.filters.supervised.attribute; import java.util.*; import weka.core.*; import weka.core.Capabilities.Capability; import weka.core.TechnicalInformation.Field; import weka.core.TechnicalInformation.Type; import weka.filters.Filter; import weka.filters.SupervisedFilter; /** <!-- globalinfo-start --> * An instance filter that discretizes a range of numeric attributes in the dataset into nominal attributes. Discretization is by Fayyad & Irani's MDL method (the default).<br/> * <br/> * For more information, see:<br/> * <br/> * Usama M. Fayyad, Keki B. Irani: Multi-interval discretization of continuousvalued attributes for classification learning. In: Thirteenth International Joint Conference on Articial Intelligence, 1022-1027, 1993.<br/> * <br/> * Igor Kononenko: On Biases in Estimating Multi-Valued Attributes. In: 14th International Joint Conference on Articial Intelligence, 1034-1040, 1995. * <p/> <!-- globalinfo-end --> * <!-- technical-bibtex-start --> * BibTeX: * <pre> * @inproceedings{Fayyad1993, * author = {Usama M. Fayyad and Keki B. Irani}, * booktitle = {Thirteenth International Joint Conference on Articial Intelligence}, * pages = {1022-1027}, * publisher = {Morgan Kaufmann Publishers}, * title = {Multi-interval discretization of continuousvalued attributes for classification learning}, * volume = {2}, * year = {1993} * } * * @inproceedings{Kononenko1995, * author = {Igor Kononenko}, * booktitle = {14th International Joint Conference on Articial Intelligence}, * pages = {1034-1040}, * title = {On Biases in Estimating Multi-Valued Attributes}, * year = {1995}, * PS = {http://ai.fri.uni-lj.si/papers/kononenko95-ijcai.ps.gz} * } * </pre> * <p/> <!-- technical-bibtex-end --> * <!-- options-start --> * Valid options are: <p/> * * <pre> -R <col1,col2-col4,...> * Specifies list of columns to Discretize. First and last are valid indexes. * (default none)</pre> * * <pre> -V * Invert matching sense of column indexes.</pre> * * <pre> -D * Output binary attributes for discretized attributes.</pre> * * <pre> -Y * Use bin numbers rather than ranges for discretized attributes.</pre> * * <pre> -E * Use better encoding of split point for MDL.</pre> * * <pre> -K * Use Kononenko's MDL criterion.</pre> * * <pre> -precision <integer> * Precision for bin boundary labels. * (default = 6 decimal places).</pre> * * <pre>-spread-attribute-weight * When generating binary attributes, spread weight of old * attribute across new attributes. Do not give each new attribute the old weight.</pre> * <!-- options-end --> * * @author Len Trigg (trigg@cs.waikato.ac.nz) * @author Eibe Frank (eibe@cs.waikato.ac.nz) * @version $Revision$ */ public class Discretize extends Filter implements SupervisedFilter, OptionHandler, WeightedInstancesHandler, WeightedAttributesHandler, TechnicalInformationHandler { /** for serialization */ static final long serialVersionUID = -3141006402280129097L; /** Stores which columns to Discretize */ protected Range m_DiscretizeCols = new Range(); /** Store the current cutpoints */ protected double[][] m_CutPoints = null; /** Output binary attributes for discretized attributes. */ protected boolean m_MakeBinary = false; /** Use bin numbers rather than ranges for discretized attributes. */ protected boolean m_UseBinNumbers = false; /** Use better encoding of split point for MDL. */ protected boolean m_UseBetterEncoding = false; /** Use Kononenko's MDL criterion instead of Fayyad et al.'s */ protected boolean m_UseKononenko = false; /** Precision for bin range labels */ protected int m_BinRangePrecision = 6; /** Whether to spread attribute weight when creating binary attributes */ protected boolean m_SpreadAttributeWeight = false; /** Constructor - initialises the filter */ public Discretize() { setAttributeIndices("first-last"); } /** * Gets an enumeration describing the available options. * * @return an enumeration of all the available options. */ @Override public Enumeration<Option> listOptions() { Vector<Option> newVector = new Vector<Option>(6); newVector.addElement(new Option("\tSpecifies list of columns to Discretize. First" + " and last are valid indexes.\n" + "\t(default none)", "R", 1, "-R <col1,col2-col4,...>")); newVector.addElement(new Option("\tInvert matching sense of column indexes.", "V", 0, "-V")); newVector.addElement(new Option("\tOutput binary attributes for discretized attributes.", "D", 0, "-D")); newVector.addElement( new Option("\tUse bin numbers rather than ranges for discretized attributes.", "Y", 0, "-Y")); newVector.addElement(new Option("\tUse better encoding of split point for MDL.", "E", 0, "-E")); newVector.addElement(new Option("\tUse Kononenko's MDL criterion.", "K", 0, "-K")); newVector .addElement(new Option("\tPrecision for bin boundary labels.\n\t" + "(default = 6 decimal places).", "precision", 1, "-precision <integer>")); newVector.addElement(new Option( "\tWhen generating binary attributes, spread weight of old " + "attribute across new attributes. Do not give each new attribute the old weight.\n\t", "spread-attribute-weight", 0, "-spread-attribute-weight")); return newVector.elements(); } /** * Parses a given list of options. * <p/> * <!-- options-start --> * Valid options are: <p/> * * <pre> -R <col1,col2-col4,...> * Specifies list of columns to Discretize. First and last are valid indexes. * (default none)</pre> * * <pre> -V * Invert matching sense of column indexes.</pre> * * <pre> -D * Output binary attributes for discretized attributes.</pre> * * <pre> -Y * Use bin numbers rather than ranges for discretized attributes.</pre> * * <pre> -E * Use better encoding of split point for MDL.</pre> * * <pre> -K * Use Kononenko's MDL criterion.</pre> * * <pre> -precision <integer> * Precision for bin boundary labels. * (default = 6 decimal places).</pre> * * <pre>-spread-attribute-weight * When generating binary attributes, spread weight of old * attribute across new attributes. Do not give each new attribute the old weight.</pre> * <!-- options-end --> * * @param options the list of options as an array of strings * @throws Exception if an option is not supported */ @Override public void setOptions(String[] options) throws Exception { setMakeBinary(Utils.getFlag('D', options)); setUseBinNumbers(Utils.getFlag('Y', options)); setUseBetterEncoding(Utils.getFlag('E', options)); setUseKononenko(Utils.getFlag('K', options)); setInvertSelection(Utils.getFlag('V', options)); String convertList = Utils.getOption('R', options); if (convertList.length() != 0) { setAttributeIndices(convertList); } else { setAttributeIndices("first-last"); } String precisionS = Utils.getOption("precision", options); if (precisionS.length() > 0) { setBinRangePrecision(Integer.parseInt(precisionS)); } setSpreadAttributeWeight(Utils.getFlag("spread-attribute-weight", options)); if (getInputFormat() != null) { setInputFormat(getInputFormat()); } Utils.checkForRemainingOptions(options); } /** * Gets the current settings of the filter. * * @return an array of strings suitable for passing to setOptions */ @Override public String[] getOptions() { List<String> options = new ArrayList<String>(); if (getMakeBinary()) { options.add("-D"); } if (getUseBinNumbers()) { options.add("-Y"); } if (getUseBetterEncoding()) { options.add("-E"); } if (getUseKononenko()) { options.add("-K"); } if (getInvertSelection()) { options.add("-V"); } if (!getAttributeIndices().equals("")) { options.add("-R"); options.add(getAttributeIndices()); } options.add("-precision"); options.add("" + getBinRangePrecision()); if (getSpreadAttributeWeight()) { options.add("-spread-attribute-weight"); } return options.toArray(new String[options.size()]); } /** * Returns the Capabilities of this filter. * * @return the capabilities of this object * @see Capabilities */ @Override public Capabilities getCapabilities() { Capabilities result = super.getCapabilities(); result.disableAll(); // attributes result.enableAllAttributes(); result.enable(Capability.MISSING_VALUES); // class result.enable(Capability.NOMINAL_CLASS); return result; } /** * Sets the format of the input instances. * * @param instanceInfo an Instances object containing the input instance * structure (any instances contained in the object are ignored - * only the structure is required). * @return true if the outputFormat may be collected immediately * @throws Exception if the input format can't be set successfully */ @Override public boolean setInputFormat(Instances instanceInfo) throws Exception { super.setInputFormat(instanceInfo); m_DiscretizeCols.setUpper(instanceInfo.numAttributes() - 1); m_CutPoints = null; // If we implement loading cutfiles, then load // them here and set the output format return false; } /** * Input an instance for filtering. Ordinarily the instance is processed and * made available for output immediately. Some filters require all instances * be read before producing output. * * @param instance the input instance * @return true if the filtered instance may now be collected with output(). * @throws IllegalStateException if no input format has been defined. */ @Override public boolean input(Instance instance) { if (getInputFormat() == null) { throw new IllegalStateException("No input instance format defined"); } if (m_NewBatch) { resetQueue(); m_NewBatch = false; } if (m_CutPoints != null) { convertInstance(instance); return true; } bufferInput(instance); return false; } /** * Signifies that this batch of input to the filter is finished. If the filter * requires all instances prior to filtering, output() may now be called to * retrieve the filtered instances. * * @return true if there are instances pending output * @throws IllegalStateException if no input structure has been defined */ @Override public boolean batchFinished() { if (getInputFormat() == null) { throw new IllegalStateException("No input instance format defined"); } if (m_CutPoints == null) { calculateCutPoints(); setOutputFormat(); // If we implement saving cutfiles, save the cuts here // Convert pending input instances for (int i = 0; i < getInputFormat().numInstances(); i++) { convertInstance(getInputFormat().instance(i)); } } flushInput(); m_NewBatch = true; return (numPendingOutput() != 0); } /** * Returns a string describing this filter * * @return a description of the filter suitable for displaying in the * explorer/experimenter gui */ public String globalInfo() { return "An instance filter that discretizes a range of numeric" + " attributes in the dataset into nominal attributes." + " Discretization is by Fayyad & Irani's MDL method (the default).\n\n" + "For more information, see:\n\n" + getTechnicalInformation().toString(); } /** * Returns an instance of a TechnicalInformation object, containing detailed * information about the technical background of this class, e.g., paper * reference or book this class is based on. * * @return the technical information about this class */ @Override public TechnicalInformation getTechnicalInformation() { TechnicalInformation result; TechnicalInformation additional; result = new TechnicalInformation(Type.INPROCEEDINGS); result.setValue(Field.AUTHOR, "Usama M. Fayyad and Keki B. Irani"); result.setValue(Field.TITLE, "Multi-interval discretization of continuousvalued attributes for classification learning"); result.setValue(Field.BOOKTITLE, "Thirteenth International Joint Conference on Articial Intelligence"); result.setValue(Field.YEAR, "1993"); result.setValue(Field.VOLUME, "2"); result.setValue(Field.PAGES, "1022-1027"); result.setValue(Field.PUBLISHER, "Morgan Kaufmann Publishers"); additional = result.add(Type.INPROCEEDINGS); additional.setValue(Field.AUTHOR, "Igor Kononenko"); additional.setValue(Field.TITLE, "On Biases in Estimating Multi-Valued Attributes"); additional.setValue(Field.BOOKTITLE, "14th International Joint Conference on Articial Intelligence"); additional.setValue(Field.YEAR, "1995"); additional.setValue(Field.PAGES, "1034-1040"); additional.setValue(Field.PS, "http://ai.fri.uni-lj.si/papers/kononenko95-ijcai.ps.gz"); return result; } /** * Returns the tip text for this property * * @return tip text for this property suitable for displaying in the * explorer/experimenter gui */ public String spreadAttributeWeightTipText() { return "When generating binary attributes, spread weight of old attribute across new attributes. " + "Do not give each new attribute the old weight."; } /** * If true, when generating binary attributes, spread weight of old * attribute across new attributes. Do not give each new attribute the old weight. * * @param p whether weight is spread */ public void setSpreadAttributeWeight(boolean p) { m_SpreadAttributeWeight = p; } /** * If true, when generating binary attributes, spread weight of old * attribute across new attributes. Do not give each new attribute the old weight. * * @return whether weight is spread */ public boolean getSpreadAttributeWeight() { return m_SpreadAttributeWeight; } /** * Returns the tip text for this property * * @return tip text for this property suitable for displaying in the * explorer/experimenter gui */ public String binRangePrecisionTipText() { return "The number of decimal places for cut points to use when generating bin labels"; } /** * Set the precision for bin boundaries. Only affects the boundary values used * in the labels for the converted attributes; internal cutpoints are at full * double precision. * * @param p the precision for bin boundaries */ public void setBinRangePrecision(int p) { m_BinRangePrecision = p; } /** * Get the precision for bin boundaries. Only affects the boundary values used * in the labels for the converted attributes; internal cutpoints are at full * double precision. * * @return the precision for bin boundaries */ public int getBinRangePrecision() { return m_BinRangePrecision; } /** * Returns the tip text for this property * * @return tip text for this property suitable for displaying in the * explorer/experimenter gui */ public String makeBinaryTipText() { return "Make resulting attributes binary."; } /** * Gets whether binary attributes should be made for discretized ones. * * @return true if attributes will be binarized */ public boolean getMakeBinary() { return m_MakeBinary; } /** * Sets whether binary attributes should be made for discretized ones. * * @param makeBinary if binary attributes are to be made */ public void setMakeBinary(boolean makeBinary) { m_MakeBinary = makeBinary; } /** * Returns the tip text for this property * * @return tip text for this property suitable for displaying in the * explorer/experimenter gui */ public String useBinNumbersTipText() { return "Use bin numbers (eg BXofY) rather than ranges fordiscretized attributes"; } /** * Gets whether bin numbers rather than ranges should be used for discretized * attributes. * * @return true if bin numbers should be used */ public boolean getUseBinNumbers() { return m_UseBinNumbers; } /** * Sets whether bin numbers rather than ranges should be used for discretized * attributes. * * @param useBinNumbers if bin numbers should be used */ public void setUseBinNumbers(boolean useBinNumbers) { m_UseBinNumbers = useBinNumbers; } /** * Returns the tip text for this property * * @return tip text for this property suitable for displaying in the * explorer/experimenter gui */ public String useKononenkoTipText() { return "Use Kononenko's MDL criterion. If set to false" + " uses the Fayyad & Irani criterion."; } /** * Gets whether Kononenko's MDL criterion is to be used. * * @return true if Kononenko's criterion will be used. */ public boolean getUseKononenko() { return m_UseKononenko; } /** * Sets whether Kononenko's MDL criterion is to be used. * * @param useKon true if Kononenko's one is to be used */ public void setUseKononenko(boolean useKon) { m_UseKononenko = useKon; } /** * Returns the tip text for this property * * @return tip text for this property suitable for displaying in the * explorer/experimenter gui */ public String useBetterEncodingTipText() { return "Uses a more efficient split point encoding."; } /** * Gets whether better encoding is to be used for MDL. * * @return true if the better MDL encoding will be used */ public boolean getUseBetterEncoding() { return m_UseBetterEncoding; } /** * Sets whether better encoding is to be used for MDL. * * @param useBetterEncoding true if better encoding to be used. */ public void setUseBetterEncoding(boolean useBetterEncoding) { m_UseBetterEncoding = useBetterEncoding; } /** * Returns the tip text for this property * * @return tip text for this property suitable for displaying in the * explorer/experimenter gui */ public String invertSelectionTipText() { return "Set attribute selection mode. If false, only selected" + " (numeric) attributes in the range will be discretized; if" + " true, only non-selected attributes will be discretized."; } /** * Gets whether the supplied columns are to be removed or kept * * @return true if the supplied columns will be kept */ public boolean getInvertSelection() { return m_DiscretizeCols.getInvert(); } /** * Sets whether selected columns should be removed or kept. If true the * selected columns are kept and unselected columns are deleted. If false * selected columns are deleted and unselected columns are kept. * * @param invert the new invert setting */ public void setInvertSelection(boolean invert) { m_DiscretizeCols.setInvert(invert); } /** * Returns the tip text for this property * * @return tip text for this property suitable for displaying in the * explorer/experimenter gui */ public String attributeIndicesTipText() { return "Specify range of attributes to act on." + " This is a comma separated list of attribute indices, with" + " \"first\" and \"last\" valid values. Specify an inclusive" + " range with \"-\". E.g: \"first-3,5,6-10,last\"."; } /** * Gets the current range selection * * @return a string containing a comma separated list of ranges */ public String getAttributeIndices() { return m_DiscretizeCols.getRanges(); } /** * Sets which attributes are to be Discretized (only numeric attributes among * the selection will be Discretized). * * @param rangeList a string representing the list of attributes. Since the * string will typically come from a user, attributes are indexed * from 1. <br> * eg: first-3,5,6-last * @throws IllegalArgumentException if an invalid range list is supplied */ public void setAttributeIndices(String rangeList) { m_DiscretizeCols.setRanges(rangeList); } /** * Sets which attributes are to be Discretized (only numeric attributes among * the selection will be Discretized). * * @param attributes an array containing indexes of attributes to Discretize. * Since the array will typically come from a program, attributes are * indexed from 0. * @throws IllegalArgumentException if an invalid set of ranges is supplied */ public void setAttributeIndicesArray(int[] attributes) { setAttributeIndices(Range.indicesToRangeList(attributes)); } /** * Gets the cut points for an attribute * * @param attributeIndex the index (from 0) of the attribute to get the cut * points of * @return an array containing the cutpoints (or null if the attribute * requested isn't being Discretized */ public double[] getCutPoints(int attributeIndex) { if (m_CutPoints == null) { return null; } return m_CutPoints[attributeIndex]; } /** * Gets the bin ranges string for an attribute * * @param attributeIndex the index (from 0) of the attribute to get the bin * ranges string of * @return the bin ranges string (or null if the attribute requested has been * discretized into only one interval.) */ public String getBinRangesString(int attributeIndex) { if (m_CutPoints == null) { return null; } double[] cutPoints = m_CutPoints[attributeIndex]; if (cutPoints == null) { return "All"; } StringBuilder sb = new StringBuilder(); boolean first = true; for (int j = 0, n = cutPoints.length; j <= n; ++j) { if (first) { first = false; } else { sb.append(','); } sb.append(binRangeString(cutPoints, j, m_BinRangePrecision)); } return sb.toString(); } /** * Get a bin range string for a specified bin of some attribute's cut points. * * @param cutPoints The attribute's cut points; never null. * @param j The bin number (zero based); never out of range. * @param precision the precision for the range values * * @return The bin range string. */ private static String binRangeString(double[] cutPoints, int j, int precision) { assert cutPoints != null; int n = cutPoints.length; assert 0 <= j && j <= n; return j == 0 ? "" + "(" + "-inf" + "-" + Utils.doubleToString(cutPoints[0], precision) + "]" : j == n ? "" + "(" + Utils.doubleToString(cutPoints[n - 1], precision) + "-" + "inf" + ")" : "" + "(" + Utils.doubleToString(cutPoints[j - 1], precision) + "-" + Utils.doubleToString(cutPoints[j], precision) + "]"; } /** Generate the cutpoints for each attribute */ protected void calculateCutPoints() { Instances copy = null; m_CutPoints = new double[getInputFormat().numAttributes()][]; for (int i = getInputFormat().numAttributes() - 1; i >= 0; i--) { if ((m_DiscretizeCols.isInRange(i)) && (getInputFormat().attribute(i).isNumeric())) { // Use copy to preserve order if (copy == null) { copy = new Instances(getInputFormat()); } calculateCutPointsByMDL(i, copy); } } } /** * Set cutpoints for a single attribute using MDL. * * @param index the index of the attribute to set cutpoints for * @param data the data to work with */ protected void calculateCutPointsByMDL(int index, Instances data) { // Sort instances data.sort(data.attribute(index)); // Find first instances that's missing int firstMissing = data.numInstances(); for (int i = 0; i < data.numInstances(); i++) { if (data.instance(i).isMissing(index)) { firstMissing = i; break; } } m_CutPoints[index] = cutPointsForSubset(data, index, 0, firstMissing); } /** * Test using Kononenko's MDL criterion. * * @param priorCounts * @param bestCounts * @param numInstances * @param numCutPoints * @return true if the split is acceptable */ private boolean KononenkosMDL(double[] priorCounts, double[][] bestCounts, double numInstances, int numCutPoints) { double distPrior, instPrior, distAfter = 0, sum, instAfter = 0; double before, after; int numClassesTotal; // Number of classes occuring in the set numClassesTotal = 0; for (double priorCount : priorCounts) { if (priorCount > 0) { numClassesTotal++; } } // Encode distribution prior to split distPrior = SpecialFunctions.log2Binomial(numInstances + numClassesTotal - 1, numClassesTotal - 1); // Encode instances prior to split. instPrior = SpecialFunctions.log2Multinomial(numInstances, priorCounts); before = instPrior + distPrior; // Encode distributions and instances after split. for (double[] bestCount : bestCounts) { sum = Utils.sum(bestCount); distAfter += SpecialFunctions.log2Binomial(sum + numClassesTotal - 1, numClassesTotal - 1); instAfter += SpecialFunctions.log2Multinomial(sum, bestCount); } // Coding cost after split after = Utils.log2(numCutPoints) + distAfter + instAfter; // Check if split is to be accepted return (before > after); } /** * Test using Fayyad and Irani's MDL criterion. * * @param priorCounts * @param bestCounts * @param numInstances * @param numCutPoints * @return true if the splits is acceptable */ private boolean FayyadAndIranisMDL(double[] priorCounts, double[][] bestCounts, double numInstances, int numCutPoints) { double priorEntropy, entropy, gain; double entropyLeft, entropyRight, delta; int numClassesTotal, numClassesRight, numClassesLeft; // Compute entropy before split. priorEntropy = ContingencyTables.entropy(priorCounts); // Compute entropy after split. entropy = ContingencyTables.entropyConditionedOnRows(bestCounts); // Compute information gain. gain = priorEntropy - entropy; // Number of classes occuring in the set numClassesTotal = 0; for (double priorCount : priorCounts) { if (priorCount > 0) { numClassesTotal++; } } // Number of classes occuring in the left subset numClassesLeft = 0; for (int i = 0; i < bestCounts[0].length; i++) { if (bestCounts[0][i] > 0) { numClassesLeft++; } } // Number of classes occuring in the right subset numClassesRight = 0; for (int i = 0; i < bestCounts[1].length; i++) { if (bestCounts[1][i] > 0) { numClassesRight++; } } // Entropy of the left and the right subsets entropyLeft = ContingencyTables.entropy(bestCounts[0]); entropyRight = ContingencyTables.entropy(bestCounts[1]); // Compute terms for MDL formula delta = Utils.log2(Math.pow(3, numClassesTotal) - 2) - ((numClassesTotal * priorEntropy) - (numClassesRight * entropyRight) - (numClassesLeft * entropyLeft)); // Check if split is to be accepted return (gain > (Utils.log2(numCutPoints) + delta) / numInstances); } /** * Selects cutpoints for sorted subset. * * @param instances * @param attIndex * @param first * @param lastPlusOne * @return */ private double[] cutPointsForSubset(Instances instances, int attIndex, int first, int lastPlusOne) { double[][] counts, bestCounts; double[] priorCounts, left, right, cutPoints; double currentCutPoint = -Double.MAX_VALUE, bestCutPoint = -1, currentEntropy, bestEntropy, priorEntropy, gain; int bestIndex = -1, numCutPoints = 0; double numInstances = 0; // Compute number of instances in set if ((lastPlusOne - first) < 2) { return null; } // Compute class counts. counts = new double[2][instances.numClasses()]; for (int i = first; i < lastPlusOne; i++) { numInstances += instances.instance(i).weight(); counts[1][(int) instances.instance(i).classValue()] += instances.instance(i).weight(); } // Save prior counts priorCounts = new double[instances.numClasses()]; System.arraycopy(counts[1], 0, priorCounts, 0, instances.numClasses()); // Entropy of the full set priorEntropy = ContingencyTables.entropy(priorCounts); bestEntropy = priorEntropy; // Find best entropy. bestCounts = new double[2][instances.numClasses()]; for (int i = first; i < (lastPlusOne - 1); i++) { counts[0][(int) instances.instance(i).classValue()] += instances.instance(i).weight(); counts[1][(int) instances.instance(i).classValue()] -= instances.instance(i).weight(); if (instances.instance(i).value(attIndex) < instances.instance(i + 1).value(attIndex)) { currentCutPoint = (instances.instance(i).value(attIndex) + instances.instance(i + 1).value(attIndex)) / 2.0; currentEntropy = ContingencyTables.entropyConditionedOnRows(counts); if (currentEntropy < bestEntropy) { bestCutPoint = currentCutPoint; bestEntropy = currentEntropy; bestIndex = i; System.arraycopy(counts[0], 0, bestCounts[0], 0, instances.numClasses()); System.arraycopy(counts[1], 0, bestCounts[1], 0, instances.numClasses()); } numCutPoints++; } } // Use worse encoding? if (!m_UseBetterEncoding) { numCutPoints = (lastPlusOne - first) - 1; } // Checks if gain is zero gain = priorEntropy - bestEntropy; if (gain <= 0) { return null; } // Check if split is to be accepted if ((m_UseKononenko && KononenkosMDL(priorCounts, bestCounts, numInstances, numCutPoints)) || (!m_UseKononenko && FayyadAndIranisMDL(priorCounts, bestCounts, numInstances, numCutPoints))) { // Select split points for the left and right subsets left = cutPointsForSubset(instances, attIndex, first, bestIndex + 1); right = cutPointsForSubset(instances, attIndex, bestIndex + 1, lastPlusOne); // Merge cutpoints and return them if ((left == null) && (right) == null) { cutPoints = new double[1]; cutPoints[0] = bestCutPoint; } else if (right == null) { cutPoints = new double[left.length + 1]; System.arraycopy(left, 0, cutPoints, 0, left.length); cutPoints[left.length] = bestCutPoint; } else if (left == null) { cutPoints = new double[1 + right.length]; cutPoints[0] = bestCutPoint; System.arraycopy(right, 0, cutPoints, 1, right.length); } else { cutPoints = new double[left.length + right.length + 1]; System.arraycopy(left, 0, cutPoints, 0, left.length); cutPoints[left.length] = bestCutPoint; System.arraycopy(right, 0, cutPoints, left.length + 1, right.length); } return cutPoints; } else { return null; } } /** * Set the output format. Takes the currently defined cutpoints and * m_InputFormat and calls setOutputFormat(Instances) appropriately. */ protected void setOutputFormat() { if (m_CutPoints == null) { setOutputFormat(null); return; } ArrayList<Attribute> attributes = new ArrayList<Attribute>(getInputFormat().numAttributes()); int classIndex = getInputFormat().classIndex(); for (int i = 0, m = getInputFormat().numAttributes(); i < m; ++i) { if ((m_DiscretizeCols.isInRange(i)) && (getInputFormat().attribute(i).isNumeric())) { Set<String> cutPointsCheck = new HashSet<String>(); double[] cutPoints = m_CutPoints[i]; if (!m_MakeBinary) { ArrayList<String> attribValues; if (cutPoints == null) { attribValues = new ArrayList<String>(1); attribValues.add("'All'"); } else { attribValues = new ArrayList<String>(cutPoints.length + 1); if (m_UseBinNumbers) { for (int j = 0, n = cutPoints.length; j <= n; ++j) { attribValues.add("'B" + (j + 1) + "of" + (n + 1) + "'"); } } else { for (int j = 0, n = cutPoints.length; j <= n; ++j) { String newBinRangeString = binRangeString(cutPoints, j, m_BinRangePrecision); if (!cutPointsCheck.add(newBinRangeString)) { throw new IllegalArgumentException( "A duplicate bin range was detected. Try increasing the bin range precision."); } attribValues.add("'" + newBinRangeString + "'"); } } } Attribute newAtt = new Attribute(getInputFormat().attribute(i).name(), attribValues); newAtt.setWeight(getInputFormat().attribute(i).weight()); attributes.add(newAtt); } else { if (cutPoints == null) { ArrayList<String> attribValues = new ArrayList<String>(1); attribValues.add("'All'"); Attribute newAtt = new Attribute(getInputFormat().attribute(i).name(), attribValues); newAtt.setWeight(getInputFormat().attribute(i).weight()); attributes.add(newAtt); } else { if (i < getInputFormat().classIndex()) { classIndex += cutPoints.length - 1; } for (int j = 0, n = cutPoints.length; j < n; ++j) { ArrayList<String> attribValues = new ArrayList<String>(2); if (m_UseBinNumbers) { attribValues.add("'B1of2'"); attribValues.add("'B2of2'"); } else { double[] binaryCutPoint = { cutPoints[j] }; String newBinRangeString1 = binRangeString(binaryCutPoint, 0, m_BinRangePrecision); String newBinRangeString2 = binRangeString(binaryCutPoint, 1, m_BinRangePrecision); if (newBinRangeString1.equals(newBinRangeString2)) { throw new IllegalArgumentException( "A duplicate bin range was detected. Try increasing the bin range precision."); } attribValues.add("'" + newBinRangeString1 + "'"); attribValues.add("'" + newBinRangeString2 + "'"); } Attribute newAtt = new Attribute(getInputFormat().attribute(i).name() + "_" + (j + 1), attribValues); if (getSpreadAttributeWeight()) { newAtt.setWeight(getInputFormat().attribute(i).weight() / cutPoints.length); } else { newAtt.setWeight(getInputFormat().attribute(i).weight()); } attributes.add(newAtt); } } } } else { attributes.add((Attribute) getInputFormat().attribute(i).copy()); } } Instances outputFormat = new Instances(getInputFormat().relationName(), attributes, 0); outputFormat.setClassIndex(classIndex); setOutputFormat(outputFormat); } /** * Convert a single instance over. The converted instance is added to the end * of the output queue. * * @param instance the instance to convert */ protected void convertInstance(Instance instance) { int index = 0; double[] vals = new double[outputFormatPeek().numAttributes()]; // Copy and convert the values for (int i = 0; i < getInputFormat().numAttributes(); i++) { if (m_DiscretizeCols.isInRange(i) && getInputFormat().attribute(i).isNumeric()) { int j; double currentVal = instance.value(i); if (m_CutPoints[i] == null) { if (instance.isMissing(i)) { vals[index] = Utils.missingValue(); } else { vals[index] = 0; } index++; } else { if (!m_MakeBinary) { if (instance.isMissing(i)) { vals[index] = Utils.missingValue(); } else { for (j = 0; j < m_CutPoints[i].length; j++) { if (currentVal <= m_CutPoints[i][j]) { break; } } vals[index] = j; } index++; } else { for (j = 0; j < m_CutPoints[i].length; j++) { if (instance.isMissing(i)) { vals[index] = Utils.missingValue(); } else if (currentVal <= m_CutPoints[i][j]) { vals[index] = 0; } else { vals[index] = 1; } index++; } } } } else { vals[index] = instance.value(i); index++; } } Instance inst = null; if (instance instanceof SparseInstance) { inst = new SparseInstance(instance.weight(), vals); } else { inst = new DenseInstance(instance.weight(), vals); } copyValues(inst, false, instance.dataset(), outputFormatPeek()); push(inst); // No need to copy instance } /** * Returns the revision string. * * @return the revision */ @Override public String getRevision() { return RevisionUtils.extract("$Revision$"); } /** * Main method for testing this class. * * @param argv should contain arguments to the filter: use -h for help */ public static void main(String[] argv) { runFilter(new Discretize(), argv); } }