Java tutorial
/* * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see <http://www.gnu.org/licenses/>. */ /* * XMeans.java * Copyright (C) 2000 University of Waikato, Hamilton, New Zealand * */ package weka.clusterers; import java.io.BufferedReader; import java.io.File; import java.io.FileOutputStream; import java.io.FileReader; import java.io.PrintWriter; import java.io.Reader; import java.util.Collections; import java.util.Enumeration; import java.util.Random; import java.util.Vector; import weka.core.AlgVector; import weka.core.Capabilities; import weka.core.Capabilities.Capability; import weka.core.DistanceFunction; import weka.core.EuclideanDistance; import weka.core.Instance; import weka.core.Instances; import weka.core.Option; import weka.core.OptionHandler; import weka.core.RevisionUtils; import weka.core.TechnicalInformation; import weka.core.TechnicalInformation.Field; import weka.core.TechnicalInformation.Type; import weka.core.TechnicalInformationHandler; import weka.core.Utils; import weka.core.neighboursearch.KDTree; import weka.filters.Filter; import weka.filters.unsupervised.attribute.ReplaceMissingValues; /** * <!-- globalinfo-start --> Cluster data using the X-means algorithm.<br/> * <br/> * X-Means is K-Means extended by an Improve-Structure part In this part of the * algorithm the centers are attempted to be split in its region. The decision * between the children of each center and itself is done comparing the * BIC-values of the two structures.<br/> * <br/> * For more information see:<br/> * <br/> * Dan Pelleg, Andrew W. Moore: X-means: Extending K-means with Efficient * Estimation of the Number of Clusters. In: Seventeenth International * Conference on Machine Learning, 727-734, 2000. * <p/> * <!-- globalinfo-end --> * * <!-- technical-bibtex-start --> BibTeX: * * <pre> * @inproceedings{Pelleg2000, * author = {Dan Pelleg and Andrew W. Moore}, * booktitle = {Seventeenth International Conference on Machine Learning}, * pages = {727-734}, * publisher = {Morgan Kaufmann}, * title = {X-means: Extending K-means with Efficient Estimation of the Number of Clusters}, * year = {2000} * } * </pre> * <p/> * <!-- technical-bibtex-end --> * * <!-- options-start --> Valid options are: * <p/> * * <pre> * -I <num> * maximum number of overall iterations * (default 1). * </pre> * * <pre> * -M <num> * maximum number of iterations in the kMeans loop in * the Improve-Parameter part * (default 1000). * </pre> * * <pre> * -J <num> * maximum number of iterations in the kMeans loop * for the splitted centroids in the Improve-Structure part * (default 1000). * </pre> * * <pre> * -L <num> * minimum number of clusters * (default 2). * </pre> * * <pre> * -H <num> * maximum number of clusters * (default 4). * </pre> * * <pre> * -B <value> * distance value for binary attributes * (default 1.0). * </pre> * * <pre> * -use-kdtree * Uses the KDTree internally * (default no). * </pre> * * <pre> * -K <KDTree class specification> * Full class name of KDTree class to use, followed * by scheme options. * eg: "weka.core.neighboursearch.kdtrees.KDTree -P" * (default no KDTree class used). * </pre> * * <pre> * -C <value> * cutoff factor, takes the given percentage of the splitted * centroids if none of the children win * (default 0.0). * </pre> * * <pre> * -D <distance function class specification> * Full class name of Distance function class to use, followed * by scheme options. * (default weka.core.EuclideanDistance). * </pre> * * <pre> * -N <file name> * file to read starting centers from (ARFF format). * </pre> * * <pre> * -O <file name> * file to write centers to (ARFF format). * </pre> * * <pre> * -U <int> * The debug level. * (default 0) * </pre> * * <pre> * -Y <file name> * The debug vectors file. * </pre> * * <pre> * -S <num> * Random number seed. * (default 10) * </pre> * * <!-- options-end --> * * @author Gabi Schmidberger (gabi@cs.waikato.ac.nz) * @author Mark Hall (mhall@cs.waikato.ac.nz) * @author Malcolm Ware (mfw4@cs.waikato.ac.nz) * @version $Revision$ * @see RandomizableClusterer */ public class XMeans extends RandomizableClusterer implements TechnicalInformationHandler { /* * major TODOS: * * make BIC-Score replaceable by other scores */ /** for serialization. */ private static final long serialVersionUID = -7941793078404132616L; /** training instances. */ protected Instances m_Instances = null; /** model information, should increase readability. */ protected Instances m_Model = null; /** replace missing values in training instances. */ protected ReplaceMissingValues m_ReplaceMissingFilter; /** * Distance value between true and false of binary attributes and "same" and * "different" of nominal attributes (default = 1.0). */ protected double m_BinValue = 1.0; /** BIC-Score of the current model. */ protected double m_Bic = Double.MIN_VALUE; /** Distortion. */ protected double[] m_Mle = null; /** maximum overall iterations. */ protected int m_MaxIterations = 1; /** * maximum iterations to perform Kmeans part if negative, iterations are not * checked. */ protected int m_MaxKMeans = 1000; /** * see above, but for kMeans of splitted clusters. */ protected int m_MaxKMeansForChildren = 1000; /** The actual number of clusters. */ protected int m_NumClusters = 2; /** min number of clusters to generate. */ protected int m_MinNumClusters = 2; /** max number of clusters to generate. */ protected int m_MaxNumClusters = 4; /** the distance function used. */ protected DistanceFunction m_DistanceF = new EuclideanDistance(); /** cluster centers. */ protected Instances m_ClusterCenters; /** file name of the output file for the cluster centers. */ protected File m_InputCenterFile = new File(System.getProperty("user.dir")); /* --> DebugVectors - USED FOR DEBUGGING */ /** input file for the random vectors --> USED FOR DEBUGGING. */ protected Reader m_DebugVectorsInput = null; /** the index for the current debug vector. */ protected int m_DebugVectorsIndex = 0; /** all the debug vectors. */ protected Instances m_DebugVectors = null; /** file name of the input file for the random vectors. */ protected File m_DebugVectorsFile = new File(System.getProperty("user.dir")); /** input file for the cluster centers. */ protected transient Reader m_CenterInput = null; /** file name of the output file for the cluster centers. */ protected File m_OutputCenterFile = new File(System.getProperty("user.dir")); /** output file for the cluster centers. */ protected transient PrintWriter m_CenterOutput = null; /** * temporary variable holding cluster assignments while iterating. */ protected int[] m_ClusterAssignments; /** * cutoff factor - percentage of splits done in Improve-Structure part only * relevant, if all children lost. */ protected double m_CutOffFactor = 0.5; /** Index in ranges for LOW. */ public static int R_LOW = 0; /** Index in ranges for HIGH. */ public static int R_HIGH = 1; /** Index in ranges for WIDTH. */ public static int R_WIDTH = 2; /** * KDTrees class if KDTrees are used. */ protected KDTree m_KDTree = new KDTree(); /** * whether to use the KDTree (the KDTree is only initialized to be * configurable from the GUI). */ protected boolean m_UseKDTree = false; /** counts iterations done in main loop. */ protected int m_IterationCount = 0; /** counter to say how often kMeans was stopped by loop counter. */ protected int m_KMeansStopped = 0; /** Number of splits prepared. */ protected int m_NumSplits = 0; /** Number of splits accepted (including cutoff factor decisions). */ protected int m_NumSplitsDone = 0; /** Number of splits accepted just because of cutoff factor. */ protected int m_NumSplitsStillDone = 0; /** * level of debug output, 0 is no output. */ protected int m_DebugLevel = 0; /** print the centers. */ public static int D_PRINTCENTERS = 1; /** follows the splitting of the centers. */ public static int D_FOLLOWSPLIT = 2; /** have a closer look at converge children. */ public static int D_CONVCHCLOSER = 3; /** check on random vectors. */ public static int D_RANDOMVECTOR = 4; /** check on kdtree. */ public static int D_KDTREE = 5; /** follow iterations. */ public static int D_ITERCOUNT = 6; /** functions were maybe misused. */ public static int D_METH_MISUSE = 80; /** for current debug. */ public static int D_CURR = 88; /** general debugging. */ public static int D_GENERAL = 99; /** Flag: I'm debugging. */ public boolean m_CurrDebugFlag = true; /** * the default constructor. */ public XMeans() { super(); m_SeedDefault = 10; setSeed(m_SeedDefault); } /** * Returns a string describing this clusterer. * * @return a description of the evaluator suitable for displaying in the * explorer/experimenter gui */ public String globalInfo() { return "Cluster data using the X-means algorithm.\n\n" + "X-Means is K-Means extended by an Improve-Structure part In this " + "part of the algorithm the centers are attempted to be split in " + "its region. The decision between the children of each center and " + "itself is done comparing the BIC-values of the two structures.\n\n" + "For more information see:\n\n" + getTechnicalInformation().toString(); } /** * Returns an instance of a TechnicalInformation object, containing detailed * information about the technical background of this class, e.g., paper * reference or book this class is based on. * * @return the technical information about this class */ @Override public TechnicalInformation getTechnicalInformation() { TechnicalInformation result; result = new TechnicalInformation(Type.INPROCEEDINGS); result.setValue(Field.AUTHOR, "Dan Pelleg and Andrew W. Moore"); result.setValue(Field.TITLE, "X-means: Extending K-means with Efficient Estimation of the Number of Clusters"); result.setValue(Field.BOOKTITLE, "Seventeenth International Conference on Machine Learning"); result.setValue(Field.YEAR, "2000"); result.setValue(Field.PAGES, "727-734"); result.setValue(Field.PUBLISHER, "Morgan Kaufmann"); return result; } /** * Returns default capabilities of the clusterer. * * @return the capabilities of this clusterer */ @Override public Capabilities getCapabilities() { Capabilities result = super.getCapabilities(); result.disableAll(); result.enable(Capability.NO_CLASS); // attributes result.enable(Capability.NUMERIC_ATTRIBUTES); result.enable(Capability.DATE_ATTRIBUTES); result.enable(Capability.MISSING_VALUES); return result; } /** * Generates the X-Means clusterer. * * @param data set of instances serving as training data * @throws Exception if the clusterer has not been generated successfully */ @Override public void buildClusterer(Instances data) throws Exception { // can clusterer handle the data? getCapabilities().testWithFail(data); if (m_MinNumClusters > m_MaxNumClusters) { throw new Exception( "XMeans: min number of clusters " + "can't be greater than max number of clusters!"); } m_NumSplits = 0; m_NumSplitsDone = 0; m_NumSplitsStillDone = 0; // replace missing values m_ReplaceMissingFilter = new ReplaceMissingValues(); m_ReplaceMissingFilter.setInputFormat(data); m_Instances = Filter.useFilter(data, m_ReplaceMissingFilter); // initialize random function Random random0 = new Random(m_Seed); // num of clusters to start with m_NumClusters = m_MinNumClusters; // set distance function to default if (m_DistanceF == null) { m_DistanceF = new EuclideanDistance(); } m_DistanceF.setInstances(m_Instances); checkInstances(); if (m_DebugVectorsFile.exists() && m_DebugVectorsFile.isFile()) { initDebugVectorsInput(); } // make list of indexes for m_Instances int[] allInstList = new int[m_Instances.numInstances()]; for (int i = 0; i < m_Instances.numInstances(); i++) { allInstList[i] = i; } // set model used (just for convenience) m_Model = new Instances(m_Instances, 0); // produce the starting centers if (m_CenterInput != null) { // read centers from file m_ClusterCenters = new Instances(m_CenterInput); m_NumClusters = m_ClusterCenters.numInstances(); } else { // makes the first centers randomly m_ClusterCenters = makeCentersRandomly(random0, m_Instances, m_NumClusters); } PFD(D_FOLLOWSPLIT, "\n*** Starting centers "); for (int k = 0; k < m_ClusterCenters.numInstances(); k++) { PFD(D_FOLLOWSPLIT, "Center " + k + ": " + m_ClusterCenters.instance(k)); } PrCentersFD(D_PRINTCENTERS); boolean finished = false; Instances children; // builds up a KDTree if (m_UseKDTree) { m_KDTree.setInstances(m_Instances); } // loop counter of main loop m_IterationCount = 0; /** * "finished" does get true as soon as: 1. number of clusters gets >= * m_MaxClusters, 2. in the last round, none of the centers have been split * * if number of clusters is already >= m_MaxClusters part 1 (= * Improve-Params) is done at least once. */ while (!finished && !stopIteration(m_IterationCount, m_MaxIterations)) { /* * ==================================================================== 1. * Improve-Params conventional K-means */ PFD(D_FOLLOWSPLIT, "\nBeginning of main loop - centers:"); PrCentersFD(D_FOLLOWSPLIT); PFD(D_ITERCOUNT, "\n*** 1. Improve-Params " + m_IterationCount + ". time"); m_IterationCount++; // prepare to converge boolean converged = false; // initialize assignments to -1 m_ClusterAssignments = initAssignments(m_Instances.numInstances()); // stores a list of indexes of instances belonging to each center int[][] instOfCent = new int[m_ClusterCenters.numInstances()][]; // KMeans loop counter int kMeansIteration = 0; // converge in conventional K-means ---------------------------------- PFD(D_FOLLOWSPLIT, "\nConverge in K-Means:"); while (!converged && !stopKMeansIteration(kMeansIteration, m_MaxKMeans)) { kMeansIteration++; converged = true; // assign instances to centers ------------------------------------- converged = assignToCenters(m_UseKDTree ? m_KDTree : null, m_ClusterCenters, instOfCent, allInstList, m_ClusterAssignments, kMeansIteration); PFD(D_FOLLOWSPLIT, "\nMain loop - Assign - centers:"); PrCentersFD(D_FOLLOWSPLIT); // compute new centers = centers of mass of points converged = recomputeCenters(m_ClusterCenters, // clusters instOfCent, // their instances m_Model); // model information PFD(D_FOLLOWSPLIT, "\nMain loop - Recompute - centers:"); PrCentersFD(D_FOLLOWSPLIT); } PFD(D_FOLLOWSPLIT, ""); PFD(D_FOLLOWSPLIT, "End of Part: 1. Improve-Params - conventional K-means"); /** * ===================================================================== * 2. Improve-Structur */ // BIC before split distortioning the centres m_Mle = distortion(instOfCent, m_ClusterCenters); m_Bic = calculateBIC(instOfCent, m_ClusterCenters, m_Mle); PFD(D_FOLLOWSPLIT, "m_Bic " + m_Bic); int currNumCent = m_ClusterCenters.numInstances(); Instances splitCenters = new Instances(m_ClusterCenters, currNumCent * 2); // store BIC values of parent and children double[] pbic = new double[currNumCent]; double[] cbic = new double[currNumCent]; // split each center for (int i = 0; i < currNumCent // this could help to optimize the algorithm // && currNumCent + numSplits <= m_MaxNumClusters ; i++) { PFD(D_FOLLOWSPLIT, "\nsplit center " + i + " " + m_ClusterCenters.instance(i)); Instance currCenter = m_ClusterCenters.instance(i); int[] currInstList = instOfCent[i]; int currNumInst = instOfCent[i].length; // not enough instances; than continue with next if (currNumInst <= 2) { pbic[i] = Double.MAX_VALUE; cbic[i] = 0.0; // add center itself as dummy splitCenters.add(currCenter); splitCenters.add(currCenter); continue; } // split centers ---------------------------------------------- double variance = m_Mle[i] / currNumInst; children = splitCenter(random0, currCenter, variance, m_Model); // initialize assignments to -1 int[] oneCentAssignments = initAssignments(currNumInst); int[][] instOfChCent = new int[2][]; // todo maybe split didn't work // converge the children -------------------------------------- converged = false; int kMeansForChildrenIteration = 0; PFD(D_FOLLOWSPLIT, "\nConverge, K-Means for children: " + i); while (!converged && !stopKMeansIteration(kMeansForChildrenIteration, m_MaxKMeansForChildren)) { kMeansForChildrenIteration++; converged = assignToCenters(children, instOfChCent, currInstList, oneCentAssignments); if (!converged) { recomputeCentersFast(children, instOfChCent, m_Model); } } // store new centers for later decision if they are taken splitCenters.add(children.instance(0)); splitCenters.add(children.instance(1)); PFD(D_FOLLOWSPLIT, "\nconverged cildren "); PFD(D_FOLLOWSPLIT, " " + children.instance(0)); PFD(D_FOLLOWSPLIT, " " + children.instance(1)); // compare parent and children model by their BIC-value pbic[i] = calculateBIC(currInstList, currCenter, m_Mle[i], m_Model); double[] chMLE = distortion(instOfChCent, children); cbic[i] = calculateBIC(instOfChCent, children, chMLE); } // end of loop over clusters // decide which one to split and make new list of cluster centers Instances newClusterCenters = null; newClusterCenters = newCentersAfterSplit(pbic, cbic, m_CutOffFactor, splitCenters); /** * Compare with before Improve-Structure */ int newNumClusters = newClusterCenters.numInstances(); if (newNumClusters != m_NumClusters) { PFD(D_FOLLOWSPLIT, "Compare with non-split"); // initialize assignments to -1 int[] newClusterAssignments = initAssignments(m_Instances.numInstances()); // stores a list of indexes of instances belonging to each center int[][] newInstOfCent = new int[newClusterCenters.numInstances()][]; // assign instances to centers ------------------------------------- converged = assignToCenters(m_UseKDTree ? m_KDTree : null, newClusterCenters, newInstOfCent, allInstList, newClusterAssignments, m_IterationCount); double[] newMle = distortion(newInstOfCent, newClusterCenters); double newBic = calculateBIC(newInstOfCent, newClusterCenters, newMle); PFD(D_FOLLOWSPLIT, "newBic " + newBic); if (newBic > m_Bic) { PFD(D_FOLLOWSPLIT, "*** decide for new clusters"); m_Bic = newBic; m_ClusterCenters = newClusterCenters; m_ClusterAssignments = newClusterAssignments; } else { PFD(D_FOLLOWSPLIT, "*** keep old clusters"); } } newNumClusters = m_ClusterCenters.numInstances(); // decide if finished: max num cluster reached // or last centers where not split at all if ((newNumClusters >= m_MaxNumClusters) || (newNumClusters == m_NumClusters)) { finished = true; } m_NumClusters = newNumClusters; } if (m_ClusterCenters.numInstances() > 0 && m_CenterOutput != null) { m_CenterOutput.println(m_ClusterCenters.toString()); m_CenterOutput.close(); m_CenterOutput = null; } } /** * Checks for nominal attributes in the dataset. Class attribute is ignored. * * @param data the data to check * @return false if no nominal attributes are present */ public boolean checkForNominalAttributes(Instances data) { int i = 0; while (i < data.numAttributes()) { if ((i != data.classIndex()) && data.attribute(i++).isNominal()) { return true; } } return false; } /** * Set array of int, used to store assignments, to -1. * * @param ass integer array used for storing assignments * @return integer array used for storing assignments */ protected int[] initAssignments(int[] ass) { for (int i = 0; i < ass.length; i++) { ass[i] = -1; } return ass; } /** * Creates and initializes integer array, used to store assignments. * * @param numInstances length of array used for assignments * @return integer array used for storing assignments */ protected int[] initAssignments(int numInstances) { int[] ass = new int[numInstances]; for (int i = 0; i < numInstances; i++) { ass[i] = -1; } return ass; } /** * Creates and initializes boolean array. * * @param len length of new array * @return the new array */ boolean[] initBoolArray(int len) { boolean[] boolArray = new boolean[len]; for (int i = 0; i < len; i++) { boolArray[i] = false; } return boolArray; } /** * Returns new center list. * * The following steps 1. and 2. both take care that the number of centers * does not exceed maxCenters. * * 1. Compare BIC values of parent and children and takes the one as new * centers which do win (= BIC-value is smaller). * * 2. If in 1. none of the children are chosen && and cutoff factor is > 0 * cutoff factor is taken as the percentage of "best" centers that are still * taken. * * @param pbic array of parents BIC-values * @param cbic array of childrens BIC-values * @param cutoffFactor cutoff factor * @param splitCenters all children * @return the new centers */ protected Instances newCentersAfterSplit(double[] pbic, double[] cbic, double cutoffFactor, Instances splitCenters) { // store if split won boolean splitPerCutoff = false; boolean takeSomeAway = false; boolean[] splitWon = initBoolArray(m_ClusterCenters.numInstances()); int numToSplit = 0; Instances newCenters = null; // how many would be split, because the children have a better bic value for (int i = 0; i < cbic.length; i++) { if (cbic[i] > pbic[i]) { // decide for splitting ---------------------------------------- splitWon[i] = true; numToSplit++; PFD(D_FOLLOWSPLIT, "Center " + i + " decide for children"); } else { // decide for parents and finished stays true ----------------- PFD(D_FOLLOWSPLIT, "Center " + i + " decide for parent"); } } // no splits yet so split per cutoff factor if ((numToSplit == 0) && (cutoffFactor > 0)) { splitPerCutoff = true; // how many to split per cutoff factor numToSplit = (int) (m_ClusterCenters.numInstances() * m_CutOffFactor); } // prepare indexes of values in ascending order double[] diff = new double[m_NumClusters]; for (int j = 0; j < diff.length; j++) { diff[j] = pbic[j] - cbic[j]; } int[] sortOrder = Utils.sort(diff); // check if maxNumClusters would be exceeded int possibleToSplit = m_MaxNumClusters - m_NumClusters; if (possibleToSplit > numToSplit) { // still enough possible, do the whole amount possibleToSplit = numToSplit; } else { takeSomeAway = true; } // prepare for splitting the one that are supposed to be split if (splitPerCutoff) { for (int j = 0; (j < possibleToSplit) && (cbic[sortOrder[j]] > 0.0); j++) { splitWon[sortOrder[j]] = true; } m_NumSplitsStillDone += possibleToSplit; } else { // take some splits away if max number of clusters would be exceeded if (takeSomeAway) { int count = 0; int j = 0; for (; j < splitWon.length && count < possibleToSplit; j++) { if (splitWon[sortOrder[j]] == true) { count++; } } while (j < splitWon.length) { splitWon[sortOrder[j]] = false; j++; } } } // finally split if (possibleToSplit > 0) { newCenters = newCentersAfterSplit(splitWon, splitCenters); } else { newCenters = m_ClusterCenters; } return newCenters; } /** * Returns new centers. Depending on splitWon: if true takes children, if * false takes parent = current center. * * @param splitWon array of boolean to indicate to take split or not * @param splitCenters list of splitted centers * @return the new centers */ protected Instances newCentersAfterSplit(boolean[] splitWon, Instances splitCenters) { Instances newCenters = new Instances(splitCenters, 0); int sIndex = 0; for (int i = 0; i < splitWon.length; i++) { if (splitWon[i]) { m_NumSplitsDone++; newCenters.add(splitCenters.instance(sIndex++)); newCenters.add(splitCenters.instance(sIndex++)); } else { sIndex++; sIndex++; newCenters.add(m_ClusterCenters.instance(i)); } } return newCenters; } /** * Controls that counter does not exceed max iteration value. Special function * for kmeans iterations. * * @param iterationCount current value of counter * @param max maximum value for counter * @return true if iteration should be stopped */ protected boolean stopKMeansIteration(int iterationCount, int max) { boolean stopIterate = false; if (max >= 0) { stopIterate = (iterationCount >= max); } if (stopIterate) { m_KMeansStopped++; } return stopIterate; } /** * Checks if iterationCount has to be checked and if yes (this means max is > * 0) compares it with max. * * @param iterationCount the current iteration count * @param max the maximum number of iterations * @return true if maximum has been reached */ protected boolean stopIteration(int iterationCount, int max) { boolean stopIterate = false; if (max >= 0) { stopIterate = (iterationCount >= max); } return stopIterate; } /** * Recompute the new centers. New cluster center is center of mass of its * instances. Returns true if cluster stays the same. * * @param centers the input and output centers * @param instOfCent the instances to the centers * @param model data model information * @return true if converged. */ protected boolean recomputeCenters(Instances centers, int[][] instOfCent, Instances model) { boolean converged = true; for (int i = 0; i < centers.numInstances(); i++) { double val; for (int j = 0; j < model.numAttributes(); j++) { val = meanOrMode(m_Instances, instOfCent[i], j); for (int k = 0; k < instOfCent[i].length; k++) { if (converged && m_ClusterCenters.instance(i).value(j) != val) { converged = false; } } if (!converged) { m_ClusterCenters.instance(i).setValue(j, val); } } } return converged; } /** * Recompute the new centers - 2nd version Same as recomputeCenters, but does * not check if center stays the same. * * @param centers the input center and output centers * @param instOfCentIndexes the indexes of the instances to the centers * @param model data model information */ protected void recomputeCentersFast(Instances centers, int[][] instOfCentIndexes, Instances model) { for (int i = 0; i < centers.numInstances(); i++) { double val; for (int j = 0; j < model.numAttributes(); j++) { val = meanOrMode(m_Instances, instOfCentIndexes[i], j); centers.instance(i).setValue(j, val); } } } /** * Computes Mean Or Mode of one attribute on a subset of m_Instances. The * subset is defined by an index list. * * @param instances all instances * @param instList the indexes of the instances the mean is computed from * @param attIndex the index of the attribute * @return mean value */ protected double meanOrMode(Instances instances, int[] instList, int attIndex) { double result, found; int[] counts; int numInst = instList.length; if (instances.attribute(attIndex).isNumeric()) { result = found = 0; for (int j = 0; j < numInst; j++) { Instance currInst = instances.instance(instList[j]); if (!currInst.isMissing(attIndex)) { found += currInst.weight(); result += currInst.weight() * currInst.value(attIndex); } } if (Utils.eq(found, 0)) { return 0; } else { return result / found; } } else if (instances.attribute(attIndex).isNominal()) { counts = new int[instances.attribute(attIndex).numValues()]; for (int j = 0; j < numInst; j++) { Instance currInst = instances.instance(instList[j]); if (!currInst.isMissing(attIndex)) { counts[(int) currInst.value(attIndex)] += currInst.weight(); } } return Utils.maxIndex(counts); } else { return 0; } } /** * Assigns instances to centers. * * @param tree KDTree on all instances * @param centers all the input centers * @param instOfCent the instances to each center * @param allInstList list of all instances * @param assignments assignments of instances to centers * @param iterationCount the number of iteration * @return true if converged * @throws Exception is something goes wrong */ protected boolean assignToCenters(KDTree tree, Instances centers, int[][] instOfCent, int[] allInstList, int[] assignments, int iterationCount) throws Exception { boolean converged = true; if (tree != null) { // using KDTree structure for assigning converged = assignToCenters(tree, centers, instOfCent, assignments, iterationCount); } else { converged = assignToCenters(centers, instOfCent, allInstList, assignments); } return converged; } /** * Assign instances to centers using KDtree. First part of conventionell * K-Means, returns true if new assignment is the same as the last one. * * @param kdtree KDTree on all instances * @param centers all the input centers * @param instOfCent the instances to each center * @param assignments assignments of instances to centers * @param iterationCount the number of iteration * @return true if converged * @throws Exception in case instances are not assigned to cluster */ protected boolean assignToCenters(KDTree kdtree, Instances centers, int[][] instOfCent, int[] assignments, int iterationCount) throws Exception { int numCent = centers.numInstances(); int numInst = m_Instances.numInstances(); int[] oldAssignments = new int[numInst]; // WARNING: assignments is "input/output-parameter" // should not be null if (assignments == null) { assignments = new int[numInst]; for (int i = 0; i < numInst; i++) { assignments[0] = -1; } } // WARNING: instOfCent is "input/output-parameter" // should not be null if (instOfCent == null) { instOfCent = new int[numCent][]; } // save old assignments for (int i = 0; i < assignments.length; i++) { oldAssignments[i] = assignments[i]; } // use tree to get new assignments kdtree.centerInstances(centers, assignments, Math.pow(.8, iterationCount)); boolean converged = true; // compare with previous assignment for (int i = 0; converged && (i < assignments.length); i++) { converged = (oldAssignments[i] == assignments[i]); if (assignments[i] == -1) { throw new Exception("Instance " + i + " has not been assigned to cluster."); } } if (!converged) { int[] numInstOfCent = new int[numCent]; for (int i = 0; i < numCent; i++) { numInstOfCent[i] = 0; } // count num of assignments per center for (int i = 0; i < numInst; i++) { numInstOfCent[assignments[i]]++; } // prepare instancelists per center for (int i = 0; i < numCent; i++) { instOfCent[i] = new int[numInstOfCent[i]]; } // write instance lists per center for (int i = 0; i < numCent; i++) { int index = -1; for (int j = 0; j < numInstOfCent[i]; j++) { index = nextAssignedOne(i, index, assignments); instOfCent[i][j] = index; } } } return converged; } /** * Assign instances to centers. Part of conventionell K-Means, returns true if * new assignment is the same as the last one. * * @param centers all the input centers * @param instOfCent the instances to each center * @param allInstList list of all indexes * @param assignments assignments of instances to centers * @return true if converged * @throws Exception if something goes wrong */ protected boolean assignToCenters(Instances centers, int[][] instOfCent, int[] allInstList, int[] assignments) throws Exception { // todo: undecided situations boolean converged = true; // true if new assignment is the same // as the old one int numInst = allInstList.length; int numCent = centers.numInstances(); int[] numInstOfCent = new int[numCent]; for (int i = 0; i < numCent; i++) { numInstOfCent[i] = 0; } // WARNING: assignments is "input/output-parameter" // should not be null if (assignments == null) { assignments = new int[numInst]; for (int i = 0; i < numInst; i++) { assignments[i] = -1; } } // WARNING: instOfCent is "input/output-parameter" // should not be null if (instOfCent == null) { instOfCent = new int[numCent][]; } // set assignments for (int i = 0; i < numInst; i++) { Instance inst = m_Instances.instance(allInstList[i]); int newC = clusterProcessedInstance(inst, centers); if (converged && newC != assignments[i]) { converged = false; } numInstOfCent[newC]++; if (!converged) { assignments[i] = newC; } } // the following is only done // if assignments are not the same, because too much effort if (!converged) { PFD(D_FOLLOWSPLIT, "assignToCenters -> it has NOT converged"); for (int i = 0; i < numCent; i++) { instOfCent[i] = new int[numInstOfCent[i]]; } for (int i = 0; i < numCent; i++) { int index = -1; for (int j = 0; j < numInstOfCent[i]; j++) { index = nextAssignedOne(i, index, assignments); instOfCent[i][j] = allInstList[index]; } } } else { PFD(D_FOLLOWSPLIT, "assignToCenters -> it has converged"); } return converged; } /** * Searches along the assignment array for the next entry of the center in * question. * * @param cent index of the center * @param lastIndex index to start searching * @param assignments assignments * @return index of the instance the center cent is assigned to */ protected int nextAssignedOne(int cent, int lastIndex, int[] assignments) { int len = assignments.length; int index = lastIndex + 1; while (index < len) { if (assignments[index] == cent) { return (index); } index++; } return (-1); } /** * Split centers in their region. Generates random vector of length = variance * and adds and substractsx to cluster vector to get two new clusters. * * @param random random function * @param center the center that is split here * @param variance variance of the cluster * @param model data model valid * @return a pair of new centers * @throws Exception something in AlgVector goes wrong */ protected Instances splitCenter(Random random, Instance center, double variance, Instances model) throws Exception { m_NumSplits++; AlgVector r = null; Instances children = new Instances(model, 2); if (m_DebugVectorsFile.exists() && m_DebugVectorsFile.isFile()) { Instance nextVector = getNextDebugVectorsInstance(model); PFD(D_RANDOMVECTOR, "Random Vector from File " + nextVector); r = new AlgVector(nextVector); } else { // random vector of length = variance r = new AlgVector(model, random); } r.changeLength(Math.pow(variance, 0.5)); PFD(D_RANDOMVECTOR, "random vector *variance " + r); // add random vector to center AlgVector c = new AlgVector(center); AlgVector c2 = (AlgVector) c.clone(); c = c.add(r); Instance newCenter = c.getAsInstance(model, random); children.add(newCenter); PFD(D_FOLLOWSPLIT, "first child " + newCenter); // substract random vector to center c2 = c2.substract(r); newCenter = c2.getAsInstance(model, random); children.add(newCenter); PFD(D_FOLLOWSPLIT, "second child " + newCenter); return children; } /** * Split centers in their region. (*Alternative version of splitCenter()*) * * @param random the random number generator * @param instances of the region * @param model the model for the centers (should be the same as that of * instances) * @return a pair of new centers */ protected Instances splitCenters(Random random, Instances instances, Instances model) { Instances children = new Instances(model, 2); int instIndex = Math.abs(random.nextInt()) % instances.numInstances(); children.add(instances.instance(instIndex)); int instIndex2 = instIndex; int count = 0; while ((instIndex2 == instIndex) && count < 10) { count++; instIndex2 = Math.abs(random.nextInt()) % instances.numInstances(); } children.add(instances.instance(instIndex2)); return children; } /** * Generates new centers randomly. Used for starting centers. * * @param random0 random number generator * @param model data model of the instances * @param numClusters number of clusters * @return new centers */ protected Instances makeCentersRandomly(Random random0, Instances model, int numClusters) { Instances clusterCenters = new Instances(model, numClusters); m_NumClusters = numClusters; // makes the new centers randomly for (int i = 0; i < numClusters; i++) { int instIndex = Math.abs(random0.nextInt()) % m_Instances.numInstances(); clusterCenters.add(m_Instances.instance(instIndex)); } return clusterCenters; } /** * Returns the BIC-value for the given center and instances. * * @param instList The indices of the instances that belong to the center * @param center the center. * @param mle maximum likelihood * @param model the data model * @return the BIC value */ protected double calculateBIC(int[] instList, Instance center, double mle, Instances model) { int[][] w1 = new int[1][instList.length]; for (int i = 0; i < instList.length; i++) { w1[0][i] = instList[i]; } double[] m = { mle }; Instances w2 = new Instances(model, 1); w2.add(center); return calculateBIC(w1, w2, m); } /** * Calculates the BIC for the given set of centers and instances. * * @param instOfCent The instances that belong to their respective centers * @param centers the centers * @param mle maximum likelihood * @return The BIC for the input. */ protected double calculateBIC(int[][] instOfCent, Instances centers, double[] mle) { double loglike = 0.0; int numInstTotal = 0; int numCenters = centers.numInstances(); int numDimensions = centers.numAttributes(); int numParameters = (numCenters - 1) + // probabilities numCenters * numDimensions + // means numCenters; // variance params for (int i = 0; i < centers.numInstances(); i++) { loglike += logLikelihoodEstimate(instOfCent[i].length, centers.instance(i), mle[i], centers.numInstances() * 2); numInstTotal += instOfCent[i].length; } /* * diff thats how we did it loglike -= ((centers.numAttributes() + 1.0) * * centers.numInstances() * 1) Math.log(count); */ loglike -= numInstTotal * Math.log(numInstTotal); // System.out.println ("numInstTotal " + numInstTotal + // "calculateBIC res " + loglike); loglike -= (numParameters / 2.0) * Math.log(numInstTotal); // System.out.println ("numParam " + // + numParameters + // " calculateBIC res " + loglike); return loglike; } /** * Calculates the log-likelihood of the data for the given model, taken at the * maximum likelihood point. * * @param numInst number of instances that belong to the center * @param center the center * @param distortion distortion * @param numCent number of centers * @return the likelihood estimate */ protected double logLikelihoodEstimate(int numInst, Instance center, double distortion, int numCent) { // R(n) num of instances of the center -> numInst // K num of centers -> not used // // todo take the diff comments away double loglike = 0; /* if is new */ if (numInst > 1) { /* diff variance is new */ // // distortion = Sum over instances x of the center(x-center) // different to paper; sum should be squared // // (Sum of distances to center) / R(n) - 1.0 // different to paper; should be R(n)-K double variance = distortion / (numInst - 1.0); // // -R(n)/2 * log(pi*2) // double p1 = -(numInst / 2.0) * Math.log(Math.PI * 2.0); /* * diff thats how we had it double p2 = -((ni * center.numAttributes()) / * 2) * distortion; */ // // -(R(n)*M)/2 * log(variance) // double p2 = -(numInst * center.numAttributes()) / 2 * Math.log(variance); /* * diff thats how we had it, the difference is a bug in x-means double p3 * = - (numInst - numCent) / 2; */ // // -(R(n)-1)/2 // double p3 = -(numInst - 1.0) / 2.0; // // R(n)*log(R(n)) // double p4 = numInst * Math.log(numInst); /* * diff x-means doesn't have this part double p5 = - numInst * * Math.log(numInstTotal); */ /* * loglike = -(ni / 2) * Math.log(Math.PI * 2) - (ni * * center.numAttributes()) / 2.0) * logdistortion - (ni - k) / 2.0 + ni * * Math.log(ni) - ni * Math.log(r); */ loglike = p1 + p2 + p3 + p4; // diff + p5; // the log(r) is something that can be reused. // as is the log(2 PI), these could provide extra speed up later on. // since distortion is so expensive to compute, I only do that once. } return loglike; } /** * Calculates the maximum likelihood estimate for the variance. * * @param instOfCent indices of instances to each center * @param centers the centers * @return the list of distortions distortion. */ protected double[] distortion(int[][] instOfCent, Instances centers) { double[] distortion = new double[centers.numInstances()]; for (int i = 0; i < centers.numInstances(); i++) { distortion[i] = 0.0; for (int j = 0; j < instOfCent[i].length; j++) { distortion[i] += m_DistanceF.distance(m_Instances.instance(instOfCent[i][j]), centers.instance(i)); } } /* * diff not done in x-means res *= 1.0 / (count - centers.numInstances()); */ return distortion; } /** * Clusters an instance. * * @param instance the instance to assign a cluster to. * @param centers the centers to cluster the instance to. * @return a cluster index. */ protected int clusterProcessedInstance(Instance instance, Instances centers) { double minDist = Integer.MAX_VALUE; int bestCluster = 0; for (int i = 0; i < centers.numInstances(); i++) { double dist = m_DistanceF.distance(instance, centers.instance(i)); if (dist < minDist) { minDist = dist; bestCluster = i; } } ; return bestCluster; } /** * Clusters an instance that has been through the filters. * * @param instance the instance to assign a cluster to * @return a cluster number */ protected int clusterProcessedInstance(Instance instance) { double minDist = Integer.MAX_VALUE; int bestCluster = 0; for (int i = 0; i < m_NumClusters; i++) { double dist = m_DistanceF.distance(instance, m_ClusterCenters.instance(i)); if (dist < minDist) { minDist = dist; bestCluster = i; } } return bestCluster; } /** * Classifies a given instance. * * @param instance the instance to be assigned to a cluster * @return the number of the assigned cluster as an integer if the class is * enumerated, otherwise the predicted value * @throws Exception if instance could not be classified successfully */ @Override public int clusterInstance(Instance instance) throws Exception { m_ReplaceMissingFilter.input(instance); Instance inst = m_ReplaceMissingFilter.output(); return clusterProcessedInstance(inst); } /** * Returns the number of clusters. * * @return the number of clusters generated for a training dataset. */ @Override public int numberOfClusters() { return m_NumClusters; } /** * Returns an enumeration describing the available options. * * @return an enumeration of all the available options **/ @Override public Enumeration<Option> listOptions() { Vector<Option> result = new Vector<Option>(); result.addElement( new Option("\tmaximum number of overall iterations\n" + "\t(default 1).", "I", 1, "-I <num>")); result.addElement(new Option("\tmaximum number of iterations in the kMeans loop in\n" + "\tthe Improve-Parameter part \n" + "\t(default 1000).", "M", 1, "-M <num>")); result.addElement(new Option( "\tmaximum number of iterations in the kMeans loop\n" + "\tfor the splitted centroids in the Improve-Structure part \n" + "\t(default 1000).", "J", 1, "-J <num>")); result.addElement(new Option("\tminimum number of clusters\n" + "\t(default 2).", "L", 1, "-L <num>")); result.addElement(new Option("\tmaximum number of clusters\n" + "\t(default 4).", "H", 1, "-H <num>")); result.addElement( new Option("\tdistance value for binary attributes\n" + "\t(default 1.0).", "B", 1, "-B <value>")); result.addElement( new Option("\tUses the KDTree internally\n" + "\t(default no).", "use-kdtree", 0, "-use-kdtree")); result.addElement(new Option("\tFull class name of KDTree class to use, followed\n" + "\tby scheme options.\n" + "\teg: \"weka.core.neighboursearch.kdtrees.KDTree -P\"\n" + "\t(default no KDTree class used).", "K", 1, "-K <KDTree class specification>")); result.addElement(new Option("\tcutoff factor, takes the given percentage of the splitted \n" + "\tcentroids if none of the children win\n" + "\t(default 0.0).", "C", 1, "-C <value>")); result.addElement(new Option( "\tFull class name of Distance function class to use, followed\n" + "\tby scheme options.\n" + "\t(default weka.core.EuclideanDistance).", "D", 1, "-D <distance function class specification>")); result.addElement( new Option("\tfile to read starting centers from (ARFF format).", "N", 1, "-N <file name>")); result.addElement(new Option("\tfile to write centers to (ARFF format).", "O", 1, "-O <file name>")); result.addElement(new Option("\tThe debug level.\n" + "\t(default 0)", "U", 1, "-U <int>")); result.addElement(new Option("\tThe debug vectors file.", "Y", 1, "-Y <file name>")); result.addAll(Collections.list(super.listOptions())); return result.elements(); } /** * Returns the tip text for this property. * * @return tip text for this property */ public String minNumClustersTipText() { return "set minimum number of clusters"; } /** * Sets the minimum number of clusters to generate. * * @param n the minimum number of clusters to generate */ public void setMinNumClusters(int n) { m_MinNumClusters = n; } /** * Gets the minimum number of clusters to generate. * * @return the minimum number of clusters to generate */ public int getMinNumClusters() { return m_MinNumClusters; } /** * Returns the tip text for this property. * * @return tip text for this property */ public String maxNumClustersTipText() { return "set maximum number of clusters"; } /** * Sets the maximum number of clusters to generate. * * @param n the maximum number of clusters to generate */ public void setMaxNumClusters(int n) { if (n >= m_MinNumClusters) { m_MaxNumClusters = n; } } /** * Gets the maximum number of clusters to generate. * * @return the maximum number of clusters to generate */ public int getMaxNumClusters() { return m_MaxNumClusters; } /** * Returns the tip text for this property. * * @return tip text for this property */ public String maxIterationsTipText() { return "the maximum number of iterations to perform"; } /** * Sets the maximum number of iterations to perform. * * @param i the number of iterations * @throws Exception if i is less than 1 */ public void setMaxIterations(int i) throws Exception { if (i < 0) { throw new Exception("Only positive values for iteration number" + " allowed (Option I)."); } m_MaxIterations = i; } /** * Gets the maximum number of iterations. * * @return the number of iterations */ public int getMaxIterations() { return m_MaxIterations; } /** * Returns the tip text for this property. * * @return tip text for this property */ public String maxKMeansTipText() { return "the maximum number of iterations to perform in KMeans"; } /** * Set the maximum number of iterations to perform in KMeans. * * @param i the number of iterations */ public void setMaxKMeans(int i) { m_MaxKMeans = i; m_MaxKMeansForChildren = i; } /** * Gets the maximum number of iterations in KMeans. * * @return the number of iterations */ public int getMaxKMeans() { return m_MaxKMeans; } /** * Returns the tip text for this property. * * @return tip text for this property */ public String maxKMeansForChildrenTipText() { return "the maximum number of iterations KMeans that is performed on the child centers"; } /** * Sets the maximum number of iterations KMeans that is performed on the child * centers. * * @param i the number of iterations */ public void setMaxKMeansForChildren(int i) { m_MaxKMeansForChildren = i; } /** * Gets the maximum number of iterations in KMeans. * * @return the number of iterations */ public int getMaxKMeansForChildren() { return m_MaxKMeansForChildren; } /** * Returns the tip text for this property. * * @return tip text for this property */ public String cutOffFactorTipText() { return "the cut-off factor to use"; } /** * Sets a new cutoff factor. * * @param i the new cutoff factor */ public void setCutOffFactor(double i) { m_CutOffFactor = i; } /** * Gets the cutoff factor. * * @return the cutoff factor */ public double getCutOffFactor() { return m_CutOffFactor; } /** * Returns the tip text for this property. * * @return tip text for this property suitable for displaying in the * explorer/experimenter gui */ public String binValueTipText() { return "Set the value that represents true in the new attributes."; } /** * Gets value that represents true in a new numeric attribute. (False is * always represented by 0.0.) * * @return the value that represents true in a new numeric attribute */ public double getBinValue() { return m_BinValue; } /** * Sets the distance value between true and false of binary attributes. and * "same" and "different" of nominal attributes * * @param value the distance */ public void setBinValue(double value) { m_BinValue = value; } /** * Returns the tip text for this property. * * @return tip text for this property suitable for displaying in the * explorer/experimenter gui */ public String distanceFTipText() { return "The distance function to use."; } /** * gets the "binary" distance value. * * @param distanceF the distance function with all options set */ public void setDistanceF(DistanceFunction distanceF) { m_DistanceF = distanceF; } /** * Gets the distance function. * * @return the distance function */ public DistanceFunction getDistanceF() { return m_DistanceF; } /** * Gets the distance function specification string, which contains the class * name of the distance function class and any options to it. * * @return the distance function specification string */ protected String getDistanceFSpec() { DistanceFunction d = getDistanceF(); if (d instanceof OptionHandler) { return d.getClass().getName() + " " + Utils.joinOptions(((OptionHandler) d).getOptions()); } return d.getClass().getName(); } /** * Returns the tip text for this property. * * @return tip text for this property suitable for displaying in the * explorer/experimenter gui */ public String debugVectorsFileTipText() { return "The file containing the debug vectors (only for debugging!)."; } /** * Sets the file that has the random vectors stored. Only used for debugging * reasons. * * @param value the file to read the random vectors from */ public void setDebugVectorsFile(File value) { m_DebugVectorsFile = value; } /** * Gets the file name for a file that has the random vectors stored. Only used * for debugging purposes. * * @return the file to read the vectors from */ public File getDebugVectorsFile() { return m_DebugVectorsFile; } /** * Initialises the debug vector input. * * @throws Exception if there is error opening the debug input file. */ public void initDebugVectorsInput() throws Exception { m_DebugVectorsInput = new BufferedReader(new FileReader(m_DebugVectorsFile)); m_DebugVectors = new Instances(m_DebugVectorsInput); m_DebugVectorsIndex = 0; } /** * Read an instance from debug vectors file. * * @param model the data model for the instance. * @throws Exception if there are no debug vector in m_DebugVectors. * @return the next debug vector. */ public Instance getNextDebugVectorsInstance(Instances model) throws Exception { if (m_DebugVectorsIndex >= m_DebugVectors.numInstances()) { throw new Exception("no more prefabricated Vectors"); } Instance nex = m_DebugVectors.instance(m_DebugVectorsIndex); nex.setDataset(model); m_DebugVectorsIndex++; return nex; } /** * Returns the tip text for this property. * * @return tip text for this property suitable for displaying in the * explorer/experimenter gui */ public String inputCenterFileTipText() { return "The file to read the list of centers from."; } /** * Sets the file to read the list of centers from. * * @param value the file to read centers from */ public void setInputCenterFile(File value) { m_InputCenterFile = value; } /** * Gets the file to read the list of centers from. * * @return the file to read the centers from */ public File getInputCenterFile() { return m_InputCenterFile; } /** * Returns the tip text for this property. * * @return tip text for this property suitable for displaying in the * explorer/experimenter gui */ public String outputCenterFileTipText() { return "The file to write the list of centers to."; } /** * Sets file to write the list of centers to. * * @param value file to write centers to */ public void setOutputCenterFile(File value) { m_OutputCenterFile = value; } /** * Gets the file to write the list of centers to. * * @return filename of the file to write centers to */ public File getOutputCenterFile() { return m_OutputCenterFile; } /** * Returns the tip text for this property. * * @return tip text for this property suitable for displaying in the * explorer/experimenter gui */ public String KDTreeTipText() { return "The KDTree to use."; } /** * Sets the KDTree class. * * @param k a KDTree object with all options set */ public void setKDTree(KDTree k) { m_KDTree = k; } /** * Gets the KDTree class. * * @return the configured KDTree */ public KDTree getKDTree() { return m_KDTree; } /** * Returns the tip text for this property. * * @return tip text for this property suitable for displaying in the * explorer/experimenter gui */ public String useKDTreeTipText() { return "Whether to use the KDTree."; } /** * Sets whether to use the KDTree or not. * * @param value if true the KDTree is used */ public void setUseKDTree(boolean value) { m_UseKDTree = value; } /** * Gets whether the KDTree is used or not. * * @return true if KDTrees are used */ public boolean getUseKDTree() { return m_UseKDTree; } /** * Gets the KDTree specification string, which contains the class name of the * KDTree class and any options to the KDTree. * * @return the KDTree string. */ protected String getKDTreeSpec() { KDTree c = getKDTree(); if (c instanceof OptionHandler) { return c.getClass().getName() + " " + Utils.joinOptions(((OptionHandler) c).getOptions()); } return c.getClass().getName(); } /** * Returns the tip text for this property. * * @return tip text for this property suitable for displaying in the * explorer/experimenter gui */ public String debugLevelTipText() { return "The debug level to use."; } /** * Sets the debug level. debug level = 0, means no output * * @param d debuglevel */ public void setDebugLevel(int d) { m_DebugLevel = d; } /** * Gets the debug level. * * @return debug level */ public int getDebugLevel() { return m_DebugLevel; } /** * Checks the instances. No checks in this KDTree but it calls the check of * the distance function. */ protected void checkInstances() { // m_DistanceF.checkInstances(); } /** * Parses a given list of options. * <p/> * * <!-- options-start --> Valid options are: * <p/> * * <pre> * -I <num> * maximum number of overall iterations * (default 1). * </pre> * * <pre> * -M <num> * maximum number of iterations in the kMeans loop in * the Improve-Parameter part * (default 1000). * </pre> * * <pre> * -J <num> * maximum number of iterations in the kMeans loop * for the splitted centroids in the Improve-Structure part * (default 1000). * </pre> * * <pre> * -L <num> * minimum number of clusters * (default 2). * </pre> * * <pre> * -H <num> * maximum number of clusters * (default 4). * </pre> * * <pre> * -B <value> * distance value for binary attributes * (default 1.0). * </pre> * * <pre> * -use-kdtree * Uses the KDTree internally * (default no). * </pre> * * <pre> * -K <KDTree class specification> * Full class name of KDTree class to use, followed * by scheme options. * eg: "weka.core.neighboursearch.kdtrees.KDTree -P" * (default no KDTree class used). * </pre> * * <pre> * -C <value> * cutoff factor, takes the given percentage of the splitted * centroids if none of the children win * (default 0.0). * </pre> * * <pre> * -D <distance function class specification> * Full class name of Distance function class to use, followed * by scheme options. * (default weka.core.EuclideanDistance). * </pre> * * <pre> * -N <file name> * file to read starting centers from (ARFF format). * </pre> * * <pre> * -O <file name> * file to write centers to (ARFF format). * </pre> * * <pre> * -U <int> * The debug level. * (default 0) * </pre> * * <pre> * -Y <file name> * The debug vectors file. * </pre> * * <pre> * -S <num> * Random number seed. * (default 10) * </pre> * * <!-- options-end --> * * @param options the list of options as an array of strings * @throws Exception if an option is not supported */ @Override public void setOptions(String[] options) throws Exception { String optionString; String funcString; optionString = Utils.getOption('I', options); if (optionString.length() != 0) { setMaxIterations(Integer.parseInt(optionString)); } else { setMaxIterations(1); } optionString = Utils.getOption('M', options); if (optionString.length() != 0) { setMaxKMeans(Integer.parseInt(optionString)); } else { setMaxKMeans(1000); } optionString = Utils.getOption('J', options); if (optionString.length() != 0) { setMaxKMeansForChildren(Integer.parseInt(optionString)); } else { setMaxKMeansForChildren(1000); } optionString = Utils.getOption('L', options); if (optionString.length() != 0) { setMinNumClusters(Integer.parseInt(optionString)); } else { setMinNumClusters(2); } optionString = Utils.getOption('H', options); if (optionString.length() != 0) { setMaxNumClusters(Integer.parseInt(optionString)); } else { setMaxNumClusters(4); } optionString = Utils.getOption('B', options); if (optionString.length() != 0) { setBinValue(Double.parseDouble(optionString)); } else { setBinValue(1.0); } setUseKDTree(Utils.getFlag("use-kdtree", options)); if (getUseKDTree()) { funcString = Utils.getOption('K', options); if (funcString.length() != 0) { String[] funcSpec = Utils.splitOptions(funcString); if (funcSpec.length == 0) { throw new Exception("Invalid function specification string"); } String funcName = funcSpec[0]; funcSpec[0] = ""; setKDTree((KDTree) Utils.forName(KDTree.class, funcName, funcSpec)); } else { setKDTree(new KDTree()); } } else { setKDTree(new KDTree()); } optionString = Utils.getOption('C', options); if (optionString.length() != 0) { setCutOffFactor(Double.parseDouble(optionString)); } else { setCutOffFactor(0.0); } funcString = Utils.getOption('D', options); if (funcString.length() != 0) { String[] funcSpec = Utils.splitOptions(funcString); if (funcSpec.length == 0) { throw new Exception("Invalid function specification string"); } String funcName = funcSpec[0]; funcSpec[0] = ""; setDistanceF((DistanceFunction) Utils.forName(DistanceFunction.class, funcName, funcSpec)); } else { setDistanceF(new EuclideanDistance()); } optionString = Utils.getOption('N', options); if (optionString.length() != 0) { setInputCenterFile(new File(optionString)); m_CenterInput = new BufferedReader(new FileReader(optionString)); } else { setInputCenterFile(new File(System.getProperty("user.dir"))); m_CenterInput = null; } optionString = Utils.getOption('O', options); if (optionString.length() != 0) { setOutputCenterFile(new File(optionString)); m_CenterOutput = new PrintWriter(new FileOutputStream(optionString)); } else { setOutputCenterFile(new File(System.getProperty("user.dir"))); m_CenterOutput = null; } optionString = Utils.getOption('U', options); int debugLevel = 0; if (optionString.length() != 0) { try { debugLevel = Integer.parseInt(optionString); } catch (NumberFormatException e) { throw new Exception(optionString + "is an illegal value for option -U"); } } setDebugLevel(debugLevel); optionString = Utils.getOption('Y', options); if (optionString.length() != 0) { setDebugVectorsFile(new File(optionString)); } else { setDebugVectorsFile(new File(System.getProperty("user.dir"))); m_DebugVectorsInput = null; m_DebugVectors = null; } super.setOptions(options); Utils.checkForRemainingOptions(options); } /** * Gets the current settings of SimpleKMeans. * * @return an array of strings suitable for passing to setOptions */ @Override public String[] getOptions() { Vector<String> result = new Vector<String>(); result.add("-I"); result.add("" + getMaxIterations()); result.add("-M"); result.add("" + getMaxKMeans()); result.add("-J"); result.add("" + getMaxKMeansForChildren()); result.add("-L"); result.add("" + getMinNumClusters()); result.add("-H"); result.add("" + getMaxNumClusters()); result.add("-B"); result.add("" + getBinValue()); if (getUseKDTree()) { result.add("-use-kdtree"); result.add("-K"); result.add("" + getKDTreeSpec()); } result.add("-C"); result.add("" + getCutOffFactor()); if (getDistanceF() != null) { result.add("-D"); result.add("" + getDistanceFSpec()); } if (getInputCenterFile().exists() && getInputCenterFile().isFile()) { result.add("-N"); result.add("" + getInputCenterFile()); } if (getOutputCenterFile().exists() && getOutputCenterFile().isFile()) { result.add("-O"); result.add("" + getOutputCenterFile()); } int dL = getDebugLevel(); if (dL > 0) { result.add("-U"); result.add("" + getDebugLevel()); } if (getDebugVectorsFile().exists() && getDebugVectorsFile().isFile()) { result.add("-Y"); result.add("" + getDebugVectorsFile()); } Collections.addAll(result, super.getOptions()); return result.toArray(new String[result.size()]); } /** * Return a string describing this clusterer. * * @return a description of the clusterer as a string */ @Override public String toString() { StringBuffer temp = new StringBuffer(); temp.append("\nXMeans\n======\n"); temp.append("Requested iterations : " + m_MaxIterations + "\n"); temp.append("Iterations performed : " + m_IterationCount + "\n"); if (m_KMeansStopped > 0) { temp.append("kMeans did not converge\n"); temp.append(" but was stopped by max-loops " + m_KMeansStopped + " times (max kMeans-iter)\n"); } temp.append("Splits prepared : " + m_NumSplits + "\n"); temp.append("Splits performed : " + m_NumSplitsDone + "\n"); temp.append("Cutoff factor : " + m_CutOffFactor + "\n"); double perc; if (m_NumSplitsDone > 0) { perc = (((double) m_NumSplitsStillDone) / ((double) m_NumSplitsDone)) * 100.0; } else { perc = 0.0; } temp.append("Percentage of splits accepted \n" + "by cutoff factor : " + Utils.doubleToString(perc, 2) + " %\n"); temp.append("------\n"); temp.append("Cutoff factor : " + m_CutOffFactor + "\n"); temp.append("------\n"); temp.append("\nCluster centers : " + m_NumClusters + " centers\n"); for (int i = 0; i < m_NumClusters; i++) { temp.append("\nCluster " + i + "\n "); for (int j = 0; j < m_ClusterCenters.numAttributes(); j++) { if (m_ClusterCenters.attribute(j).isNominal()) { temp.append( " " + m_ClusterCenters.attribute(j).value((int) m_ClusterCenters.instance(i).value(j))); } else { temp.append(" " + m_ClusterCenters.instance(i).value(j)); } } } if (m_Mle != null) { temp.append("\n\nDistortion: " + Utils.doubleToString(Utils.sum(m_Mle), 6) + "\n"); } temp.append("BIC-Value : " + Utils.doubleToString(m_Bic, 6) + "\n"); return temp.toString(); } /** * Return the centers of the clusters as an Instances object * * @return the cluster centers. */ public Instances getClusterCenters() { return m_ClusterCenters; } /** * Print centers for debug. * * @param debugLevel level that gives according messages */ protected void PrCentersFD(int debugLevel) { if (debugLevel == m_DebugLevel) { for (int i = 0; i < m_ClusterCenters.numInstances(); i++) { System.out.println(m_ClusterCenters.instance(i)); } } } /** * Tests on debug status. * * @param debugLevel level that gives according messages * @return true if debug level is set */ protected boolean TFD(int debugLevel) { return (debugLevel == m_DebugLevel); } /** * Does debug printouts. * * @param debugLevel level that gives according messages * @param output string that is printed */ protected void PFD(int debugLevel, String output) { if (debugLevel == m_DebugLevel) { System.out.println(output); } } /** * Does debug printouts. * * @param output string that is printed */ protected void PFD_CURR(String output) { if (m_CurrDebugFlag) { System.out.println(output); } } /** * Returns the revision string. * * @return the revision */ @Override public String getRevision() { return RevisionUtils.extract("$Revision$"); } /** * Main method for testing this class. * * @param argv should contain options */ public static void main(String[] argv) { runClusterer(new XMeans(), argv); } }