core.ClusterEvaluationEX.java Source code

Introduction

Here is the source code for core.ClusterEvaluationEX.java
Source

/*
 *   This program is free software: you can redistribute it and/or modify
 *   it under the terms of the GNU General Public License as published by
 *   the Free Software Foundation, either version 3 of the License, or
 *   (at your option) any later version.
 *
 *   This program is distributed in the hope that it will be useful,
 *   but WITHOUT ANY WARRANTY; without even the implied warranty of
 *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *   GNU General Public License for more details.
 *
 *   You should have received a copy of the GNU General Public License
 *   along with this program.  If not, see <http://www.gnu.org/licenses/>.
 */

/*
 *    ClusterEvaluation.java
 *    Copyright (C) 1999-2012 University of Waikato, Hamilton, New Zealand
 *
 */

package core;

import weka.core.Attribute;
import weka.core.Drawable;
import weka.core.FastVector;
import weka.core.Instance;
import weka.core.Instances;
import weka.core.Option;
import weka.core.OptionHandler;
import weka.core.Range;
import weka.core.RevisionHandler;
import weka.core.RevisionUtils;
import weka.core.Utils;
import weka.core.converters.ConverterUtils.DataSource;
import weka.filters.Filter;
import weka.filters.unsupervised.attribute.Remove;

import java.beans.BeanInfo;
import java.beans.Introspector;
import java.beans.MethodDescriptor;
import java.io.BufferedWriter;
import java.io.FileWriter;
import java.io.Serializable;
import java.lang.reflect.Method;
import java.util.Enumeration;
import java.util.Random;
import java.util.Vector;

import weka.clusterers.*;

/**
 * Class for evaluating clustering models.<p/>
 *
 * Valid options are: <p/>
 *
 * -t name of the training file <br/>
 * Specify the training file. <p/>
 *
 * -T name of the test file <br/>
 * Specify the test file to apply clusterer to. <p/>
 *
 * -d name of file to save clustering model to <br/>
 * Specify output file. <p/>
 *
 * -l name of file to load clustering model from <br/>
 * Specifiy input file. <p/>
 *
 * -p attribute range <br/>
 * Output predictions. Predictions are for the training file if only the
 * training file is specified, otherwise they are for the test file. The range
 * specifies attribute values to be output with the predictions.
 * Use '-p 0' for none. <p/>
 *
 * -x num folds <br/>
 * Set the number of folds for a cross validation of the training data.
 * Cross validation can only be done for distribution clusterers and will
 * be performed if the test file is missing. <p/>
 * 
 * -s num <br/>
 * Sets the seed for randomizing the data for cross-validation. <p/>
 *
 * -c class <br/>
 * Set the class attribute. If set, then class based evaluation of clustering
 * is performed. <p/>
 * 
 * -g name of graph file <br/>
 * Outputs the graph representation of the clusterer to the file. Only for
 * clusterer that implemented the <code>weka.core.Drawable</code> interface.
 * <p/>
 *
 * @author   Mark Hall (mhall@cs.waikato.ac.nz)
 * @version  $Revision: 8034 $
 * @see        weka.core.Drawable
 */
public class ClusterEvaluationEX implements Serializable, RevisionHandler {

    /** for serialization */
    static final long serialVersionUID = -830188327319128005L;

    /** the clusterer */
    private Clusterer m_Clusterer;

    /** holds a string describing the results of clustering the training data */
    private StringBuffer m_clusteringResults;

    /** holds the number of clusters found by the clusterer */
    private int m_numClusters;

    /** holds the assigments of instances to clusters for a particular testing
        dataset */
    private double[] m_clusterAssignments;

    /** holds the average log likelihood for a particular testing dataset
       if the clusterer is a DensityBasedClusterer */
    private double m_logL;

    /** will hold the mapping of classes to clusters (for class based 
        evaluation) */
    private int[] m_classToCluster = null;

    public Instances ID;

    public Instances noise;

    /**
     * set the clusterer
     * @param clusterer the clusterer to use
     */
    public void setClusterer(Clusterer clusterer) {
        m_Clusterer = clusterer;
    }

    /**
     * return the results of clustering.
     * @return a string detailing the results of clustering a data set
     */
    public String clusterResultsToString() {
        return m_clusteringResults.toString();
    }

    /**
     * Return the number of clusters found for the most recent call to
     * evaluateClusterer
     * @return the number of clusters found
     */
    public int getNumClusters() {
        return m_numClusters;
    }

    /**
     * Return an array of cluster assignments corresponding to the most
     * recent set of instances clustered.
     * @return an array of cluster assignments
     */
    public double[] getClusterAssignments() {
        return m_clusterAssignments;
    }

    /**
     * Return the array (ordered by cluster number) of minimum error class to
     * cluster mappings
     * @return an array of class to cluster mappings
     */
    public int[] getClassesToClusters() {
        return m_classToCluster;
    }

    public Instances DeleteNoise(Instances data) {
        noise = data.stringFreeStructure();
        for (int i = 0; i < data.numInstances(); i++) {
            if (data.instance(i).value(1) == -1) {
                noise.add(data.instance(i));
                data.delete(i);
                i--;
            }
        }
        return data;
    }

    /**
     * Return the instances with ID
     * 
     */
    public Instances getID(Instances data, int num) {
        ID = new CommonMethords().addIDInfo(data, new CommonMethords().InstanceToArrays(data));
        for (int i = 0; i < m_clusterAssignments.length; i++) {
            if (m_clusterAssignments[i] != -1) {
                ID.instance(i).setValue(0, m_clusterAssignments[i] + num);
            } else
                ID.instance(i).setValue(0, m_clusterAssignments[i]);
        }
        return ID;
    }

    /**
     * num??
     * Returns the Centroids
     * */
    public Instances getCentroids(int num) {
        FastVector atts = new FastVector();
        Attribute clusterID = new Attribute("clusterID");
        atts.addElement(clusterID);
        Instances data = new Instances("centroids", atts, m_numClusters);
        for (int i = 0; i < ID.numAttributes() - 1; i++) {
            Attribute att = new Attribute("Subject" + String.valueOf(i));
            atts.addElement(att);
        }
        double[] map = new double[m_numClusters];
        double[] temp = new double[m_clusterAssignments.length];
        System.arraycopy(m_clusterAssignments, 0, temp, 0, m_clusterAssignments.length);
        int n = map.length;
        for (int i = 0; i < m_clusterAssignments.length; i++) {
            double id = temp[i];
            if (id == -1)
                continue;
            boolean flag = true;
            for (int j = 0; j < temp.length; j++) {
                if (temp[j] == id) {
                    temp[j] = -1;
                }
            }
            if (flag && n != -1) {
                map[map.length - n] = id + num;
                n--;
            } else if (n != -1) {
                continue;
            } else {
                break;
            }
        }
        for (int i = 0; i < map.length; i++) {
            double id = map[i];
            double[] averatts = new double[ID.numAttributes()];
            int count = 0;
            for (int j = 0; j < ID.numInstances(); j++) {
                Instance iter = ID.instance(j);
                if (iter.value(0) == id) {
                    averatts = CommonMethords.add(averatts, iter.toDoubleArray());
                    count++;
                }
            }
            averatts = CommonMethords.calAver(averatts, count);
            Instance ins = new Instance(1, averatts);
            data.add(ins);
        }
        return data;
    }

    /**
     * Return the log likelihood corresponding to the most recent
     * set of instances clustered.
     *
     * @return a <code>double</code> value
     */
    public double getLogLikelihood() {
        return m_logL;
    }

    /**
     * Constructor. Sets defaults for each member variable. Default Clusterer
     * is EM.
     */
    public ClusterEvaluationEX() {
        setClusterer(new SimpleKMeans());
        m_clusteringResults = new StringBuffer();
        m_clusterAssignments = null;
    }

    /**
     * Evaluate the clusterer on a set of instances. Calculates clustering
     * statistics and stores cluster assigments for the instances in
     * m_clusterAssignments
     * 
     * @param test the set of instances to cluster
     * @throws Exception if something goes wrong
     */
    public void evaluateClusterer(Instances test) throws Exception {
        evaluateClusterer(test, "");
    }

    /**
     * Evaluate the clusterer on a set of instances. Calculates clustering
     * statistics and stores cluster assigments for the instances in
     * m_clusterAssignments
     * 
     * @param test the set of instances to cluster
     * @param testFileName the name of the test file for incremental testing, 
     * if "" or null then not used
     * 
     * @throws Exception if something goes wrong
     */
    public void evaluateClusterer(Instances test, String testFileName) throws Exception {
        evaluateClusterer(test, testFileName, true);
    }

    /**
     * Evaluate the clusterer on a set of instances. Calculates clustering
     * statistics and stores cluster assigments for the instances in
     * m_clusterAssignments
     * 
     * @param test the set of instances to cluster
     * @param testFileName the name of the test file for incremental testing, 
     * if "" or null then not used
     * @param outputModel true if the clustering model is to be output as well
     * as the stats
     * 
     * @throws Exception if something goes wrong
     */
    public void evaluateClusterer(Instances test, String testFileName, boolean outputModel) throws Exception {
        int i = 0;
        int cnum;
        double loglk = 0.0;
        int cc = m_Clusterer.numberOfClusters();
        m_numClusters = cc;
        double[] instanceStats = new double[cc];
        Instances testRaw = null;
        boolean hasClass = (test.classIndex() >= 0);
        int unclusteredInstances = 0;
        Vector<Double> clusterAssignments = new Vector<Double>();
        Filter filter = null;
        DataSource source = null;
        Instance inst;

        if (testFileName == null)
            testFileName = "";

        // load data
        if (testFileName.length() != 0)
            source = new DataSource(testFileName);
        else
            source = new DataSource(test);
        testRaw = source.getStructure(test.classIndex());

        // If class is set then do class based evaluation as well
        if (hasClass) {
            if (testRaw.classAttribute().isNumeric())
                throw new Exception("ClusterEvaluation: Class must be nominal!");

            filter = new Remove();
            ((Remove) filter).setAttributeIndices("" + (testRaw.classIndex() + 1));
            ((Remove) filter).setInvertSelection(false);
            filter.setInputFormat(testRaw);
        }

        i = 0;
        while (source.hasMoreElements(testRaw)) {
            // next instance
            inst = source.nextElement(testRaw);
            if (filter != null) {
                filter.input(inst);
                filter.batchFinished();
                inst = filter.output();
            }

            cnum = -1;
            try {
                if (m_Clusterer instanceof DensityBasedClusterer) {
                    loglk += ((DensityBasedClusterer) m_Clusterer).logDensityForInstance(inst);
                    cnum = m_Clusterer.clusterInstance(inst);
                    clusterAssignments.add((double) cnum);
                } else {
                    cnum = m_Clusterer.clusterInstance(inst);
                    clusterAssignments.add((double) cnum);
                }
            } catch (Exception e) {
                clusterAssignments.add(-1.0);
                unclusteredInstances++;
            }

            if (cnum != -1) {
                instanceStats[cnum]++;
            }
        }

        double sum = Utils.sum(instanceStats);
        loglk /= sum;
        m_logL = loglk;
        m_clusterAssignments = new double[clusterAssignments.size()];
        for (i = 0; i < clusterAssignments.size(); i++) {
            m_clusterAssignments[i] = clusterAssignments.get(i);
        }
        int numInstFieldWidth = (int) ((Math.log(clusterAssignments.size()) / Math.log(10)) + 1);

        if (outputModel) {
            m_clusteringResults.append(m_Clusterer.toString());
        }
        m_clusteringResults.append("Clustered Instances\n\n");
        int clustFieldWidth = (int) ((Math.log(cc) / Math.log(10)) + 1);
        for (i = 0; i < cc; i++) {
            if (instanceStats[i] > 0)
                m_clusteringResults.append(Utils.doubleToString((double) i, clustFieldWidth, 0) + "      "
                        + Utils.doubleToString(instanceStats[i], numInstFieldWidth, 0) + " ("
                        + Utils.doubleToString((instanceStats[i] / sum * 100.0), 3, 0) + "%)\n");
        }

        if (unclusteredInstances > 0)
            m_clusteringResults.append("\nUnclustered instances : " + unclusteredInstances);

        if (m_Clusterer instanceof DensityBasedClusterer)
            m_clusteringResults.append("\n\nLog likelihood: " + Utils.doubleToString(loglk, 1, 5) + "\n");

        if (hasClass) {
            evaluateClustersWithRespectToClass(test, testFileName);
        }
    }

    /**
     * Evaluates cluster assignments with respect to actual class labels.
     * Assumes that m_Clusterer has been trained and tested on 
     * inst (minus the class).
     * 
     * @param inst the instances (including class) to evaluate with respect to
     * @param fileName the name of the test file for incremental testing, 
     * if "" or null then not used
     * @throws Exception if something goes wrong
     */
    private void evaluateClustersWithRespectToClass(Instances inst, String fileName) throws Exception {

        int numClasses = inst.classAttribute().numValues();
        int[][] counts = new int[m_numClusters][numClasses];
        int[] clusterTotals = new int[m_numClusters];
        double[] best = new double[m_numClusters + 1];
        double[] current = new double[m_numClusters + 1];
        DataSource source = null;
        Instances instances = null;
        Instance instance = null;
        int i;
        int numInstances;

        if (fileName == null)
            fileName = "";

        if (fileName.length() != 0) {
            source = new DataSource(fileName);
        } else
            source = new DataSource(inst);
        instances = source.getStructure(inst.classIndex());

        i = 0;
        while (source.hasMoreElements(instances)) {
            instance = source.nextElement(instances);
            if (m_clusterAssignments[i] >= 0) {
                counts[(int) m_clusterAssignments[i]][(int) instance.classValue()]++;
                clusterTotals[(int) m_clusterAssignments[i]]++;
            }
            i++;
        }
        numInstances = i;

        best[m_numClusters] = Double.MAX_VALUE;
        mapClasses(m_numClusters, 0, counts, clusterTotals, current, best, 0);

        m_clusteringResults.append("\n\nClass attribute: " + inst.classAttribute().name() + "\n");
        m_clusteringResults.append("Classes to Clusters:\n");
        String matrixString = toMatrixString(counts, clusterTotals, new Instances(inst, 0));
        m_clusteringResults.append(matrixString).append("\n");

        int Cwidth = 1 + (int) (Math.log(m_numClusters) / Math.log(10));
        // add the minimum error assignment
        for (i = 0; i < m_numClusters; i++) {
            if (clusterTotals[i] > 0) {
                m_clusteringResults.append("Cluster " + Utils.doubleToString((double) i, Cwidth, 0));
                m_clusteringResults.append(" <-- ");

                if (best[i] < 0) {
                    m_clusteringResults.append("No class\n");
                } else {
                    m_clusteringResults.append(inst.classAttribute().value((int) best[i])).append("\n");
                }
            }
        }
        m_clusteringResults.append("\nIncorrectly clustered instances :\t" + best[m_numClusters] + "\t"
                + (Utils.doubleToString((best[m_numClusters] / numInstances * 100.0), 8, 4)) + " %\n");

        // copy the class assignments
        m_classToCluster = new int[m_numClusters];
        for (i = 0; i < m_numClusters; i++) {
            m_classToCluster[i] = (int) best[i];
        }
    }

    /**
     * Returns a "confusion" style matrix of classes to clusters assignments
     * @param counts the counts of classes for each cluster
     * @param clusterTotals total number of examples in each cluster
     * @param inst the training instances (with class)
     * @return the "confusion" style matrix as string
     * @throws Exception if matrix can't be generated
     */
    private String toMatrixString(int[][] counts, int[] clusterTotals, Instances inst) throws Exception {
        StringBuffer ms = new StringBuffer();

        int maxval = 0;
        for (int i = 0; i < m_numClusters; i++) {
            for (int j = 0; j < counts[i].length; j++) {
                if (counts[i][j] > maxval) {
                    maxval = counts[i][j];
                }
            }
        }

        int Cwidth = 1
                + Math.max((int) (Math.log(maxval) / Math.log(10)), (int) (Math.log(m_numClusters) / Math.log(10)));

        ms.append("\n");

        for (int i = 0; i < m_numClusters; i++) {
            if (clusterTotals[i] > 0) {
                ms.append(" ").append(Utils.doubleToString((double) i, Cwidth, 0));
            }
        }
        ms.append("  <-- assigned to cluster\n");

        for (int i = 0; i < counts[0].length; i++) {

            for (int j = 0; j < m_numClusters; j++) {
                if (clusterTotals[j] > 0) {
                    ms.append(" ").append(Utils.doubleToString((double) counts[j][i], Cwidth, 0));
                }
            }
            ms.append(" | ").append(inst.classAttribute().value(i)).append("\n");
        }

        return ms.toString();
    }

    /**
     * Finds the minimum error mapping of classes to clusters. Recursively
     * considers all possible class to cluster assignments.
     * 
     * @param numClusters the number of clusters
     * @param lev the cluster being processed
     * @param counts the counts of classes in clusters
     * @param clusterTotals the total number of examples in each cluster
     * @param current the current path through the class to cluster assignment
     * tree
     * @param best the best assignment path seen
     * @param error accumulates the error for a particular path
     */
    public static void mapClasses(int numClusters, int lev, int[][] counts, int[] clusterTotals, double[] current,
            double[] best, int error) {
        // leaf
        if (lev == numClusters) {
            if (error < best[numClusters]) {
                best[numClusters] = error;
                for (int i = 0; i < numClusters; i++) {
                    best[i] = current[i];
                }
            }
        } else {
            // empty cluster -- ignore
            if (clusterTotals[lev] == 0) {
                current[lev] = -1; // cluster ignored
                mapClasses(numClusters, lev + 1, counts, clusterTotals, current, best, error);
            } else {
                // first try no class assignment to this cluster
                current[lev] = -1; // cluster assigned no class (ie all errors)
                mapClasses(numClusters, lev + 1, counts, clusterTotals, current, best, error + clusterTotals[lev]);
                // now loop through the classes in this cluster
                for (int i = 0; i < counts[0].length; i++) {
                    if (counts[lev][i] > 0) {
                        boolean ok = true;
                        // check to see if this class has already been assigned
                        for (int j = 0; j < lev; j++) {
                            if ((int) current[j] == i) {
                                ok = false;
                                break;
                            }
                        }
                        if (ok) {
                            current[lev] = i;
                            mapClasses(numClusters, lev + 1, counts, clusterTotals, current, best,
                                    (error + (clusterTotals[lev] - counts[lev][i])));
                        }
                    }
                }
            }
        }
    }

    /**
     * Evaluates a clusterer with the options given in an array of
     * strings. It takes the string indicated by "-t" as training file, the
     * string indicated by "-T" as test file.
     * If the test file is missing, a stratified ten-fold
     * cross-validation is performed (distribution clusterers only).
     * Using "-x" you can change the number of
     * folds to be used, and using "-s" the random seed.
     * If the "-p" option is present it outputs the classification for
     * each test instance. If you provide the name of an object file using
     * "-l", a clusterer will be loaded from the given file. If you provide the
     * name of an object file using "-d", the clusterer built from the
     * training data will be saved to the given file.
     *
     * @param clusterer machine learning clusterer
     * @param options the array of string containing the options
     * @throws Exception if model could not be evaluated successfully
     * @return a string describing the results 
     */
    public static String evaluateClusterer(Clusterer clusterer, String[] options) throws Exception {

        int seed = 1, folds = 10;
        boolean doXval = false;
        Instances train = null;
        Random random;
        String trainFileName, testFileName, seedString, foldsString;
        String objectInputFileName, objectOutputFileName, attributeRangeString;
        String graphFileName;
        String[] savedOptions = null;
        boolean printClusterAssignments = false;
        Range attributesToOutput = null;
        StringBuffer text = new StringBuffer();
        int theClass = -1; // class based evaluation of clustering
        boolean updateable = (clusterer instanceof UpdateableClusterer);
        DataSource source = null;
        Instance inst;

        if (Utils.getFlag('h', options) || Utils.getFlag("help", options)) {

            // global info requested as well?
            boolean globalInfo = Utils.getFlag("synopsis", options) || Utils.getFlag("info", options);

            throw new Exception("Help requested." + makeOptionString(clusterer, globalInfo));
        }

        try {
            // Get basic options (options the same for all clusterers
            //printClusterAssignments = Utils.getFlag('p', options);
            objectInputFileName = Utils.getOption('l', options);
            objectOutputFileName = Utils.getOption('d', options);
            trainFileName = Utils.getOption('t', options);
            testFileName = Utils.getOption('T', options);
            graphFileName = Utils.getOption('g', options);

            // Check -p option
            try {
                attributeRangeString = Utils.getOption('p', options);
            } catch (Exception e) {
                throw new Exception(e.getMessage() + "\nNOTE: the -p option has changed. "
                        + "It now expects a parameter specifying a range of attributes "
                        + "to list with the predictions. Use '-p 0' for none.");
            }
            if (attributeRangeString.length() != 0) {
                printClusterAssignments = true;
                if (!attributeRangeString.equals("0"))
                    attributesToOutput = new Range(attributeRangeString);
            }

            if (trainFileName.length() == 0) {
                if (objectInputFileName.length() == 0) {
                    throw new Exception("No training file and no object " + "input file given.");
                }

                if (testFileName.length() == 0) {
                    throw new Exception("No training file and no test file given.");
                }
            } else {
                if ((objectInputFileName.length() != 0) && (printClusterAssignments == false)) {
                    throw new Exception("Can't use both train and model file " + "unless -p specified.");
                }
            }

            seedString = Utils.getOption('s', options);

            if (seedString.length() != 0) {
                seed = Integer.parseInt(seedString);
            }

            foldsString = Utils.getOption('x', options);

            if (foldsString.length() != 0) {
                folds = Integer.parseInt(foldsString);
                doXval = true;
            }
        } catch (Exception e) {
            throw new Exception('\n' + e.getMessage() + makeOptionString(clusterer, false));
        }

        try {
            if (trainFileName.length() != 0) {
                source = new DataSource(trainFileName);
                train = source.getStructure();

                String classString = Utils.getOption('c', options);
                if (classString.length() != 0) {
                    if (classString.compareTo("last") == 0)
                        theClass = train.numAttributes();
                    else if (classString.compareTo("first") == 0)
                        theClass = 1;
                    else
                        theClass = Integer.parseInt(classString);

                    if (theClass != -1) {
                        if (doXval || testFileName.length() != 0)
                            throw new Exception("Can only do class based evaluation on the " + "training data");

                        if (objectInputFileName.length() != 0)
                            throw new Exception("Can't load a clusterer and do class based " + "evaluation");

                        if (objectOutputFileName.length() != 0)
                            throw new Exception("Can't do class based evaluation and save clusterer");
                    }
                } else {
                    // if the dataset defines a class attribute, use it
                    if (train.classIndex() != -1) {
                        theClass = train.classIndex() + 1;
                        System.err
                                .println("Note: using class attribute from dataset, i.e., attribute #" + theClass);
                    }
                }

                if (theClass != -1) {
                    if (theClass < 1 || theClass > train.numAttributes())
                        throw new Exception("Class is out of range!");

                    if (!train.attribute(theClass - 1).isNominal())
                        throw new Exception("Class must be nominal!");

                    train.setClassIndex(theClass - 1);
                }
            }
        } catch (Exception e) {
            throw new Exception("ClusterEvaluation: " + e.getMessage() + '.');
        }

        // Save options
        if (options != null) {
            savedOptions = new String[options.length];
            System.arraycopy(options, 0, savedOptions, 0, options.length);
        }

        if (objectInputFileName.length() != 0)
            Utils.checkForRemainingOptions(options);

        // Set options for clusterer
        if (clusterer instanceof OptionHandler)
            ((OptionHandler) clusterer).setOptions(options);

        Utils.checkForRemainingOptions(options);

        Instances trainHeader = train;
        if (objectInputFileName.length() != 0) {
            // Load the clusterer from file
            //      clusterer = (Clusterer) SerializationHelper.read(objectInputFileName);
            java.io.ObjectInputStream ois = new java.io.ObjectInputStream(
                    new java.io.BufferedInputStream(new java.io.FileInputStream(objectInputFileName)));
            clusterer = (Clusterer) ois.readObject();
            // try and get the training header
            try {
                trainHeader = (Instances) ois.readObject();
            } catch (Exception ex) {
                // don't moan if we cant
            }
        } else {
            // Build the clusterer if no object file provided
            if (theClass == -1) {
                if (updateable) {
                    clusterer.buildClusterer(source.getStructure());
                    while (source.hasMoreElements(train)) {
                        inst = source.nextElement(train);
                        ((UpdateableClusterer) clusterer).updateClusterer(inst);
                    }
                    ((UpdateableClusterer) clusterer).updateFinished();
                } else {
                    clusterer.buildClusterer(source.getDataSet());
                }
            } else {
                Remove removeClass = new Remove();
                removeClass.setAttributeIndices("" + theClass);
                removeClass.setInvertSelection(false);
                removeClass.setInputFormat(train);
                if (updateable) {
                    Instances clusterTrain = Filter.useFilter(train, removeClass);
                    clusterer.buildClusterer(clusterTrain);
                    trainHeader = clusterTrain;
                    while (source.hasMoreElements(train)) {
                        inst = source.nextElement(train);
                        removeClass.input(inst);
                        removeClass.batchFinished();
                        Instance clusterTrainInst = removeClass.output();
                        ((UpdateableClusterer) clusterer).updateClusterer(clusterTrainInst);
                    }
                    ((UpdateableClusterer) clusterer).updateFinished();
                } else {
                    Instances clusterTrain = Filter.useFilter(source.getDataSet(), removeClass);
                    clusterer.buildClusterer(clusterTrain);
                    trainHeader = clusterTrain;
                }
                ClusterEvaluationEX ce = new ClusterEvaluationEX();
                ce.setClusterer(clusterer);
                ce.evaluateClusterer(train, trainFileName);

                return "\n\n=== Clustering stats for training data ===\n\n" + ce.clusterResultsToString();
            }
        }

        /* Output cluster predictions only (for the test data if specified,
           otherwise for the training data */
        if (printClusterAssignments) {
            return printClusterings(clusterer, trainFileName, testFileName, attributesToOutput);
        }

        text.append(clusterer.toString());
        text.append(
                "\n\n=== Clustering stats for training data ===\n\n" + printClusterStats(clusterer, trainFileName));

        if (testFileName.length() != 0) {
            // check header compatibility
            DataSource test = new DataSource(testFileName);
            Instances testStructure = test.getStructure();
            if (!trainHeader.equalHeaders(testStructure)) {
                throw new Exception("Training and testing data are not compatible\n");
            }

            text.append("\n\n=== Clustering stats for testing data ===\n\n"
                    + printClusterStats(clusterer, testFileName));
        }

        if ((clusterer instanceof DensityBasedClusterer) && (doXval == true) && (testFileName.length() == 0)
                && (objectInputFileName.length() == 0)) {
            // cross validate the log likelihood on the training data
            random = new Random(seed);
            random.setSeed(seed);
            train = source.getDataSet();
            train.randomize(random);
            text.append(crossValidateModel(clusterer.getClass().getName(), train, folds, savedOptions, random));
        }

        // Save the clusterer if an object output file is provided
        if (objectOutputFileName.length() != 0) {
            //SerializationHelper.write(objectOutputFileName, clusterer);
            saveClusterer(objectOutputFileName, clusterer, trainHeader);
        }

        // If classifier is drawable output string describing graph
        if ((clusterer instanceof Drawable) && (graphFileName.length() != 0)) {
            BufferedWriter writer = new BufferedWriter(new FileWriter(graphFileName));
            writer.write(((Drawable) clusterer).graph());
            writer.newLine();
            writer.flush();
            writer.close();
        }

        return text.toString();
    }

    public static void saveClusterer(String fileName, Clusterer clusterer, Instances header) throws Exception {
        java.io.ObjectOutputStream oos = new java.io.ObjectOutputStream(
                new java.io.BufferedOutputStream(new java.io.FileOutputStream(fileName)));

        oos.writeObject(clusterer);
        if (header != null) {
            oos.writeObject(header);
        }
        oos.flush();
        oos.close();
    }

    /**
     * Perform a cross-validation for DensityBasedClusterer on a set of instances.
     *
     * @param clusterer the clusterer to use
     * @param data the training data
     * @param numFolds number of folds of cross validation to perform
     * @param random random number seed for cross-validation
     * @return the cross-validated log-likelihood
     * @throws Exception if an error occurs
     */
    public static double crossValidateModel(DensityBasedClusterer clusterer, Instances data, int numFolds,
            Random random) throws Exception {
        Instances train, test;
        double foldAv = 0;
        ;
        data = new Instances(data);
        data.randomize(random);
        //    double sumOW = 0;
        for (int i = 0; i < numFolds; i++) {
            // Build and test clusterer
            train = data.trainCV(numFolds, i, random);

            clusterer.buildClusterer(train);

            test = data.testCV(numFolds, i);

            for (int j = 0; j < test.numInstances(); j++) {
                try {
                    foldAv += ((DensityBasedClusterer) clusterer).logDensityForInstance(test.instance(j));
                    //     sumOW += test.instance(j).weight();
                    //   double temp = Utils.sum(tempDist);
                } catch (Exception ex) {
                    // unclustered instances
                }
            }
        }

        //    return foldAv / sumOW;
        return foldAv / data.numInstances();
    }

    /**
     * Performs a cross-validation 
     * for a DensityBasedClusterer clusterer on a set of instances.
     *
     * @param clustererString a string naming the class of the clusterer
     * @param data the data on which the cross-validation is to be 
     * performed 
     * @param numFolds the number of folds for the cross-validation
     * @param options the options to the clusterer
     * @param random a random number generator
     * @return a string containing the cross validated log likelihood
     * @throws Exception if a clusterer could not be generated 
     */
    public static String crossValidateModel(String clustererString, Instances data, int numFolds, String[] options,
            Random random) throws Exception {
        Clusterer clusterer = null;
        String[] savedOptions = null;
        double CvAv = 0.0;
        StringBuffer CvString = new StringBuffer();

        if (options != null) {
            savedOptions = new String[options.length];
        }

        data = new Instances(data);

        // create clusterer
        try {
            clusterer = (Clusterer) Class.forName(clustererString).newInstance();
        } catch (Exception e) {
            throw new Exception("Can't find class with name " + clustererString + '.');
        }

        if (!(clusterer instanceof DensityBasedClusterer)) {
            throw new Exception(clustererString + " must be a distrinbution " + "clusterer.");
        }

        // Save options
        if (options != null) {
            System.arraycopy(options, 0, savedOptions, 0, options.length);
        }

        // Parse options
        if (clusterer instanceof OptionHandler) {
            try {
                ((OptionHandler) clusterer).setOptions(savedOptions);
                Utils.checkForRemainingOptions(savedOptions);
            } catch (Exception e) {
                throw new Exception("Can't parse given options in " + "cross-validation!");
            }
        }
        CvAv = crossValidateModel((DensityBasedClusterer) clusterer, data, numFolds, random);

        CvString.append("\n" + numFolds + " fold CV Log Likelihood: " + Utils.doubleToString(CvAv, 6, 4) + "\n");
        return CvString.toString();
    }

    // ===============
    // Private methods
    // ===============
    /**
     * Print the cluster statistics for either the training
     * or the testing data.
     *
     * @param clusterer the clusterer to use for generating statistics.
     * @param fileName the file to load
     * @return a string containing cluster statistics.
     * @throws Exception if statistics can't be generated.
     */
    private static String printClusterStats(Clusterer clusterer, String fileName) throws Exception {
        StringBuffer text = new StringBuffer();
        int i = 0;
        int cnum;
        double loglk = 0.0;
        int cc = clusterer.numberOfClusters();
        double[] instanceStats = new double[cc];
        int unclusteredInstances = 0;

        if (fileName.length() != 0) {
            DataSource source = new DataSource(fileName);
            Instances structure = source.getStructure();
            Instance inst;
            while (source.hasMoreElements(structure)) {
                inst = source.nextElement(structure);
                try {
                    cnum = clusterer.clusterInstance(inst);

                    if (clusterer instanceof DensityBasedClusterer) {
                        loglk += ((DensityBasedClusterer) clusterer).logDensityForInstance(inst);
                        //       temp = Utils.sum(dist);
                    }
                    instanceStats[cnum]++;
                } catch (Exception e) {
                    unclusteredInstances++;
                }
                i++;
            }

            /*
            // count the actual number of used clusters
            int count = 0;
            for (i = 0; i < cc; i++) {
            if (instanceStats[i] > 0) {
            count++;
            }
            }
            if (count > 0) {
            double[] tempStats = new double [count];
            count=0;
            for (i=0;i<cc;i++) {
            if (instanceStats[i] > 0) {
             tempStats[count++] = instanceStats[i];
            }
            }
            instanceStats = tempStats;
            cc = instanceStats.length;
            } */

            int clustFieldWidth = (int) ((Math.log(cc) / Math.log(10)) + 1);
            int numInstFieldWidth = (int) ((Math.log(i) / Math.log(10)) + 1);
            double sum = Utils.sum(instanceStats);
            loglk /= sum;
            text.append("Clustered Instances\n");

            for (i = 0; i < cc; i++) {
                if (instanceStats[i] > 0) {
                    text.append(Utils.doubleToString((double) i, clustFieldWidth, 0) + "      "
                            + Utils.doubleToString(instanceStats[i], numInstFieldWidth, 0) + " ("
                            + Utils.doubleToString((instanceStats[i] / sum * 100.0), 3, 0) + "%)\n");
                }
            }
            if (unclusteredInstances > 0) {
                text.append("\nUnclustered Instances : " + unclusteredInstances);
            }

            if (clusterer instanceof DensityBasedClusterer) {
                text.append("\n\nLog likelihood: " + Utils.doubleToString(loglk, 1, 5) + "\n");
            }
        }

        return text.toString();
    }

    /**
     * Print the cluster assignments for either the training
     * or the testing data.
     *
     * @param clusterer the clusterer to use for cluster assignments
     * @param trainFileName the train file
     * @param testFileName an optional test file
     * @param attributesToOutput the attributes to print
     * @return a string containing the instance indexes and cluster assigns.
     * @throws Exception if cluster assignments can't be printed
     */
    private static String printClusterings(Clusterer clusterer, String trainFileName, String testFileName,
            Range attributesToOutput) throws Exception {

        StringBuffer text = new StringBuffer();
        int i = 0;
        int cnum;
        DataSource source = null;
        Instance inst;
        Instances structure;

        if (testFileName.length() != 0)
            source = new DataSource(testFileName);
        else
            source = new DataSource(trainFileName);

        structure = source.getStructure();
        while (source.hasMoreElements(structure)) {
            inst = source.nextElement(structure);
            try {
                cnum = clusterer.clusterInstance(inst);

                text.append(i + " " + cnum + " " + attributeValuesString(inst, attributesToOutput) + "\n");
            } catch (Exception e) {
                /*     throw  new Exception('\n' + "Unable to cluster instance\n" 
                 + e.getMessage()); */
                text.append(i + " Unclustered " + attributeValuesString(inst, attributesToOutput) + "\n");
            }
            i++;
        }

        return text.toString();
    }

    /**
     * Builds a string listing the attribute values in a specified range of indices,
     * separated by commas and enclosed in brackets.
     *
     * @param instance the instance to print the values from
     * @param attRange the range of the attributes to list
     * @return a string listing values of the attributes in the range
     */
    private static String attributeValuesString(Instance instance, Range attRange) {
        StringBuffer text = new StringBuffer();
        if (attRange != null) {
            boolean firstOutput = true;
            attRange.setUpper(instance.numAttributes() - 1);
            for (int i = 0; i < instance.numAttributes(); i++)
                if (attRange.isInRange(i)) {
                    if (firstOutput)
                        text.append("(");
                    else
                        text.append(",");
                    text.append(instance.toString(i));
                    firstOutput = false;
                }
            if (!firstOutput)
                text.append(")");
        }
        return text.toString();
    }

    /**
     * Make up the help string giving all the command line options
     *
     * @param clusterer the clusterer to include options for
     * @return a string detailing the valid command line options
     */
    private static String makeOptionString(Clusterer clusterer, boolean globalInfo) {
        StringBuffer optionsText = new StringBuffer("");
        // General options
        optionsText.append("\n\nGeneral options:\n\n");
        optionsText.append("-h or -help\n");
        optionsText.append("\tOutput help information.\n");
        optionsText.append("-synopsis or -info\n");
        optionsText.append("\tOutput synopsis for clusterer (use in conjunction " + " with -h)\n");
        optionsText.append("-t <name of training file>\n");
        optionsText.append("\tSets training file.\n");
        optionsText.append("-T <name of test file>\n");
        optionsText.append("\tSets test file.\n");
        optionsText.append("-l <name of input file>\n");
        optionsText.append("\tSets model input file.\n");
        optionsText.append("-d <name of output file>\n");
        optionsText.append("\tSets model output file.\n");
        optionsText.append("-p <attribute range>\n");
        optionsText.append("\tOutput predictions. Predictions are for " + "training file"
                + "\n\tif only training file is specified," + "\n\totherwise predictions are for the test file."
                + "\n\tThe range specifies attribute values to be output"
                + "\n\twith the predictions. Use '-p 0' for none.\n");
        optionsText.append("-x <number of folds>\n");
        optionsText.append("\tOnly Distribution Clusterers can be cross validated.\n");
        optionsText.append("-s <random number seed>\n");
        optionsText.append("\tSets the seed for randomizing the data in cross-validation\n");
        optionsText.append("-c <class index>\n");
        optionsText.append("\tSet class attribute. If supplied, class is ignored");
        optionsText.append("\n\tduring clustering but is used in a classes to");
        optionsText.append("\n\tclusters evaluation.\n");
        if (clusterer instanceof Drawable) {
            optionsText.append("-g <name of graph file>\n");
            optionsText.append("\tOutputs the graph representation of the clusterer to the file.\n");
        }

        // Get scheme-specific options
        if (clusterer instanceof OptionHandler) {
            optionsText.append("\nOptions specific to " + clusterer.getClass().getName() + ":\n\n");
            Enumeration enu = ((OptionHandler) clusterer).listOptions();

            while (enu.hasMoreElements()) {
                Option option = (Option) enu.nextElement();
                optionsText.append(option.synopsis() + '\n');
                optionsText.append(option.description() + "\n");
            }
        }

        // Get global information (if available)
        if (globalInfo) {
            try {
                String gi = getGlobalInfo(clusterer);
                optionsText.append(gi);
            } catch (Exception ex) {
                // quietly ignore
            }
        }

        return optionsText.toString();
    }

    /**
     * Return the global info (if it exists) for the supplied clusterer
     * 
     * @param clusterer the clusterer to get the global info for
     * @return the global info (synopsis) for the clusterer
     * @throws Exception if there is a problem reflecting on the clusterer
     */
    protected static String getGlobalInfo(Clusterer clusterer) throws Exception {
        BeanInfo bi = Introspector.getBeanInfo(clusterer.getClass());
        MethodDescriptor[] methods;
        methods = bi.getMethodDescriptors();
        Object[] args = {};
        String result = "\nSynopsis for " + clusterer.getClass().getName() + ":\n\n";

        for (int i = 0; i < methods.length; i++) {
            String name = methods[i].getDisplayName();
            Method meth = methods[i].getMethod();
            if (name.equals("globalInfo")) {
                String globalInfo = (String) (meth.invoke(clusterer, args));
                result += globalInfo;
                break;
            }
        }

        return result;
    }

    /**
     * Tests whether the current evaluation object is equal to another
     * evaluation object
     *
     * @param obj the object to compare against
     * @return true if the two objects are equal
     */
    public boolean equals(Object obj) {
        if ((obj == null) || !(obj.getClass().equals(this.getClass())))
            return false;

        ClusterEvaluationEX cmp = (ClusterEvaluationEX) obj;

        if ((m_classToCluster != null) != (cmp.m_classToCluster != null))
            return false;
        if (m_classToCluster != null) {
            for (int i = 0; i < m_classToCluster.length; i++) {
                if (m_classToCluster[i] != cmp.m_classToCluster[i])
                    return false;
            }
        }

        if ((m_clusterAssignments != null) != (cmp.m_clusterAssignments != null))
            return false;
        if (m_clusterAssignments != null) {
            for (int i = 0; i < m_clusterAssignments.length; i++) {
                if (m_clusterAssignments[i] != cmp.m_clusterAssignments[i])
                    return false;
            }
        }

        if (Double.isNaN(m_logL) != Double.isNaN(cmp.m_logL))
            return false;
        if (!Double.isNaN(m_logL)) {
            if (m_logL != cmp.m_logL)
                return false;
        }

        if (m_numClusters != cmp.m_numClusters)
            return false;

        // TODO: better comparison? via members?
        String clusteringResults1 = m_clusteringResults.toString().replaceAll("Elapsed time.*", "");
        String clusteringResults2 = cmp.m_clusteringResults.toString().replaceAll("Elapsed time.*", "");
        if (!clusteringResults1.equals(clusteringResults2))
            return false;

        return true;
    }

    /**
     * Returns the revision string.
     * 
     * @return      the revision
     */
    public String getRevision() {
        return RevisionUtils.extract("$Revision: 8034 $");
    }

    /**
     * Main method for testing this class.
     *
     * @param args the options
     */
    public static void main(String[] args) {
        try {
            if (args.length == 0) {
                throw new Exception("The first argument must be the name of a " + "clusterer");
            }

            String ClustererString = args[0];
            args[0] = "";
            Clusterer newClusterer = AbstractClusterer.forName(ClustererString, null);
            System.out.println(evaluateClusterer(newClusterer, args));
        } catch (Exception e) {
            System.out.println(e.getMessage());
        }
    }
}