adaptedClusteringAlgorithms.MyFarthestFirst.java Source code

Introduction

Here is the source code for adaptedClusteringAlgorithms.MyFarthestFirst.java
Source

/*
 *    This program is free software; you can redistribute it and/or modify
 *    it under the terms of the GNU General Public License as published by
 *    the Free Software Foundation; either version 2 of the License, or
 *    (at your option) any later version.
 *
 *    This program is distributed in the hope that it will be useful,
 *    but WITHOUT ANY WARRANTY; without even the implied warranty of
 *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *    GNU General Public License for more details.
 *
 *    You should have received a copy of the GNU General Public License
 *    along with this program; if not, write to the Free Software
 *    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
 */

/*
 *    FarthestFirst.java
 *    Copyright (C) 2002 University of Waikato, Hamilton, New Zealand
 *
 */
package adaptedClusteringAlgorithms;

import weka.clusterers.RandomizableClusterer;
import weka.core.Attribute;
import weka.core.Capabilities;
import weka.core.DistanceFunction;
import weka.core.Instance;
import weka.core.Instances;
import weka.core.Option;
import weka.core.RevisionUtils;
import weka.core.TechnicalInformation;
import weka.core.TechnicalInformationHandler;
import weka.core.Utils;
import weka.core.Capabilities.Capability;
import weka.core.TechnicalInformation.Field;
import weka.core.TechnicalInformation.Type;
import weka.filters.unsupervised.attribute.ReplaceMissingValues;

import java.util.Enumeration;
import java.util.Random;
import java.util.Vector;

import adaptedSemanticMeasurers.CalculusDir;
import adaptedSemanticMeasurers.CalculusInd;
import adaptedSemanticMeasurers.ChEBIInd;
import adaptedSemanticMeasurers.GOChEBIDir;
import adaptedSemanticMeasurers.GOChEBIInd;
import adaptedSemanticMeasurers.GOInd;
import adaptedSemanticMeasurers.GODir;
import adaptedSemanticMeasurers.ChEBIDir;
import main.MyFirstClusterer;
import main.SESAME;

//--> WEKA's FarthestFirst adapted to use SML's Semantic Measurers as if they were Euclidean

/**
 <!-- globalinfo-start -->
 * Cluster data using the FarthestFirst algorithm.<br/>
 * <br/>
 * For more information see:<br/>
 * <br/>
 * Hochbaum, Shmoys (1985). A best possible heuristic for the k-center problem. Mathematics of Operations Research. 10(2):180-184.<br/>
 * <br/>
 * Sanjoy Dasgupta: Performance Guarantees for Hierarchical Clustering. In: 15th Annual Conference on Computational Learning Theory, 351-363, 2002.<br/>
 * <br/>
 * Notes:<br/>
 * - works as a fast simple approximate clusterer<br/>
 * - modelled after SimpleKMeans, might be a useful initializer for it
 * <p/>
 <!-- globalinfo-end -->
 *
 <!-- technical-bibtex-start -->
 * BibTeX:
 * <pre>
 * &#64;article{Hochbaum1985,
 *    author = {Hochbaum and Shmoys},
 *    journal = {Mathematics of Operations Research},
 *    number = {2},
 *    pages = {180-184},
 *    title = {A best possible heuristic for the k-center problem},
 *    volume = {10},
 *    year = {1985}
 * }
 * 
 * &#64;inproceedings{Dasgupta2002,
 *    author = {Sanjoy Dasgupta},
 *    booktitle = {15th Annual Conference on Computational Learning Theory},
 *    pages = {351-363},
 *    publisher = {Springer},
 *    title = {Performance Guarantees for Hierarchical Clustering},
 *    year = {2002}
 * }
 * </pre>
 * <p/>
 <!-- technical-bibtex-end -->
 *
 <!-- options-start -->
 * Valid options are: <p/>
 * 
 * <pre> -N &lt;num&gt;
 *  number of clusters. (default = 2).</pre>
 * 
 * <pre> -S &lt;num&gt;
 *  Random number seed.
 *  (default 1)</pre>
 *  
  * <pre>
 * -A &lt;classname and options&gt;
 *  Distance function to be used for instance comparison
 *  (default adaptedSemanticMeasurers.ChEBIDir)
 * </pre>
 * 
 <!-- options-end -->
 *
 * @author Bernhard Pfahringer (bernhard@cs.waikato.ac.nz)
 * @version $Revision: 5538 $
 * @see RandomizableClusterer
 */
public class MyFarthestFirst extends RandomizableClusterer implements TechnicalInformationHandler {

    //Todo: rewrite to be fully incremental
    //      cleanup, like deleting m_instances 

    /** for serialization */
    static final long serialVersionUID = 7499838100631329509L;

    /**
     * training instances, not necessary to keep, 
     * could be replaced by m_ClusterCentroids where needed for header info
     */
    protected Instances m_instances;

    /**
     * replace missing values in training instances
     */
    protected ReplaceMissingValues m_ReplaceMissingFilter;

    /**
     * number of clusters to generate
     */
    protected int m_NumClusters = 2;

    /**
     * holds the cluster centroids
     */
    protected Instances m_ClusterCentroids;

    /**
     * attribute min values
     */
    private double[] m_Min;

    /**
     * attribute max values
     */
    private double[] m_Max;

    // This is added to allow a distance calculation process where semantic measurers are used
    // To that the use of DistanceFunction interface must be made
    /** the distance function used. */
    protected DistanceFunction m_DistanceFunction = new ChEBIDir();

    /**
     * Returns a string describing this clusterer
     * @return a description of the evaluator suitable for
     * displaying in the explorer/experimenter gui
     */
    public String globalInfo() {
        return "Cluster data using the FarthestFirst algorithm.\n\n" + "For more information see:\n\n"
                + getTechnicalInformation().toString() + "\n\n" + "Notes:\n"
                + "- works as a fast simple approximate clusterer\n"
                + "- modelled after SimpleKMeans, might be a useful initializer for it";
    }

    /**
     * Returns an instance of a TechnicalInformation object, containing 
     * detailed information about the technical background of this class,
     * e.g., paper reference or book this class is based on.
     * 
     * @return the technical information about this class
     */
    public TechnicalInformation getTechnicalInformation() {
        TechnicalInformation result;
        TechnicalInformation additional;

        result = new TechnicalInformation(Type.ARTICLE);
        result.setValue(Field.AUTHOR, "Hochbaum and Shmoys");
        result.setValue(Field.YEAR, "1985");
        result.setValue(Field.TITLE, "A best possible heuristic for the k-center problem");
        result.setValue(Field.JOURNAL, "Mathematics of Operations Research");
        result.setValue(Field.VOLUME, "10");
        result.setValue(Field.NUMBER, "2");
        result.setValue(Field.PAGES, "180-184");

        additional = result.add(Type.INPROCEEDINGS);
        additional.setValue(Field.AUTHOR, "Sanjoy Dasgupta");
        additional.setValue(Field.TITLE, "Performance Guarantees for Hierarchical Clustering");
        additional.setValue(Field.BOOKTITLE, "15th Annual Conference on Computational Learning Theory");
        additional.setValue(Field.YEAR, "2002");
        additional.setValue(Field.PAGES, "351-363");
        additional.setValue(Field.PUBLISHER, "Springer");

        return result;
    }

    /**
     * Returns default capabilities of the clusterer.
     *
     * @return      the capabilities of this clusterer
     */
    public Capabilities getCapabilities() {
        Capabilities result = super.getCapabilities();
        result.disableAll();
        result.enable(Capability.NO_CLASS);

        // attributes
        result.enable(Capability.NOMINAL_ATTRIBUTES);
        result.enable(Capability.NUMERIC_ATTRIBUTES);
        result.enable(Capability.DATE_ATTRIBUTES);
        result.enable(Capability.MISSING_VALUES);

        return result;
    }

    /**
     * Generates a clusterer. Has to initialize all fields of the clusterer
     * that are not being set via options.
     *
     * @param data set of instances serving as training data 
     * @throws Exception if the clusterer has not been 
     * generated successfully
     */
    public void buildClusterer(Instances data) throws Exception {

        if (!SESAME.SESAME_GUI)
            MyFirstClusterer.weka_gui = true;

        // can clusterer handle the data?
        getCapabilities().testWithFail(data);

        //long start = System.currentTimeMillis();

        m_ReplaceMissingFilter = new ReplaceMissingValues();
        // Missing values replacement is not required so this modification is made
        /*m_ReplaceMissingFilter.setInputFormat(data);
        m_instances = Filter.useFilter(data, m_ReplaceMissingFilter);*/
        Instances m_instances = new Instances(data);

        // To use semantic measurers through DistanceFunction interface
        m_DistanceFunction.setInstances(m_instances);

        initMinMax(m_instances);

        m_ClusterCentroids = new Instances(m_instances, m_NumClusters);

        int n = m_instances.numInstances();
        Random r = new Random(getSeed());
        boolean[] selected = new boolean[n];
        double[] minDistance = new double[n];

        for (int i = 0; i < n; i++)
            minDistance[i] = Double.MAX_VALUE;

        int firstI = r.nextInt(n);
        m_ClusterCentroids.add(m_instances.instance(firstI));
        selected[firstI] = true;

        updateMinDistance(minDistance, selected, m_instances, m_instances.instance(firstI));

        if (m_NumClusters > n)
            m_NumClusters = n;

        for (int i = 1; i < m_NumClusters; i++) {
            int nextI = farthestAway(minDistance, selected);
            m_ClusterCentroids.add(m_instances.instance(nextI));
            selected[nextI] = true;
            updateMinDistance(minDistance, selected, m_instances, m_instances.instance(nextI));
        }

        m_instances = new Instances(m_instances, 0);
        //long end = System.currentTimeMillis();
        //System.out.println("Clustering Time = " + (end-start));

        // Save memory!!
        m_DistanceFunction.clean();

        if (!SESAME.SESAME_GUI)
            MyFirstClusterer.weka_gui = true;
    }

    protected void updateMinDistance(double[] minDistance, boolean[] selected, Instances data, Instance center) {
        for (int i = 0; i < selected.length; i++)
            if (!selected[i]) {
                // Another modification to use DistanceFunction
                // double d = distance(center,data.instance(i));
                double d = m_DistanceFunction.distance(center, data.instance(i));
                if (d < minDistance[i])
                    minDistance[i] = d;
            }
    }

    protected int farthestAway(double[] minDistance, boolean[] selected) {
        double maxDistance = -1.0;
        int maxI = -1;
        for (int i = 0; i < selected.length; i++)
            if (!selected[i])
                if (maxDistance < minDistance[i]) {
                    maxDistance = minDistance[i];
                    maxI = i;
                }
        return maxI;
    }

    protected void initMinMax(Instances data) {
        m_Min = new double[data.numAttributes()];
        m_Max = new double[data.numAttributes()];
        for (int i = 0; i < data.numAttributes(); i++) {
            m_Min[i] = m_Max[i] = Double.NaN;
        }

        for (int i = 0; i < data.numInstances(); i++) {
            updateMinMax(data.instance(i));
        }
    }

    /**
     * Updates the minimum and maximum values for all the attributes
     * based on a new instance.
     *
     * @param instance the new instance
     */
    private void updateMinMax(Instance instance) {

        for (int j = 0; j < instance.numAttributes(); j++) {
            if (Double.isNaN(m_Min[j])) {
                m_Min[j] = instance.value(j);
                m_Max[j] = instance.value(j);
            } else {
                if (instance.value(j) < m_Min[j]) {
                    m_Min[j] = instance.value(j);
                } else {
                    if (instance.value(j) > m_Max[j]) {
                        m_Max[j] = instance.value(j);
                    }
                }
            }
        }
    }

    /**
     * clusters an instance that has been through the filters
     *
     * @param instance the instance to assign a cluster to
     * @return a cluster number
     */
    protected int clusterProcessedInstance(Instance instance) {
        double minDist = Double.MAX_VALUE;
        int bestCluster = 0;
        for (int i = 0; i < m_NumClusters; i++) {
            // Another modification to use DistanceFunction
            //double dist = distance(instance, m_ClusterCentroids.instance(i));
            double dist = m_DistanceFunction.distance(instance, m_ClusterCentroids.instance(i));
            if (dist < minDist) {
                minDist = dist;
                bestCluster = i;
            }
        }
        return bestCluster;
    }

    /**
     * Classifies a given instance.
     *
     * @param instance the instance to be assigned to a cluster
     * @return the number of the assigned cluster as an integer
     * if the class is enumerated, otherwise the predicted value
     * @throws Exception if instance could not be classified
     * successfully
     */
    public int clusterInstance(Instance instance) throws Exception {
        Instance inst = null;
        // Missing values replacement is not required so this modification is made
        // m_ReplaceMissingFilter.input(instance);
        // m_ReplaceMissingFilter.batchFinished();
        // Instance inst = m_ReplaceMissingFilter.output();
        inst = instance;

        return clusterProcessedInstance(inst);
    }

    // The following piece of code is commented because DistanceFunction will be used
    /*  *//**
          * Calculates the distance between two instances
          *
          * @param first the first instance
          * @param second the second instance
          * @return the distance between the two given instances, between 0 and 1
          *//*          
             protected double distance(Instance first, Instance second) {  
                 
             double distance = 0;
             int firstI, secondI;
                 
             for (int p1 = 0, p2 = 0; 
             p1 < first.numValues() || p2 < second.numValues();) {
              if (p1 >= first.numValues()) {
             firstI = m_instances.numAttributes();
              } else {
             firstI = first.index(p1); 
              }
              if (p2 >= second.numValues()) {
             secondI = m_instances.numAttributes();
              } else {
             secondI = second.index(p2);
              }
              if (firstI == m_instances.classIndex()) {
             p1++; continue;
              } 
              if (secondI == m_instances.classIndex()) {
             p2++; continue;
              } 
              double diff;
              if (firstI == secondI) {
             diff = difference(firstI, 
                first.valueSparse(p1),
                second.valueSparse(p2));
             p1++; p2++;
              } else if (firstI > secondI) {
             diff = difference(secondI, 
                0, second.valueSparse(p2));
             p2++;
              } else {
             diff = difference(firstI, 
                first.valueSparse(p1), 0);
             p1++;
              }
              distance += diff * diff;
             }
                 
             return Math.sqrt(distance / m_instances.numAttributes());
             }*/

    // To use DistanceFunction the following Set and Get are necessary
    /**
     * Returns the tip text for this property.
     * 
     * @return tip text for this property suitable for displaying in the
     *         explorer/experimenter gui
     */
    public String distanceFunctionTipText() {
        return "The distance function to use for instances comparison "
                + "(default: adaptedSemanticMeasurers.ChEBIDir). ";
    }

    /**
     * returns the distance function currently in use.
     * 
     * @return the distance function
     */
    public DistanceFunction getDistanceFunction() {
        return m_DistanceFunction;
    }

    /**
     * sets the distance function to use for instance comparison.
     * 
     * @param df the new distance function to use
     * @throws Exception if instances cannot be processed
     */
    public void setDistanceFunction(DistanceFunction df) throws Exception {
        if (!(df instanceof ChEBIInd) && !(df instanceof ChEBIDir) && !(df instanceof GOInd)
                && !(df instanceof GODir) && !(df instanceof GOChEBIInd) && !(df instanceof GOChEBIDir)
                && !(df instanceof CalculusInd) && !(df instanceof CalculusDir)) {
            throw new Exception("MyFarthestFirst does not currently support that distance.");
        }
        m_DistanceFunction = df;
    }

    /**
     * Computes the difference between two given attribute
     * values.
     */
    protected double difference(int index, double val1, double val2) {

        switch (m_instances.attribute(index).type()) {
        case Attribute.NOMINAL:

            // If attribute is nominal
            if (Instance.isMissingValue(val1) || Instance.isMissingValue(val2) || ((int) val1 != (int) val2)) {
                return 1;
            } else {
                return 0;
            }
        case Attribute.NUMERIC:

            // If attribute is numeric
            if (Instance.isMissingValue(val1) || Instance.isMissingValue(val2)) {
                if (Instance.isMissingValue(val1) && Instance.isMissingValue(val2)) {
                    return 1;
                } else {
                    double diff;
                    if (Instance.isMissingValue(val2)) {
                        diff = norm(val1, index);
                    } else {
                        diff = norm(val2, index);
                    }
                    if (diff < 0.5) {
                        diff = 1.0 - diff;
                    }
                    return diff;
                }
            } else {
                return norm(val1, index) - norm(val2, index);
            }
        default:
            return 0;
        }
    }

    /**
     * Normalizes a given value of a numeric attribute.
     *
     * @param x the value to be normalized
     * @param i the attribute's index
     * @return the normalized value
     */
    protected double norm(double x, int i) {

        if (Double.isNaN(m_Min[i]) || Utils.eq(m_Max[i], m_Min[i])) {
            return 0;
        } else {
            return (x - m_Min[i]) / (m_Max[i] - m_Min[i]);
        }
    }

    /**
     * Returns the number of clusters.
     *
     * @return the number of clusters generated for a training dataset.
     * @throws Exception if number of clusters could not be returned
     * successfully
     */
    public int numberOfClusters() throws Exception {
        return m_NumClusters;
    }

    /**
     * Returns an enumeration describing the available options.
     * 
     * @return an enumeration of all the available options.
     */
    public Enumeration listOptions() {
        Vector result = new Vector();

        result.addElement(new Option("\tnumber of clusters. (default = 2).", "N", 1, "-N <num>"));

        result.add(new Option("\tDistance function to use.\n" + "\t(default: adaptedSemanticMeasurers.ChEBIDir)",
                "A", 1, "-A <classname and options>"));

        Enumeration en = super.listOptions();
        while (en.hasMoreElements())
            result.addElement(en.nextElement());

        return result.elements();
    }

    /**
     * Returns the tip text for this property
     * @return tip text for this property suitable for
     * displaying in the explorer/experimenter gui
     */
    public String numClustersTipText() {
        return "set number of clusters";
    }

    /**
     * set the number of clusters to generate
     *
     * @param n the number of clusters to generate
     * @throws Exception if number of clusters is negative
     */
    public void setNumClusters(int n) throws Exception {
        if (n < 0) {
            throw new Exception("Number of clusters must be > 0");
        }
        m_NumClusters = n;
    }

    /**
     * gets the number of clusters to generate
     *
     * @return the number of clusters to generate
     */
    public int getNumClusters() {
        return m_NumClusters;
    }

    /**
     * Parses a given list of options. <p/>
     * 
     <!-- options-start -->
     * Valid options are: <p/>
     * 
     * <pre> -N &lt;num&gt;
     *  number of clusters. (default = 2).</pre>
     * 
     * <pre> -S &lt;num&gt;
     *  Random number seed.
     *  (default 1)</pre>
     *  
     * <pre>
     * -A &lt;classname and options&gt;
     *  Distance function to be used for instance comparison
     *  (adaptedSemanticMeasurers.ChEBIDir)
     * </pre>
     * 
     <!-- options-end -->
     *
     * @param options the list of options as an array of strings
     * @throws Exception if an option is not supported
     */
    @Override
    public void setOptions(String[] options) throws Exception {

        String optionString = Utils.getOption('N', options);

        if (optionString.length() != 0) {
            setNumClusters(Integer.parseInt(optionString));
        }

        String distFunctionClass = Utils.getOption('A', options);
        if (distFunctionClass.length() != 0) {
            String distFunctionClassSpec[] = Utils.splitOptions(distFunctionClass);
            if (distFunctionClassSpec.length == 0) {
                throw new Exception("Invalid DistanceFunction specification string.");
            }
            String className = distFunctionClassSpec[0];
            distFunctionClassSpec[0] = "";

            setDistanceFunction(
                    (DistanceFunction) Utils.forName(DistanceFunction.class, className, distFunctionClassSpec));
        } else {
            setDistanceFunction(new ChEBIDir());
        }

        super.setOptions(options);
    }

    /**
     * Gets the current settings of FarthestFirst
     *
     * @return an array of strings suitable for passing to setOptions()
     */
    public String[] getOptions() {
        int i;
        Vector result;
        String[] options;

        result = new Vector();

        result.add("-N");
        result.add("" + getNumClusters());

        result.add("-A");
        result.add(
                (m_DistanceFunction.getClass().getName() + " " + Utils.joinOptions(m_DistanceFunction.getOptions()))
                        .trim());

        options = super.getOptions();
        for (i = 0; i < options.length; i++)
            result.add(options[i]);

        return (String[]) result.toArray(new String[result.size()]);
    }

    /**
     * return a string describing this clusterer
     *
     * @return a description of the clusterer as a string
     */
    public String toString() {
        StringBuffer temp = new StringBuffer();

        temp.append("\n FarthestFirst\n==============\n");

        temp.append("\nCluster centroids:\n");
        for (int i = 0; i < m_NumClusters; i++) {
            temp.append("\nCluster " + i + "\n\t");
            for (int j = 0; j < m_ClusterCentroids.numAttributes(); j++) {
                if (m_ClusterCentroids.attribute(j).isNominal()) {
                    temp.append(" "
                            + m_ClusterCentroids.attribute(j).value((int) m_ClusterCentroids.instance(i).value(j)));
                } else {
                    temp.append(" " + m_ClusterCentroids.instance(i).value(j));
                }
            }
        }
        temp.append("\n\n");
        return temp.toString();
    }

    /**
     * Gets the the cluster centroids
     * 
     * @return the cluster centroids
     */
    public Instances getClusterCentroids() {
        return m_ClusterCentroids;
    }

    /**
     * Returns the revision string.
     * 
     * @return      the revision
     */
    public String getRevision() {
        return RevisionUtils.extract("$Revision: 5538 $");
    }

    /**
     * Main method for testing this class.
     *
     * @param argv should contain the following arguments: <p>
     * -t training file [-N number of clusters]
     */
    public static void main(String[] argv) {
        runClusterer(new MyFarthestFirst(), argv);
    }
}