no.ntnu.idi.dm.clustering.KMeans.java Source code

Java tutorial

Introduction

Here is the source code for no.ntnu.idi.dm.clustering.KMeans.java

Source

package no.ntnu.idi.dm.clustering;

/*
 * Copyright 2004-2010 Information & Software Engineering Group (188/1)
 *                     Institute of Software Technology and Interactive Systems
 *                     Vienna University of Technology, Austria
 *                     
 *                     and myself
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.ifs.tuwien.ac.at/dm/somtoolbox/license.html
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

import java.util.ArrayList;
import java.util.Hashtable;
import java.util.Vector;

import no.ntnu.idi.dm.distancemetrics.EuclideanDistance;

import org.apache.commons.lang3.ArrayUtils;
import org.apache.commons.math3.analysis.function.Max;
import org.apache.commons.math3.random.JDKRandomGenerator;
import org.apache.commons.math3.random.RandomGenerator;

/**
 * Pretty much the classic K-Means clustering. Tried to keep it simple, though.
 * 
 * @author Robert Neumayer
 * @version $Id: KMeans.java 3921 2010-11-05 12:54:53Z mayer $
 */

public class KMeans {

    protected double[][] data;

    private int k; //the number of desired clusters
    private int numberOfInstances; //the number of data points 
    private int numberOfAttributes; //the number of atttributes each data point has. The number of dimensions each data point has.

    // keep min, max values and the differences between them
    private double[] minValues, maxValues, differences;

    // this one will be used for looking up cluster assignments
    // which in turn will help to terminate the training process
    // as soon as we don't experience any changes in cluster assignments
    private Hashtable<Integer, Integer> instancesInClusters;

    protected Cluster[] clusters;

    // TODO remove this after testing
    private static long RANDOM_SEED = 1234567;

    // Keep track of udates. When the number of new updates is 0, the clustering is complete. 

    /**
     * Default constructor (as much defaulting as possible). Uses linear
     * initialisation and Euclidean distance.
     * 
     * @param k
     *            number of clusters
     * @param data
     *            guess
     */
    public KMeans(int k, double[][] data) {
        this(k, data, new EuclideanDistance());
    }

    /**
     * Construct a new K-Means object.
     * 
     * @param k
     *            number of clusters
     * @param data
     *            the data set
     * @param initialisation
     *            initialisation type
     * @param distanceFunction
     *            an LnMetric of your choice
     */
    public KMeans(int k, double[][] data, EuclideanDistance distanceFunction) {
        this.k = k;
        this.numberOfAttributes = data[0].length;
        this.numberOfInstances = data.length;
        this.instancesInClusters = new Hashtable<Integer, Integer>();
        this.clusters = new Cluster[k];
        this.data = data;

        // initialise a couple of things
        initMinAndMaxValues();

        // init with random instances
        initClustersRandomlyOnInstances(distanceFunction);

        // printClusters();
        // this one is to do a first assignment of data points to clusters
        recalculateClusters();
    }

    /* getters and setters */

    public double[] getMinValues() {
        return minValues;
    }

    public double[] getMaxValues() {
        return maxValues;
    }

    public double[] getDifferences() {
        return differences;
    }

    public Cluster[] getClusters() {
        return clusters;
    }

    /**
     * @return Returns the data.
     */
    public double[][] getData() {
        return data;
    }

    /**
     * Update for as long as instances move between clusters. "Not moving" means
     * that there hasn't been a change in the last
     * {@link #NUMBER_OF_UPDATE_RANGE} steps ({@value #NUMBER_OF_UPDATE_RANGE}).
     */
    public void assignAndUpdate() {
        // train for as we have updates in the cluster assignments
        boolean update = true;
        while (update) {
            update = recalculateClusters();
        }
        removeEmptyClusters();
    }

    /**
     * Searches for clusters which have no instances assigned. These are then
     * removed
     */
    private void removeEmptyClusters() {
        Vector<Cluster> nonEmptyClusters = new Vector<Cluster>();
        for (Cluster cluster : clusters) {
            if (cluster.getNumberOfInstances() != 0) {
                nonEmptyClusters.add(cluster);
            }
        }
        Cluster[] remainingClusters = new Cluster[nonEmptyClusters.size()];
        for (int i = 0; i < nonEmptyClusters.size(); i++) {
            remainingClusters[i] = nonEmptyClusters.elementAt(i);
        }
        clusters = remainingClusters;
    }

    private int lastNumberOfUpdates = Integer.MAX_VALUE;

    /**
     * A classic training step in the K-Means world.
     * 
     * @return whether this step brought any changes or not. Note, this one also
     *         says no if there were as many changes as in the last step.
     */
    private boolean recalculateClusters() {

        boolean recalculate = false;
        int numUpdates = 0;

        for (int i = 0; i < this.data.length; i++) {
            int index = getIndexOfClosestCluster(this.data[i]);

            if (!clusters[index].getIndices().contains(i)) {
                int oldCluster = 0;
                for (int j = 0; j < clusters.length; j++) {
                    if (clusters[j].getIndices().contains(i)) {
                        oldCluster = j;
                    }
                }
                // Remove from old cluster and add to new cluster. 
                clusters[oldCluster].removeInstanceIndex(i);
                clusters[index].addIndex(i);

                numUpdates++;
                recalculate = true;
            }
        }

        calculateNewCentroids();

        //This step don't make sense, but the javadoc specified it. 
        if (numUpdates == this.lastNumberOfUpdates) {
            recalculate = false;
        }

        return recalculate;

    }

    /**
     * Batch calculation of all cluster centroids.
     */
    private void calculateNewCentroids() {
        for (int clusterIndex = 0; clusterIndex < k; clusterIndex++) {
            // go look for a new centroid if no instances are assigned to the
            // cluster
            if (clusters[clusterIndex].getNumberOfInstances() == 0) {
                if (getSubstituteCentroid() != null) {
                    clusters[clusterIndex].setCentroid(getSubstituteCentroid());
                }
            }
            clusters[clusterIndex].calculateCentroid(data);
        }
    }

    /**
     * Get a new centroid for empty clusters. We therefore take the instance
     * with the largest SSE to the cluster centroid having the largest SSE. Get
     * the idea? Read slowly.
     * 
     * @return a new centroid (rather: a clone thereof :))
     */
    private double[] getSubstituteCentroid() {
        double maxSSE = Double.NEGATIVE_INFINITY;
        int maxSSEIndex = -1;
        for (int clusterIndex = 0; clusterIndex < k; clusterIndex++) {
            clusters[clusterIndex].calculateCentroid(data);
            double currentSSE = clusters[clusterIndex].SSE(data);
            if (currentSSE > maxSSE) {
                maxSSE = currentSSE;
                maxSSEIndex = clusterIndex;
            }
        }

        if (clusters[maxSSEIndex].getInstanceIndexWithMaxSSE(data) == -1) {
            return null;
        }
        return data[clusters[maxSSEIndex].getInstanceIndexWithMaxSSE(data)].clone();
    }

    /**
     * Get the index of the closest cluster for the given instance index. Note
     * that in case of equally distant clusters we assign the first found
     * cluster. At the end of the day this means that the clusters with lower
     * indices will have a tendency to be larger. It hopefully won't have too
     * much impact, possibly a random assignment in case of equal weights would
     * make sense, however, this would require a couple of steps more in here.
     * 
     * @param instance
     *            the data vector to be assigned
     * @return index of the closest cluster centre
     */
    private int getIndexOfClosestCluster(double[] instance) {
        int indexOfClosestCluster = 0;
        double smallestDistance = Double.POSITIVE_INFINITY;
        double currentDistance = 0;
        for (int clusterIndex = 0; clusterIndex < k; clusterIndex++) {
            currentDistance = clusters[clusterIndex].getDistanceToCentroid(instance);
            if (currentDistance < smallestDistance) {
                smallestDistance = currentDistance;
                indexOfClosestCluster = clusterIndex;
            }
        }
        return indexOfClosestCluster;
    }

    /** Take random points from the input data as centroids. */
    private void initClustersRandomlyOnInstances(EuclideanDistance distanceFunction) {
        ArrayList<double[]> usedInstances = new ArrayList<double[]>();
        RandomGenerator rg = new JDKRandomGenerator();
        rg.setSeed(RANDOM_SEED);
        // for each cluster
        for (int clusterIndex = 0; clusterIndex < k; clusterIndex++) {
            // draw a random input
            double[] centroid = data[rg.nextInt(data.length - 1)].clone();
            while (usedInstances.contains(centroid)) {
                centroid = data[rg.nextInt(data.length - 1)].clone();
            }
            usedInstances.add(centroid);
            clusters[clusterIndex] = new Cluster(centroid, distanceFunction);
        }
    }

    /**
     * Utility method to get the min, max, and diff values of the data set. This
     * is used for scaling the (random) values in the initialisation functions.
     */
    private void initMinAndMaxValues() {
        minValues = new double[numberOfAttributes];
        maxValues = new double[numberOfAttributes];
        differences = new double[numberOfAttributes];
        // for each attribute
        for (int j = 0; j < numberOfAttributes; j++) {
            // in each instance (i.e. each single value now :-))
            minValues[j] = Double.MAX_VALUE;
            maxValues[j] = Double.MIN_VALUE;
            for (double[] element : data) {
                if (element[j] < minValues[j]) {
                    minValues[j] = element[j];
                }
                if (element[j] > maxValues[j]) {
                    maxValues[j] = element[j];
                }
            }
            differences[j] = maxValues[j] - minValues[j];
        }
    }

    /**
     * Get the sum of the squared error for all clusters.
     * 
     * @return SSE.
     */
    public double getSSE() {
        double sse = 0;

        for (int i = 0; i < k; i++) {
            sse += clusters[i].SSE(data);
        }

        return sse;
    }

    /**
     * 
     * @return SSB.
     */
    public double getSSB() {
        double ssb = 0;
        double[] m = getM();

        for (int i = 0; i < clusters.length; i++) {
            ssb += clusters[i].SSB(m);
        }
        return ssb;
    }

    /**
     * 
     * @return Center for all centroids.
     */
    public double[] getM() {

        //Coordinates for the m
        double[] m = new double[clusters[0].getCentroid().length];

        //Sum all coordinates for each cluster.
        for (int i = 0; i < clusters.length; i++) {
            for (int j = 0; j < clusters[i].getCentroid().length; j++) {
                m[j] += clusters[i].getCentroid()[j];
            }
        }

        //Get the mean
        for (int i = 0; i < clusters[0].getCentroid().length; i++) {
            m[i] = m[i] / clusters[0].getCentroid().length;
        }

        return m;
    }

    /**
     * Get the sum of the squared error for single clusters.
     * 
     * @return several SSEs.
     */
    public double[] getSSEs() {
        double[] sse = new double[k];
        for (int i = 0; i < clusters.length; i++) {
            sse[i] = clusters[i].SSE(this.data);
        }
        return sse;
    }

    public void printCentroids() {
        for (int i = 0; i < clusters.length; i++) {
            System.out.println(i + " / " + clusters[i].SSE(data) + " / " + clusters[i].getNumberOfInstances()
                    + " / " + ArrayUtils.toString(clusters[i].getCentroid()));
        }
    }

    public void printCentroidsShort() {
        for (int i = 0; i < clusters.length; i++) {
            System.out
                    .println("\t" + i + " / " + clusters[i].SSE(data) + " / " + clusters[i].getNumberOfInstances());
        }
    }

    public double getAverageSilhouetteValue() {
        double silhouette = 0;

        for (int i = 0; i < clusters.length; i++) {
            for (int j = 0; j < clusters[i].getNumberOfInstances(); j++) {

                //Distance from point i in cluster c to all other points in cluster c.
                double a = clusters[i].getAverageDistanceOfPoint(data, clusters[i].getIndices().get(j));
                //Sets b to the max value, because we want the min distance 
                //(closest point to i from a other cluster)
                double b = Double.MAX_VALUE;

                for (int l = 0; l < getClusters().length; l++) {
                    if (l != k) {
                        double tmpB = clusters[l].getAverageDistanceOfPointToAllPoints(data,
                                clusters[i].getIndices().get(j));
                        if (tmpB < b) {
                            b = tmpB;
                        }
                    }
                }
                silhouette += (b - a) / Math.max(a, b);
            }
        }
        silhouette /= data.length;

        return silhouette;
    }
}