Java tutorial
/* * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ /* * FarthestFirst.java * Copyright (C) 2002 University of Waikato, Hamilton, New Zealand * */ package adaptedClusteringAlgorithms; import weka.clusterers.RandomizableClusterer; import weka.core.Attribute; import weka.core.Capabilities; import weka.core.DistanceFunction; import weka.core.Instance; import weka.core.Instances; import weka.core.Option; import weka.core.RevisionUtils; import weka.core.TechnicalInformation; import weka.core.TechnicalInformationHandler; import weka.core.Utils; import weka.core.Capabilities.Capability; import weka.core.TechnicalInformation.Field; import weka.core.TechnicalInformation.Type; import weka.filters.unsupervised.attribute.ReplaceMissingValues; import java.util.Enumeration; import java.util.Random; import java.util.Vector; import adaptedSemanticMeasurers.CalculusDir; import adaptedSemanticMeasurers.CalculusInd; import adaptedSemanticMeasurers.ChEBIInd; import adaptedSemanticMeasurers.GOChEBIDir; import adaptedSemanticMeasurers.GOChEBIInd; import adaptedSemanticMeasurers.GOInd; import adaptedSemanticMeasurers.GODir; import adaptedSemanticMeasurers.ChEBIDir; import main.MyFirstClusterer; import main.SESAME; //--> WEKA's FarthestFirst adapted to use SML's Semantic Measurers as if they were Euclidean /** <!-- globalinfo-start --> * Cluster data using the FarthestFirst algorithm.<br/> * <br/> * For more information see:<br/> * <br/> * Hochbaum, Shmoys (1985). A best possible heuristic for the k-center problem. Mathematics of Operations Research. 10(2):180-184.<br/> * <br/> * Sanjoy Dasgupta: Performance Guarantees for Hierarchical Clustering. In: 15th Annual Conference on Computational Learning Theory, 351-363, 2002.<br/> * <br/> * Notes:<br/> * - works as a fast simple approximate clusterer<br/> * - modelled after SimpleKMeans, might be a useful initializer for it * <p/> <!-- globalinfo-end --> * <!-- technical-bibtex-start --> * BibTeX: * <pre> * @article{Hochbaum1985, * author = {Hochbaum and Shmoys}, * journal = {Mathematics of Operations Research}, * number = {2}, * pages = {180-184}, * title = {A best possible heuristic for the k-center problem}, * volume = {10}, * year = {1985} * } * * @inproceedings{Dasgupta2002, * author = {Sanjoy Dasgupta}, * booktitle = {15th Annual Conference on Computational Learning Theory}, * pages = {351-363}, * publisher = {Springer}, * title = {Performance Guarantees for Hierarchical Clustering}, * year = {2002} * } * </pre> * <p/> <!-- technical-bibtex-end --> * <!-- options-start --> * Valid options are: <p/> * * <pre> -N <num> * number of clusters. (default = 2).</pre> * * <pre> -S <num> * Random number seed. * (default 1)</pre> * * <pre> * -A <classname and options> * Distance function to be used for instance comparison * (default adaptedSemanticMeasurers.ChEBIDir) * </pre> * <!-- options-end --> * * @author Bernhard Pfahringer (bernhard@cs.waikato.ac.nz) * @version $Revision: 5538 $ * @see RandomizableClusterer */ public class MyFarthestFirst extends RandomizableClusterer implements TechnicalInformationHandler { //Todo: rewrite to be fully incremental // cleanup, like deleting m_instances /** for serialization */ static final long serialVersionUID = 7499838100631329509L; /** * training instances, not necessary to keep, * could be replaced by m_ClusterCentroids where needed for header info */ protected Instances m_instances; /** * replace missing values in training instances */ protected ReplaceMissingValues m_ReplaceMissingFilter; /** * number of clusters to generate */ protected int m_NumClusters = 2; /** * holds the cluster centroids */ protected Instances m_ClusterCentroids; /** * attribute min values */ private double[] m_Min; /** * attribute max values */ private double[] m_Max; // This is added to allow a distance calculation process where semantic measurers are used // To that the use of DistanceFunction interface must be made /** the distance function used. */ protected DistanceFunction m_DistanceFunction = new ChEBIDir(); /** * Returns a string describing this clusterer * @return a description of the evaluator suitable for * displaying in the explorer/experimenter gui */ public String globalInfo() { return "Cluster data using the FarthestFirst algorithm.\n\n" + "For more information see:\n\n" + getTechnicalInformation().toString() + "\n\n" + "Notes:\n" + "- works as a fast simple approximate clusterer\n" + "- modelled after SimpleKMeans, might be a useful initializer for it"; } /** * Returns an instance of a TechnicalInformation object, containing * detailed information about the technical background of this class, * e.g., paper reference or book this class is based on. * * @return the technical information about this class */ public TechnicalInformation getTechnicalInformation() { TechnicalInformation result; TechnicalInformation additional; result = new TechnicalInformation(Type.ARTICLE); result.setValue(Field.AUTHOR, "Hochbaum and Shmoys"); result.setValue(Field.YEAR, "1985"); result.setValue(Field.TITLE, "A best possible heuristic for the k-center problem"); result.setValue(Field.JOURNAL, "Mathematics of Operations Research"); result.setValue(Field.VOLUME, "10"); result.setValue(Field.NUMBER, "2"); result.setValue(Field.PAGES, "180-184"); additional = result.add(Type.INPROCEEDINGS); additional.setValue(Field.AUTHOR, "Sanjoy Dasgupta"); additional.setValue(Field.TITLE, "Performance Guarantees for Hierarchical Clustering"); additional.setValue(Field.BOOKTITLE, "15th Annual Conference on Computational Learning Theory"); additional.setValue(Field.YEAR, "2002"); additional.setValue(Field.PAGES, "351-363"); additional.setValue(Field.PUBLISHER, "Springer"); return result; } /** * Returns default capabilities of the clusterer. * * @return the capabilities of this clusterer */ public Capabilities getCapabilities() { Capabilities result = super.getCapabilities(); result.disableAll(); result.enable(Capability.NO_CLASS); // attributes result.enable(Capability.NOMINAL_ATTRIBUTES); result.enable(Capability.NUMERIC_ATTRIBUTES); result.enable(Capability.DATE_ATTRIBUTES); result.enable(Capability.MISSING_VALUES); return result; } /** * Generates a clusterer. Has to initialize all fields of the clusterer * that are not being set via options. * * @param data set of instances serving as training data * @throws Exception if the clusterer has not been * generated successfully */ public void buildClusterer(Instances data) throws Exception { if (!SESAME.SESAME_GUI) MyFirstClusterer.weka_gui = true; // can clusterer handle the data? getCapabilities().testWithFail(data); //long start = System.currentTimeMillis(); m_ReplaceMissingFilter = new ReplaceMissingValues(); // Missing values replacement is not required so this modification is made /*m_ReplaceMissingFilter.setInputFormat(data); m_instances = Filter.useFilter(data, m_ReplaceMissingFilter);*/ Instances m_instances = new Instances(data); // To use semantic measurers through DistanceFunction interface m_DistanceFunction.setInstances(m_instances); initMinMax(m_instances); m_ClusterCentroids = new Instances(m_instances, m_NumClusters); int n = m_instances.numInstances(); Random r = new Random(getSeed()); boolean[] selected = new boolean[n]; double[] minDistance = new double[n]; for (int i = 0; i < n; i++) minDistance[i] = Double.MAX_VALUE; int firstI = r.nextInt(n); m_ClusterCentroids.add(m_instances.instance(firstI)); selected[firstI] = true; updateMinDistance(minDistance, selected, m_instances, m_instances.instance(firstI)); if (m_NumClusters > n) m_NumClusters = n; for (int i = 1; i < m_NumClusters; i++) { int nextI = farthestAway(minDistance, selected); m_ClusterCentroids.add(m_instances.instance(nextI)); selected[nextI] = true; updateMinDistance(minDistance, selected, m_instances, m_instances.instance(nextI)); } m_instances = new Instances(m_instances, 0); //long end = System.currentTimeMillis(); //System.out.println("Clustering Time = " + (end-start)); // Save memory!! m_DistanceFunction.clean(); if (!SESAME.SESAME_GUI) MyFirstClusterer.weka_gui = true; } protected void updateMinDistance(double[] minDistance, boolean[] selected, Instances data, Instance center) { for (int i = 0; i < selected.length; i++) if (!selected[i]) { // Another modification to use DistanceFunction // double d = distance(center,data.instance(i)); double d = m_DistanceFunction.distance(center, data.instance(i)); if (d < minDistance[i]) minDistance[i] = d; } } protected int farthestAway(double[] minDistance, boolean[] selected) { double maxDistance = -1.0; int maxI = -1; for (int i = 0; i < selected.length; i++) if (!selected[i]) if (maxDistance < minDistance[i]) { maxDistance = minDistance[i]; maxI = i; } return maxI; } protected void initMinMax(Instances data) { m_Min = new double[data.numAttributes()]; m_Max = new double[data.numAttributes()]; for (int i = 0; i < data.numAttributes(); i++) { m_Min[i] = m_Max[i] = Double.NaN; } for (int i = 0; i < data.numInstances(); i++) { updateMinMax(data.instance(i)); } } /** * Updates the minimum and maximum values for all the attributes * based on a new instance. * * @param instance the new instance */ private void updateMinMax(Instance instance) { for (int j = 0; j < instance.numAttributes(); j++) { if (Double.isNaN(m_Min[j])) { m_Min[j] = instance.value(j); m_Max[j] = instance.value(j); } else { if (instance.value(j) < m_Min[j]) { m_Min[j] = instance.value(j); } else { if (instance.value(j) > m_Max[j]) { m_Max[j] = instance.value(j); } } } } } /** * clusters an instance that has been through the filters * * @param instance the instance to assign a cluster to * @return a cluster number */ protected int clusterProcessedInstance(Instance instance) { double minDist = Double.MAX_VALUE; int bestCluster = 0; for (int i = 0; i < m_NumClusters; i++) { // Another modification to use DistanceFunction //double dist = distance(instance, m_ClusterCentroids.instance(i)); double dist = m_DistanceFunction.distance(instance, m_ClusterCentroids.instance(i)); if (dist < minDist) { minDist = dist; bestCluster = i; } } return bestCluster; } /** * Classifies a given instance. * * @param instance the instance to be assigned to a cluster * @return the number of the assigned cluster as an integer * if the class is enumerated, otherwise the predicted value * @throws Exception if instance could not be classified * successfully */ public int clusterInstance(Instance instance) throws Exception { Instance inst = null; // Missing values replacement is not required so this modification is made // m_ReplaceMissingFilter.input(instance); // m_ReplaceMissingFilter.batchFinished(); // Instance inst = m_ReplaceMissingFilter.output(); inst = instance; return clusterProcessedInstance(inst); } // The following piece of code is commented because DistanceFunction will be used /* *//** * Calculates the distance between two instances * * @param first the first instance * @param second the second instance * @return the distance between the two given instances, between 0 and 1 *//* protected double distance(Instance first, Instance second) { double distance = 0; int firstI, secondI; for (int p1 = 0, p2 = 0; p1 < first.numValues() || p2 < second.numValues();) { if (p1 >= first.numValues()) { firstI = m_instances.numAttributes(); } else { firstI = first.index(p1); } if (p2 >= second.numValues()) { secondI = m_instances.numAttributes(); } else { secondI = second.index(p2); } if (firstI == m_instances.classIndex()) { p1++; continue; } if (secondI == m_instances.classIndex()) { p2++; continue; } double diff; if (firstI == secondI) { diff = difference(firstI, first.valueSparse(p1), second.valueSparse(p2)); p1++; p2++; } else if (firstI > secondI) { diff = difference(secondI, 0, second.valueSparse(p2)); p2++; } else { diff = difference(firstI, first.valueSparse(p1), 0); p1++; } distance += diff * diff; } return Math.sqrt(distance / m_instances.numAttributes()); }*/ // To use DistanceFunction the following Set and Get are necessary /** * Returns the tip text for this property. * * @return tip text for this property suitable for displaying in the * explorer/experimenter gui */ public String distanceFunctionTipText() { return "The distance function to use for instances comparison " + "(default: adaptedSemanticMeasurers.ChEBIDir). "; } /** * returns the distance function currently in use. * * @return the distance function */ public DistanceFunction getDistanceFunction() { return m_DistanceFunction; } /** * sets the distance function to use for instance comparison. * * @param df the new distance function to use * @throws Exception if instances cannot be processed */ public void setDistanceFunction(DistanceFunction df) throws Exception { if (!(df instanceof ChEBIInd) && !(df instanceof ChEBIDir) && !(df instanceof GOInd) && !(df instanceof GODir) && !(df instanceof GOChEBIInd) && !(df instanceof GOChEBIDir) && !(df instanceof CalculusInd) && !(df instanceof CalculusDir)) { throw new Exception("MyFarthestFirst does not currently support that distance."); } m_DistanceFunction = df; } /** * Computes the difference between two given attribute * values. */ protected double difference(int index, double val1, double val2) { switch (m_instances.attribute(index).type()) { case Attribute.NOMINAL: // If attribute is nominal if (Instance.isMissingValue(val1) || Instance.isMissingValue(val2) || ((int) val1 != (int) val2)) { return 1; } else { return 0; } case Attribute.NUMERIC: // If attribute is numeric if (Instance.isMissingValue(val1) || Instance.isMissingValue(val2)) { if (Instance.isMissingValue(val1) && Instance.isMissingValue(val2)) { return 1; } else { double diff; if (Instance.isMissingValue(val2)) { diff = norm(val1, index); } else { diff = norm(val2, index); } if (diff < 0.5) { diff = 1.0 - diff; } return diff; } } else { return norm(val1, index) - norm(val2, index); } default: return 0; } } /** * Normalizes a given value of a numeric attribute. * * @param x the value to be normalized * @param i the attribute's index * @return the normalized value */ protected double norm(double x, int i) { if (Double.isNaN(m_Min[i]) || Utils.eq(m_Max[i], m_Min[i])) { return 0; } else { return (x - m_Min[i]) / (m_Max[i] - m_Min[i]); } } /** * Returns the number of clusters. * * @return the number of clusters generated for a training dataset. * @throws Exception if number of clusters could not be returned * successfully */ public int numberOfClusters() throws Exception { return m_NumClusters; } /** * Returns an enumeration describing the available options. * * @return an enumeration of all the available options. */ public Enumeration listOptions() { Vector result = new Vector(); result.addElement(new Option("\tnumber of clusters. (default = 2).", "N", 1, "-N <num>")); result.add(new Option("\tDistance function to use.\n" + "\t(default: adaptedSemanticMeasurers.ChEBIDir)", "A", 1, "-A <classname and options>")); Enumeration en = super.listOptions(); while (en.hasMoreElements()) result.addElement(en.nextElement()); return result.elements(); } /** * Returns the tip text for this property * @return tip text for this property suitable for * displaying in the explorer/experimenter gui */ public String numClustersTipText() { return "set number of clusters"; } /** * set the number of clusters to generate * * @param n the number of clusters to generate * @throws Exception if number of clusters is negative */ public void setNumClusters(int n) throws Exception { if (n < 0) { throw new Exception("Number of clusters must be > 0"); } m_NumClusters = n; } /** * gets the number of clusters to generate * * @return the number of clusters to generate */ public int getNumClusters() { return m_NumClusters; } /** * Parses a given list of options. <p/> * <!-- options-start --> * Valid options are: <p/> * * <pre> -N <num> * number of clusters. (default = 2).</pre> * * <pre> -S <num> * Random number seed. * (default 1)</pre> * * <pre> * -A <classname and options> * Distance function to be used for instance comparison * (adaptedSemanticMeasurers.ChEBIDir) * </pre> * <!-- options-end --> * * @param options the list of options as an array of strings * @throws Exception if an option is not supported */ @Override public void setOptions(String[] options) throws Exception { String optionString = Utils.getOption('N', options); if (optionString.length() != 0) { setNumClusters(Integer.parseInt(optionString)); } String distFunctionClass = Utils.getOption('A', options); if (distFunctionClass.length() != 0) { String distFunctionClassSpec[] = Utils.splitOptions(distFunctionClass); if (distFunctionClassSpec.length == 0) { throw new Exception("Invalid DistanceFunction specification string."); } String className = distFunctionClassSpec[0]; distFunctionClassSpec[0] = ""; setDistanceFunction( (DistanceFunction) Utils.forName(DistanceFunction.class, className, distFunctionClassSpec)); } else { setDistanceFunction(new ChEBIDir()); } super.setOptions(options); } /** * Gets the current settings of FarthestFirst * * @return an array of strings suitable for passing to setOptions() */ public String[] getOptions() { int i; Vector result; String[] options; result = new Vector(); result.add("-N"); result.add("" + getNumClusters()); result.add("-A"); result.add( (m_DistanceFunction.getClass().getName() + " " + Utils.joinOptions(m_DistanceFunction.getOptions())) .trim()); options = super.getOptions(); for (i = 0; i < options.length; i++) result.add(options[i]); return (String[]) result.toArray(new String[result.size()]); } /** * return a string describing this clusterer * * @return a description of the clusterer as a string */ public String toString() { StringBuffer temp = new StringBuffer(); temp.append("\n FarthestFirst\n==============\n"); temp.append("\nCluster centroids:\n"); for (int i = 0; i < m_NumClusters; i++) { temp.append("\nCluster " + i + "\n\t"); for (int j = 0; j < m_ClusterCentroids.numAttributes(); j++) { if (m_ClusterCentroids.attribute(j).isNominal()) { temp.append(" " + m_ClusterCentroids.attribute(j).value((int) m_ClusterCentroids.instance(i).value(j))); } else { temp.append(" " + m_ClusterCentroids.instance(i).value(j)); } } } temp.append("\n\n"); return temp.toString(); } /** * Gets the the cluster centroids * * @return the cluster centroids */ public Instances getClusterCentroids() { return m_ClusterCentroids; } /** * Returns the revision string. * * @return the revision */ public String getRevision() { return RevisionUtils.extract("$Revision: 5538 $"); } /** * Main method for testing this class. * * @param argv should contain the following arguments: <p> * -t training file [-N number of clusters] */ public static void main(String[] argv) { runClusterer(new MyFarthestFirst(), argv); } }