Java tutorial
package categorization; /* * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ /* * SpectralClusterer.java * Copyright (C) 2002 Luigi Dragone * */ import cern.colt.matrix.*; import cern.colt.matrix.linalg.*; import cern.colt.map.*; import cern.colt.list.*; import cern.jet.math.*; import utils.UtilsJS; import weka.core.*; import weka.clusterers.*; import java.util.*; /** * <p> * Spectral clusterer class. For more information see: * <ul> * <li>Shi, J., and J. Malik (1997) "Normalized Cuts and Image Segmentation", * in Proc. of IEEE Conf. on Comp. Vision and Pattern Recognition, Puerto Rico</li> * <li>Kannan, R., S. Vempala, and A. Vetta (2000) "On Clusterings - Good, Bad and * Spectral", Tech. Report, CS Dept., Yale University.</li> * <li>Ng, A. Y., M. I. Jordan, and Y. Weiss (2001) "On Spectral Clustering: * Analysis and an algorithm", in Advances in Neural Information Processing * Systems 14.</li> * </ul> * </p> * <p> * This implementation assumes that the data instances are points in an Euclidean * space. The algorithm is based on the concept of similarity between points instead * of distance. Given two points <var>x</var> and <var>y</var> and their * Euclidean distance <tt>d(x, y)</tt>, their similarity is defined as * <tt>exp(- d(x, y)^2 / (2 * sigma^2))</tt>, where <var>sigma</var> is a * scaling factor (its default value is 1.0).</p> * <p>There is a distance cut factor <var>r</var>, if the distance <tt>d(x, y)</tt> * between two points is greater than <var>r</var> then their similarity is set to 0. * This parameter combined with the use of sparse matrices can improve the * performances w.r.t. the memory.</p> * <p>To classify a new instance w.r.t. the partitions found this implementation * applies a naive min-distance algorithm that assigns the instance to the * cluster that contains the nearest point. Since the distance to similarity * function is bijective and monotone the nearest point is also the most similar * one.</p> * <p> * Valid options are: * <ul> * <li> * -A <0-1> <br> * Specifies the alpha star factor. The algorithm stops the recursive partitioning * when it does not find a cut that has a value below this factor.<br> * Use this argument to limit the number of clusters. * </li> * <li> * -S <positive number> <br> * Specifies the value of the scaling factor sigma. * </li> * <li> * -R <-1 or a positive number> <br> * Specifies the distance cut factor. -1 is equivalent to the positive infinity. * </li> * <li> * -M <br> * Requires the use of sparse representation of similarity matrices. * </li> * </ul> * <p>This implementation relies on the COLT numeric package for Java written by * Wolfgang Hoschek. For other information about COLT see its home * page at * <a href="http://nicewww.cern.ch/~hoschek/colt/index.htm">http://nicewww.cern.ch/~hoschek/colt/index.htm</a>.</p> * According to the license, the copyright notice is reported below:<br> * <pre> * Written by Wolfgang Hoschek. Check the Colt home page for more info. * Copyright © 1999 CERN - European Organization for Nuclear Research. * Permission to use, copy, modify, distribute and sell this software and * its documentation for any purpose is hereby granted without fee, * provided that the above copyright notice appear in all copies and that * both that copyright notice and this permission notice appear in * supporting documentation. CERN makes no representations about * the suitability of this software for any purpose. It is provided "as is" * without expressed or implied warranty. * </pre> * </p> * <p> * This software is issued under GNU General Public License.<br> * <pre> * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. *</pre> * </p> * * @author Luigi Dragone (<a href="mailto:luigi@luigidragone.com">luigi@luigidragone.com</a>) * @version 1.0 * * @see <a href="http://nicewww.cern.ch/~hoschek/colt/index.htm">COLT</a> */ public class SpectralWEKA extends AbstractClusterer implements OptionHandler { /** * The points of the dataset */ protected DoubleMatrix2D v; /** * The class number of each point in the dataset */ protected int[] cluster; /** * The number of different clusters found */ protected int numOfClusters = 0; /** * The alpha star parameter value */ protected double alpha_star = 0.5; /** * The distance cut factor */ protected double r = -1; /** * The sigma scaling factor */ protected double sigma = 1.0; /** * The using sparse matrix flag */ protected boolean useSparseMatrix = false; protected static Vector options = new Vector(); protected Instances m_Sequences; ArrayList<String> seqList; protected double[][] sim_matrix; boolean partitionOnlyOnce = true; int numPartitions; /** * The static initializer sets up the options vector */ static { options.addElement(new Option("\tAlpha star. (default = 0.5).", "A", 1, "-A <0-1>")); options.addElement(new Option("\tSigma. (default = 1.0).", "S", 1, "-S <num>")); options.addElement(new Option( "\tR. All points that are far away more than this value have a zero similarity. (default = -1).", "R", 1, "-R <num>")); options.addElement(new Option("\tUse sparse matrix representation. (default = false).", "M", 0, "-M")); } /** * Returns the Euclidean distance between two points. It is used to compute * the similarity degree of these ones. * * @param x the first point * @param y the second point * @return the Euclidean distance between the points */ protected static double distnorm2(DoubleMatrix1D x, DoubleMatrix1D y) { DoubleMatrix1D z = x.copy(); z.assign(y, Functions.minus); return z.zDotProduct(z); } /** * Merges two sets of points represented as integer vectors. The sets are * not overlapped. * * @param a the first set of points * @param b the second set of points * @return the union of the two sets */ protected static int[] merge(int[] a, int[] b) { int[] v = new int[a.length + b.length]; System.arraycopy(a, 0, v, 0, a.length); System.arraycopy(b, 0, v, a.length, b.length); return v; } /** * Computes the association degree between two partitions of a graph.<br> * The association degree is defined as the sum of the weights of all the * edges between points of the two partitions. * * @param W the weight matrix of the graph * @param a the points of the first partition * @param b the points of the second partition * @return the association degree */ protected static double asso(DoubleMatrix2D W, int[] a, int[] b) { return W.viewSelection(a, b).zSum(); } /** * Returns the normalized association degree between two partitions of a graph. * * @param W the weight matrix of the graph * @param a the points of the first partition * @param b the points of the second partition * @return the normalized association degree */ protected static double Nasso(DoubleMatrix2D W, int[] a, int[] b) { int[] v = merge(a, b); return Nasso(W, a, b, v); } /** * Returns the normalized association degree between two partitions of a * graph w.r.t. a given subgraph. * * @param W the weight matrix of the graph * @param a the points of the first partition * @param b the points of the second partition * @param v the points of the subgraph * @return the normalized association degree */ protected static double Nasso(DoubleMatrix2D W, int[] a, int[] b, int[] v) { return asso(W, a, a) / asso(W, a, v) + asso(W, b, b) / asso(W, b, v); } /** * Returns the normalized dissimilarity degree (or cut) between two partitions * of a graph. * * @param W the weight matrix of the graph * @param a the points of the first partition * @param b the points of the second partition * @return the normalized cut */ protected static double Ncut(DoubleMatrix2D W, int[] a, int[] b) { return 2 - Nasso(W, a, b); } /** * Returns the normalized dissimilarity degree (or cut) between two partitions * of a graph w.r.t. a given subgraph. * * @param W the weight matrix of the graph * @param a the points of the first partition * @param b the points of the second partition * @param v the points of the subgraph. * @return the normalized cut */ protected static double Ncut(DoubleMatrix2D W, int[] a, int[] b, int[] v) { return 2 - Nasso(W, a, b, v); } /** * Returns the best cut of a graph w.r.t. the degree of dissimilarity between * points of different partitions and the degree of similarity between points * of the same partition. * * @param W the weight matrix of the graph * @return an array of two elements, each of these contains the points of a * partition */ protected static int[][] bestCut(DoubleMatrix2D W) { int n = W.columns(); // Builds the diagonal matrices D and D^(-1/2) (represented as their diagonals) DoubleMatrix1D d = DoubleFactory1D.dense.make(n); DoubleMatrix1D d_minus_1_2 = DoubleFactory1D.dense.make(n); for (int i = 0; i < n; i++) { double d_i = W.viewRow(i).zSum(); d.set(i, d_i); d_minus_1_2.set(i, 1 / Math.sqrt(d_i)); } DoubleMatrix2D D = DoubleFactory2D.sparse.diagonal(d); //System.out.println("DoubleMatrix2D :\n"+D.toString()); DoubleMatrix2D X = D.copy(); //System.out.println("DoubleMatrix2D copy :\n"+X.toString()); // X = D^(-1/2) * (D - W) * D^(-1/2) X.assign(W, Functions.minus); //System.out.println("DoubleMatrix2D X: (D-W) :\n"+X.toString()); for (int i = 0; i < n; i++) for (int j = 0; j < n; j++) X.set(i, j, X.get(i, j) * d_minus_1_2.get(i) * d_minus_1_2.get(j)); // Computes the eigenvalues and the eigenvectors of X EigenvalueDecomposition e = new EigenvalueDecomposition(X); DoubleMatrix1D lambda = e.getRealEigenvalues(); // Selects the eigenvector z_2 associated with the second smallest eigenvalue // Creates a map that contains the pairs <index, eigenvalue> AbstractIntDoubleMap map = new OpenIntDoubleHashMap(n); for (int i = 0; i < n; i++) map.put(i, Math.abs(lambda.get(i))); IntArrayList list = new IntArrayList(); // Sorts the map on the value map.keysSortedByValue(list); // Gets the index of the second smallest element int i_2 = list.get(1); // y_2 = D^(-1/2) * z_2 DoubleMatrix1D y_2 = e.getV().viewColumn(i_2).copy(); y_2.assign(d_minus_1_2, Functions.mult); // Creates a map that contains the pairs <i, y_2[i]> map.clear(); for (int i = 0; i < n; i++) map.put(i, y_2.get(i)); // Sorts the map on the value map.keysSortedByValue(list); // Search the element in the map previuosly ordered that minimizes the cut // of the partition double best_cut = Double.POSITIVE_INFINITY; int[][] partition = new int[2][]; // The array v contains all the elements of the graph ordered by their // projection on vector y_2 int[] v = list.elements(); // For each admissible splitting point i for (int i = 1; i < n; i++) { // The array a contains all the elements that have a projection on vector // y_2 less or equal to the one of i-th element // The array b contains the remaining elements int[] a = new int[i]; int[] b = new int[n - i]; System.arraycopy(v, 0, a, 0, i); System.arraycopy(v, i, b, 0, n - i); double cut = Ncut(W, a, b, v); if (cut < best_cut) { best_cut = cut; partition[0] = a; partition[1] = b; } } //System.out.println("Partition:"); //UtilsJS.printMatrix(partition); return partition; } /** * Splits recursively the points of the graph while the value of the * best cut found is less of a specified limit (the alpha star factor). * * @param W the weight matrix of the graph * @param alpha_star the alpha star factor * @return an array of sets of points (partitions) */ protected int[][] partition(DoubleMatrix2D W, double alpha_star) { numPartitions++; //System.out.println("!"); // If the graph contains only one point if (W.columns() == 1) { int[][] p = new int[1][1]; p[0][0] = 0; return p; // Otherwise } else { // Computes the best cut int[][] cut = bestCut(W); // Computes the value of the found cut double cutVal = Ncut(W, cut[0], cut[1], null); //System.out.println("cutVal = "+cutVal +"\tnumPartitions = "+numPartitions); // If the value is less than alpha star if (cutVal < alpha_star && numPartitions < 2) { // Recursively partitions the first one found ... DoubleMatrix2D W0 = W.viewSelection(cut[0], cut[0]); int[][] p0 = partition(W0, alpha_star); // ... and the second one DoubleMatrix2D W1 = W.viewSelection(cut[1], cut[1]); int[][] p1 = partition(W1, alpha_star); // Merges the partitions found in the previous recursive steps int[][] p = new int[p0.length + p1.length][]; for (int i = 0; i < p0.length; i++) { p[i] = new int[p0[i].length]; for (int j = 0; j < p0[i].length; j++) p[i][j] = cut[0][p0[i][j]]; } for (int i = 0; i < p1.length; i++) { p[i + p0.length] = new int[p1[i].length]; for (int j = 0; j < p1[i].length; j++) p[i + p0.length][j] = cut[1][p1[i][j]]; } return p; } else { // Otherwise returns the partitions found in current step // w/o recursive invocation int[][] p = new int[1][W.columns()]; for (int i = 0; i < p[0].length; i++) p[0][i] = i; return p; } } } /** * Returns the number of clusters found. * * @return the number of clusters */ public int numberOfClusters() throws java.lang.Exception { return numOfClusters; } /** * Classifies an instance w.r.t. the partitions found. It applies a naive * min-distance algorithm. * * @param instance the instance to classify * @return the cluster that contains the nearest point to the instance */ public int clusterInstance(Instance instance) throws java.lang.Exception { DoubleMatrix1D u = DoubleFactory1D.dense.make(instance.toDoubleArray()); double min_dist = Double.POSITIVE_INFINITY; int c = -1; for (int i = 0; i < v.rows(); i++) { double dist = distnorm2(u, v.viewRow(i)); if (dist < min_dist) { c = cluster[i]; min_dist = dist; } } return c; } public void buildClusterer(ArrayList<String> seqDB, double[][] sm) { seqList = seqDB; this.setSimMatrix(sm); Attribute seqString = new Attribute("sequence", (FastVector) null); FastVector attrInfo = new FastVector(); attrInfo.addElement(seqString); Instances data = new Instances("data", attrInfo, 0); for (int i = 0; i < seqList.size(); i++) { Instance currentInst = new Instance(1); currentInst.setDataset(data); currentInst.setValue(0, seqList.get(i)); data.add(currentInst); } try { buildClusterer(data); } catch (Exception e) { // TODO Auto-generated catch block e.printStackTrace(); } } /** * Generates a clusterer by the mean of spectral clustering algorithm. * * @param data set of instances serving as training data * @exception Exception if the clusterer has not been generated successfully */ public void buildClusterer(Instances data) throws java.lang.Exception { m_Sequences = new Instances(data); int n = data.numInstances(); int k = data.numAttributes(); DoubleMatrix2D w; if (useSparseMatrix) w = DoubleFactory2D.sparse.make(n, n); else w = DoubleFactory2D.dense.make(n, n); double[][] v1 = new double[n][]; for (int i = 0; i < n; i++) v1[i] = data.instance(i).toDoubleArray(); v = DoubleFactory2D.dense.make(v1); double sigma_sq = sigma * sigma; //Sets up similarity matrix for (int i = 0; i < n; i++) for (int j = i; j < n; j++) { /*double dist = distnorm2(v.viewRow(i), v.viewRow(j)); if((r == -1) || (dist < r)) { double sim = Math.exp(- (dist * dist) / (2 * sigma_sq)); w.set(i, j, sim); w.set(j, i, sim); }*/ /* String [] key = {data.instance(i).stringValue(0), data.instance(j).stringValue(0)}; System.out.println(key[0]); System.out.println(key[1]); System.out.println(simScoreMap.containsKey(key)); Double simValue = simScoreMap.get(key);*/ double sim = sim_matrix[i][j]; w.set(i, j, sim); w.set(j, i, sim); } //Partitions points int[][] p = partition(w, alpha_star); //Deploys results numOfClusters = p.length; cluster = new int[n]; for (int i = 0; i < p.length; i++) for (int j = 0; j < p[i].length; j++) cluster[p[i][j]] = i; //System.out.println("Final partition:"); // UtilsJS.printMatrix(p); // System.out.println("Cluster:\n"); // UtilsJS.printArray(cluster); this.numOfClusters = cluster[Utils.maxIndex(cluster)] + 1; // System.out.println("Num clusters:\t"+this.numOfClusters); } /** * Returns a string describing this clusterer * @return a description of the evaluator suitable for * displaying in the explorer/experimenter gui */ public String globalInfo() { return "Cluster data using spectral methods"; } /** * Returns an enumeration describing the available options. <p> * * @return an enumeration of all the available options **/ public Enumeration listOptions() { return options.elements(); } /** * Parses a given list of options. * * @param options the list of options as an array of strings * @exception Exception if an option is not supported **/ public void setOptions(String[] options) throws Exception { String optionString = Utils.getOption('A', options); if (optionString.length() != 0) setAlphaStar(Double.parseDouble(optionString)); optionString = Utils.getOption('S', options); if (optionString.length() != 0) setSigma(Double.parseDouble(optionString)); optionString = Utils.getOption('R', options); if (optionString.length() != 0) setR(Double.parseDouble(optionString)); setUseSparseMatrix(Utils.getFlag('M', options)); } /** * Gets the current settings of the options. * * @return an array of strings suitable for passing to setOptions() */ public String[] getOptions() { String[] options = new String[7]; int current = 0; options[current++] = "-A"; options[current++] = "" + Double.toString(getAlphaStar()); options[current++] = "-S"; options[current++] = "" + Double.toString(getSigma()); options[current++] = "-R"; options[current++] = "" + Double.toString(getR()); if (getUseSparseMatrix()) options[current++] = "-M"; while (current < options.length) options[current++] = ""; return options; } /** * Sets the new value of the alpha star factor. * * @param alpah_star the new value (0 < alpha_star < 1) * @exception Exception if alpha_star is not between 0 and 1 */ public void setAlphaStar(double alpha_star) throws Exception { if ((alpha_star > 0.000) && (alpha_star < 1.1)) this.alpha_star = alpha_star; else throw new Exception("alpha_star must be between 0 and 1"); } /** * Returns the current value of the alpha star factor. * * @return the alpha star factor */ public double getAlphaStar() { return alpha_star; } /** * Returns the tip text for this property * * @return tip text for this property suitable for * displaying in the explorer/experimenter gui */ public String alphaStarTipText() { return "set maximum allowable normalized cut value. The algorithm stops the recursive partitioning when it does not find a cut that has a value below this factor. Use this argument to limit the number of clusters."; } /** * Sets the new value of the sigma scaling factor. * * @param sigma the new value (sigma > 0) * @exception Exception if sigma is not strictly greater than 0 */ public void setSigma(double sigma) throws Exception { if (sigma > 0) this.sigma = sigma; else throw new Exception("sigma must be a positive number"); } /** * Returns the current value of the sigma factor. * * @return the sigma factor */ public double getSigma() { return sigma; } /** * Returns the tip text for this property. * * @return tip text for this property suitable for * displaying in the explorer/experimenter gui */ public String sigmaTipText() { return "set the distance scaling factor. The similarity of two point x and y is defined as exp(- d(x, y) / sigma) where d(x, y) is the Euclidean distance between x and y."; } /** * Sets the new value of the r distance cut parameter. * * @param r the new value (r > 0 || r == -1) * @exception Exception if r is not -1 and r is not a positive number */ public void setR(double r) throws Exception { if ((r > 0) || (r == -1)) this.r = r; else throw new Exception("r must be -1 or a positive number"); } /** * Returns the current value of the r distance cur parameter. * * @return the r distance cut parameter */ public double getR() { return r; } /** * Returns the tip text for this property. * * @return tip text for this property suitable for * displaying in the explorer/experimenter gui */ public String rTipText() { return "set the maximum distance value, all points that are far away more than this value have a 0 similarity. Use this parameter to obtain a sparse similarity matrix (see -M)."; } /** * Sets the use of sparse representation for similarity matrix. * * @param useSparseMatrix true for sparse matrix representation */ public void setUseSparseMatrix(boolean useSparseMatrix) { this.useSparseMatrix = useSparseMatrix; } /** * Returns the status of using sparse matrix flag. * * @return the status of using sparse matrix flag */ public boolean getUseSparseMatrix() { return useSparseMatrix; } /** * Returns the tip text for this property. * * @return tip text for this property suitable for * displaying in the explorer/experimenter gui */ public String useSparseMatrixTipText() { return "use sparse representation for similarity matrix. It can improve the memory efficiency"; } public void setSimMatrix(double[][] sm) { sim_matrix = sm; } public int getClusterSize(int clusterNum) { int count = 0; for (int i = 0; i < seqList.size(); i++) { if (cluster[i] == clusterNum) count++; } return count; } public ArrayList<String> getClusterIDs(int clusterNum) { ArrayList<String> clusterSeqs = new ArrayList<String>(); for (int i = 0; i < seqList.size(); i++) { if (cluster[i] == clusterNum) clusterSeqs.add(seqList.get(i)); } return clusterSeqs; } public int getClusterNumber(String objectID) { int datasetIndex = -1; for (int i = 0; i < m_Sequences.numInstances(); i++) { if (objectID.equals(m_Sequences.instance(i).stringValue(0))) datasetIndex = i; } return cluster[datasetIndex]; } /** * Constructor. **/ public SpectralWEKA() { numPartitions = 0; } }