Java tutorial
/* * To change this license header, choose License Headers in Project Properties. * To change this template file, choose Tools | Templates * and open the template in the editor. */ package classes; import java.io.Serializable; import java.text.DecimalFormat; import java.text.NumberFormat; import java.util.Comparator; import java.util.Enumeration; import java.util.HashMap; import java.util.Iterator; import java.util.Locale; import java.util.Map; import java.util.PriorityQueue; import java.util.Vector; import weka.clusterers.AbstractClusterer; import weka.clusterers.HierarchicalClusterer; import weka.core.Capabilities; import weka.core.CapabilitiesHandler; import weka.core.DistanceFunction; import weka.core.Drawable; import weka.core.EuclideanDistance; import weka.core.Instance; import weka.core.Instances; import weka.core.Option; import weka.core.OptionHandler; import weka.core.RevisionUtils; import weka.core.SelectedTag; import weka.core.Tag; import weka.core.Utils; /** * * @author Eng.Abdo */ public class AbdoAgglomerativeClusterer extends AbstractClusterer implements OptionHandler, CapabilitiesHandler, Drawable { private static final long serialVersionUID = 1L; /** * Whether the classifier is run in debug mode. */ protected boolean m_bDebug = false; /** * Whether the distance represent node height (if false) or branch length * (if true). */ protected boolean m_bDistanceIsBranchLength = false; /** * training data * */ Instances m_instances; /** * number of clusters desired in clustering * */ int m_nNumClusters = 1; public void setNumClusters(int nClusters) { m_nNumClusters = Math.max(1, nClusters); } public int getNumClusters() { return m_nNumClusters; } /** * distance function used for comparing members of a cluster * */ protected DistanceFunction m_DistanceFunction = new EuclideanDistance(); public DistanceFunction getDistanceFunction() { return m_DistanceFunction; } public void setDistanceFunction(DistanceFunction distanceFunction) { m_DistanceFunction = distanceFunction; } /** * holder of clusters during clustering process where each item in this * array represents a cluster and its label is the index of this item. Each * cluster is represented as a vector of instances indexes contained in this * cluster */ private Vector<Integer>[] nClusterID; /** * Cophenetic coefficients */ private double[][] copheneticMatrix; public double[][] getCopheneticMatrix() { return copheneticMatrix; } public void setCopheneticMatrix(double[][] copheneticMatrix) { this.copheneticMatrix = copheneticMatrix; } /** * array of distances between all clusters */ private double[][] fDistance0; private double[][] similarities; public Vector<Integer>[] getnClusterID() { return nClusterID; } public void setnClusterID(Vector<Integer>[] nClusterID) { this.nClusterID = nClusterID; } public double[][] getfDistance0() { return fDistance0; } public void setfDistance0(double[][] fDistance0) { this.fDistance0 = fDistance0; } /** * used for priority queue for efficient retrieval of the nearest cluster to * current instance during calculation of silhouette coefficient * */ class Candidate implements Serializable { int clusterIndex; double distance; public Candidate(int clusterIndex, double distance) { this.clusterIndex = clusterIndex; this.distance = distance; } } /** * comparator used by priority queue* */ class CandidateComparator implements Comparator<Candidate>, Serializable { @Override public int compare(Candidate o1, Candidate o2) { if (o1.distance < o2.distance) { return -1; } else if (o1.distance == o2.distance) { return 0; } else { return 1; } } } /** * used for priority queue for efficient retrieval of pair of clusters to * merge* */ class Tuple implements Serializable { public Tuple(double d, int i, int j, int nSize1, int nSize2) { m_fDist = d; m_iCluster1 = i; m_iCluster2 = j; m_nClusterSize1 = nSize1; m_nClusterSize2 = nSize2; } double m_fDist; int m_iCluster1; int m_iCluster2; int m_nClusterSize1; int m_nClusterSize2; } /** * comparator used by priority queue* */ class TupleComparator implements Comparator<Tuple>, Serializable { public int compare(Tuple o1, Tuple o2) { if (o1.m_fDist < o2.m_fDist) { return -1; } else if (o1.m_fDist == o2.m_fDist) { return 0; } return 1; } } /** * the various link types */ final static int SINGLE = 0; final static int COMPLETE = 1; final static int AVERAGE = 2; final static int MEAN = 3; final static int CENTROID = 4; final static int WARD = 5; final static int ADJCOMLPETE = 6; final static int NEIGHBOR_JOINING = 7; public static final Tag[] TAGS_LINK_TYPE = { new Tag(SINGLE, "SINGLE"), new Tag(COMPLETE, "COMPLETE"), new Tag(AVERAGE, "AVERAGE"), new Tag(MEAN, "MEAN"), new Tag(CENTROID, "CENTROID"), new Tag(WARD, "WARD"), new Tag(ADJCOMLPETE, "ADJCOMLPETE"), new Tag(NEIGHBOR_JOINING, "NEIGHBOR_JOINING") }; /** * Holds the Link type used calculate distance between clusters */ int m_nLinkType = COMPLETE; boolean m_bPrintNewick = true; ; public boolean getPrintNewick() { return m_bPrintNewick; } public void setPrintNewick(boolean bPrintNewick) { m_bPrintNewick = bPrintNewick; } public double[][] getSimilarities() { return similarities; } public void setSimilarities(double[][] similarities) { this.similarities = similarities; } public void setLinkType(SelectedTag newLinkType) { if (newLinkType.getTags() == TAGS_LINK_TYPE) { m_nLinkType = newLinkType.getSelectedTag().getID(); } } public SelectedTag getLinkType() { return new SelectedTag(m_nLinkType, TAGS_LINK_TYPE); } /** * class representing node in cluster hierarchy * */ class Node implements Serializable { Node m_left; Node m_right; Node m_parent; int m_iLeftInstance; int m_iRightInstance; double m_fLeftLength = 0; double m_fRightLength = 0; double m_fHeight = 0; public String toString(int attIndex) { NumberFormat nf = NumberFormat.getNumberInstance(new Locale("en", "US")); DecimalFormat myFormatter = (DecimalFormat) nf; myFormatter.applyPattern("#.#####"); if (m_left == null) { if (m_right == null) { return "(" + m_instances.instance(m_iLeftInstance).stringValue(attIndex) + ":" + myFormatter.format(m_fLeftLength) + "," + m_instances.instance(m_iRightInstance).stringValue(attIndex) + ":" + myFormatter.format(m_fRightLength) + ")"; } else { return "(" + m_instances.instance(m_iLeftInstance).stringValue(attIndex) + ":" + myFormatter.format(m_fLeftLength) + "," + m_right.toString(attIndex) + ":" + myFormatter.format(m_fRightLength) + ")"; } } else { if (m_right == null) { return "(" + m_left.toString(attIndex) + ":" + myFormatter.format(m_fLeftLength) + "," + m_instances.instance(m_iRightInstance).stringValue(attIndex) + ":" + myFormatter.format(m_fRightLength) + ")"; } else { return "(" + m_left.toString(attIndex) + ":" + myFormatter.format(m_fLeftLength) + "," + m_right.toString(attIndex) + ":" + myFormatter.format(m_fRightLength) + ")"; } } } public String toString2(int attIndex) { NumberFormat nf = NumberFormat.getNumberInstance(new Locale("en", "US")); DecimalFormat myFormatter = (DecimalFormat) nf; myFormatter.applyPattern("#.#####"); if (m_left == null) { if (m_right == null) { return "(" + m_instances.instance(m_iLeftInstance).value(attIndex) + ":" + myFormatter.format(m_fLeftLength) + "," + m_instances.instance(m_iRightInstance).value(attIndex) + ":" + myFormatter.format(m_fRightLength) + ")"; } else { return "(" + m_instances.instance(m_iLeftInstance).value(attIndex) + ":" + myFormatter.format(m_fLeftLength) + "," + m_right.toString2(attIndex) + ":" + myFormatter.format(m_fRightLength) + ")"; } } else { if (m_right == null) { return "(" + m_left.toString2(attIndex) + ":" + myFormatter.format(m_fLeftLength) + "," + m_instances.instance(m_iRightInstance).value(attIndex) + ":" + myFormatter.format(m_fRightLength) + ")"; } else { return "(" + m_left.toString2(attIndex) + ":" + myFormatter.format(m_fLeftLength) + "," + m_right.toString2(attIndex) + ":" + myFormatter.format(m_fRightLength) + ")"; } } } void setHeight(double fHeight1, double fHeight2) { m_fHeight = fHeight1; if (m_left == null) { m_fLeftLength = fHeight1; } else { m_fLeftLength = fHeight1 - m_left.m_fHeight; } if (m_right == null) { m_fRightLength = fHeight2; } else { m_fRightLength = fHeight2 - m_right.m_fHeight; } } void setLength(double fLength1, double fLength2) { m_fLeftLength = fLength1; m_fRightLength = fLength2; m_fHeight = fLength1; if (m_left != null) { m_fHeight += m_left.m_fHeight; } } } Node[] m_clusters; int[] m_nClusterNr; public int[] getM_nClusterNr() { return m_nClusterNr; } public void setM_nClusterNr(int[] m_nClusterNr) { this.m_nClusterNr = m_nClusterNr; } @Override public void buildClusterer(Instances data) throws Exception { m_instances = data; int nInstances = m_instances.numInstances(); if (nInstances == 0) { return; } m_DistanceFunction.setInstances(m_instances); // use array of integer vectors to store cluster indices, // starting with one cluster per instance this.nClusterID = new Vector[data.numInstances()]; for (int i = 0; i < data.numInstances(); i++) { nClusterID[i] = new Vector<Integer>(); nClusterID[i].add(i); } // calculate distance matrix int nClusters = data.numInstances(); // used for keeping track of hierarchy Node[] clusterNodes = new Node[nInstances]; if (m_nLinkType == NEIGHBOR_JOINING) { neighborJoining(nClusters, nClusterID, clusterNodes); } else { doLinkClustering(nClusters, nClusterID, clusterNodes); } // move all clusters in m_nClusterID array // & collect hierarchy int iCurrent = 0; m_clusters = new Node[m_nNumClusters]; m_nClusterNr = new int[nInstances]; for (int i = 0; i < nInstances; i++) { if (nClusterID[i].size() > 0) { for (int j = 0; j < nClusterID[i].size(); j++) { m_nClusterNr[nClusterID[i].elementAt(j)] = iCurrent; } m_clusters[iCurrent] = clusterNodes[i]; iCurrent++; } } } // buildClusterer /** * use neighbor joining algorithm for clustering This is roughly based on * the RapidNJ simple implementation and runs at O(n^3) More efficient * implementations exist, see RapidNJ (or my GPU implementation :-)) * * @param nClusters * @param nClusterID * @param clusterNodes */ void neighborJoining(int nClusters, Vector<Integer>[] nClusterID, Node[] clusterNodes) { int n = m_instances.numInstances(); System.out.println("NeighborJoining"); double[][] fDist = new double[nClusters][nClusters]; for (int i = 0; i < nClusters; i++) { fDist[i][i] = 0; for (int j = i + 1; j < nClusters; j++) { fDist[i][j] = getDistance0(nClusterID[i], nClusterID[j]); fDist[j][i] = fDist[i][j]; } } double[] fSeparationSums = new double[n]; double[] fSeparations = new double[n]; int[] nNextActive = new int[n]; //calculate initial separation rows for (int i = 0; i < n; i++) { double fSum = 0; for (int j = 0; j < n; j++) { fSum += fDist[i][j]; } fSeparationSums[i] = fSum; fSeparations[i] = fSum / (nClusters - 2); nNextActive[i] = i + 1; } while (nClusters > 2) { // find minimum int iMin1 = -1; int iMin2 = -1; double fMin = Double.MAX_VALUE; if (m_bDebug) { for (int i = 0; i < n; i++) { if (nClusterID[i].size() > 0) { double[] fRow = fDist[i]; double fSep1 = fSeparations[i]; for (int j = 0; j < n; j++) { if (nClusterID[j].size() > 0 && i != j) { double fSep2 = fSeparations[j]; double fVal = fRow[j] - fSep1 - fSep2; if (fVal < fMin) { // new minimum iMin1 = i; iMin2 = j; fMin = fVal; } } } } } } else { int i = 0; while (i < n) { double fSep1 = fSeparations[i]; double[] fRow = fDist[i]; int j = nNextActive[i]; while (j < n) { double fSep2 = fSeparations[j]; double fVal = fRow[j] - fSep1 - fSep2; if (fVal < fMin) { // new minimum iMin1 = i; iMin2 = j; fMin = fVal; } j = nNextActive[j]; } i = nNextActive[i]; } } // record distance double fMinDistance = fDist[iMin1][iMin2]; nClusters--; double fSep1 = fSeparations[iMin1]; double fSep2 = fSeparations[iMin2]; double fDist1 = (0.5 * fMinDistance) + (0.5 * (fSep1 - fSep2)); double fDist2 = (0.5 * fMinDistance) + (0.5 * (fSep2 - fSep1)); if (nClusters > 2) { // update separations & distance double fNewSeparationSum = 0; double fMutualDistance = fDist[iMin1][iMin2]; double[] fRow1 = fDist[iMin1]; double[] fRow2 = fDist[iMin2]; for (int i = 0; i < n; i++) { if (i == iMin1 || i == iMin2 || nClusterID[i].size() == 0) { fRow1[i] = 0; } else { double fVal1 = fRow1[i]; double fVal2 = fRow2[i]; double fDistance = (fVal1 + fVal2 - fMutualDistance) / 2.0; fNewSeparationSum += fDistance; // update the separationsum of cluster i. fSeparationSums[i] += (fDistance - fVal1 - fVal2); fSeparations[i] = fSeparationSums[i] / (nClusters - 2); fRow1[i] = fDistance; fDist[i][iMin1] = fDistance; } } fSeparationSums[iMin1] = fNewSeparationSum; fSeparations[iMin1] = fNewSeparationSum / (nClusters - 2); fSeparationSums[iMin2] = 0; merge(iMin1, iMin2, fDist1, fDist2, nClusterID, clusterNodes); int iPrev = iMin2; // since iMin1 < iMin2 we havenActiveRows[0] >= 0, so the next loop should be save while (nClusterID[iPrev].size() == 0) { iPrev--; } nNextActive[iPrev] = nNextActive[iMin2]; } else { merge(iMin1, iMin2, fDist1, fDist2, nClusterID, clusterNodes); break; } } for (int i = 0; i < n; i++) { if (nClusterID[i].size() > 0) { for (int j = i + 1; j < n; j++) { if (nClusterID[j].size() > 0) { double fDist1 = fDist[i][j]; if (nClusterID[i].size() == 1) { merge(i, j, fDist1, 0, nClusterID, clusterNodes); } else if (nClusterID[j].size() == 1) { merge(i, j, 0, fDist1, nClusterID, clusterNodes); } else { merge(i, j, fDist1 / 2.0, fDist1 / 2.0, nClusterID, clusterNodes); } break; } } } } } // neighborJoining /** * Perform clustering using a link method This implementation uses a * priority queue resulting in a O(n^2 log(n)) algorithm * * @param nClusters number of clusters * @param nClusterID * @param clusterNodes */ /** * Perform clustering using a link method This implementation uses a * priority queue resulting in a O(n^2 log(n)) algorithm * * @param nClusters number of clusters * @param nClusterID * @param clusterNodes */ void doLinkClustering(int nClusters, Vector<Integer>[] nClusterID, Node[] clusterNodes) { Map<Integer, Double> k_var_map = new HashMap<>(); int nInstances = m_instances.numInstances(); PriorityQueue<Tuple> queue = new PriorityQueue<Tuple>(nClusters * nClusters / 2, new TupleComparator()); this.fDistance0 = new double[nClusters][nClusters]; this.copheneticMatrix = new double[nClusters][nClusters]; double[][] fClusterDistance = null; if (m_bDebug) { fClusterDistance = new double[nClusters][nClusters]; } for (int i = 0; i < nClusters; i++) { fDistance0[i][i] = 0; for (int j = i + 1; j < nClusters; j++) { System.out.println("i,j " + i + " ," + j); fDistance0[i][j] = getDistance0(nClusterID[i], nClusterID[j]); fDistance0[j][i] = fDistance0[i][j]; queue.add(new Tuple(fDistance0[i][j], i, j, 1, 1)); if (m_bDebug) { fClusterDistance[i][j] = fDistance0[i][j]; fClusterDistance[j][i] = fDistance0[i][j]; } } } while (nClusters > m_nNumClusters) { int iMin1 = -1; int iMin2 = -1; // find closest two clusters if (m_bDebug) { /* simple but inefficient implementation */ double fMinDistance = Double.MAX_VALUE; for (int i = 0; i < nInstances; i++) { if (nClusterID[i].size() > 0) { for (int j = i + 1; j < nInstances; j++) { if (nClusterID[j].size() > 0) { double fDist = fClusterDistance[i][j]; if (fDist < fMinDistance) { fMinDistance = fDist; iMin1 = i; iMin2 = j; } } } } } merge(iMin1, iMin2, fMinDistance, fMinDistance, nClusterID, clusterNodes); } else { // use priority queue to find next best pair to cluster Tuple t; do { t = queue.poll(); } while (t != null && (nClusterID[t.m_iCluster1].size() != t.m_nClusterSize1 || nClusterID[t.m_iCluster2].size() != t.m_nClusterSize2)); iMin1 = t.m_iCluster1; iMin2 = t.m_iCluster2; // update cophenetic matrix Iterator<Integer> iterator = nClusterID[iMin1].iterator(); while (iterator.hasNext()) { int i = iterator.next(); Iterator<Integer> iterator2 = nClusterID[iMin2].iterator(); while (iterator2.hasNext()) { int j = iterator2.next(); if (copheneticMatrix[i][j] == 0.0) { copheneticMatrix[i][j] = t.m_fDist; copheneticMatrix[j][i] = t.m_fDist; } } } merge(iMin1, iMin2, t.m_fDist, t.m_fDist, nClusterID, clusterNodes); } // merge clusters // update distances & queue for (int i = 0; i < nInstances; i++) { if (i != iMin1 && nClusterID[i].size() != 0) { int i1 = Math.min(iMin1, i); int i2 = Math.max(iMin1, i); double fDistance = getDistance(fDistance0, nClusterID[i1], nClusterID[i2]); if (m_bDebug) { fClusterDistance[i1][i2] = fDistance; fClusterDistance[i2][i1] = fDistance; } queue.add(new Tuple(fDistance, i1, i2, nClusterID[i1].size(), nClusterID[i2].size())); } } nClusters--; // evaluation of the total quality or the goodness of the current clustering int count = 0; double total = 0.0; double totlaVariance = 0.0; double total_sil = 0.0; for (int x = 0; x < nClusterID.length; x++) { if (nClusterID[x].size() != 0) { count += 1; // double sel; total_sil += calcSilhouette(nClusterID[x], x); // total += sel; //totlaVariance+=calcWithinVariance(nClusterID[x]); } } System.out.println("\n Number of clusters at this stage is : " + (count)); //System.out.println(" Selhouette Coefficient at this stage is : " + (double) (total / count)); k_var_map.put(count, (double) (total_sil / m_instances.numInstances())); // System.out.println(" Total variance is: "+totlaVariance); //buildIdealizedSimilarityMatrix(nClusterID); } ExcelUtility.writeExperimentResult(k_var_map, "D:\\Thesis\\Experiments\\datasets\\overlapped\\dataset-3-\\studying-number-of-clusters\\Experiment-1\\exp3.xls"); System.out.println("The CCCP is: " + new Correlator().correlate(fDistance0, copheneticMatrix)); } // doLinkClustering // print the distance matrix private void printDistanceMatrix() { System.out.println(""); for (int i = 0; i < m_instances.numInstances(); i++) { for (int j = i + 1; j < m_instances.numInstances(); j++) { System.out.print(fDistance0[i][j] + " "); } System.out.println(""); } } private void sessionSimilarity(int i, int j) { Instance first = m_instances.instance(i); Instance second = m_instances.instance(j); double similarity = 0.0; } // build the idelaized similarity matrix private void buildIdealizedSimilarityMatrix(Vector<Integer>[] clusters) { int[][] idealizedMatrix = new int[m_instances.numInstances()][m_instances.numInstances()]; for (int i = 0; i < clusters.length; i++) { if (clusters[i].size() > 0) { for (int j = 0; j < clusters[i].size(); j++) { for (int k = j; k < clusters[i].size(); k++) { idealizedMatrix[clusters[i].get(j)][clusters[i].get(k)] = 1; idealizedMatrix[clusters[i].get(k)][clusters[i].get(j)] = 1; } } } } }// end of buildIdealizedSimilarityMatrix // calculate the silhouette coefficient for a given cluster private double calcSilhouette(Vector<Integer> cluster, int clusterIndex) { // silhoette of a singleton cluster is 0 if (cluster.size() == 1) { return 0; } else { double selhouetteCoefficient = 0.0; for (int i = 0; i < cluster.size(); i++) { int current = cluster.get(i); double ai = 0.0; double si = 0.0; double bi = 0.0; for (int j = 0; j < cluster.size(); j++) { int target = cluster.get(j); // System.out.println("Distance between i nd target :"+target+" is "+fDistance0[target][current]); ai += fDistance0[target][current]; } ai = (double) ai / (cluster.size() - 1); // System.out.println("ai is: "+ ai); // need to be verified ai = (Double.isNaN(ai) ? 0 : ai); PriorityQueue<Candidate> cq = new PriorityQueue<>(5, new CandidateComparator()); // now, here to compute Bi for (int k = 0; k < nClusterID.length; k++) { if ((k != clusterIndex) && (nClusterID[k].size() != 0)) { bi = 0.0; for (int l = 0; l < nClusterID[k].size(); l++) { int targetO = nClusterID[k].get(l); bi += fDistance0[current][targetO]; } bi = bi / nClusterID[k].size(); // put k and b cq.add(new Candidate(k, bi)); } } Candidate nearest = cq.poll(); //here to compute the selhouette coefficient for instance current wich belongs to cluster if (nearest == null) { nearest = new Candidate(0, 0); } // System.out.println("Distance is: "+nearest.distance); si = (nearest.distance - ai) / Math.max(ai, nearest.distance); // if(Double.isNaN(si)) // System.out.println("naaaaaaaaaaaaaaaaaaan"); // else System.out.println("ok: "+si); selhouetteCoefficient += si; } return selhouetteCoefficient; // double result = selhouetteCoefficient / cluster.size(); // if (Double.isNaN(result)) { // return 0; // } else { // return result; // } } } // end of calcSilhouette void merge(int iMin1, int iMin2, double fDist1, double fDist2, Vector<Integer>[] nClusterID, Node[] clusterNodes) { if (m_bDebug) { System.err.println("Merging " + iMin1 + " " + iMin2 + " " + fDist1 + " " + fDist2); } if (iMin1 > iMin2) { int h = iMin1; iMin1 = iMin2; iMin2 = h; double f = fDist1; fDist1 = fDist2; fDist2 = f; } nClusterID[iMin1].addAll(nClusterID[iMin2]); nClusterID[iMin2].removeAllElements(); // track hierarchy Node node = new Node(); if (clusterNodes[iMin1] == null) { node.m_iLeftInstance = iMin1; } else { node.m_left = clusterNodes[iMin1]; clusterNodes[iMin1].m_parent = node; } if (clusterNodes[iMin2] == null) { node.m_iRightInstance = iMin2; } else { node.m_right = clusterNodes[iMin2]; clusterNodes[iMin2].m_parent = node; } if (m_bDistanceIsBranchLength) { node.setLength(fDist1, fDist2); } else { node.setHeight(fDist1, fDist2); } clusterNodes[iMin1] = node; } // merge /** * calculate distance the first time when setting up the distance matrix * */ double getDistance0(Vector<Integer> cluster1, Vector<Integer> cluster2) { double fBestDist = Double.MAX_VALUE; switch (m_nLinkType) { case SINGLE: case NEIGHBOR_JOINING: case CENTROID: case COMPLETE: case ADJCOMLPETE: case AVERAGE: case MEAN: // set up two instances for distance function Instance instance1 = (Instance) m_instances.instance(cluster1.elementAt(0)).copy(); Instance instance2 = (Instance) m_instances.instance(cluster2.elementAt(0)).copy(); fBestDist = m_DistanceFunction.distance(instance1, instance2); break; case WARD: { // finds the distance of the change in caused by merging the cluster. // The information of a cluster is calculated as the error sum of squares of the // centroids of the cluster and its members. double ESS1 = calcESS(cluster1); double ESS2 = calcESS(cluster2); Vector<Integer> merged = new Vector<Integer>(); merged.addAll(cluster1); merged.addAll(cluster2); double ESS = calcESS(merged); fBestDist = ESS * merged.size() - ESS1 * cluster1.size() - ESS2 * cluster2.size(); } break; } return fBestDist; } // getDistance0 /** * calculate the distance between two clusters * * @param cluster1 list of indices of instances in the first cluster * @param cluster2 dito for second cluster * @return distance between clusters based on link type */ double getDistance(double[][] fDistance, Vector<Integer> cluster1, Vector<Integer> cluster2) { double fBestDist = Double.MAX_VALUE; switch (m_nLinkType) { case SINGLE: // find single link distance aka minimum link, which is the closest distance between // any item in cluster1 and any item in cluster2 fBestDist = Double.MAX_VALUE; for (int i = 0; i < cluster1.size(); i++) { int i1 = cluster1.elementAt(i); for (int j = 0; j < cluster2.size(); j++) { int i2 = cluster2.elementAt(j); double fDist = fDistance[i1][i2]; if (fBestDist > fDist) { fBestDist = fDist; } } } break; case COMPLETE: case ADJCOMLPETE: // find complete link distance aka maximum link, which is the largest distance between // any item in cluster1 and any item in cluster2 fBestDist = 0; for (int i = 0; i < cluster1.size(); i++) { int i1 = cluster1.elementAt(i); for (int j = 0; j < cluster2.size(); j++) { int i2 = cluster2.elementAt(j); double fDist = fDistance[i1][i2]; if (fBestDist < fDist) { fBestDist = fDist; } } } if (m_nLinkType == COMPLETE) { break; } // calculate adjustment, which is the largest within cluster distance double fMaxDist = 0; for (int i = 0; i < cluster1.size(); i++) { int i1 = cluster1.elementAt(i); for (int j = i + 1; j < cluster1.size(); j++) { int i2 = cluster1.elementAt(j); double fDist = fDistance[i1][i2]; if (fMaxDist < fDist) { fMaxDist = fDist; } } } for (int i = 0; i < cluster2.size(); i++) { int i1 = cluster2.elementAt(i); for (int j = i + 1; j < cluster2.size(); j++) { int i2 = cluster2.elementAt(j); double fDist = fDistance[i1][i2]; if (fMaxDist < fDist) { fMaxDist = fDist; } } } fBestDist -= fMaxDist; break; case AVERAGE: // finds average distance between the elements of the two clusters fBestDist = 0; for (int i = 0; i < cluster1.size(); i++) { int i1 = cluster1.elementAt(i); for (int j = 0; j < cluster2.size(); j++) { int i2 = cluster2.elementAt(j); fBestDist += fDistance[i1][i2]; } } fBestDist /= (cluster1.size() * cluster2.size()); break; case MEAN: { // calculates the mean distance of a merged cluster (akak Group-average agglomerative clustering) Vector<Integer> merged = new Vector<Integer>(); merged.addAll(cluster1); merged.addAll(cluster2); fBestDist = 0; for (int i = 0; i < merged.size(); i++) { int i1 = merged.elementAt(i); for (int j = i + 1; j < merged.size(); j++) { int i2 = merged.elementAt(j); fBestDist += fDistance[i1][i2]; } } int n = merged.size(); fBestDist /= (n * (n - 1.0) / 2.0); } break; case CENTROID: // finds the distance of the centroids of the clusters double[] fValues1 = new double[m_instances.numAttributes()]; for (int i = 0; i < cluster1.size(); i++) { Instance instance = m_instances.instance(cluster1.elementAt(i)); for (int j = 0; j < m_instances.numAttributes(); j++) { fValues1[j] += instance.value(j); } } double[] fValues2 = new double[m_instances.numAttributes()]; for (int i = 0; i < cluster2.size(); i++) { Instance instance = m_instances.instance(cluster2.elementAt(i)); for (int j = 0; j < m_instances.numAttributes(); j++) { fValues2[j] += instance.value(j); } } for (int j = 0; j < m_instances.numAttributes(); j++) { fValues1[j] /= cluster1.size(); fValues2[j] /= cluster2.size(); } // set up two instances for distance function Instance instance1 = (Instance) m_instances.instance(0).copy(); Instance instance2 = (Instance) m_instances.instance(0).copy(); for (int j = 0; j < m_instances.numAttributes(); j++) { instance1.setValue(j, fValues1[j]); instance2.setValue(j, fValues2[j]); } fBestDist = m_DistanceFunction.distance(instance1, instance2); break; case WARD: { // finds the distance of the change in caused by merging the cluster. // The information of a cluster is calculated as the error sum of squares of the // centroids of the cluster and its members. double ESS1 = calcESS(cluster1); double ESS2 = calcESS(cluster2); Vector<Integer> merged = new Vector<Integer>(); merged.addAll(cluster1); merged.addAll(cluster2); double ESS = calcESS(merged); fBestDist = ESS * merged.size() - ESS1 * cluster1.size() - ESS2 * cluster2.size(); } break; } return fBestDist; } // getDistance /** * calculated error sum-of-squares for instances wrt centroid * */ double calcESS(Vector<Integer> cluster) { double[] fValues1 = new double[m_instances.numAttributes()]; for (int i = 0; i < cluster.size(); i++) { Instance instance = m_instances.instance(cluster.elementAt(i)); for (int j = 0; j < m_instances.numAttributes(); j++) { fValues1[j] += instance.value(j); } } for (int j = 0; j < m_instances.numAttributes(); j++) { fValues1[j] /= cluster.size(); } // set up two instances for distance function Instance centroid = (Instance) m_instances.instance(cluster.elementAt(0)).copy(); for (int j = 0; j < m_instances.numAttributes(); j++) { centroid.setValue(j, fValues1[j]); } double fESS = 0; for (int i = 0; i < cluster.size(); i++) { Instance instance = m_instances.instance(cluster.elementAt(i)); fESS += m_DistanceFunction.distance(centroid, instance); } return fESS / cluster.size(); } // calcESS /** * calculates the with-in-cluster variance as the sum squares of differences * between instances contained in the cluster and its mean divided by * cluster size * * @param cluster * @return */ double calcWithinVariance(Vector<Integer> cluster) { double variance = 0.0; double[] fValues1 = new double[m_instances.numAttributes()]; for (int i = 0; i < cluster.size(); i++) { Instance instance = m_instances.instance(cluster.elementAt(i)); for (int j = 0; j < m_instances.numAttributes(); j++) { fValues1[j] += instance.value(j); } } for (int j = 0; j < m_instances.numAttributes(); j++) { fValues1[j] /= cluster.size(); } // set up two instances for distance function Instance centroid = (Instance) m_instances.instance(cluster.elementAt(0)).copy(); for (int j = 0; j < m_instances.numAttributes(); j++) { centroid.setValue(j, fValues1[j]); } for (int i = 0; i < cluster.size(); i++) { double temp = 0; Instance instance = m_instances.instance(cluster.elementAt(i)); for (int j = 0; j < m_instances.numAttributes(); j++) { temp += (instance.value(j) - centroid.value(j)); } variance += temp * temp; } return variance / cluster.size(); } @Override /** * instances are assigned a cluster by finding the instance in the training * data with the closest distance to the instance to be clustered. The * cluster index of the training data point is taken as the cluster index. */ public int clusterInstance(Instance instance) throws Exception { if (m_instances.numInstances() == 0) { return 0; } double fBestDist = Double.MAX_VALUE; int iBestInstance = -1; for (int i = 0; i < m_instances.numInstances(); i++) { double fDist = m_DistanceFunction.distance(instance, m_instances.instance(i)); if (fDist < fBestDist) { fBestDist = fDist; iBestInstance = i; } } for (int k = 0; k < nClusterID.length; k++) { if (nClusterID[k].contains(iBestInstance)) { return k; } } return -1; } public int abdoClusterInstance(Instance instance) { double fBestDist = Double.MAX_VALUE; int iBestInstance = -1; for (int j = 0; j < nClusterID.length; j++) { if (nClusterID[j].size() > 0) { double avg = 0.0; for (int k = 0; k < nClusterID[j].size(); k++) { avg += m_DistanceFunction.distance(instance, m_instances.instance(nClusterID[j].get(k))); } avg = avg / nClusterID[j].size(); if (avg < fBestDist) { fBestDist = avg; iBestInstance = j; } } } return iBestInstance; } @Override /** * create distribution with all clusters having zero probability, except the * cluster the instance is assigned to. */ public double[] distributionForInstance(Instance instance) throws Exception { if (numberOfClusters() == 0) { double[] p = new double[1]; p[0] = 1; return p; } double[] p = new double[numberOfClusters()]; p[clusterInstance(instance)] = 1.0; return p; } @Override public Capabilities getCapabilities() { Capabilities result = new Capabilities(this); result.disableAll(); result.enable(Capabilities.Capability.NO_CLASS); // attributes result.enable(Capabilities.Capability.NOMINAL_ATTRIBUTES); result.enable(Capabilities.Capability.NUMERIC_ATTRIBUTES); result.enable(Capabilities.Capability.DATE_ATTRIBUTES); result.enable(Capabilities.Capability.MISSING_VALUES); result.enable(Capabilities.Capability.STRING_ATTRIBUTES); // other result.setMinimumNumberInstances(0); return result; } @Override public int numberOfClusters() throws Exception { return Math.min(m_nNumClusters, m_instances.numInstances()); } /** * Returns an enumeration describing the available options. * * @return an enumeration of all the available options. */ public Enumeration listOptions() { Vector newVector = new Vector(8); newVector.addElement(new Option( "\tIf set, classifier is run in debug mode and\n" + "\tmay output additional info to the console", "D", 0, "-D")); newVector.addElement(new Option( "\tIf set, distance is interpreted as branch length\n" + "\totherwise it is node height.", "B", 0, "-B")); newVector.addElement(new Option("\tnumber of clusters", "N", 1, "-N <Nr Of Clusters>")); newVector.addElement( new Option("\tFlag to indicate the cluster should be printed in Newick format.", "P", 0, "-P")); newVector.addElement(new Option( "Link type (Single, Complete, Average, Mean, Centroid, Ward, Adjusted complete, Neighbor joining)", "L", 1, "-L [SINGLE|COMPLETE|AVERAGE|MEAN|CENTROID|WARD|ADJCOMLPETE|NEIGHBOR_JOINING]")); newVector.add(new Option("\tDistance function to use.\n" + "\t(default: weka.core.EuclideanDistance)", "A", 1, "-A <classname and options>")); return newVector.elements(); } /** * Parses a given list of options. * <p/> * * <!-- options-start --> * Valid options are: * <p/> * * <!-- options-end --> * * @param options the list of options as an array of strings * @throws Exception if an option is not supported */ public void setOptions(String[] options) throws Exception { m_bPrintNewick = Utils.getFlag('P', options); String optionString = Utils.getOption('N', options); if (optionString.length() != 0) { Integer temp = new Integer(optionString); setNumClusters(temp); } else { setNumClusters(2); } setDebug(Utils.getFlag('D', options)); setDistanceIsBranchLength(Utils.getFlag('B', options)); String sLinkType = Utils.getOption('L', options); if (sLinkType.compareTo("SINGLE") == 0) { setLinkType(new SelectedTag(SINGLE, TAGS_LINK_TYPE)); } if (sLinkType.compareTo("COMPLETE") == 0) { setLinkType(new SelectedTag(COMPLETE, TAGS_LINK_TYPE)); } if (sLinkType.compareTo("AVERAGE") == 0) { setLinkType(new SelectedTag(AVERAGE, TAGS_LINK_TYPE)); } if (sLinkType.compareTo("MEAN") == 0) { setLinkType(new SelectedTag(MEAN, TAGS_LINK_TYPE)); } if (sLinkType.compareTo("CENTROID") == 0) { setLinkType(new SelectedTag(CENTROID, TAGS_LINK_TYPE)); } if (sLinkType.compareTo("WARD") == 0) { setLinkType(new SelectedTag(WARD, TAGS_LINK_TYPE)); } if (sLinkType.compareTo("ADJCOMLPETE") == 0) { setLinkType(new SelectedTag(ADJCOMLPETE, TAGS_LINK_TYPE)); } if (sLinkType.compareTo("NEIGHBOR_JOINING") == 0) { setLinkType(new SelectedTag(NEIGHBOR_JOINING, TAGS_LINK_TYPE)); } String nnSearchClass = Utils.getOption('A', options); if (nnSearchClass.length() != 0) { String nnSearchClassSpec[] = Utils.splitOptions(nnSearchClass); if (nnSearchClassSpec.length == 0) { throw new Exception("Invalid DistanceFunction specification string."); } String className = nnSearchClassSpec[0]; nnSearchClassSpec[0] = ""; setDistanceFunction( (DistanceFunction) Utils.forName(DistanceFunction.class, className, nnSearchClassSpec)); } else { setDistanceFunction(new EuclideanDistance()); } Utils.checkForRemainingOptions(options); } /** * Gets the current settings of the clusterer. * * @return an array of strings suitable for passing to setOptions() */ public String[] getOptions() { String[] options = new String[14]; int current = 0; options[current++] = "-N"; options[current++] = "" + getNumClusters(); options[current++] = "-L"; switch (m_nLinkType) { case (SINGLE): options[current++] = "SINGLE"; break; case (COMPLETE): options[current++] = "COMPLETE"; break; case (AVERAGE): options[current++] = "AVERAGE"; break; case (MEAN): options[current++] = "MEAN"; break; case (CENTROID): options[current++] = "CENTROID"; break; case (WARD): options[current++] = "WARD"; break; case (ADJCOMLPETE): options[current++] = "ADJCOMLPETE"; break; case (NEIGHBOR_JOINING): options[current++] = "NEIGHBOR_JOINING"; break; } if (m_bPrintNewick) { options[current++] = "-P"; } if (getDebug()) { options[current++] = "-D"; } if (getDistanceIsBranchLength()) { options[current++] = "-B"; } options[current++] = "-A"; options[current++] = (m_DistanceFunction.getClass().getName() + " " + Utils.joinOptions(m_DistanceFunction.getOptions())).trim(); while (current < options.length) { options[current++] = ""; } return options; } public String toString() { StringBuffer buf = new StringBuffer(); int attIndex = m_instances.classIndex(); if (attIndex < 0) { // try find a string, or last attribute otherwise attIndex = 0; while (attIndex < m_instances.numAttributes() - 1) { if (m_instances.attribute(attIndex).isString()) { break; } attIndex++; } } try { if (m_bPrintNewick && (numberOfClusters() > 0)) { for (int i = 0; i < m_clusters.length; i++) { if (m_clusters[i] != null) { buf.append("Cluster " + i + "\n"); if (m_instances.attribute(attIndex).isString()) { buf.append(m_clusters[i].toString(attIndex)); } else { buf.append(m_clusters[i].toString2(attIndex)); } buf.append("\n\n"); } } } } catch (Exception e) { e.printStackTrace(); } return buf.toString(); } /** * Set debugging mode. * * @param debug true if debug output should be printed */ public void setDebug(boolean debug) { m_bDebug = debug; } /** * Get whether debugging is turned on. * * @return true if debugging output is on */ public boolean getDebug() { return m_bDebug; } public boolean getDistanceIsBranchLength() { return m_bDistanceIsBranchLength; } public void setDistanceIsBranchLength(boolean bDistanceIsHeight) { m_bDistanceIsBranchLength = bDistanceIsHeight; } public String distanceIsBranchLengthTipText() { return "If set to false, the distance between clusters is interpreted " + "as the height of the node linking the clusters. This is appropriate for " + "example for single link clustering. However, for neighbor joining, the " + "distance is better interpreted as branch length. Set this flag to " + "get the latter interpretation."; } /** * Returns the tip text for this property * * @return tip text for this property suitable for displaying in the * explorer/experimenter gui */ public String debugTipText() { return "If set to true, classifier may output additional info to " + "the console."; } /** * @return a string to describe the NumClusters */ public String numClustersTipText() { return "Sets the number of clusters. " + "If a single hierarchy is desired, set this to 1."; } /** * @return a string to describe the print Newick flag */ public String printNewickTipText() { return "Flag to indicate whether the cluster should be print in Newick format." + " This can be useful for display in other programs. However, for large datasets" + " a lot of text may be produced, which may not be a nuisance when the Newick format" + " is not required"; } /** * @return a string to describe the distance function */ public String distanceFunctionTipText() { return "Sets the distance function, which measures the distance between two individual. " + "instances (or possibly the distance between an instance and the centroid of a cluster" + "depending on the Link type)."; } /** * @return a string to describe the Link type */ public String linkTypeTipText() { return "Sets the method used to measure the distance between two clusters.\n" + "SINGLE:\n" + " find single link distance aka minimum link, which is the closest distance between" + " any item in cluster1 and any item in cluster2\n" + "COMPLETE:\n" + " find complete link distance aka maximum link, which is the largest distance between" + " any item in cluster1 and any item in cluster2\n" + "ADJCOMLPETE:\n" + " as COMPLETE, but with adjustment, which is the largest within cluster distance\n" + "AVERAGE:\n" + " finds average distance between the elements of the two clusters\n" + "MEAN: \n" + " calculates the mean distance of a merged cluster (akak Group-average agglomerative clustering)\n" + "CENTROID:\n" + " finds the distance of the centroids of the clusters\n" + "WARD:\n" + " finds the distance of the change in caused by merging the cluster." + " The information of a cluster is calculated as the error sum of squares of the" + " centroids of the cluster and its members.\n" + "NEIGHBOR_JOINING\n" + " use neighbor joining algorithm."; } /** * This will return a string describing the clusterer. * * @return The string. */ public String globalInfo() { return "Hierarchical clustering class.\n" + "Implements a number of classic agglomorative (i.e. bottom up) hierarchical clustering methods" + "based on ."; } public String graph() throws Exception { if (numberOfClusters() == 0) { return "Newick:(no,clusters)"; } int attIndex = m_instances.classIndex(); if (attIndex < 0) { // try find a string, or last attribute otherwise attIndex = 0; while (attIndex < m_instances.numAttributes() - 1) { if (m_instances.attribute(attIndex).isString()) { break; } attIndex++; } } String sNewick = null; if (m_instances.attribute(attIndex).isString()) { sNewick = m_clusters[0].toString(attIndex); } else { sNewick = m_clusters[0].toString2(attIndex); } return "Newick:" + sNewick; } public int graphType() { return Drawable.Newick; } /** * Returns the revision string. * * @return the revision */ public String getRevision() { return RevisionUtils.extract("$Revision: 11330 $"); } } // class HierarchicalClusterer