classes.AbdoAgglomerativeClusterer.java Source code

Introduction

Here is the source code for classes.AbdoAgglomerativeClusterer.java
Source

/*
 * To change this license header, choose License Headers in Project Properties.
 * To change this template file, choose Tools | Templates
 * and open the template in the editor.
 */
package classes;

import java.io.Serializable;
import java.text.DecimalFormat;
import java.text.NumberFormat;
import java.util.Comparator;
import java.util.Enumeration;
import java.util.HashMap;
import java.util.Iterator;
import java.util.Locale;
import java.util.Map;
import java.util.PriorityQueue;
import java.util.Vector;
import weka.clusterers.AbstractClusterer;
import weka.clusterers.HierarchicalClusterer;
import weka.core.Capabilities;
import weka.core.CapabilitiesHandler;
import weka.core.DistanceFunction;
import weka.core.Drawable;
import weka.core.EuclideanDistance;
import weka.core.Instance;
import weka.core.Instances;
import weka.core.Option;
import weka.core.OptionHandler;
import weka.core.RevisionUtils;
import weka.core.SelectedTag;
import weka.core.Tag;
import weka.core.Utils;

/**
 *
 * @author Eng.Abdo
 */
public class AbdoAgglomerativeClusterer extends AbstractClusterer
        implements OptionHandler, CapabilitiesHandler, Drawable {

    private static final long serialVersionUID = 1L;

    /**
     * Whether the classifier is run in debug mode.
     */
    protected boolean m_bDebug = false;

    /**
     * Whether the distance represent node height (if false) or branch length
     * (if true).
     */
    protected boolean m_bDistanceIsBranchLength = false;

    /**
     * training data *
     */
    Instances m_instances;

    /**
     * number of clusters desired in clustering *
     */
    int m_nNumClusters = 1;

    public void setNumClusters(int nClusters) {
        m_nNumClusters = Math.max(1, nClusters);
    }

    public int getNumClusters() {
        return m_nNumClusters;
    }

    /**
     * distance function used for comparing members of a cluster *
     */
    protected DistanceFunction m_DistanceFunction = new EuclideanDistance();

    public DistanceFunction getDistanceFunction() {
        return m_DistanceFunction;
    }

    public void setDistanceFunction(DistanceFunction distanceFunction) {
        m_DistanceFunction = distanceFunction;
    }

    /**
     * holder of clusters during clustering process where each item in this
     * array represents a cluster and its label is the index of this item. Each
     * cluster is represented as a vector of instances indexes contained in this
     * cluster
     */
    private Vector<Integer>[] nClusterID;

    /**
     * Cophenetic coefficients
     */
    private double[][] copheneticMatrix;

    public double[][] getCopheneticMatrix() {
        return copheneticMatrix;
    }

    public void setCopheneticMatrix(double[][] copheneticMatrix) {
        this.copheneticMatrix = copheneticMatrix;
    }

    /**
     * array of distances between all clusters
     */
    private double[][] fDistance0;

    private double[][] similarities;

    public Vector<Integer>[] getnClusterID() {
        return nClusterID;
    }

    public void setnClusterID(Vector<Integer>[] nClusterID) {
        this.nClusterID = nClusterID;
    }

    public double[][] getfDistance0() {
        return fDistance0;
    }

    public void setfDistance0(double[][] fDistance0) {
        this.fDistance0 = fDistance0;
    }

    /**
     * used for priority queue for efficient retrieval of the nearest cluster to
     * current instance during calculation of silhouette coefficient
     *
     */
    class Candidate implements Serializable {

        int clusterIndex;
        double distance;

        public Candidate(int clusterIndex, double distance) {
            this.clusterIndex = clusterIndex;
            this.distance = distance;
        }

    }

    /**
     * comparator used by priority queue*
     */
    class CandidateComparator implements Comparator<Candidate>, Serializable {

        @Override
        public int compare(Candidate o1, Candidate o2) {
            if (o1.distance < o2.distance) {
                return -1;
            } else if (o1.distance == o2.distance) {
                return 0;
            } else {
                return 1;
            }
        }

    }

    /**
     * used for priority queue for efficient retrieval of pair of clusters to
     * merge*
     */
    class Tuple implements Serializable {

        public Tuple(double d, int i, int j, int nSize1, int nSize2) {
            m_fDist = d;
            m_iCluster1 = i;
            m_iCluster2 = j;
            m_nClusterSize1 = nSize1;
            m_nClusterSize2 = nSize2;
        }

        double m_fDist;
        int m_iCluster1;
        int m_iCluster2;
        int m_nClusterSize1;
        int m_nClusterSize2;
    }

    /**
     * comparator used by priority queue*
     */
    class TupleComparator implements Comparator<Tuple>, Serializable {

        public int compare(Tuple o1, Tuple o2) {
            if (o1.m_fDist < o2.m_fDist) {
                return -1;
            } else if (o1.m_fDist == o2.m_fDist) {
                return 0;
            }
            return 1;
        }
    }

    /**
     * the various link types
     */
    final static int SINGLE = 0;
    final static int COMPLETE = 1;
    final static int AVERAGE = 2;
    final static int MEAN = 3;
    final static int CENTROID = 4;
    final static int WARD = 5;
    final static int ADJCOMLPETE = 6;
    final static int NEIGHBOR_JOINING = 7;
    public static final Tag[] TAGS_LINK_TYPE = { new Tag(SINGLE, "SINGLE"), new Tag(COMPLETE, "COMPLETE"),
            new Tag(AVERAGE, "AVERAGE"), new Tag(MEAN, "MEAN"), new Tag(CENTROID, "CENTROID"),
            new Tag(WARD, "WARD"), new Tag(ADJCOMLPETE, "ADJCOMLPETE"),
            new Tag(NEIGHBOR_JOINING, "NEIGHBOR_JOINING") };

    /**
     * Holds the Link type used calculate distance between clusters
     */
    int m_nLinkType = COMPLETE;

    boolean m_bPrintNewick = true;

    ;

    public boolean getPrintNewick() {
        return m_bPrintNewick;
    }

    public void setPrintNewick(boolean bPrintNewick) {
        m_bPrintNewick = bPrintNewick;
    }

    public double[][] getSimilarities() {
        return similarities;
    }

    public void setSimilarities(double[][] similarities) {
        this.similarities = similarities;
    }

    public void setLinkType(SelectedTag newLinkType) {
        if (newLinkType.getTags() == TAGS_LINK_TYPE) {
            m_nLinkType = newLinkType.getSelectedTag().getID();
        }
    }

    public SelectedTag getLinkType() {
        return new SelectedTag(m_nLinkType, TAGS_LINK_TYPE);
    }

    /**
     * class representing node in cluster hierarchy *
     */
    class Node implements Serializable {

        Node m_left;
        Node m_right;
        Node m_parent;
        int m_iLeftInstance;
        int m_iRightInstance;
        double m_fLeftLength = 0;
        double m_fRightLength = 0;
        double m_fHeight = 0;

        public String toString(int attIndex) {
            NumberFormat nf = NumberFormat.getNumberInstance(new Locale("en", "US"));
            DecimalFormat myFormatter = (DecimalFormat) nf;
            myFormatter.applyPattern("#.#####");

            if (m_left == null) {
                if (m_right == null) {
                    return "(" + m_instances.instance(m_iLeftInstance).stringValue(attIndex) + ":"
                            + myFormatter.format(m_fLeftLength) + ","
                            + m_instances.instance(m_iRightInstance).stringValue(attIndex) + ":"
                            + myFormatter.format(m_fRightLength) + ")";
                } else {
                    return "(" + m_instances.instance(m_iLeftInstance).stringValue(attIndex) + ":"
                            + myFormatter.format(m_fLeftLength) + "," + m_right.toString(attIndex) + ":"
                            + myFormatter.format(m_fRightLength) + ")";
                }
            } else {
                if (m_right == null) {
                    return "(" + m_left.toString(attIndex) + ":" + myFormatter.format(m_fLeftLength) + ","
                            + m_instances.instance(m_iRightInstance).stringValue(attIndex) + ":"
                            + myFormatter.format(m_fRightLength) + ")";
                } else {
                    return "(" + m_left.toString(attIndex) + ":" + myFormatter.format(m_fLeftLength) + ","
                            + m_right.toString(attIndex) + ":" + myFormatter.format(m_fRightLength) + ")";
                }
            }
        }

        public String toString2(int attIndex) {
            NumberFormat nf = NumberFormat.getNumberInstance(new Locale("en", "US"));
            DecimalFormat myFormatter = (DecimalFormat) nf;
            myFormatter.applyPattern("#.#####");

            if (m_left == null) {
                if (m_right == null) {
                    return "(" + m_instances.instance(m_iLeftInstance).value(attIndex) + ":"
                            + myFormatter.format(m_fLeftLength) + ","
                            + m_instances.instance(m_iRightInstance).value(attIndex) + ":"
                            + myFormatter.format(m_fRightLength) + ")";
                } else {
                    return "(" + m_instances.instance(m_iLeftInstance).value(attIndex) + ":"
                            + myFormatter.format(m_fLeftLength) + "," + m_right.toString2(attIndex) + ":"
                            + myFormatter.format(m_fRightLength) + ")";
                }
            } else {
                if (m_right == null) {
                    return "(" + m_left.toString2(attIndex) + ":" + myFormatter.format(m_fLeftLength) + ","
                            + m_instances.instance(m_iRightInstance).value(attIndex) + ":"
                            + myFormatter.format(m_fRightLength) + ")";
                } else {
                    return "(" + m_left.toString2(attIndex) + ":" + myFormatter.format(m_fLeftLength) + ","
                            + m_right.toString2(attIndex) + ":" + myFormatter.format(m_fRightLength) + ")";
                }
            }
        }

        void setHeight(double fHeight1, double fHeight2) {
            m_fHeight = fHeight1;
            if (m_left == null) {
                m_fLeftLength = fHeight1;
            } else {
                m_fLeftLength = fHeight1 - m_left.m_fHeight;
            }
            if (m_right == null) {
                m_fRightLength = fHeight2;
            } else {
                m_fRightLength = fHeight2 - m_right.m_fHeight;
            }
        }

        void setLength(double fLength1, double fLength2) {
            m_fLeftLength = fLength1;
            m_fRightLength = fLength2;
            m_fHeight = fLength1;
            if (m_left != null) {
                m_fHeight += m_left.m_fHeight;
            }
        }
    }

    Node[] m_clusters;
    int[] m_nClusterNr;

    public int[] getM_nClusterNr() {
        return m_nClusterNr;
    }

    public void setM_nClusterNr(int[] m_nClusterNr) {
        this.m_nClusterNr = m_nClusterNr;
    }

    @Override
    public void buildClusterer(Instances data) throws Exception {

        m_instances = data;
        int nInstances = m_instances.numInstances();
        if (nInstances == 0) {
            return;
        }
        m_DistanceFunction.setInstances(m_instances);
        // use array of integer vectors to store cluster indices,
        // starting with one cluster per instance

        this.nClusterID = new Vector[data.numInstances()];

        for (int i = 0; i < data.numInstances(); i++) {
            nClusterID[i] = new Vector<Integer>();
            nClusterID[i].add(i);
        }

        // calculate distance matrix
        int nClusters = data.numInstances();

        // used for keeping track of hierarchy
        Node[] clusterNodes = new Node[nInstances];

        if (m_nLinkType == NEIGHBOR_JOINING) {
            neighborJoining(nClusters, nClusterID, clusterNodes);
        } else {
            doLinkClustering(nClusters, nClusterID, clusterNodes);
        }

        // move all clusters in m_nClusterID array
        // & collect hierarchy
        int iCurrent = 0;
        m_clusters = new Node[m_nNumClusters];
        m_nClusterNr = new int[nInstances];
        for (int i = 0; i < nInstances; i++) {
            if (nClusterID[i].size() > 0) {
                for (int j = 0; j < nClusterID[i].size(); j++) {
                    m_nClusterNr[nClusterID[i].elementAt(j)] = iCurrent;
                }
                m_clusters[iCurrent] = clusterNodes[i];
                iCurrent++;

            }
        }

    } // buildClusterer

    /**
     * use neighbor joining algorithm for clustering This is roughly based on
     * the RapidNJ simple implementation and runs at O(n^3) More efficient
     * implementations exist, see RapidNJ (or my GPU implementation :-))
     *
     * @param nClusters
     * @param nClusterID
     * @param clusterNodes
     */
    void neighborJoining(int nClusters, Vector<Integer>[] nClusterID, Node[] clusterNodes) {
        int n = m_instances.numInstances();
        System.out.println("NeighborJoining");
        double[][] fDist = new double[nClusters][nClusters];
        for (int i = 0; i < nClusters; i++) {
            fDist[i][i] = 0;
            for (int j = i + 1; j < nClusters; j++) {
                fDist[i][j] = getDistance0(nClusterID[i], nClusterID[j]);
                fDist[j][i] = fDist[i][j];
            }
        }

        double[] fSeparationSums = new double[n];
        double[] fSeparations = new double[n];
        int[] nNextActive = new int[n];

        //calculate initial separation rows
        for (int i = 0; i < n; i++) {
            double fSum = 0;
            for (int j = 0; j < n; j++) {
                fSum += fDist[i][j];
            }
            fSeparationSums[i] = fSum;
            fSeparations[i] = fSum / (nClusters - 2);
            nNextActive[i] = i + 1;
        }

        while (nClusters > 2) {
            // find minimum
            int iMin1 = -1;
            int iMin2 = -1;
            double fMin = Double.MAX_VALUE;
            if (m_bDebug) {
                for (int i = 0; i < n; i++) {
                    if (nClusterID[i].size() > 0) {
                        double[] fRow = fDist[i];
                        double fSep1 = fSeparations[i];
                        for (int j = 0; j < n; j++) {
                            if (nClusterID[j].size() > 0 && i != j) {
                                double fSep2 = fSeparations[j];
                                double fVal = fRow[j] - fSep1 - fSep2;

                                if (fVal < fMin) {
                                    // new minimum
                                    iMin1 = i;
                                    iMin2 = j;
                                    fMin = fVal;
                                }
                            }
                        }
                    }
                }
            } else {
                int i = 0;
                while (i < n) {
                    double fSep1 = fSeparations[i];
                    double[] fRow = fDist[i];
                    int j = nNextActive[i];
                    while (j < n) {
                        double fSep2 = fSeparations[j];
                        double fVal = fRow[j] - fSep1 - fSep2;
                        if (fVal < fMin) {
                            // new minimum
                            iMin1 = i;
                            iMin2 = j;
                            fMin = fVal;
                        }
                        j = nNextActive[j];
                    }
                    i = nNextActive[i];
                }
            }
            // record distance
            double fMinDistance = fDist[iMin1][iMin2];
            nClusters--;
            double fSep1 = fSeparations[iMin1];
            double fSep2 = fSeparations[iMin2];
            double fDist1 = (0.5 * fMinDistance) + (0.5 * (fSep1 - fSep2));
            double fDist2 = (0.5 * fMinDistance) + (0.5 * (fSep2 - fSep1));
            if (nClusters > 2) {
                // update separations  & distance
                double fNewSeparationSum = 0;
                double fMutualDistance = fDist[iMin1][iMin2];
                double[] fRow1 = fDist[iMin1];
                double[] fRow2 = fDist[iMin2];
                for (int i = 0; i < n; i++) {
                    if (i == iMin1 || i == iMin2 || nClusterID[i].size() == 0) {
                        fRow1[i] = 0;
                    } else {
                        double fVal1 = fRow1[i];
                        double fVal2 = fRow2[i];
                        double fDistance = (fVal1 + fVal2 - fMutualDistance) / 2.0;
                        fNewSeparationSum += fDistance;
                        // update the separationsum of cluster i.
                        fSeparationSums[i] += (fDistance - fVal1 - fVal2);
                        fSeparations[i] = fSeparationSums[i] / (nClusters - 2);
                        fRow1[i] = fDistance;
                        fDist[i][iMin1] = fDistance;
                    }
                }
                fSeparationSums[iMin1] = fNewSeparationSum;
                fSeparations[iMin1] = fNewSeparationSum / (nClusters - 2);
                fSeparationSums[iMin2] = 0;
                merge(iMin1, iMin2, fDist1, fDist2, nClusterID, clusterNodes);
                int iPrev = iMin2;
                // since iMin1 < iMin2 we havenActiveRows[0] >= 0, so the next loop should be save
                while (nClusterID[iPrev].size() == 0) {
                    iPrev--;
                }
                nNextActive[iPrev] = nNextActive[iMin2];
            } else {
                merge(iMin1, iMin2, fDist1, fDist2, nClusterID, clusterNodes);
                break;
            }
        }

        for (int i = 0; i < n; i++) {
            if (nClusterID[i].size() > 0) {
                for (int j = i + 1; j < n; j++) {
                    if (nClusterID[j].size() > 0) {
                        double fDist1 = fDist[i][j];
                        if (nClusterID[i].size() == 1) {
                            merge(i, j, fDist1, 0, nClusterID, clusterNodes);
                        } else if (nClusterID[j].size() == 1) {
                            merge(i, j, 0, fDist1, nClusterID, clusterNodes);
                        } else {
                            merge(i, j, fDist1 / 2.0, fDist1 / 2.0, nClusterID, clusterNodes);
                        }
                        break;
                    }
                }
            }
        }
    } // neighborJoining

    /**
     * Perform clustering using a link method This implementation uses a
     * priority queue resulting in a O(n^2 log(n)) algorithm
     *
     * @param nClusters number of clusters
     * @param nClusterID
     * @param clusterNodes
     */
    /**
     * Perform clustering using a link method This implementation uses a
     * priority queue resulting in a O(n^2 log(n)) algorithm
     *
     * @param nClusters number of clusters
     * @param nClusterID
     * @param clusterNodes
     */
    void doLinkClustering(int nClusters, Vector<Integer>[] nClusterID, Node[] clusterNodes) {

        Map<Integer, Double> k_var_map = new HashMap<>();
        int nInstances = m_instances.numInstances();

        PriorityQueue<Tuple> queue = new PriorityQueue<Tuple>(nClusters * nClusters / 2, new TupleComparator());
        this.fDistance0 = new double[nClusters][nClusters];
        this.copheneticMatrix = new double[nClusters][nClusters];
        double[][] fClusterDistance = null;
        if (m_bDebug) {

            fClusterDistance = new double[nClusters][nClusters];
        }
        for (int i = 0; i < nClusters; i++) {
            fDistance0[i][i] = 0;
            for (int j = i + 1; j < nClusters; j++) {
                System.out.println("i,j " + i + " ," + j);
                fDistance0[i][j] = getDistance0(nClusterID[i], nClusterID[j]);
                fDistance0[j][i] = fDistance0[i][j];

                queue.add(new Tuple(fDistance0[i][j], i, j, 1, 1));
                if (m_bDebug) {
                    fClusterDistance[i][j] = fDistance0[i][j];
                    fClusterDistance[j][i] = fDistance0[i][j];
                }
            }
        }
        while (nClusters > m_nNumClusters) {

            int iMin1 = -1;
            int iMin2 = -1;
            // find closest two clusters
            if (m_bDebug) {
                /* simple but inefficient implementation */
                double fMinDistance = Double.MAX_VALUE;
                for (int i = 0; i < nInstances; i++) {
                    if (nClusterID[i].size() > 0) {
                        for (int j = i + 1; j < nInstances; j++) {
                            if (nClusterID[j].size() > 0) {
                                double fDist = fClusterDistance[i][j];
                                if (fDist < fMinDistance) {
                                    fMinDistance = fDist;
                                    iMin1 = i;
                                    iMin2 = j;
                                }
                            }
                        }
                    }
                }
                merge(iMin1, iMin2, fMinDistance, fMinDistance, nClusterID, clusterNodes);
            } else {
                // use priority queue to find next best pair to cluster
                Tuple t;
                do {
                    t = queue.poll();
                } while (t != null && (nClusterID[t.m_iCluster1].size() != t.m_nClusterSize1
                        || nClusterID[t.m_iCluster2].size() != t.m_nClusterSize2));
                iMin1 = t.m_iCluster1;
                iMin2 = t.m_iCluster2;
                // update cophenetic matrix
                Iterator<Integer> iterator = nClusterID[iMin1].iterator();
                while (iterator.hasNext()) {
                    int i = iterator.next();
                    Iterator<Integer> iterator2 = nClusterID[iMin2].iterator();
                    while (iterator2.hasNext()) {
                        int j = iterator2.next();
                        if (copheneticMatrix[i][j] == 0.0) {
                            copheneticMatrix[i][j] = t.m_fDist;
                            copheneticMatrix[j][i] = t.m_fDist;
                        }

                    }

                }
                merge(iMin1, iMin2, t.m_fDist, t.m_fDist, nClusterID, clusterNodes);
            }
            // merge  clusters

            // update distances & queue
            for (int i = 0; i < nInstances; i++) {
                if (i != iMin1 && nClusterID[i].size() != 0) {
                    int i1 = Math.min(iMin1, i);
                    int i2 = Math.max(iMin1, i);
                    double fDistance = getDistance(fDistance0, nClusterID[i1], nClusterID[i2]);
                    if (m_bDebug) {
                        fClusterDistance[i1][i2] = fDistance;
                        fClusterDistance[i2][i1] = fDistance;
                    }
                    queue.add(new Tuple(fDistance, i1, i2, nClusterID[i1].size(), nClusterID[i2].size()));
                }
            }

            nClusters--;

            // evaluation of the total quality or the goodness of the current clustering
            int count = 0;
            double total = 0.0;
            double totlaVariance = 0.0;
            double total_sil = 0.0;
            for (int x = 0; x < nClusterID.length; x++) {
                if (nClusterID[x].size() != 0) {
                    count += 1;
                    //                    double sel;
                    total_sil += calcSilhouette(nClusterID[x], x);
                    //                    total += sel;
                    //totlaVariance+=calcWithinVariance(nClusterID[x]);
                }
            }

            System.out.println("\n Number of clusters at this stage is : " + (count));
            //System.out.println(" Selhouette Coefficient at this stage is : " + (double) (total / count));

            k_var_map.put(count, (double) (total_sil / m_instances.numInstances()));
            // System.out.println(" Total variance is: "+totlaVariance);
            //buildIdealizedSimilarityMatrix(nClusterID);
        }
        ExcelUtility.writeExperimentResult(k_var_map,
                "D:\\Thesis\\Experiments\\datasets\\overlapped\\dataset-3-\\studying-number-of-clusters\\Experiment-1\\exp3.xls");
        System.out.println("The CCCP is: " + new Correlator().correlate(fDistance0, copheneticMatrix));

    } // doLinkClustering

    // print the distance matrix
    private void printDistanceMatrix() {

        System.out.println("");
        for (int i = 0; i < m_instances.numInstances(); i++) {
            for (int j = i + 1; j < m_instances.numInstances(); j++) {
                System.out.print(fDistance0[i][j] + "    ");
            }
            System.out.println("");
        }

    }

    private void sessionSimilarity(int i, int j) {
        Instance first = m_instances.instance(i);
        Instance second = m_instances.instance(j);

        double similarity = 0.0;

    }

    // build the idelaized similarity matrix 
    private void buildIdealizedSimilarityMatrix(Vector<Integer>[] clusters) {
        int[][] idealizedMatrix = new int[m_instances.numInstances()][m_instances.numInstances()];
        for (int i = 0; i < clusters.length; i++) {
            if (clusters[i].size() > 0) {
                for (int j = 0; j < clusters[i].size(); j++) {
                    for (int k = j; k < clusters[i].size(); k++) {
                        idealizedMatrix[clusters[i].get(j)][clusters[i].get(k)] = 1;
                        idealizedMatrix[clusters[i].get(k)][clusters[i].get(j)] = 1;

                    }
                }

            }

        }

    }// end of buildIdealizedSimilarityMatrix 

    // calculate the silhouette coefficient for a given cluster
    private double calcSilhouette(Vector<Integer> cluster, int clusterIndex) {

        // silhoette of a singleton cluster is 0
        if (cluster.size() == 1) {
            return 0;
        } else {
            double selhouetteCoefficient = 0.0;
            for (int i = 0; i < cluster.size(); i++) {

                int current = cluster.get(i);
                double ai = 0.0;
                double si = 0.0;
                double bi = 0.0;
                for (int j = 0; j < cluster.size(); j++) {
                    int target = cluster.get(j);
                    // System.out.println("Distance between i nd target :"+target+" is "+fDistance0[target][current]);
                    ai += fDistance0[target][current];

                }

                ai = (double) ai / (cluster.size() - 1);
                // System.out.println("ai is: "+ ai);
                // need to be verified 
                ai = (Double.isNaN(ai) ? 0 : ai);

                PriorityQueue<Candidate> cq = new PriorityQueue<>(5, new CandidateComparator());

                // now, here to compute Bi
                for (int k = 0; k < nClusterID.length; k++) {

                    if ((k != clusterIndex) && (nClusterID[k].size() != 0)) {
                        bi = 0.0;

                        for (int l = 0; l < nClusterID[k].size(); l++) {
                            int targetO = nClusterID[k].get(l);
                            bi += fDistance0[current][targetO];
                        }

                        bi = bi / nClusterID[k].size();
                        // put k and b
                        cq.add(new Candidate(k, bi));
                    }
                }

                Candidate nearest = cq.poll();

                //here to compute the selhouette coefficient for instance current wich belongs to cluster
                if (nearest == null) {
                    nearest = new Candidate(0, 0);
                }
                //                System.out.println("Distance is: "+nearest.distance);
                si = (nearest.distance - ai) / Math.max(ai, nearest.distance);
                //                   if(Double.isNaN(si))
                //                  System.out.println("naaaaaaaaaaaaaaaaaaan");
                //                   else     System.out.println("ok: "+si);
                selhouetteCoefficient += si;

            }

            return selhouetteCoefficient;
            //        double result = selhouetteCoefficient / cluster.size();
            //        if (Double.isNaN(result)) {
            //            return 0;
            //        } else {
            //            return result;
            //        }
        }
    } // end of calcSilhouette

    void merge(int iMin1, int iMin2, double fDist1, double fDist2, Vector<Integer>[] nClusterID,
            Node[] clusterNodes) {
        if (m_bDebug) {
            System.err.println("Merging " + iMin1 + " " + iMin2 + " " + fDist1 + " " + fDist2);
        }
        if (iMin1 > iMin2) {
            int h = iMin1;
            iMin1 = iMin2;
            iMin2 = h;
            double f = fDist1;
            fDist1 = fDist2;
            fDist2 = f;
        }
        nClusterID[iMin1].addAll(nClusterID[iMin2]);
        nClusterID[iMin2].removeAllElements();

        // track hierarchy
        Node node = new Node();
        if (clusterNodes[iMin1] == null) {
            node.m_iLeftInstance = iMin1;
        } else {
            node.m_left = clusterNodes[iMin1];
            clusterNodes[iMin1].m_parent = node;
        }
        if (clusterNodes[iMin2] == null) {
            node.m_iRightInstance = iMin2;
        } else {
            node.m_right = clusterNodes[iMin2];
            clusterNodes[iMin2].m_parent = node;
        }
        if (m_bDistanceIsBranchLength) {
            node.setLength(fDist1, fDist2);
        } else {
            node.setHeight(fDist1, fDist2);
        }
        clusterNodes[iMin1] = node;
    } // merge

    /**
     * calculate distance the first time when setting up the distance matrix *
     */
    double getDistance0(Vector<Integer> cluster1, Vector<Integer> cluster2) {
        double fBestDist = Double.MAX_VALUE;
        switch (m_nLinkType) {
        case SINGLE:
        case NEIGHBOR_JOINING:
        case CENTROID:
        case COMPLETE:
        case ADJCOMLPETE:
        case AVERAGE:
        case MEAN:
            // set up two instances for distance function
            Instance instance1 = (Instance) m_instances.instance(cluster1.elementAt(0)).copy();
            Instance instance2 = (Instance) m_instances.instance(cluster2.elementAt(0)).copy();
            fBestDist = m_DistanceFunction.distance(instance1, instance2);
            break;
        case WARD: {
            // finds the distance of the change in caused by merging the cluster.
            // The information of a cluster is calculated as the error sum of squares of the
            // centroids of the cluster and its members.
            double ESS1 = calcESS(cluster1);
            double ESS2 = calcESS(cluster2);
            Vector<Integer> merged = new Vector<Integer>();
            merged.addAll(cluster1);
            merged.addAll(cluster2);
            double ESS = calcESS(merged);
            fBestDist = ESS * merged.size() - ESS1 * cluster1.size() - ESS2 * cluster2.size();
        }
            break;
        }
        return fBestDist;
    } // getDistance0

    /**
     * calculate the distance between two clusters
     *
     * @param cluster1 list of indices of instances in the first cluster
     * @param cluster2 dito for second cluster
     * @return distance between clusters based on link type
     */
    double getDistance(double[][] fDistance, Vector<Integer> cluster1, Vector<Integer> cluster2) {
        double fBestDist = Double.MAX_VALUE;
        switch (m_nLinkType) {
        case SINGLE:
            // find single link distance aka minimum link, which is the closest distance between
            // any item in cluster1 and any item in cluster2
            fBestDist = Double.MAX_VALUE;
            for (int i = 0; i < cluster1.size(); i++) {
                int i1 = cluster1.elementAt(i);
                for (int j = 0; j < cluster2.size(); j++) {
                    int i2 = cluster2.elementAt(j);
                    double fDist = fDistance[i1][i2];
                    if (fBestDist > fDist) {
                        fBestDist = fDist;
                    }
                }
            }
            break;
        case COMPLETE:
        case ADJCOMLPETE:
            // find complete link distance aka maximum link, which is the largest distance between
            // any item in cluster1 and any item in cluster2
            fBestDist = 0;
            for (int i = 0; i < cluster1.size(); i++) {
                int i1 = cluster1.elementAt(i);
                for (int j = 0; j < cluster2.size(); j++) {
                    int i2 = cluster2.elementAt(j);
                    double fDist = fDistance[i1][i2];
                    if (fBestDist < fDist) {
                        fBestDist = fDist;
                    }
                }
            }
            if (m_nLinkType == COMPLETE) {
                break;
            }
            // calculate adjustment, which is the largest within cluster distance
            double fMaxDist = 0;
            for (int i = 0; i < cluster1.size(); i++) {
                int i1 = cluster1.elementAt(i);
                for (int j = i + 1; j < cluster1.size(); j++) {
                    int i2 = cluster1.elementAt(j);
                    double fDist = fDistance[i1][i2];
                    if (fMaxDist < fDist) {
                        fMaxDist = fDist;
                    }
                }
            }
            for (int i = 0; i < cluster2.size(); i++) {
                int i1 = cluster2.elementAt(i);
                for (int j = i + 1; j < cluster2.size(); j++) {
                    int i2 = cluster2.elementAt(j);
                    double fDist = fDistance[i1][i2];
                    if (fMaxDist < fDist) {
                        fMaxDist = fDist;
                    }
                }
            }
            fBestDist -= fMaxDist;
            break;
        case AVERAGE:
            // finds average distance between the elements of the two clusters
            fBestDist = 0;
            for (int i = 0; i < cluster1.size(); i++) {
                int i1 = cluster1.elementAt(i);
                for (int j = 0; j < cluster2.size(); j++) {
                    int i2 = cluster2.elementAt(j);
                    fBestDist += fDistance[i1][i2];
                }
            }
            fBestDist /= (cluster1.size() * cluster2.size());
            break;
        case MEAN: {
            // calculates the mean distance of a merged cluster (akak Group-average agglomerative clustering)
            Vector<Integer> merged = new Vector<Integer>();
            merged.addAll(cluster1);
            merged.addAll(cluster2);
            fBestDist = 0;
            for (int i = 0; i < merged.size(); i++) {
                int i1 = merged.elementAt(i);
                for (int j = i + 1; j < merged.size(); j++) {
                    int i2 = merged.elementAt(j);
                    fBestDist += fDistance[i1][i2];
                }
            }
            int n = merged.size();
            fBestDist /= (n * (n - 1.0) / 2.0);
        }
            break;
        case CENTROID:
            // finds the distance of the centroids of the clusters
            double[] fValues1 = new double[m_instances.numAttributes()];
            for (int i = 0; i < cluster1.size(); i++) {
                Instance instance = m_instances.instance(cluster1.elementAt(i));
                for (int j = 0; j < m_instances.numAttributes(); j++) {
                    fValues1[j] += instance.value(j);
                }
            }
            double[] fValues2 = new double[m_instances.numAttributes()];
            for (int i = 0; i < cluster2.size(); i++) {
                Instance instance = m_instances.instance(cluster2.elementAt(i));
                for (int j = 0; j < m_instances.numAttributes(); j++) {
                    fValues2[j] += instance.value(j);
                }
            }
            for (int j = 0; j < m_instances.numAttributes(); j++) {
                fValues1[j] /= cluster1.size();
                fValues2[j] /= cluster2.size();
            }
            // set up two instances for distance function
            Instance instance1 = (Instance) m_instances.instance(0).copy();
            Instance instance2 = (Instance) m_instances.instance(0).copy();
            for (int j = 0; j < m_instances.numAttributes(); j++) {
                instance1.setValue(j, fValues1[j]);
                instance2.setValue(j, fValues2[j]);
            }
            fBestDist = m_DistanceFunction.distance(instance1, instance2);
            break;
        case WARD: {
            // finds the distance of the change in caused by merging the cluster.
            // The information of a cluster is calculated as the error sum of squares of the
            // centroids of the cluster and its members.
            double ESS1 = calcESS(cluster1);
            double ESS2 = calcESS(cluster2);
            Vector<Integer> merged = new Vector<Integer>();
            merged.addAll(cluster1);
            merged.addAll(cluster2);
            double ESS = calcESS(merged);
            fBestDist = ESS * merged.size() - ESS1 * cluster1.size() - ESS2 * cluster2.size();
        }
            break;
        }
        return fBestDist;
    } // getDistance

    /**
     * calculated error sum-of-squares for instances wrt centroid *
     */
    double calcESS(Vector<Integer> cluster) {
        double[] fValues1 = new double[m_instances.numAttributes()];
        for (int i = 0; i < cluster.size(); i++) {
            Instance instance = m_instances.instance(cluster.elementAt(i));
            for (int j = 0; j < m_instances.numAttributes(); j++) {
                fValues1[j] += instance.value(j);
            }
        }
        for (int j = 0; j < m_instances.numAttributes(); j++) {
            fValues1[j] /= cluster.size();
        }
        // set up two instances for distance function
        Instance centroid = (Instance) m_instances.instance(cluster.elementAt(0)).copy();
        for (int j = 0; j < m_instances.numAttributes(); j++) {
            centroid.setValue(j, fValues1[j]);
        }
        double fESS = 0;
        for (int i = 0; i < cluster.size(); i++) {
            Instance instance = m_instances.instance(cluster.elementAt(i));
            fESS += m_DistanceFunction.distance(centroid, instance);
        }
        return fESS / cluster.size();
    } // calcESS

    /**
     * calculates the with-in-cluster variance as the sum squares of differences
     * between instances contained in the cluster and its mean divided by
     * cluster size
     *
     * @param cluster
     * @return
     */
    double calcWithinVariance(Vector<Integer> cluster) {

        double variance = 0.0;
        double[] fValues1 = new double[m_instances.numAttributes()];
        for (int i = 0; i < cluster.size(); i++) {
            Instance instance = m_instances.instance(cluster.elementAt(i));
            for (int j = 0; j < m_instances.numAttributes(); j++) {
                fValues1[j] += instance.value(j);
            }
        }
        for (int j = 0; j < m_instances.numAttributes(); j++) {
            fValues1[j] /= cluster.size();
        }
        // set up two instances for distance function
        Instance centroid = (Instance) m_instances.instance(cluster.elementAt(0)).copy();
        for (int j = 0; j < m_instances.numAttributes(); j++) {
            centroid.setValue(j, fValues1[j]);
        }

        for (int i = 0; i < cluster.size(); i++) {
            double temp = 0;
            Instance instance = m_instances.instance(cluster.elementAt(i));
            for (int j = 0; j < m_instances.numAttributes(); j++) {
                temp += (instance.value(j) - centroid.value(j));
            }

            variance += temp * temp;
        }
        return variance / cluster.size();
    }

    @Override
    /**
     * instances are assigned a cluster by finding the instance in the training
     * data with the closest distance to the instance to be clustered. The
     * cluster index of the training data point is taken as the cluster index.
     */
    public int clusterInstance(Instance instance) throws Exception {
        if (m_instances.numInstances() == 0) {
            return 0;
        }
        double fBestDist = Double.MAX_VALUE;
        int iBestInstance = -1;
        for (int i = 0; i < m_instances.numInstances(); i++) {
            double fDist = m_DistanceFunction.distance(instance, m_instances.instance(i));
            if (fDist < fBestDist) {
                fBestDist = fDist;
                iBestInstance = i;
            }
        }

        for (int k = 0; k < nClusterID.length; k++) {
            if (nClusterID[k].contains(iBestInstance)) {
                return k;
            }
        }

        return -1;
    }

    public int abdoClusterInstance(Instance instance) {

        double fBestDist = Double.MAX_VALUE;
        int iBestInstance = -1;
        for (int j = 0; j < nClusterID.length; j++) {
            if (nClusterID[j].size() > 0) {
                double avg = 0.0;
                for (int k = 0; k < nClusterID[j].size(); k++) {
                    avg += m_DistanceFunction.distance(instance, m_instances.instance(nClusterID[j].get(k)));
                }

                avg = avg / nClusterID[j].size();
                if (avg < fBestDist) {
                    fBestDist = avg;
                    iBestInstance = j;
                }

            }
        }
        return iBestInstance;
    }

    @Override
    /**
     * create distribution with all clusters having zero probability, except the
     * cluster the instance is assigned to.
     */
    public double[] distributionForInstance(Instance instance) throws Exception {
        if (numberOfClusters() == 0) {
            double[] p = new double[1];
            p[0] = 1;
            return p;
        }
        double[] p = new double[numberOfClusters()];
        p[clusterInstance(instance)] = 1.0;
        return p;
    }

    @Override
    public Capabilities getCapabilities() {
        Capabilities result = new Capabilities(this);
        result.disableAll();
        result.enable(Capabilities.Capability.NO_CLASS);

        // attributes
        result.enable(Capabilities.Capability.NOMINAL_ATTRIBUTES);
        result.enable(Capabilities.Capability.NUMERIC_ATTRIBUTES);
        result.enable(Capabilities.Capability.DATE_ATTRIBUTES);
        result.enable(Capabilities.Capability.MISSING_VALUES);
        result.enable(Capabilities.Capability.STRING_ATTRIBUTES);

        // other
        result.setMinimumNumberInstances(0);
        return result;
    }

    @Override
    public int numberOfClusters() throws Exception {
        return Math.min(m_nNumClusters, m_instances.numInstances());
    }

    /**
     * Returns an enumeration describing the available options.
     *
     * @return an enumeration of all the available options.
     */
    public Enumeration listOptions() {

        Vector newVector = new Vector(8);
        newVector.addElement(new Option(
                "\tIf set, classifier is run in debug mode and\n" + "\tmay output additional info to the console",
                "D", 0, "-D"));
        newVector.addElement(new Option(
                "\tIf set, distance is interpreted as branch length\n" + "\totherwise it is node height.", "B", 0,
                "-B"));

        newVector.addElement(new Option("\tnumber of clusters", "N", 1, "-N <Nr Of Clusters>"));
        newVector.addElement(
                new Option("\tFlag to indicate the cluster should be printed in Newick format.", "P", 0, "-P"));
        newVector.addElement(new Option(
                "Link type (Single, Complete, Average, Mean, Centroid, Ward, Adjusted complete, Neighbor joining)",
                "L", 1, "-L [SINGLE|COMPLETE|AVERAGE|MEAN|CENTROID|WARD|ADJCOMLPETE|NEIGHBOR_JOINING]"));
        newVector.add(new Option("\tDistance function to use.\n" + "\t(default: weka.core.EuclideanDistance)", "A",
                1, "-A <classname and options>"));
        return newVector.elements();
    }

    /**
     * Parses a given list of options.
     * <p/>
     *
     * <!-- options-start -->
     * Valid options are:
     * <p/>
     *
     * <!-- options-end -->
     *
     * @param options the list of options as an array of strings
     * @throws Exception if an option is not supported
     */
    public void setOptions(String[] options) throws Exception {
        m_bPrintNewick = Utils.getFlag('P', options);

        String optionString = Utils.getOption('N', options);
        if (optionString.length() != 0) {
            Integer temp = new Integer(optionString);
            setNumClusters(temp);
        } else {
            setNumClusters(2);
        }

        setDebug(Utils.getFlag('D', options));
        setDistanceIsBranchLength(Utils.getFlag('B', options));

        String sLinkType = Utils.getOption('L', options);

        if (sLinkType.compareTo("SINGLE") == 0) {
            setLinkType(new SelectedTag(SINGLE, TAGS_LINK_TYPE));
        }
        if (sLinkType.compareTo("COMPLETE") == 0) {
            setLinkType(new SelectedTag(COMPLETE, TAGS_LINK_TYPE));
        }
        if (sLinkType.compareTo("AVERAGE") == 0) {
            setLinkType(new SelectedTag(AVERAGE, TAGS_LINK_TYPE));
        }
        if (sLinkType.compareTo("MEAN") == 0) {
            setLinkType(new SelectedTag(MEAN, TAGS_LINK_TYPE));
        }
        if (sLinkType.compareTo("CENTROID") == 0) {
            setLinkType(new SelectedTag(CENTROID, TAGS_LINK_TYPE));
        }
        if (sLinkType.compareTo("WARD") == 0) {
            setLinkType(new SelectedTag(WARD, TAGS_LINK_TYPE));
        }
        if (sLinkType.compareTo("ADJCOMLPETE") == 0) {
            setLinkType(new SelectedTag(ADJCOMLPETE, TAGS_LINK_TYPE));
        }
        if (sLinkType.compareTo("NEIGHBOR_JOINING") == 0) {
            setLinkType(new SelectedTag(NEIGHBOR_JOINING, TAGS_LINK_TYPE));
        }

        String nnSearchClass = Utils.getOption('A', options);
        if (nnSearchClass.length() != 0) {
            String nnSearchClassSpec[] = Utils.splitOptions(nnSearchClass);
            if (nnSearchClassSpec.length == 0) {
                throw new Exception("Invalid DistanceFunction specification string.");
            }
            String className = nnSearchClassSpec[0];
            nnSearchClassSpec[0] = "";

            setDistanceFunction(
                    (DistanceFunction) Utils.forName(DistanceFunction.class, className, nnSearchClassSpec));
        } else {
            setDistanceFunction(new EuclideanDistance());
        }

        Utils.checkForRemainingOptions(options);
    }

    /**
     * Gets the current settings of the clusterer.
     *
     * @return an array of strings suitable for passing to setOptions()
     */
    public String[] getOptions() {

        String[] options = new String[14];
        int current = 0;

        options[current++] = "-N";
        options[current++] = "" + getNumClusters();

        options[current++] = "-L";
        switch (m_nLinkType) {
        case (SINGLE):
            options[current++] = "SINGLE";
            break;
        case (COMPLETE):
            options[current++] = "COMPLETE";
            break;
        case (AVERAGE):
            options[current++] = "AVERAGE";
            break;
        case (MEAN):
            options[current++] = "MEAN";
            break;
        case (CENTROID):
            options[current++] = "CENTROID";
            break;
        case (WARD):
            options[current++] = "WARD";
            break;
        case (ADJCOMLPETE):
            options[current++] = "ADJCOMLPETE";
            break;
        case (NEIGHBOR_JOINING):
            options[current++] = "NEIGHBOR_JOINING";
            break;
        }
        if (m_bPrintNewick) {
            options[current++] = "-P";
        }
        if (getDebug()) {
            options[current++] = "-D";
        }
        if (getDistanceIsBranchLength()) {
            options[current++] = "-B";
        }

        options[current++] = "-A";
        options[current++] = (m_DistanceFunction.getClass().getName() + " "
                + Utils.joinOptions(m_DistanceFunction.getOptions())).trim();

        while (current < options.length) {
            options[current++] = "";
        }

        return options;
    }

    public String toString() {
        StringBuffer buf = new StringBuffer();
        int attIndex = m_instances.classIndex();
        if (attIndex < 0) {
            // try find a string, or last attribute otherwise
            attIndex = 0;
            while (attIndex < m_instances.numAttributes() - 1) {
                if (m_instances.attribute(attIndex).isString()) {
                    break;
                }
                attIndex++;
            }
        }
        try {
            if (m_bPrintNewick && (numberOfClusters() > 0)) {
                for (int i = 0; i < m_clusters.length; i++) {
                    if (m_clusters[i] != null) {
                        buf.append("Cluster " + i + "\n");
                        if (m_instances.attribute(attIndex).isString()) {
                            buf.append(m_clusters[i].toString(attIndex));
                        } else {
                            buf.append(m_clusters[i].toString2(attIndex));
                        }
                        buf.append("\n\n");
                    }
                }
            }
        } catch (Exception e) {
            e.printStackTrace();
        }
        return buf.toString();
    }

    /**
     * Set debugging mode.
     *
     * @param debug true if debug output should be printed
     */
    public void setDebug(boolean debug) {

        m_bDebug = debug;
    }

    /**
     * Get whether debugging is turned on.
     *
     * @return true if debugging output is on
     */
    public boolean getDebug() {

        return m_bDebug;
    }

    public boolean getDistanceIsBranchLength() {
        return m_bDistanceIsBranchLength;
    }

    public void setDistanceIsBranchLength(boolean bDistanceIsHeight) {
        m_bDistanceIsBranchLength = bDistanceIsHeight;
    }

    public String distanceIsBranchLengthTipText() {
        return "If set to false, the distance between clusters is interpreted "
                + "as the height of the node linking the clusters. This is appropriate for "
                + "example for single link clustering. However, for neighbor joining, the "
                + "distance is better interpreted as branch length. Set this flag to "
                + "get the latter interpretation.";
    }

    /**
     * Returns the tip text for this property
     *
     * @return tip text for this property suitable for displaying in the
     * explorer/experimenter gui
     */
    public String debugTipText() {
        return "If set to true, classifier may output additional info to " + "the console.";
    }

    /**
     * @return a string to describe the NumClusters
     */
    public String numClustersTipText() {
        return "Sets the number of clusters. " + "If a single hierarchy is desired, set this to 1.";
    }

    /**
     * @return a string to describe the print Newick flag
     */
    public String printNewickTipText() {
        return "Flag to indicate whether the cluster should be print in Newick format."
                + " This can be useful for display in other programs. However, for large datasets"
                + " a lot of text may be produced, which may not be a nuisance when the Newick format"
                + " is not required";
    }

    /**
     * @return a string to describe the distance function
     */
    public String distanceFunctionTipText() {
        return "Sets the distance function, which measures the distance between two individual. "
                + "instances (or possibly the distance between an instance and the centroid of a cluster"
                + "depending on the Link type).";
    }

    /**
     * @return a string to describe the Link type
     */
    public String linkTypeTipText() {
        return "Sets the method used to measure the distance between two clusters.\n" + "SINGLE:\n"
                + " find single link distance aka minimum link, which is the closest distance between"
                + " any item in cluster1 and any item in cluster2\n" + "COMPLETE:\n"
                + " find complete link distance aka maximum link, which is the largest distance between"
                + " any item in cluster1 and any item in cluster2\n" + "ADJCOMLPETE:\n"
                + " as COMPLETE, but with adjustment, which is the largest within cluster distance\n" + "AVERAGE:\n"
                + " finds average distance between the elements of the two clusters\n" + "MEAN: \n"
                + " calculates the mean distance of a merged cluster (akak Group-average agglomerative clustering)\n"
                + "CENTROID:\n" + " finds the distance of the centroids of the clusters\n" + "WARD:\n"
                + " finds the distance of the change in caused by merging the cluster."
                + " The information of a cluster is calculated as the error sum of squares of the"
                + " centroids of the cluster and its members.\n" + "NEIGHBOR_JOINING\n"
                + " use neighbor joining algorithm.";
    }

    /**
     * This will return a string describing the clusterer.
     *
     * @return The string.
     */
    public String globalInfo() {
        return "Hierarchical clustering class.\n"
                + "Implements a number of classic agglomorative (i.e. bottom up) hierarchical clustering methods"
                + "based on .";
    }

    public String graph() throws Exception {
        if (numberOfClusters() == 0) {
            return "Newick:(no,clusters)";
        }
        int attIndex = m_instances.classIndex();
        if (attIndex < 0) {
            // try find a string, or last attribute otherwise
            attIndex = 0;
            while (attIndex < m_instances.numAttributes() - 1) {
                if (m_instances.attribute(attIndex).isString()) {
                    break;
                }
                attIndex++;
            }
        }
        String sNewick = null;
        if (m_instances.attribute(attIndex).isString()) {
            sNewick = m_clusters[0].toString(attIndex);
        } else {
            sNewick = m_clusters[0].toString2(attIndex);
        }
        return "Newick:" + sNewick;
    }

    public int graphType() {
        return Drawable.Newick;
    }

    /**
     * Returns the revision string.
     *
     * @return   the revision
     */
    public String getRevision() {
        return RevisionUtils.extract("$Revision: 11330 $");
    }
} // class HierarchicalClusterer