Source code

Java tutorial


Here is the source code for


 * Copyright 2014-2016 LIST (Luxembourg Institute of Science and Technology), all right reserved.
 * Authorship : Olivier PARISOT, Yoanne DIDRY
 * Licensed under GNU General Public License version 3
package lu.lippmann.cdb.lab.beta.util;

import java.util.*;

import lu.lippmann.cdb.lab.beta.*;
import lu.lippmann.cdb.lab.beta.shih.TupleSI;
import weka.clusterers.*;
import weka.core.*;

 * @author 
public final class WekaUtil2 {

    private WekaUtil2() {

     * @param newInstances
     * @param K
     * @return
     * @throws Exception
    public static double[] doKMeans(final Instances newInstances, final int K) throws Exception {
        final SimpleKMeans clusterer = new SimpleKMeans();
                Utils.splitOptions("-N " + K + " -R first-last -I 500 -S 10 -A weka.core.EuclideanDistance"));


        final ClusterEvaluation eval = new ClusterEvaluation();

        double[] ass = eval.getClusterAssignments();
        return ass;

     * @param newInstances
     * @param K
     * @return
     * @throws Exception
    public static List<IndexedInstance> doHAC(final Instances instances, final int K) throws Exception {
        final HierarchicalClusterer clusterer = new HierarchicalClusterer();
        clusterer.setOptions(Utils.splitOptions("-N " + K + " -L MEAN -P -A weka.core.EuclideanDistance"));
        return computeClusters(clusterer, instances);

     * Generate the centroid coordinates based 
     * on it's  members (objects assigned to the cluster of the centroid) and the distance 
     * function being used.
     * @return the centroid
    public static MixedCentroid computeMixedCentroid(final boolean preserveOrder,
            final NormalizableDistance distanceFunction, final Instances numericInstances,
            final Instances originalInstances, final int clusterIndex) {
        final int numInstances = numericInstances.numInstances();
        final int numAttributes = numericInstances.numAttributes();

        final Map<TupleSI, Integer> addedAttr = new HashMap<TupleSI, Integer>();

        if (numInstances == 1) {
            Instance uniqueNumInstance = numericInstances.firstInstance();
            Instance uniqueMixInstance = originalInstances.firstInstance();
            double[] centroid = uniqueNumInstance.toDoubleArray();
            for (int i = 0; i < uniqueMixInstance.numAttributes(); i++) {
                if (!uniqueMixInstance.attribute(i).isNumeric()) {
                    final String catVal = uniqueMixInstance.attribute(i).value((int) uniqueMixInstance.value(i));
                    addedAttr.put(new TupleSI(catVal, i), 1);
            return new MixedCentroid(clusterIndex, centroid, addedAttr);

        final double[] vals = new double[numAttributes];

        //used only for Manhattan Distance
        Instances sortedMembers = null;
        int middle = 0;
        boolean dataIsEven = false;

        final boolean isManhattanDist = (distanceFunction instanceof ManhattanDistance);
        final boolean isEuclideanDist = (distanceFunction instanceof EuclideanDistance);

        if (isManhattanDist) {
            middle = (numInstances - 1) / 2;
            dataIsEven = ((numInstances % 2) == 0);
            if (preserveOrder) {
                sortedMembers = numericInstances;
            } else {
                sortedMembers = new Instances(numericInstances);

        for (int j = 0; j < numAttributes; j++) {
            //in case of Euclidian distance the centroid is the mean point
            //in case of Manhattan distance the centroid is the median point
            //in both cases, if the attribute is nominal, the centroid is the mode            
            if (isEuclideanDist) {
                vals[j] = numericInstances.meanOrMode(j);

                for (int i = 0; i < numInstances; i++) {
                    if (!originalInstances.attribute(j).isNumeric()) {
                        final Instance instance = originalInstances.instance(i);
                        final String catVal = instance.attribute(j).value((int) instance.value(j));
                        //Initialize map
                        final TupleSI key = new TupleSI(catVal, j);
                        if (!addedAttr.containsKey(key))
                            addedAttr.put(key, 0);
                        addedAttr.put(key, addedAttr.get(key) + 1);
            } else if (isManhattanDist) {
                sortedMembers.kthSmallestValue(j, middle + 1);
                vals[j] = sortedMembers.instance(middle).value(j);
                if (dataIsEven) {
                    sortedMembers.kthSmallestValue(j, middle + 2);
                    vals[j] = (vals[j] + sortedMembers.instance(middle + 1).value(j)) / 2;
            } else {
                throw new IllegalStateException("Not handled distance ...");

        return new MixedCentroid(clusterIndex, vals, addedAttr);

     * @param instances
     * @param instance
    public static void removeFromInstances(Instances instances, Instance instance) {
        InstanceComparator cp = new InstanceComparator();
        for (int i = 0; i < instances.numInstances(); i++) {
            Instance cinstance = instances.instance(i);
            if (, instance) == 0) {

     * @param wekaClusterer
     * @param instances
     * @return
     * @throws Exception
    public static List<IndexedInstance> computeClusters(final Clusterer wekaClusterer, final Instances instances)
            throws Exception {
        final Instances ii = new Instances(instances);


        final ClusterEvaluation eval = new ClusterEvaluation();

        final int clustersCount = eval.getNumClusters();
        final List<IndexedInstance> clustersList = new ArrayList<IndexedInstance>(clustersCount);

        //Initialize instances
        for (int k = 0; k < clustersCount; k++) {
            clustersList.add(new IndexedInstance(new Instances(instances, 0), new HashMap<Integer, Integer>()));

        final double[] ass = eval.getClusterAssignments();
        if (ass.length != ii.numInstances())
            throw new IllegalStateException();
        for (int i = 0; i < ass.length; i++) {
            IndexedInstance idxi = clustersList.get((int) ass[i]);
            int pos = idxi.getInstances().size() - 1;
            idxi.getMapOrigIndex().put(pos, i);

        return clustersList;
