List of usage examples for weka.classifiers.rules DecisionTableHashKey DecisionTableHashKey
public DecisionTableHashKey(Instance t, int numAtts, boolean ignoreClass) throws Exception
From source file:adaptedClusteringAlgorithms.MySimpleKMeans.java
License:Open Source License
/** * Generates a clusterer. Has to initialize all fields of the clusterer that * are not being set via options./*from w w w. j a va2 s . c o m*/ * * @param data set of instances serving as training data * @throws Exception if the clusterer has not been generated successfully */ @Override public void buildClusterer(Instances data) throws Exception { if (!SESAME.SESAME_GUI) MyFirstClusterer.weka_gui = true; // can clusterer handle the data? getCapabilities().testWithFail(data); m_Iterations = 0; m_ReplaceMissingFilter = new ReplaceMissingValues(); Instances instances = new Instances(data); instances.setClassIndex(-1); if (!m_dontReplaceMissing) { m_ReplaceMissingFilter.setInputFormat(instances); instances = Filter.useFilter(instances, m_ReplaceMissingFilter); } m_FullMissingCounts = new int[instances.numAttributes()]; if (m_displayStdDevs) { m_FullStdDevs = new double[instances.numAttributes()]; } m_FullNominalCounts = new int[instances.numAttributes()][0]; m_FullMeansOrMediansOrModes = moveCentroid(0, instances, false); for (int i = 0; i < instances.numAttributes(); i++) { m_FullMissingCounts[i] = instances.attributeStats(i).missingCount; if (instances.attribute(i).isNumeric()) { if (m_displayStdDevs) { m_FullStdDevs[i] = Math.sqrt(instances.variance(i)); } if (m_FullMissingCounts[i] == instances.numInstances()) { m_FullMeansOrMediansOrModes[i] = Double.NaN; // mark missing as mean } } else { m_FullNominalCounts[i] = instances.attributeStats(i).nominalCounts; if (m_FullMissingCounts[i] > m_FullNominalCounts[i][Utils.maxIndex(m_FullNominalCounts[i])]) { m_FullMeansOrMediansOrModes[i] = -1; // mark missing as most common // value } } } m_ClusterCentroids = new Instances(instances, m_NumClusters); int[] clusterAssignments = new int[instances.numInstances()]; if (m_PreserveOrder) { m_Assignments = clusterAssignments; } m_DistanceFunction.setInstances(instances); Random RandomO = new Random(getSeed()); int instIndex; HashMap initC = new HashMap(); DecisionTableHashKey hk = null; Instances initInstances = null; if (m_PreserveOrder) { initInstances = new Instances(instances); } else { initInstances = instances; } for (int j = initInstances.numInstances() - 1; j >= 0; j--) { instIndex = RandomO.nextInt(j + 1); hk = new DecisionTableHashKey(initInstances.instance(instIndex), initInstances.numAttributes(), true); if (!initC.containsKey(hk)) { m_ClusterCentroids.add(initInstances.instance(instIndex)); initC.put(hk, null); } initInstances.swap(j, instIndex); if (m_ClusterCentroids.numInstances() == m_NumClusters) { break; } } m_NumClusters = m_ClusterCentroids.numInstances(); // removing reference initInstances = null; int i; boolean converged = false; int emptyClusterCount; Instances[] tempI = new Instances[m_NumClusters]; m_squaredErrors = new double[m_NumClusters]; m_ClusterNominalCounts = new int[m_NumClusters][instances.numAttributes()][0]; m_ClusterMissingCounts = new int[m_NumClusters][instances.numAttributes()]; while (!converged) { emptyClusterCount = 0; m_Iterations++; converged = true; for (i = 0; i < instances.numInstances(); i++) { Instance toCluster = instances.instance(i); int newC = clusterProcessedInstance(toCluster, true); if (newC != clusterAssignments[i]) { converged = false; } clusterAssignments[i] = newC; } // update centroids m_ClusterCentroids = new Instances(instances, m_NumClusters); for (i = 0; i < m_NumClusters; i++) { tempI[i] = new Instances(instances, 0); } for (i = 0; i < instances.numInstances(); i++) { tempI[clusterAssignments[i]].add(instances.instance(i)); } for (i = 0; i < m_NumClusters; i++) { if (tempI[i].numInstances() == 0) { // empty cluster emptyClusterCount++; } else { moveCentroid(i, tempI[i], true); } } if (m_Iterations == m_MaxIterations) { converged = true; } if (emptyClusterCount > 0) { m_NumClusters -= emptyClusterCount; if (converged) { Instances[] t = new Instances[m_NumClusters]; int index = 0; for (int k = 0; k < tempI.length; k++) { if (tempI[k].numInstances() > 0) { t[index] = tempI[k]; for (i = 0; i < tempI[k].numAttributes(); i++) { m_ClusterNominalCounts[index][i] = m_ClusterNominalCounts[k][i]; } index++; } } tempI = t; } else { tempI = new Instances[m_NumClusters]; } } if (!converged) { m_squaredErrors = new double[m_NumClusters]; m_ClusterNominalCounts = new int[m_NumClusters][instances.numAttributes()][0]; } } if (m_displayStdDevs) { m_ClusterStdDevs = new Instances(instances, m_NumClusters); } m_ClusterSizes = new int[m_NumClusters]; for (i = 0; i < m_NumClusters; i++) { if (m_displayStdDevs) { double[] vals2 = new double[instances.numAttributes()]; for (int j = 0; j < instances.numAttributes(); j++) { if (instances.attribute(j).isNumeric()) { vals2[j] = Math.sqrt(tempI[i].variance(j)); } else { vals2[j] = Instance.missingValue(); } } m_ClusterStdDevs.add(new Instance(1.0, vals2)); } m_ClusterSizes[i] = tempI[i].numInstances(); } // Save memory!! m_DistanceFunction.clean(); if (!SESAME.SESAME_GUI) MyFirstClusterer.weka_gui = true; }
From source file:br.ufrn.ia.core.clustering.SimpleKMeansIaProject.java
License:Open Source License
public void buildClusterer(Instances data) throws Exception { // can clusterer handle the data? getCapabilities().testWithFail(data); m_Iterations = 0;// w w w .j a va2 s . c o m m_ReplaceMissingFilter = new ReplaceMissingValues(); Instances instances = new Instances(data); instances.setClassIndex(-1); if (!m_dontReplaceMissing) { m_ReplaceMissingFilter.setInputFormat(instances); instances = Filter.useFilter(instances, m_ReplaceMissingFilter); } m_FullMissingCounts = new int[instances.numAttributes()]; if (m_displayStdDevs) { m_FullStdDevs = new double[instances.numAttributes()]; } m_FullNominalCounts = new int[instances.numAttributes()][0]; m_FullMeansOrMediansOrModes = moveCentroid(0, instances, false); for (int i = 0; i < instances.numAttributes(); i++) { m_FullMissingCounts[i] = instances.attributeStats(i).missingCount; if (instances.attribute(i).isNumeric()) { if (m_displayStdDevs) { m_FullStdDevs[i] = Math.sqrt(instances.variance(i)); } if (m_FullMissingCounts[i] == instances.numInstances()) { m_FullMeansOrMediansOrModes[i] = Double.NaN; // mark missing // as mean } } else { m_FullNominalCounts[i] = instances.attributeStats(i).nominalCounts; if (m_FullMissingCounts[i] > m_FullNominalCounts[i][Utils.maxIndex(m_FullNominalCounts[i])]) { m_FullMeansOrMediansOrModes[i] = -1; // mark missing as most // common value } } } m_ClusterCentroids = new Instances(instances, m_NumClusters); int[] clusterAssignments = new int[instances.numInstances()]; if (m_PreserveOrder) m_Assignments = clusterAssignments; m_DistanceFunction.setInstances(instances); Random RandomO = new Random(getSeed()); int instIndex; HashMap initC = new HashMap(); DecisionTableHashKey hk = null; Instances initInstances = null; if (m_PreserveOrder) initInstances = new Instances(instances); else initInstances = instances; for (int j = initInstances.numInstances() - 1; j >= 0; j--) { instIndex = RandomO.nextInt(j + 1); hk = new DecisionTableHashKey(initInstances.instance(instIndex), initInstances.numAttributes(), true); if (!initC.containsKey(hk)) { m_ClusterCentroids.add(initInstances.instance(instIndex)); initC.put(hk, null); } initInstances.swap(j, instIndex); if (m_ClusterCentroids.numInstances() == m_NumClusters) { break; } } m_NumClusters = m_ClusterCentroids.numInstances(); // removing reference initInstances = null; int i; boolean converged = false; int emptyClusterCount; Instances[] tempI = new Instances[m_NumClusters]; m_squaredErrors = new double[m_NumClusters]; m_ClusterNominalCounts = new int[m_NumClusters][instances.numAttributes()][0]; m_ClusterMissingCounts = new int[m_NumClusters][instances.numAttributes()]; while (!converged) { emptyClusterCount = 0; m_Iterations++; converged = true; for (i = 0; i < instances.numInstances(); i++) { Instance toCluster = instances.instance(i); int newC = clusterProcessedInstance(toCluster, true); if (newC != clusterAssignments[i]) { converged = false; } clusterAssignments[i] = newC; } // update centroids m_ClusterCentroids = new Instances(instances, m_NumClusters); for (i = 0; i < m_NumClusters; i++) { tempI[i] = new Instances(instances, 0); } for (i = 0; i < instances.numInstances(); i++) { tempI[clusterAssignments[i]].add(instances.instance(i)); } for (i = 0; i < m_NumClusters; i++) { if (tempI[i].numInstances() == 0) { // empty cluster emptyClusterCount++; } else { moveCentroid(i, tempI[i], true); } } if (emptyClusterCount > 0) { m_NumClusters -= emptyClusterCount; if (converged) { Instances[] t = new Instances[m_NumClusters]; int index = 0; for (int k = 0; k < tempI.length; k++) { if (tempI[k].numInstances() > 0) { t[index++] = tempI[k]; } } tempI = t; } else { tempI = new Instances[m_NumClusters]; } } if (m_Iterations == m_MaxIterations) converged = true; if (!converged) { m_squaredErrors = new double[m_NumClusters]; m_ClusterNominalCounts = new int[m_NumClusters][instances.numAttributes()][0]; } } if (m_displayStdDevs) { m_ClusterStdDevs = new Instances(instances, m_NumClusters); } m_ClusterSizes = new int[m_NumClusters]; for (i = 0; i < m_NumClusters; i++) { if (m_displayStdDevs) { double[] vals2 = new double[instances.numAttributes()]; for (int j = 0; j < instances.numAttributes(); j++) { if (instances.attribute(j).isNumeric()) { vals2[j] = Math.sqrt(tempI[i].variance(j)); } else { vals2[j] = Utils.missingValue(); } } m_ClusterStdDevs.add(new DenseInstance(1.0, vals2)); } m_ClusterSizes[i] = tempI[i].numInstances(); } }
From source file:clusterer.SimpleKMeansWithSilhouette.java
License:Open Source License
/** * Generates a clusterer. Has to initialize all fields of the clusterer that * are not being set via options./* w w w .j av a 2 s. c o m*/ * * @param data set of instances serving as training data * @throws Exception if the clusterer has not been generated successfully */ @Override public void buildClusterer(Instances data) throws Exception { m_canopyClusters = null; // can clusterer handle the data? getCapabilities().testWithFail(data); m_Iterations = 0; m_ReplaceMissingFilter = new ReplaceMissingValues(); Instances instances = new Instances(data); instances.setClassIndex(-1); if (!m_dontReplaceMissing) { m_ReplaceMissingFilter.setInputFormat(instances); instances = Filter.useFilter(instances, m_ReplaceMissingFilter); } m_ClusterNominalCounts = new double[m_NumClusters][instances.numAttributes()][]; m_ClusterMissingCounts = new double[m_NumClusters][instances.numAttributes()]; if (m_displayStdDevs) { m_FullStdDevs = instances.variances(); } m_FullMeansOrMediansOrModes = moveCentroid(0, instances, true, false); m_FullMissingCounts = m_ClusterMissingCounts[0]; m_FullNominalCounts = m_ClusterNominalCounts[0]; double sumOfWeights = instances.sumOfWeights(); for (int i = 0; i < instances.numAttributes(); i++) { if (instances.attribute(i).isNumeric()) { if (m_displayStdDevs) { m_FullStdDevs[i] = Math.sqrt(m_FullStdDevs[i]); } if (m_FullMissingCounts[i] == sumOfWeights) { m_FullMeansOrMediansOrModes[i] = Double.NaN; // mark missing as mean } } else { if (m_FullMissingCounts[i] > m_FullNominalCounts[i][Utils.maxIndex(m_FullNominalCounts[i])]) { m_FullMeansOrMediansOrModes[i] = -1; // mark missing as most common // value } } } m_ClusterCentroids = new Instances(instances, m_NumClusters); int[] clusterAssignments = new int[instances.numInstances()]; if (m_PreserveOrder) { m_Assignments = clusterAssignments; } m_DistanceFunction.setInstances(instances); Random RandomO = new Random(getSeed()); int instIndex; HashMap<DecisionTableHashKey, Integer> initC = new HashMap<DecisionTableHashKey, Integer>(); DecisionTableHashKey hk = null; Instances initInstances = null; if (m_PreserveOrder) { initInstances = new Instances(instances); } else { initInstances = instances; } if (m_speedUpDistanceCompWithCanopies) { m_canopyClusters = new Canopy(); m_canopyClusters.setNumClusters(m_NumClusters); m_canopyClusters.setSeed(getSeed()); m_canopyClusters.setT2(getCanopyT2()); m_canopyClusters.setT1(getCanopyT1()); m_canopyClusters.setMaxNumCandidateCanopiesToHoldInMemory(getCanopyMaxNumCanopiesToHoldInMemory()); m_canopyClusters.setPeriodicPruningRate(getCanopyPeriodicPruningRate()); m_canopyClusters.setMinimumCanopyDensity(getCanopyMinimumCanopyDensity()); m_canopyClusters.setDebug(getDebug()); m_canopyClusters.buildClusterer(initInstances); // System.err.println(m_canopyClusters); m_centroidCanopyAssignments = new ArrayList<long[]>(); m_dataPointCanopyAssignments = new ArrayList<long[]>(); } if (m_initializationMethod == KMEANS_PLUS_PLUS) { kMeansPlusPlusInit(initInstances); m_initialStartPoints = new Instances(m_ClusterCentroids); } else if (m_initializationMethod == CANOPY) { canopyInit(initInstances); m_initialStartPoints = new Instances(m_canopyClusters.getCanopies()); } else if (m_initializationMethod == FARTHEST_FIRST) { farthestFirstInit(initInstances); m_initialStartPoints = new Instances(m_ClusterCentroids); } else { // random for (int j = initInstances.numInstances() - 1; j >= 0; j--) { instIndex = RandomO.nextInt(j + 1); hk = new DecisionTableHashKey(initInstances.instance(instIndex), initInstances.numAttributes(), true); if (!initC.containsKey(hk)) { m_ClusterCentroids.add(initInstances.instance(instIndex)); initC.put(hk, null); } initInstances.swap(j, instIndex); if (m_ClusterCentroids.numInstances() == m_NumClusters) { break; } } m_initialStartPoints = new Instances(m_ClusterCentroids); } if (m_speedUpDistanceCompWithCanopies) { // assign canopies to training data for (int i = 0; i < instances.numInstances(); i++) { m_dataPointCanopyAssignments.add(m_canopyClusters.assignCanopies(instances.instance(i))); } } m_NumClusters = m_ClusterCentroids.numInstances(); // removing reference initInstances = null; int i; boolean converged = false; int emptyClusterCount; Instances[] tempI = new Instances[m_NumClusters]; m_squaredErrors = new double[m_NumClusters]; m_ClusterNominalCounts = new double[m_NumClusters][instances.numAttributes()][0]; m_ClusterMissingCounts = new double[m_NumClusters][instances.numAttributes()]; startExecutorPool(); while (!converged) { if (m_speedUpDistanceCompWithCanopies) { // re-assign canopies to the current cluster centers m_centroidCanopyAssignments.clear(); for (int kk = 0; kk < m_ClusterCentroids.numInstances(); kk++) { m_centroidCanopyAssignments .add(m_canopyClusters.assignCanopies(m_ClusterCentroids.instance(kk))); } } emptyClusterCount = 0; m_Iterations++; converged = true; if (m_executionSlots <= 1 || instances.numInstances() < 2 * m_executionSlots) { for (i = 0; i < instances.numInstances(); i++) { Instance toCluster = instances.instance(i); int newC = clusterProcessedInstance(toCluster, false, true, m_speedUpDistanceCompWithCanopies ? m_dataPointCanopyAssignments.get(i) : null); if (newC != clusterAssignments[i]) { converged = false; } clusterAssignments[i] = newC; } } else { converged = launchAssignToClusters(instances, clusterAssignments); } // update centroids m_ClusterCentroids = new Instances(instances, m_NumClusters); for (i = 0; i < m_NumClusters; i++) { tempI[i] = new Instances(instances, 0); } for (i = 0; i < instances.numInstances(); i++) { tempI[clusterAssignments[i]].add(instances.instance(i)); } if (m_executionSlots <= 1 || instances.numInstances() < 2 * m_executionSlots) { for (i = 0; i < m_NumClusters; i++) { if (tempI[i].numInstances() == 0) { // empty cluster emptyClusterCount++; } else { moveCentroid(i, tempI[i], true, true); } } } else { emptyClusterCount = launchMoveCentroids(tempI); } if (m_Iterations == m_MaxIterations) { converged = true; } if (emptyClusterCount > 0) { m_NumClusters -= emptyClusterCount; if (converged) { Instances[] t = new Instances[m_NumClusters]; int index = 0; for (int k = 0; k < tempI.length; k++) { if (tempI[k].numInstances() > 0) { t[index] = tempI[k]; for (i = 0; i < tempI[k].numAttributes(); i++) { m_ClusterNominalCounts[index][i] = m_ClusterNominalCounts[k][i]; } index++; } } tempI = t; } else { tempI = new Instances[m_NumClusters]; } } if (!converged) { m_ClusterNominalCounts = new double[m_NumClusters][instances.numAttributes()][0]; } } // calculate errors if (!m_FastDistanceCalc) { for (i = 0; i < instances.numInstances(); i++) { clusterProcessedInstance(instances.instance(i), true, false, null); } } if (m_displayStdDevs) { m_ClusterStdDevs = new Instances(instances, m_NumClusters); } m_ClusterSizes = new double[m_NumClusters]; for (i = 0; i < m_NumClusters; i++) { if (m_displayStdDevs) { double[] vals2 = tempI[i].variances(); for (int j = 0; j < instances.numAttributes(); j++) { if (instances.attribute(j).isNumeric()) { vals2[j] = Math.sqrt(vals2[j]); } else { vals2[j] = Utils.missingValue(); } } m_ClusterStdDevs.add(new DenseInstance(1.0, vals2)); } m_ClusterSizes[i] = tempI[i].sumOfWeights(); } m_executorPool.shutdown(); // save memory! m_DistanceFunction.clean(); // Calculate Silhouette Coefficient SilCoeff = new double[instances.numInstances()]; AvgSilCoeff = 0; for (int z = 0; z < instances.numInstances(); z++) { double[] distance = new double[m_NumClusters]; Arrays.fill(distance, 0.0); //Sum for (int y = 0; y < instances.numInstances(); y++) { distance[clusterAssignments[y]] += m_DistanceFunction.distance(instances.get(z), instances.get(y)); } //Average for (int x = 0; x < m_NumClusters; x++) { distance[x] = distance[x] / m_ClusterSizes[x]; } double a = distance[clusterAssignments[z]]; distance[clusterAssignments[z]] = Double.MAX_VALUE; Arrays.sort(distance); double b = distance[0]; SilCoeff[z] = (b - a) / Math.max(a, b); AvgSilCoeff += SilCoeff[z]; } AvgSilCoeff = AvgSilCoeff / instances.numInstances(); //System.out.println("AvgSilCoeff: " + AvgSilCoeff); }
From source file:clusterer.SimpleKMeansWithSilhouette.java
License:Open Source License
/** * Initialize using the k-means++ method * // w w w.j a v a 2 s .com * @param data the training data * @throws Exception if a problem occurs */ protected void kMeansPlusPlusInit(Instances data) throws Exception { Random randomO = new Random(getSeed()); HashMap<DecisionTableHashKey, String> initC = new HashMap<DecisionTableHashKey, String>(); // choose initial center uniformly at random int index = randomO.nextInt(data.numInstances()); m_ClusterCentroids.add(data.instance(index)); DecisionTableHashKey hk = new DecisionTableHashKey(data.instance(index), data.numAttributes(), true); initC.put(hk, null); int iteration = 0; int remainingInstances = data.numInstances() - 1; if (m_NumClusters > 1) { // proceed with selecting the rest // distances to the initial randomly chose center double[] distances = new double[data.numInstances()]; double[] cumProbs = new double[data.numInstances()]; for (int i = 0; i < data.numInstances(); i++) { distances[i] = m_DistanceFunction.distance(data.instance(i), m_ClusterCentroids.instance(iteration)); } // now choose the remaining cluster centers for (int i = 1; i < m_NumClusters; i++) { // distances converted to probabilities double[] weights = new double[data.numInstances()]; System.arraycopy(distances, 0, weights, 0, distances.length); Utils.normalize(weights); double sumOfProbs = 0; for (int k = 0; k < data.numInstances(); k++) { sumOfProbs += weights[k]; cumProbs[k] = sumOfProbs; } cumProbs[data.numInstances() - 1] = 1.0; // make sure there are no // rounding issues // choose a random instance double prob = randomO.nextDouble(); for (int k = 0; k < cumProbs.length; k++) { if (prob < cumProbs[k]) { Instance candidateCenter = data.instance(k); hk = new DecisionTableHashKey(candidateCenter, data.numAttributes(), true); if (!initC.containsKey(hk)) { initC.put(hk, null); m_ClusterCentroids.add(candidateCenter); } else { // we shouldn't get here because any instance that is a duplicate // of // an already chosen cluster center should have zero distance (and // hence // zero probability of getting chosen) to that center. System.err.println("We shouldn't get here...."); } remainingInstances--; break; } } iteration++; if (remainingInstances == 0) { break; } // prepare to choose the next cluster center. // check distances against the new cluster center to see if it is closer for (int k = 0; k < data.numInstances(); k++) { if (distances[k] > 0) { double newDist = m_DistanceFunction.distance(data.instance(k), m_ClusterCentroids.instance(iteration)); if (newDist < distances[k]) { distances[k] = newDist; } } } } } }
From source file:cn.edu.xmu.dm.d3c.clustering.SimpleKMeans.java
License:Open Source License
/** * Generates a clusterer. Has to initialize all fields of the clusterer * that are not being set via options./*from w w w . jav a 2 s . c o m*/ * * @param data set of instances serving as training data * @throws Exception if the clusterer has not been * generated successfully */ public void buildClusterer(Instances data) throws Exception { // can clusterer handle the data? getCapabilities().testWithFail(data); m_Iterations = 0; m_ReplaceMissingFilter = new ReplaceMissingValues(); Instances instances = new Instances(data); instances.setClassIndex(-1); if (!m_dontReplaceMissing) { m_ReplaceMissingFilter.setInputFormat(instances); instances = Filter.useFilter(instances, m_ReplaceMissingFilter); } m_FullMissingCounts = new int[instances.numAttributes()]; if (m_displayStdDevs) { m_FullStdDevs = new double[instances.numAttributes()]; } m_FullNominalCounts = new int[instances.numAttributes()][0]; m_FullMeansOrMediansOrModes = moveCentroid(0, instances, false); for (int i = 0; i < instances.numAttributes(); i++) { m_FullMissingCounts[i] = instances.attributeStats(i).missingCount; if (instances.attribute(i).isNumeric()) { if (m_displayStdDevs) { m_FullStdDevs[i] = Math.sqrt(instances.variance(i)); } if (m_FullMissingCounts[i] == instances.numInstances()) { m_FullMeansOrMediansOrModes[i] = Double.NaN; // mark missing as mean } } else { m_FullNominalCounts[i] = instances.attributeStats(i).nominalCounts; if (m_FullMissingCounts[i] > m_FullNominalCounts[i][Utils.maxIndex(m_FullNominalCounts[i])]) { m_FullMeansOrMediansOrModes[i] = -1; // mark missing as most common value } } } m_ClusterCentroids = new Instances(instances, m_NumClusters); int[] clusterAssignments = new int[instances.numInstances()]; if (m_PreserveOrder) m_Assignments = clusterAssignments; m_DistanceFunction.setInstances(instances); Random RandomO = new Random(getSeed()); int instIndex; HashMap initC = new HashMap(); DecisionTableHashKey hk = null; Instances initInstances = null; if (m_PreserveOrder) initInstances = new Instances(instances); else initInstances = instances; if (m_initializeWithKMeansPlusPlus) { kMeansPlusPlusInit(initInstances); } else { for (int j = initInstances.numInstances() - 1; j >= 0; j--) { instIndex = RandomO.nextInt(j + 1); hk = new DecisionTableHashKey(initInstances.instance(instIndex), initInstances.numAttributes(), true); if (!initC.containsKey(hk)) { m_ClusterCentroids.add(initInstances.instance(instIndex)); initC.put(hk, null); } initInstances.swap(j, instIndex); if (m_ClusterCentroids.numInstances() == m_NumClusters) { break; } } } m_NumClusters = m_ClusterCentroids.numInstances(); //removing reference initInstances = null; int i; boolean converged = false; int emptyClusterCount; Instances[] tempI = new Instances[m_NumClusters]; m_squaredErrors = new double[m_NumClusters]; m_ClusterNominalCounts = new int[m_NumClusters][instances.numAttributes()][0]; m_ClusterMissingCounts = new int[m_NumClusters][instances.numAttributes()]; while (!converged) { emptyClusterCount = 0; m_Iterations++; converged = true; for (i = 0; i < instances.numInstances(); i++) { Instance toCluster = instances.instance(i); int newC = clusterProcessedInstance(toCluster, false, true); if (newC != clusterAssignments[i]) { converged = false; } clusterAssignments[i] = newC; } // update centroids m_ClusterCentroids = new Instances(instances, m_NumClusters); for (i = 0; i < m_NumClusters; i++) { tempI[i] = new Instances(instances, 0); } for (i = 0; i < instances.numInstances(); i++) { tempI[clusterAssignments[i]].add(instances.instance(i)); } for (i = 0; i < m_NumClusters; i++) { if (tempI[i].numInstances() == 0) { // empty cluster emptyClusterCount++; } else { moveCentroid(i, tempI[i], true); } } if (emptyClusterCount > 0) { m_NumClusters -= emptyClusterCount; if (converged) { Instances[] t = new Instances[m_NumClusters]; int index = 0; for (int k = 0; k < tempI.length; k++) { if (tempI[k].numInstances() > 0) { t[index++] = tempI[k]; } } tempI = t; } else { tempI = new Instances[m_NumClusters]; } } if (m_Iterations == m_MaxIterations) converged = true; if (!converged) { m_ClusterNominalCounts = new int[m_NumClusters][instances.numAttributes()][0]; } } // calculate errors if (!m_FastDistanceCalc) { for (i = 0; i < instances.numInstances(); i++) { clusterProcessedInstance(instances.instance(i), true, false); } } if (m_displayStdDevs) { m_ClusterStdDevs = new Instances(instances, m_NumClusters); } m_ClusterSizes = new int[m_NumClusters]; for (i = 0; i < m_NumClusters; i++) { if (m_displayStdDevs) { double[] vals2 = new double[instances.numAttributes()]; for (int j = 0; j < instances.numAttributes(); j++) { if (instances.attribute(j).isNumeric()) { vals2[j] = Math.sqrt(tempI[i].variance(j)); } else { vals2[j] = Utils.missingValue(); } } m_ClusterStdDevs.add(new DenseInstance(1.0, vals2)); } m_ClusterSizes[i] = tempI[i].numInstances(); } }
From source file:cn.edu.xmu.dm.d3c.clustering.SimpleKMeans.java
License:Open Source License
protected void kMeansPlusPlusInit(Instances data) throws Exception { Random randomO = new Random(getSeed()); HashMap<DecisionTableHashKey, String> initC = new HashMap<DecisionTableHashKey, String>(); // choose initial center uniformly at random int index = randomO.nextInt(data.numInstances()); m_ClusterCentroids.add(data.instance(index)); DecisionTableHashKey hk = new DecisionTableHashKey(data.instance(index), data.numAttributes(), true); initC.put(hk, null);//www .j a v a2s . com int iteration = 0; int remainingInstances = data.numInstances() - 1; if (m_NumClusters > 1) { // proceed with selecting the rest // distances to the initial randomly chose center double[] distances = new double[data.numInstances()]; double[] cumProbs = new double[data.numInstances()]; for (int i = 0; i < data.numInstances(); i++) { distances[i] = m_DistanceFunction.distance(data.instance(i), m_ClusterCentroids.instance(iteration)); } // now choose the remaining cluster centers for (int i = 1; i < m_NumClusters; i++) { // distances converted to probabilities double[] weights = new double[data.numInstances()]; System.arraycopy(distances, 0, weights, 0, distances.length); Utils.normalize(weights); double sumOfProbs = 0; for (int k = 0; k < data.numInstances(); k++) { sumOfProbs += weights[k]; cumProbs[k] = sumOfProbs; } cumProbs[data.numInstances() - 1] = 1.0; // make sure there are no rounding issues // choose a random instance double prob = randomO.nextDouble(); for (int k = 0; k < cumProbs.length; k++) { if (prob < cumProbs[k]) { Instance candidateCenter = data.instance(k); hk = new DecisionTableHashKey(candidateCenter, data.numAttributes(), true); if (!initC.containsKey(hk)) { initC.put(hk, null); m_ClusterCentroids.add(candidateCenter); } else { // we shouldn't get here because any instance that is a duplicate of // an already chosen cluster center should have zero distance (and hence // zero probability of getting chosen) to that center. System.err.println("We shouldn't get here...."); } remainingInstances--; break; } } iteration++; if (remainingInstances == 0) { break; } // prepare to choose the next cluster center. // check distances against the new cluster center to see if it is closer for (int k = 0; k < data.numInstances(); k++) { if (distances[k] > 0) { double newDist = m_DistanceFunction.distance(data.instance(k), m_ClusterCentroids.instance(iteration)); if (newDist < distances[k]) { distances[k] = newDist; } } } } } }
From source file:de.unimannheim.dws.algorithms.CustomSimpleKMedian.java
License:Open Source License
/** * Generates a clusterer. Has to initialize all fields of the clusterer that * are not being set via options.//from w w w . j a v a2s .c o m * * @param data set of instances serving as training data * @throws Exception if the clusterer has not been generated successfully */ @Override public void buildClusterer(Instances data) throws Exception { // can clusterer handle the data? getCapabilities().testWithFail(data); m_Iterations = 0; m_ReplaceMissingFilter = new ReplaceMissingValues(); Instances instances = new Instances(data); instances.setClassIndex(-1); if (!m_dontReplaceMissing) { m_ReplaceMissingFilter.setInputFormat(instances); instances = Filter.useFilter(instances, m_ReplaceMissingFilter); } m_FullMissingCounts = new int[instances.numAttributes()]; if (m_displayStdDevs) { m_FullStdDevs = new double[instances.numAttributes()]; } m_FullNominalCounts = new int[instances.numAttributes()][0]; m_FullMeansOrMediansOrModes = moveCentroid(0, instances, false); for (int i = 0; i < instances.numAttributes(); i++) { m_FullMissingCounts[i] = instances.attributeStats(i).missingCount; if (instances.attribute(i).isNumeric()) { if (m_displayStdDevs) { m_FullStdDevs[i] = Math.sqrt(instances.variance(i)); } if (m_FullMissingCounts[i] == instances.numInstances()) { m_FullMeansOrMediansOrModes[i] = Double.NaN; // mark missing as mean } } else { m_FullNominalCounts[i] = instances.attributeStats(i).nominalCounts; if (m_FullMissingCounts[i] > m_FullNominalCounts[i][Utils.maxIndex(m_FullNominalCounts[i])]) { m_FullMeansOrMediansOrModes[i] = -1; // mark missing as most common // value } } } m_ClusterCentroids = new Instances(instances, m_NumClusters); int[] clusterAssignments = new int[instances.numInstances()]; if (m_PreserveOrder) { m_Assignments = clusterAssignments; } m_DistanceFunction.setInstances(instances); Random RandomO = new Random(getSeed()); int instIndex; HashMap initC = new HashMap(); DecisionTableHashKey hk = null; Instances initInstances = null; if (m_PreserveOrder) { initInstances = new Instances(instances); } else { initInstances = instances; } for (int j = initInstances.numInstances() - 1; j >= 0; j--) { instIndex = RandomO.nextInt(j + 1); hk = new DecisionTableHashKey(initInstances.instance(instIndex), initInstances.numAttributes(), true); if (!initC.containsKey(hk)) { m_ClusterCentroids.add(initInstances.instance(instIndex)); initC.put(hk, null); } initInstances.swap(j, instIndex); if (m_ClusterCentroids.numInstances() == m_NumClusters) { break; } } m_NumClusters = m_ClusterCentroids.numInstances(); // removing reference initInstances = null; int i; boolean converged = false; int emptyClusterCount; Instances[] tempI = new Instances[m_NumClusters]; m_squaredErrors = new double[m_NumClusters]; m_ClusterNominalCounts = new int[m_NumClusters][instances.numAttributes()][0]; m_ClusterMissingCounts = new int[m_NumClusters][instances.numAttributes()]; while (!converged) { emptyClusterCount = 0; m_Iterations++; converged = true; for (i = 0; i < instances.numInstances(); i++) { Instance toCluster = instances.instance(i); int newC = clusterProcessedInstance(toCluster, true); if (newC != clusterAssignments[i]) { converged = false; } clusterAssignments[i] = newC; } // update centroids m_ClusterCentroids = new Instances(instances, m_NumClusters); for (i = 0; i < m_NumClusters; i++) { tempI[i] = new Instances(instances, 0); } for (i = 0; i < instances.numInstances(); i++) { tempI[clusterAssignments[i]].add(instances.instance(i)); } for (i = 0; i < m_NumClusters; i++) { if (tempI[i].numInstances() == 0) { // empty cluster emptyClusterCount++; } else { moveCentroid(i, tempI[i], true); } } if (m_Iterations == m_MaxIterations) { converged = true; } if (emptyClusterCount > 0) { m_NumClusters -= emptyClusterCount; if (converged) { Instances[] t = new Instances[m_NumClusters]; int index = 0; for (int k = 0; k < tempI.length; k++) { if (tempI[k].numInstances() > 0) { t[index] = tempI[k]; for (i = 0; i < tempI[k].numAttributes(); i++) { m_ClusterNominalCounts[index][i] = m_ClusterNominalCounts[k][i]; } index++; } } tempI = t; } else { tempI = new Instances[m_NumClusters]; } } if (!converged) { m_squaredErrors = new double[m_NumClusters]; m_ClusterNominalCounts = new int[m_NumClusters][instances.numAttributes()][0]; } } if (m_displayStdDevs) { m_ClusterStdDevs = new Instances(instances, m_NumClusters); } m_ClusterSizes = new int[m_NumClusters]; for (i = 0; i < m_NumClusters; i++) { if (m_displayStdDevs) { double[] vals2 = new double[instances.numAttributes()]; for (int j = 0; j < instances.numAttributes(); j++) { if (instances.attribute(j).isNumeric()) { vals2[j] = Math.sqrt(tempI[i].variance(j)); } else { vals2[j] = Instance.missingValue(); } } m_ClusterStdDevs.add(new Instance(1.0, vals2)); } m_ClusterSizes[i] = tempI[i].numInstances(); } // Save memory!! m_DistanceFunction.clean(); }
From source file:gr.iti.mklab.visual.quantization.SimpleKMeansWithOutput.java
License:Open Source License
/** * Generates a clusterer. Has to initialize all fields of the clusterer that are not being set via * options.//from www .j a v a 2 s.c o m * * @param data * set of instances serving as training data * @throws Exception * if the clusterer has not been generated successfully */ @Override public void buildClusterer(Instances data) throws Exception { // can clusterer handle the data? getCapabilities().testWithFail(data); m_Iterations = 0; m_ReplaceMissingFilter = new ReplaceMissingValues(); Instances instances = new Instances(data); instances.setClassIndex(-1); if (!m_dontReplaceMissing) { m_ReplaceMissingFilter.setInputFormat(instances); instances = Filter.useFilter(instances, m_ReplaceMissingFilter); } m_FullMissingCounts = new int[instances.numAttributes()]; if (m_displayStdDevs) { m_FullStdDevs = new double[instances.numAttributes()]; } m_FullNominalCounts = new int[instances.numAttributes()][0]; m_FullMeansOrMediansOrModes = moveCentroid(0, instances, false, false); for (int i = 0; i < instances.numAttributes(); i++) { m_FullMissingCounts[i] = instances.attributeStats(i).missingCount; if (instances.attribute(i).isNumeric()) { if (m_displayStdDevs) { m_FullStdDevs[i] = Math.sqrt(instances.variance(i)); } if (m_FullMissingCounts[i] == instances.numInstances()) { m_FullMeansOrMediansOrModes[i] = Double.NaN; // mark missing as mean } } else { m_FullNominalCounts[i] = instances.attributeStats(i).nominalCounts; if (m_FullMissingCounts[i] > m_FullNominalCounts[i][Utils.maxIndex(m_FullNominalCounts[i])]) { m_FullMeansOrMediansOrModes[i] = -1; // mark missing as most common // value } } } m_ClusterCentroids = new Instances(instances, m_NumClusters); int[] clusterAssignments = new int[instances.numInstances()]; if (m_PreserveOrder) m_Assignments = clusterAssignments; m_DistanceFunction.setInstances(instances); Random RandomO = new Random(getSeed()); int instIndex; HashMap initC = new HashMap(); DecisionTableHashKey hk = null; Instances initInstances = null; if (m_PreserveOrder) initInstances = new Instances(instances); else initInstances = instances; if (m_initializeWithKMeansPlusPlus) { kMeansPlusPlusInit(initInstances); } else { for (int j = initInstances.numInstances() - 1; j >= 0; j--) { instIndex = RandomO.nextInt(j + 1); hk = new DecisionTableHashKey(initInstances.instance(instIndex), initInstances.numAttributes(), true); if (!initC.containsKey(hk)) { m_ClusterCentroids.add(initInstances.instance(instIndex)); initC.put(hk, null); } initInstances.swap(j, instIndex); if (m_ClusterCentroids.numInstances() == m_NumClusters) { break; } } } m_NumClusters = m_ClusterCentroids.numInstances(); // removing reference initInstances = null; int i; boolean converged = false; int emptyClusterCount; Instances[] tempI = new Instances[m_NumClusters]; m_squaredErrors = new double[m_NumClusters]; m_ClusterNominalCounts = new int[m_NumClusters][instances.numAttributes()][0]; m_ClusterMissingCounts = new int[m_NumClusters][instances.numAttributes()]; startExecutorPool(); long start = System.currentTimeMillis(); while (!converged) { emptyClusterCount = 0; m_Iterations++; converged = true; System.out.print(new Date() + ": " + "Iter " + m_Iterations + " "); if (m_executionSlots <= 1 || instances.numInstances() < 2 * m_executionSlots) { for (i = 0; i < instances.numInstances(); i++) { Instance toCluster = instances.instance(i); int newC = clusterProcessedInstance(toCluster, true, true); if (newC != clusterAssignments[i]) { converged = false; } clusterAssignments[i] = newC; } } else { converged = launchAssignToClusters(instances, clusterAssignments); } // update centroids m_ClusterCentroids = new Instances(instances, m_NumClusters); for (i = 0; i < m_NumClusters; i++) { tempI[i] = new Instances(instances, 0); } for (i = 0; i < instances.numInstances(); i++) { tempI[clusterAssignments[i]].add(instances.instance(i)); } if (m_executionSlots <= 1 || instances.numInstances() < 2 * m_executionSlots) { for (i = 0; i < m_NumClusters; i++) { if (tempI[i].numInstances() == 0) { // empty cluster emptyClusterCount++; } else { moveCentroid(i, tempI[i], true, true); } } } else { emptyClusterCount = launchMoveCentroids(tempI); } if (m_Iterations == m_MaxIterations) converged = true; if (emptyClusterCount > 0) { m_NumClusters -= emptyClusterCount; if (converged) { Instances[] t = new Instances[m_NumClusters]; int index = 0; for (int k = 0; k < tempI.length; k++) { if (tempI[k].numInstances() > 0) { t[index++] = tempI[k]; } } tempI = t; } else { tempI = new Instances[m_NumClusters]; } } if (!converged) { m_ClusterNominalCounts = new int[m_NumClusters][instances.numAttributes()][0]; } System.out.println("Sum of within cluster distances: " + Utils.sum(m_squaredErrors)); // reset erros to zero m_squaredErrors = new double[m_NumClusters]; } long end = System.currentTimeMillis(); System.out.println("\nClustering completed in " + (end - start) + " ms and converged in " + m_Iterations + " iterations"); // calculate errors if (!m_FastDistanceCalc) { for (i = 0; i < instances.numInstances(); i++) { clusterProcessedInstance(instances.instance(i), true, false); } } if (m_displayStdDevs) { m_ClusterStdDevs = new Instances(instances, m_NumClusters); } m_ClusterSizes = new int[m_NumClusters]; for (i = 0; i < m_NumClusters; i++) { if (m_displayStdDevs) { double[] vals2 = new double[instances.numAttributes()]; for (int j = 0; j < instances.numAttributes(); j++) { if (instances.attribute(j).isNumeric()) { vals2[j] = Math.sqrt(tempI[i].variance(j)); } else { vals2[j] = Utils.missingValue(); } } m_ClusterStdDevs.add(new DenseInstance(1.0, vals2)); } m_ClusterSizes[i] = tempI[i].numInstances(); } m_executorPool.shutdown(); }
From source file:gr.iti.mklab.visual.quantization.SimpleKMeansWithOutput.java
License:Open Source License
protected void kMeansPlusPlusInit(Instances data) throws Exception { Random randomO = new Random(getSeed()); HashMap<DecisionTableHashKey, String> initC = new HashMap<DecisionTableHashKey, String>(); // choose initial center uniformly at random int index = randomO.nextInt(data.numInstances()); m_ClusterCentroids.add(data.instance(index)); DecisionTableHashKey hk = new DecisionTableHashKey(data.instance(index), data.numAttributes(), true); initC.put(hk, null);/*from w w w . j a va 2 s. c o m*/ int iteration = 0; int remainingInstances = data.numInstances() - 1; if (m_NumClusters > 1) { // proceed with selecting the rest // distances to the initial randomly chose center double[] distances = new double[data.numInstances()]; double[] cumProbs = new double[data.numInstances()]; for (int i = 0; i < data.numInstances(); i++) { distances[i] = m_DistanceFunction.distance(data.instance(i), m_ClusterCentroids.instance(iteration)); } // now choose the remaining cluster centers for (int i = 1; i < m_NumClusters; i++) { // distances converted to probabilities double[] weights = new double[data.numInstances()]; System.arraycopy(distances, 0, weights, 0, distances.length); Utils.normalize(weights); double sumOfProbs = 0; for (int k = 0; k < data.numInstances(); k++) { sumOfProbs += weights[k]; cumProbs[k] = sumOfProbs; } cumProbs[data.numInstances() - 1] = 1.0; // make sure there are no // rounding issues // choose a random instance double prob = randomO.nextDouble(); for (int k = 0; k < cumProbs.length; k++) { if (prob < cumProbs[k]) { Instance candidateCenter = data.instance(k); hk = new DecisionTableHashKey(candidateCenter, data.numAttributes(), true); if (!initC.containsKey(hk)) { initC.put(hk, null); m_ClusterCentroids.add(candidateCenter); } else { // we shouldn't get here because any instance that is a duplicate // of // an already chosen cluster center should have zero distance (and // hence // zero probability of getting chosen) to that center. System.err.println("We shouldn't get here...."); } remainingInstances--; break; } } iteration++; if (remainingInstances == 0) { break; } // prepare to choose the next cluster center. // check distances against the new cluster center to see if it is closer for (int k = 0; k < data.numInstances(); k++) { if (distances[k] > 0) { double newDist = m_DistanceFunction.distance(data.instance(k), m_ClusterCentroids.instance(iteration)); if (newDist < distances[k]) { distances[k] = newDist; } } } } } }
From source file:mulan.classifier.meta.ConstrainedKMeans.java
License:Open Source License
/** * Generates a clusterer. Has to initialize all fields of the clusterer * that are not being set via options./* w w w . j a v a 2s. c o m*/ * * @param data set of instances serving as training data * @throws Exception if the clusterer has not been * generated successfully */ public void buildClusterer(Instances data) throws Exception { for (int i = 0; i < m_NumClusters; i++) { bucket[i] = new ArrayList<bucketInstance>(); } // calculate bucket size bucketSize = (int) Math.ceil(data.numInstances() / (double) m_NumClusters); //System.out.print("bucketSize = " + bucketSize + "\n"); // can clusterer handle the data? getCapabilities().testWithFail(data); m_Iterations = 0; m_ReplaceMissingFilter = new ReplaceMissingValues(); Instances instances = new Instances(data); instances.setClassIndex(-1); m_ReplaceMissingFilter.setInputFormat(instances); instances = Filter.useFilter(instances, m_ReplaceMissingFilter); m_Min = new double[instances.numAttributes()]; m_Max = new double[instances.numAttributes()]; for (int i = 0; i < instances.numAttributes(); i++) { m_Min[i] = m_Max[i] = Double.NaN; } m_ClusterCentroids = new Instances(instances, m_NumClusters); int[] clusterAssignments = new int[instances.numInstances()]; for (int i = 0; i < instances.numInstances(); i++) { updateMinMax(instances.instance(i)); } Random RandomO = new Random(getSeed()); int instIndex; HashMap initC = new HashMap(); DecisionTableHashKey hk = null; for (int j = instances.numInstances() - 1; j >= 0; j--) { instIndex = RandomO.nextInt(j + 1); hk = new DecisionTableHashKey(instances.instance(instIndex), instances.numAttributes(), true); if (!initC.containsKey(hk)) { m_ClusterCentroids.add(instances.instance(instIndex)); initC.put(hk, null); } instances.swap(j, instIndex); if (m_ClusterCentroids.numInstances() == m_NumClusters) { break; } } m_NumClusters = m_ClusterCentroids.numInstances(); int i; boolean converged = false; int emptyClusterCount; Instances[] tempI = new Instances[m_NumClusters]; m_squaredErrors = new double[m_NumClusters]; m_ClusterNominalCounts = new int[m_NumClusters][instances.numAttributes()][0]; while (!converged) { // reset buckets for (int j = 0; j < m_NumClusters; j++) { bucket[j] = new ArrayList<bucketInstance>(); } emptyClusterCount = 0; m_Iterations++; //System.out.println(">>Iterations: "+m_Iterations); converged = true; for (i = 0; i < instances.numInstances(); i++) { //System.out.println("processing instance: " + i); Instance toCluster = instances.instance(i); int newC = clusterProcessedInstance(toCluster, true); if (newC != clusterAssignments[i]) { converged = false; } clusterAssignments[i] = newC; } if (m_Iterations > maxIterations) { converged = true; } // update centroids m_ClusterCentroids = new Instances(instances, m_NumClusters); for (i = 0; i < m_NumClusters; i++) { tempI[i] = new Instances(instances, 0); } for (i = 0; i < instances.numInstances(); i++) { tempI[clusterAssignments[i]].add(instances.instance(i)); } for (i = 0; i < m_NumClusters; i++) { double[] vals = new double[instances.numAttributes()]; if (tempI[i].numInstances() == 0) { // empty cluster emptyClusterCount++; } else { for (int j = 0; j < instances.numAttributes(); j++) { vals[j] = tempI[i].meanOrMode(j); m_ClusterNominalCounts[i][j] = tempI[i].attributeStats(j).nominalCounts; } m_ClusterCentroids.add(new DenseInstance(1.0, vals)); } //System.out.println("centroid: " + i + " " + m_ClusterCentroids.instance(i).toString()); } if (emptyClusterCount > 0) { m_NumClusters -= emptyClusterCount; tempI = new Instances[m_NumClusters]; } if (!converged) { m_squaredErrors = new double[m_NumClusters]; m_ClusterNominalCounts = new int[m_NumClusters][instances.numAttributes()][0]; } } // reset buckets for (int j = 0; j < m_NumClusters; j++) { bucket[j] = new ArrayList<bucketInstance>(); } m_ClusterStdDevs = new Instances(instances, m_NumClusters); m_ClusterSizes = new int[m_NumClusters]; for (i = 0; i < m_NumClusters; i++) { double[] vals2 = new double[instances.numAttributes()]; for (int j = 0; j < instances.numAttributes(); j++) { if (instances.attribute(j).isNumeric()) { vals2[j] = Math.sqrt(tempI[i].variance(j)); } else { vals2[j] = Utils.missingValue(); } } m_ClusterStdDevs.add(new DenseInstance(1.0, vals2)); m_ClusterSizes[i] = tempI[i].numInstances(); } }