List of usage examples for weka.core Instances kthSmallestValue
public double kthSmallestValue(int attIndex, int k)
From source file:br.ufrn.ia.core.clustering.SimpleKMeansIaProject.java
License:Open Source License
protected double[] moveCentroid(int centroidIndex, Instances members, boolean updateClusterInfo) { double[] vals = new double[members.numAttributes()]; // used only for Manhattan Distance Instances sortedMembers = null; int middle = 0; boolean dataIsEven = false; if (m_DistanceFunction instanceof ManhattanDistance) { middle = (members.numInstances() - 1) / 2; dataIsEven = ((members.numInstances() % 2) == 0); if (m_PreserveOrder) { sortedMembers = members;//from ww w.j a v a 2s .com } else { sortedMembers = new Instances(members); } } for (int j = 0; j < members.numAttributes(); j++) { // in case of Euclidian distance the centroid is the mean point // in case of Manhattan distance the centroid is the median point // in both cases, if the attribute is nominal, the centroid is the // mode if (m_DistanceFunction instanceof EuclideanDistance || members.attribute(j).isNominal()) { vals[j] = members.meanOrMode(j); } else if (m_DistanceFunction instanceof ManhattanDistance) { // singleton special case if (members.numInstances() == 1) { vals[j] = members.instance(0).value(j); } else { sortedMembers.kthSmallestValue(j, middle + 1); vals[j] = sortedMembers.instance(middle).value(j); if (dataIsEven) { sortedMembers.kthSmallestValue(j, middle + 2); vals[j] = (vals[j] + sortedMembers.instance(middle + 1).value(j)) / 2; } } } if (updateClusterInfo) { m_ClusterMissingCounts[centroidIndex][j] = members.attributeStats(j).missingCount; m_ClusterNominalCounts[centroidIndex][j] = members.attributeStats(j).nominalCounts; if (members.attribute(j).isNominal()) { if (m_ClusterMissingCounts[centroidIndex][j] > m_ClusterNominalCounts[centroidIndex][j][Utils .maxIndex(m_ClusterNominalCounts[centroidIndex][j])]) { vals[j] = Utils.missingValue(); // mark mode as missing } } else { if (m_ClusterMissingCounts[centroidIndex][j] == members.numInstances()) { vals[j] = Utils.missingValue(); // mark mean as missing } } } } if (updateClusterInfo) m_ClusterCentroids.add(new DenseInstance(1.0, vals)); return vals; }
From source file:clusterer.SimpleKMeansWithSilhouette.java
License:Open Source License
/** * Move the centroid to it's new coordinates. Generate the centroid * coordinates based on it's members (objects assigned to the cluster of the * centroid) and the distance function being used. * /* w ww . j a v a 2s . c om*/ * @param centroidIndex index of the centroid which the coordinates will be * computed * @param members the objects that are assigned to the cluster of this * centroid * @param updateClusterInfo if the method is supposed to update the m_Cluster * arrays * @param addToCentroidInstances true if the method is to add the computed * coordinates to the Instances holding the centroids * @return the centroid coordinates */ protected double[] moveCentroid(int centroidIndex, Instances members, boolean updateClusterInfo, boolean addToCentroidInstances) { double[] vals = new double[members.numAttributes()]; double[][] nominalDists = new double[members.numAttributes()][]; double[] weightMissing = new double[members.numAttributes()]; double[] weightNonMissing = new double[members.numAttributes()]; // Quickly calculate some relevant statistics for (int j = 0; j < members.numAttributes(); j++) { if (members.attribute(j).isNominal()) { nominalDists[j] = new double[members.attribute(j).numValues()]; } } for (Instance inst : members) { for (int j = 0; j < members.numAttributes(); j++) { if (inst.isMissing(j)) { weightMissing[j] += inst.weight(); } else { weightNonMissing[j] += inst.weight(); if (members.attribute(j).isNumeric()) { vals[j] += inst.weight() * inst.value(j); // Will be overwritten in Manhattan case } else { nominalDists[j][(int) inst.value(j)] += inst.weight(); } } } } for (int j = 0; j < members.numAttributes(); j++) { if (members.attribute(j).isNumeric()) { if (weightNonMissing[j] > 0) { vals[j] /= weightNonMissing[j]; } else { vals[j] = Utils.missingValue(); } } else { double max = -Double.MAX_VALUE; double maxIndex = -1; for (int i = 0; i < nominalDists[j].length; i++) { if (nominalDists[j][i] > max) { max = nominalDists[j][i]; maxIndex = i; } if (max < weightMissing[j]) { vals[j] = Utils.missingValue(); } else { vals[j] = maxIndex; } } } } if (m_DistanceFunction instanceof ManhattanDistance) { // Need to replace means by medians Instances sortedMembers = null; int middle = (members.numInstances() - 1) / 2; boolean dataIsEven = ((members.numInstances() % 2) == 0); if (m_PreserveOrder) { sortedMembers = members; } else { sortedMembers = new Instances(members); } for (int j = 0; j < members.numAttributes(); j++) { if ((weightNonMissing[j] > 0) && members.attribute(j).isNumeric()) { // singleton special case if (members.numInstances() == 1) { vals[j] = members.instance(0).value(j); } else { vals[j] = sortedMembers.kthSmallestValue(j, middle + 1); if (dataIsEven) { vals[j] = (vals[j] + sortedMembers.kthSmallestValue(j, middle + 2)) / 2; } } } } } if (updateClusterInfo) { for (int j = 0; j < members.numAttributes(); j++) { m_ClusterMissingCounts[centroidIndex][j] = weightMissing[j]; m_ClusterNominalCounts[centroidIndex][j] = nominalDists[j]; } } if (addToCentroidInstances) { m_ClusterCentroids.add(new DenseInstance(1.0, vals)); } return vals; }
From source file:cn.edu.xmu.dm.d3c.clustering.SimpleKMeans.java
License:Open Source License
/** * Move the centroid to it's new coordinates. Generate the centroid coordinates based * on it's members (objects assigned to the cluster of the centroid) and the distance * function being used./*from w ww .j ava 2s. co m*/ * @param centroidIndex index of the centroid which the coordinates will be computed * @param members the objects that are assigned to the cluster of this centroid * @param updateClusterInfo if the method is supposed to update the m_Cluster arrays * @return the centroid coordinates */ protected double[] moveCentroid(int centroidIndex, Instances members, boolean updateClusterInfo) { double[] vals = new double[members.numAttributes()]; //used only for Manhattan Distance Instances sortedMembers = null; int middle = 0; boolean dataIsEven = false; if (m_DistanceFunction instanceof ManhattanDistance) { middle = (members.numInstances() - 1) / 2; dataIsEven = ((members.numInstances() % 2) == 0); if (m_PreserveOrder) { sortedMembers = members; } else { sortedMembers = new Instances(members); } } for (int j = 0; j < members.numAttributes(); j++) { //in case of Euclidian distance the centroid is the mean point //in case of Manhattan distance the centroid is the median point //in both cases, if the attribute is nominal, the centroid is the mode if (m_DistanceFunction instanceof EuclideanDistance || members.attribute(j).isNominal()) { vals[j] = members.meanOrMode(j); } else if (m_DistanceFunction instanceof ManhattanDistance) { //singleton special case if (members.numInstances() == 1) { vals[j] = members.instance(0).value(j); } else { sortedMembers.kthSmallestValue(j, middle + 1); vals[j] = sortedMembers.instance(middle).value(j); if (dataIsEven) { sortedMembers.kthSmallestValue(j, middle + 2); vals[j] = (vals[j] + sortedMembers.instance(middle + 1).value(j)) / 2; } } } if (updateClusterInfo) { m_ClusterMissingCounts[centroidIndex][j] = members.attributeStats(j).missingCount; m_ClusterNominalCounts[centroidIndex][j] = members.attributeStats(j).nominalCounts; if (members.attribute(j).isNominal()) { if (m_ClusterMissingCounts[centroidIndex][j] > m_ClusterNominalCounts[centroidIndex][j][Utils .maxIndex(m_ClusterNominalCounts[centroidIndex][j])]) { vals[j] = Utils.missingValue(); // mark mode as missing } } else { if (m_ClusterMissingCounts[centroidIndex][j] == members.numInstances()) { vals[j] = Utils.missingValue(); // mark mean as missing } } } } if (updateClusterInfo) m_ClusterCentroids.add(new DenseInstance(1.0, vals)); return vals; }
From source file:de.ugoe.cs.cpdp.dataprocessing.CLAMIProcessor.java
License:Apache License
/** * <p>/*from www. j a va2 s .co m*/ * Applies the CLAMI processor to the data. The test data is also required, in order to * guarantee a consistent metric set. * </p> * * @param testdata * test data; the data is not modified, only metrics are dropped * @param data * data to which the CLAMI processor is applied */ public void applyCLAMI(Instances testdata, Instances data) { // first determine medians double[] medians = new double[data.numAttributes()]; // get medians for (int j = 0; j < data.numAttributes(); j++) { if (j != data.classIndex()) { medians[j] = data.kthSmallestValue(j, (data.numInstances() + 1) >> 1); } } // now determine cluster number for each instance double[] clusterNumber = new double[data.numInstances()]; for (int i = 0; i < data.numInstances(); i++) { int countHighValues = 0; Instance currentInstance = data.get(i); for (int j = 0; j < data.numAttributes(); j++) { if (j != data.classIndex()) { if (currentInstance.value(j) > medians[j]) { countHighValues++; } } } clusterNumber[i] = countHighValues; } // determine median of cluster number Median m = new Median(); double medianClusterNumber = m.evaluate(clusterNumber); // now we filter the metrics int[] numMetricViolations = new int[data.numAttributes()]; for (int j = 0; j < data.numAttributes(); j++) { int currentViolations = 0; for (int i = 0; i < data.numInstances(); i++) { Instance currentInstance = data.get(i); if (j != data.classIndex()) { if (clusterNumber[i] > medianClusterNumber) { // "buggy" if (currentInstance.value(j) <= medians[j]) { currentViolations++; } } else { // "not buggy" if (currentInstance.value(j) > medians[j]) { currentViolations++; } } } } numMetricViolations[j] = currentViolations; } SortedSet<Integer> distinctViolationCounts = new TreeSet<>(); for (int currentViolations : numMetricViolations) { distinctViolationCounts.add(currentViolations); } Iterator<Integer> violationCountInterator = distinctViolationCounts.iterator(); int violationCutoff = violationCountInterator.next(); // now we filter the data; // this is first tried with the metrics with fewest violations. if no buggy/bugfree // instances remain, this is repeated with the next metrics with second fewest violations, // and so on. // this part is a bit unclear from the description in the paper, but I confirmed with the // author that this is how they implemented it boolean[] cleanInstances = new boolean[data.numInstances()]; int numCleanBuggyInstances = 0; int numCleanBugfreeInstances = 0; do { violationCutoff = violationCountInterator.next(); cleanInstances = new boolean[data.numInstances()]; numCleanBuggyInstances = 0; numCleanBugfreeInstances = 0; for (int i = 0; i < data.numInstances(); i++) { int currentViolations = 0; Instance currentInstance = data.get(i); for (int j = 0; j < data.numAttributes(); j++) { if (j != data.classIndex() && numMetricViolations[j] == violationCutoff) { if (clusterNumber[i] > medianClusterNumber) { // "buggy" if (currentInstance.value(j) <= medians[j]) { currentViolations++; } } else { // "not buggy" if (currentInstance.value(j) > medians[j]) { currentViolations++; } } } } if (currentViolations == 0) { cleanInstances[i] = true; if (clusterNumber[i] > medianClusterNumber) { numCleanBuggyInstances++; } else { numCleanBugfreeInstances++; } } else { cleanInstances[i] = false; } } } while (numCleanBuggyInstances == 0 || numCleanBugfreeInstances == 0); // output some interesting information to provide insights into the CLAMI model Console.traceln(Level.FINE, "Selected Metrics and Median-threshold: "); for (int j = 0; j < data.numAttributes(); j++) { if (j != data.classIndex() && numMetricViolations[j] == violationCutoff) { Console.traceln(Level.FINE, "\t" + data.attribute(j).name() + ": " + medians[j]); } } // finally modify the instances // drop the metrics (also from the testdata) for (int j = data.numAttributes() - 1; j >= 0; j--) { if (j != data.classIndex() && numMetricViolations[j] != violationCutoff) { data.deleteAttributeAt(j); testdata.deleteAttributeAt(j); } } // drop the unclean instances for (int i = data.numInstances() - 1; i >= 0; i--) { if (!cleanInstances[i]) { data.delete(i); } else { // set the classification if (clusterNumber[i] > medianClusterNumber) { data.get(i).setClassValue(1.0d); } else { data.get(i).setClassValue(0.0d); } } } }
From source file:de.ugoe.cs.cpdp.dataprocessing.CLAProcessor.java
License:Apache License
/** * Applies the CLA processor the the data. * //from ww w . j av a 2 s.co m * @param data * data to which the processor is applied */ public void applyCLA(Instances data) { // first determine medians double[] medians = new double[data.numAttributes()]; // get medians for (int j = 0; j < data.numAttributes(); j++) { if (j != data.classIndex()) { medians[j] = data.kthSmallestValue(j, (data.numInstances() + 1) >> 1); } } // now determine cluster number for each instance double[] clusterNumber = new double[data.numInstances()]; for (int i = 0; i < data.numInstances(); i++) { int countHighValues = 0; Instance currentInstance = data.get(i); for (int j = 0; j < data.numAttributes(); j++) { if (j != data.classIndex()) { if (currentInstance.value(j) > medians[j]) { countHighValues++; } } } clusterNumber[i] = countHighValues; } // determine median of cluster number Median m = new Median(); double medianClusterNumber = m.evaluate(Arrays.stream(clusterNumber).distinct().toArray()); // finally modify the instances // drop the unclean instances for (int i = data.numInstances() - 1; i >= 0; i--) { // set the classification if (clusterNumber[i] > medianClusterNumber) { data.get(i).setClassValue(1.0d); } else { data.get(i).setClassValue(0.0d); } } }
From source file:de.ugoe.cs.cpdp.dataselection.CLIFF.java
License:Apache License
/** * <p>/* w w w.jav a 2s . c o m*/ * Gets an array with the ranges from the data for a given attribute * </p> * * @param data * the data * @param j * index of the attribute * @return the ranges for the attribute */ private double[] getRanges(Instances data, int j) { double[] values = new double[numRanges + 1]; for (int k = 0; k < numRanges; k++) { values[k] = data.kthSmallestValue(j, (int) (data.size() * (k + 1.0) / numRanges)); } values[numRanges] = data.attributeStats(j).numericStats.max; return values; }
From source file:de.unimannheim.dws.algorithms.CustomSimpleKMedian.java
License:Open Source License
/** * Move the centroid to it's new coordinates. Generate the centroid * coordinates based on it's members (objects assigned to the cluster of the * centroid) and the distance function being used. * /*from ww w .j a v a 2 s . c o m*/ * @param centroidIndex index of the centroid which the coordinates will be * computed * @param members the objects that are assigned to the cluster of this * centroid * @param updateClusterInfo if the method is supposed to update the m_Cluster * arrays * @return the centroid coordinates */ protected double[] moveCentroid(int centroidIndex, Instances members, boolean updateClusterInfo) { double[] vals = new double[members.numAttributes()]; // used only for Manhattan Distance Instances sortedMembers = null; int middle = 0; boolean dataIsEven = false; if (m_DistanceFunction instanceof ManhattanDistance || m_DistanceFunction instanceof CustomPairWiseDistance) { middle = (members.numInstances() - 1) / 2; dataIsEven = ((members.numInstances() % 2) == 0); if (m_PreserveOrder) { sortedMembers = members; } else { sortedMembers = new Instances(members); } } for (int j = 0; j < members.numAttributes(); j++) { // in case of Euclidian distance the centroid is the mean point // in case of Manhattan distance the centroid is the median point // in both cases, if the attribute is nominal, the centroid is the mode if (m_DistanceFunction instanceof EuclideanDistance || members.attribute(j).isNominal()) { vals[j] = members.meanOrMode(j); } else if (m_DistanceFunction instanceof ManhattanDistance || m_DistanceFunction instanceof CustomPairWiseDistance) { // singleton special case if (members.numInstances() == 1) { vals[j] = members.instance(0).value(j); } else { vals[j] = sortedMembers.kthSmallestValue(j, middle + 1); if (dataIsEven) { vals[j] = (vals[j] + sortedMembers.kthSmallestValue(j, middle + 2)) / 2; } } } if (updateClusterInfo) { m_ClusterMissingCounts[centroidIndex][j] = members.attributeStats(j).missingCount; m_ClusterNominalCounts[centroidIndex][j] = members.attributeStats(j).nominalCounts; if (members.attribute(j).isNominal()) { if (m_ClusterMissingCounts[centroidIndex][j] > m_ClusterNominalCounts[centroidIndex][j][Utils .maxIndex(m_ClusterNominalCounts[centroidIndex][j])]) { vals[j] = Instance.missingValue(); // mark mode as missing } } else { if (m_ClusterMissingCounts[centroidIndex][j] == members.numInstances()) { vals[j] = Instance.missingValue(); // mark mean as missing } } } } if (updateClusterInfo) { m_ClusterCentroids.add(new Instance(1.0, vals)); } return vals; }
From source file:elh.eus.absa.CLI.java
License:Open Source License
/** * Main access to the train-atc functionalities. Train ATC using a double one vs. all classifier * (E and A) for E#A aspect categories/*from w ww .j a v a2 s . c om*/ * @throws Exception */ public final void trainATC2(final InputStream inputStream) throws IOException { // load training parameters file String paramFile = parsedArguments.getString("params"); String testFile = parsedArguments.getString("testset"); String paramFile2 = parsedArguments.getString("params2"); String corpusFormat = parsedArguments.getString("corpusFormat"); //String validation = parsedArguments.getString("validation"); String lang = parsedArguments.getString("language"); //int foldNum = Integer.parseInt(parsedArguments.getString("foldNum")); //boolean printPreds = parsedArguments.getBoolean("printPreds"); boolean nullSentenceOpinions = parsedArguments.getBoolean("nullSentences"); boolean onlyTest = parsedArguments.getBoolean("testOnly"); double threshold = 0.5; double threshold2 = 0.5; String modelsPath = "/home/inaki/elixa-atp/ovsaModels"; CorpusReader reader = new CorpusReader(inputStream, corpusFormat, nullSentenceOpinions, lang); Features atcTrain = new Features(reader, paramFile, "3"); Instances traindata = atcTrain.loadInstances(true, "atc"); if (onlyTest) { if (FileUtilsElh.checkFile(testFile)) { System.err.println("read from test file"); reader = new CorpusReader(new FileInputStream(new File(testFile)), corpusFormat, nullSentenceOpinions, lang); atcTrain.setCorpus(reader); traindata = atcTrain.loadInstances(true, "atc"); } } //setting class attribute (entCat|attCat|entAttCat|polarityCat) //HashMap<String, Integer> opInst = atcTrain.getOpinInst(); //WekaWrapper classifyAtts; WekaWrapper onevsall; try { //classify.printMultilabelPredictions(classify.multiLabelPrediction()); */ //onevsall Instances entdata = new Instances(traindata); entdata.deleteAttributeAt(entdata.attribute("attCat").index()); entdata.deleteAttributeAt(entdata.attribute("entAttCat").index()); entdata.setClassIndex(entdata.attribute("entCat").index()); onevsall = new WekaWrapper(entdata, true); if (!onlyTest) { onevsall.trainOneVsAll(modelsPath, paramFile + "entCat"); System.out.println("trainATC: one vs all models ready"); } onevsall.setTestdata(entdata); HashMap<Integer, HashMap<String, Double>> ovsaRes = onevsall.predictOneVsAll(modelsPath, paramFile + "entCat"); System.out.println("trainATC: one vs all predictions ready"); HashMap<Integer, String> instOps = new HashMap<Integer, String>(); for (String oId : atcTrain.getOpinInst().keySet()) { instOps.put(atcTrain.getOpinInst().get(oId), oId); } atcTrain = new Features(reader, paramFile2, "3"); entdata = atcTrain.loadInstances(true, "attTrain2_data"); entdata.deleteAttributeAt(entdata.attribute("entAttCat").index()); //entdata.setClassIndex(entdata.attribute("entCat").index()); Attribute insAtt = entdata.attribute("instanceId"); double maxInstId = entdata.kthSmallestValue(insAtt, entdata.numDistinctValues(insAtt) - 1); System.err.println("last instance has index: " + maxInstId); for (int ins = 0; ins < entdata.numInstances(); ins++) { System.err.println("ins" + ins); int i = (int) entdata.instance(ins).value(insAtt); Instance currentInst = entdata.instance(ins); //System.err.println("instance "+i+" oid "+kk.get(i+1)+"kk contains key i?"+kk.containsKey(i)); String sId = reader.getOpinion(instOps.get(i)).getsId(); String oId = instOps.get(i); reader.removeSentenceOpinions(sId); int oSubId = 0; for (String cl : ovsaRes.get(i).keySet()) { //System.err.println("instance: "+i+" class "+cl+" value: "+ovsaRes.get(i).get(cl)); if (ovsaRes.get(i).get(cl) > threshold) { //System.err.println("one got through ! instance "+i+" class "+cl+" value: "+ovsaRes.get(i).get(cl)); // for the first one update the instances if (oSubId >= 1) { Instance newIns = new SparseInstance(currentInst); newIns.setDataset(entdata); entdata.add(newIns); newIns.setValue(insAtt, maxInstId + oSubId); newIns.setClassValue(cl); instOps.put((int) maxInstId + oSubId, oId); } // if the are more create new instances else { currentInst.setClassValue(cl); //create and add opinion to the structure // trgt, offsetFrom, offsetTo, polarity, cat, sId); //Opinion op = new Opinion(instOps.get(i)+"_"+oSubId, "", 0, 0, "", cl, sId); //reader.addOpinion(op); } oSubId++; } } //finished updating instances data } entdata.setClass(entdata.attribute("attCat")); onevsall = new WekaWrapper(entdata, true); /** * Bigarren sailkatzailea * * */ if (!onlyTest) { onevsall.trainOneVsAll(modelsPath, paramFile + "attCat"); System.out.println("trainATC: one vs all attcat models ready"); } ovsaRes = onevsall.predictOneVsAll(modelsPath, paramFile + "entAttCat"); insAtt = entdata.attribute("instanceId"); maxInstId = entdata.kthSmallestValue(insAtt, insAtt.numValues()); System.err.println("last instance has index: " + maxInstId); for (int ins = 0; ins < entdata.numInstances(); ins++) { System.err.println("ins: " + ins); int i = (int) entdata.instance(ins).value(insAtt); Instance currentInst = entdata.instance(ins); //System.err.println("instance "+i+" oid "+kk.get(i+1)+"kk contains key i?"+kk.containsKey(i)); String sId = reader.getOpinion(instOps.get(i)).getsId(); String oId = instOps.get(i); reader.removeSentenceOpinions(sId); int oSubId = 0; for (String cl : ovsaRes.get(i).keySet()) { //System.err.println("instance: "+i+" class "+cl+" value: "+ovsaRes.get(i).get(cl)); if (ovsaRes.get(i).get(cl) > threshold2) { ///System.err.println("instance: "+i+" class "+cl+" value: "+ovsaRes.get(i).get(cl)); if (ovsaRes.get(i).get(cl) > threshold) { //System.err.println("one got through ! instance "+i+" class "+cl+" value: "+ovsaRes.get(i).get(cl)); // for the first one update the instances if (oSubId >= 1) { String label = currentInst.stringValue(entdata.attribute("entAtt")) + "#" + cl; //create and add opinion to the structure // trgt, offsetFrom, offsetTo, polarity, cat, sId); Opinion op = new Opinion(oId + "_" + oSubId, "", 0, 0, "", label, sId); reader.addOpinion(op); } // if the are more create new instances else { String label = currentInst.stringValue(entdata.attribute("entAtt")) + "#" + cl; //create and add opinion to the structure // trgt, offsetFrom, offsetTo, polarity, cat, sId); reader.removeOpinion(oId); Opinion op = new Opinion(oId + "_" + oSubId, "", 0, 0, "", label, sId); reader.addOpinion(op); } oSubId++; } } //finished updating instances data } } reader.print2Semeval2015format(paramFile + "entAttCat.xml"); } catch (Exception e) { e.printStackTrace(); } //traindata.setClass(traindata.attribute("entAttCat")); System.err.println("DONE CLI train-atc2 (oneVsAll)"); }
From source file:gr.iti.mklab.visual.quantization.SimpleKMeansWithOutput.java
License:Open Source License
/** * Move the centroid to it's new coordinates. Generate the centroid coordinates based on it's members * (objects assigned to the cluster of the centroid) and the distance function being used. * /*from w w w.java2 s . co m*/ * @param centroidIndex * index of the centroid which the coordinates will be computed * @param members * the objects that are assigned to the cluster of this centroid * @param updateClusterInfo * if the method is supposed to update the m_Cluster arrays * @param addToCentroidInstances * true if the method is to add the computed coordinates to the Instances holding the centroids * @return the centroid coordinates */ protected double[] moveCentroid(int centroidIndex, Instances members, boolean updateClusterInfo, boolean addToCentroidInstances) { double[] vals = new double[members.numAttributes()]; // used only for Manhattan Distance Instances sortedMembers = null; int middle = 0; boolean dataIsEven = false; if (m_DistanceFunction instanceof ManhattanDistance) { middle = (members.numInstances() - 1) / 2; dataIsEven = ((members.numInstances() % 2) == 0); if (m_PreserveOrder) { sortedMembers = members; } else { sortedMembers = new Instances(members); } } for (int j = 0; j < members.numAttributes(); j++) { // in case of Euclidian distance the centroid is the mean point // in case of Manhattan distance the centroid is the median point // in both cases, if the attribute is nominal, the centroid is the mode if (m_DistanceFunction instanceof EuclideanDistance || members.attribute(j).isNominal()) { vals[j] = members.meanOrMode(j); } else if (m_DistanceFunction instanceof ManhattanDistance) { // singleton special case if (members.numInstances() == 1) { vals[j] = members.instance(0).value(j); } else { vals[j] = sortedMembers.kthSmallestValue(j, middle + 1); if (dataIsEven) { vals[j] = (vals[j] + sortedMembers.kthSmallestValue(j, middle + 2)) / 2; } } } if (updateClusterInfo) { m_ClusterMissingCounts[centroidIndex][j] = members.attributeStats(j).missingCount; m_ClusterNominalCounts[centroidIndex][j] = members.attributeStats(j).nominalCounts; if (members.attribute(j).isNominal()) { if (m_ClusterMissingCounts[centroidIndex][j] > m_ClusterNominalCounts[centroidIndex][j][Utils .maxIndex(m_ClusterNominalCounts[centroidIndex][j])]) { vals[j] = Utils.missingValue(); // mark mode as missing } } else { if (m_ClusterMissingCounts[centroidIndex][j] == members.numInstances()) { vals[j] = Utils.missingValue(); // mark mean as missing } } } } if (addToCentroidInstances) { m_ClusterCentroids.add(new DenseInstance(1.0, vals)); } return vals; }
From source file:lu.lippmann.cdb.lab.beta.util.WekaUtil2.java
License:Open Source License
/** * Generate the centroid coordinates based * on it's members (objects assigned to the cluster of the centroid) and the distance * function being used.//from ww w. j a v a 2 s .c o m * @return the centroid */ public static MixedCentroid computeMixedCentroid(final boolean preserveOrder, final NormalizableDistance distanceFunction, final Instances numericInstances, final Instances originalInstances, final int clusterIndex) { final int numInstances = numericInstances.numInstances(); final int numAttributes = numericInstances.numAttributes(); final Map<TupleSI, Integer> addedAttr = new HashMap<TupleSI, Integer>(); if (numInstances == 1) { Instance uniqueNumInstance = numericInstances.firstInstance(); Instance uniqueMixInstance = originalInstances.firstInstance(); double[] centroid = uniqueNumInstance.toDoubleArray(); for (int i = 0; i < uniqueMixInstance.numAttributes(); i++) { if (!uniqueMixInstance.attribute(i).isNumeric()) { final String catVal = uniqueMixInstance.attribute(i).value((int) uniqueMixInstance.value(i)); addedAttr.put(new TupleSI(catVal, i), 1); } } return new MixedCentroid(clusterIndex, centroid, addedAttr); } final double[] vals = new double[numAttributes]; //used only for Manhattan Distance Instances sortedMembers = null; int middle = 0; boolean dataIsEven = false; final boolean isManhattanDist = (distanceFunction instanceof ManhattanDistance); final boolean isEuclideanDist = (distanceFunction instanceof EuclideanDistance); if (isManhattanDist) { middle = (numInstances - 1) / 2; dataIsEven = ((numInstances % 2) == 0); if (preserveOrder) { sortedMembers = numericInstances; } else { sortedMembers = new Instances(numericInstances); } } for (int j = 0; j < numAttributes; j++) { //in case of Euclidian distance the centroid is the mean point //in case of Manhattan distance the centroid is the median point //in both cases, if the attribute is nominal, the centroid is the mode if (isEuclideanDist) { vals[j] = numericInstances.meanOrMode(j); for (int i = 0; i < numInstances; i++) { if (!originalInstances.attribute(j).isNumeric()) { final Instance instance = originalInstances.instance(i); final String catVal = instance.attribute(j).value((int) instance.value(j)); //Initialize map final TupleSI key = new TupleSI(catVal, j); if (!addedAttr.containsKey(key)) addedAttr.put(key, 0); addedAttr.put(key, addedAttr.get(key) + 1); } } } else if (isManhattanDist) { sortedMembers.kthSmallestValue(j, middle + 1); vals[j] = sortedMembers.instance(middle).value(j); if (dataIsEven) { sortedMembers.kthSmallestValue(j, middle + 2); vals[j] = (vals[j] + sortedMembers.instance(middle + 1).value(j)) / 2; } } else { throw new IllegalStateException("Not handled distance ..."); } } return new MixedCentroid(clusterIndex, vals, addedAttr); }