List of usage examples for weka.core Instances numInstances
publicint numInstances()
From source file:gr.auth.ee.lcs.utilities.InstancesUtility.java
License:Open Source License
/** * The number of instances are multiple of the number of folds. * From a se t of instances, it returns a chunk whose length is instances.numInstances / numberOfFolds * with index = index. Index starts at zero. * /*from w ww.ja v a 2s .co m*/ * In essencem this is used when splitting a partition of instances to a train and test set. * * One chunk is the test set and the rest is the train set. * We provide the index for the test set and the rest will automatically become the train set * see splitPartitionIntoFolds * * _____ * |_6_| index = 0 * |_6_| 1 * |_6_| 2 * |_6_| 3 * |_6_| 4 * |_6_| 5 * |_6_| 6 * |_6_| 7 * |_6_| 8 * |_6_| 9 * * */ public static Instances getPartitionSegment(Instances instances, int index, int numberOfFolds) { if (instances.numInstances() % numberOfFolds != 0) { System.out.println("Number of instances not a multiple of " + numberOfFolds); return null; } int numberOfInstancesToGet = instances.numInstances() / numberOfFolds; Instances segment = new Instances(instances, numberOfInstancesToGet); for (int i = index * numberOfInstancesToGet; i < (index + 1) * numberOfInstancesToGet; i++) { segment.add(instances.instance(i)); } return segment; }
From source file:gr.auth.ee.lcs.utilities.InstancesUtility.java
License:Open Source License
/** * Splits the .arff input dataset to |number-of-distinct-label-combinations| Instances which are stored in the partitions[] array. * Called by initializePopulation() as a preparatory step to clustering. * @throws Exception //from w w w.jav a 2s .co m * * */ public static Instances[] partitionInstances(final AbstractLearningClassifierSystem lcs, final String filename) throws Exception { // Open .arff final Instances set = InstancesUtility.openInstance(filename); if (set.classIndex() < 0) { set.setClassIndex(set.numAttributes() - 1); } //set.randomize(new Random()); int numberOfLabels = (int) SettingsLoader.getNumericSetting("numberOfLabels", 1); // the partitions vector holds the indices String stringsArray[] = new String[lcs.instances.length]; int indicesArray[] = new int[lcs.instances.length]; // convert each instance's labelset into a string and store it in the stringsArray array for (int i = 0; i < set.numInstances(); i++) { stringsArray[i] = ""; indicesArray[i] = i; for (int j = set.numAttributes() - numberOfLabels; j < set.numAttributes(); j++) { stringsArray[i] += (int) set.instance(i).value(j); } } // contains the indicesVector(s) Vector<Vector> mothershipVector = new Vector<Vector>(); String baseString = ""; for (int i = 0; i < set.numInstances(); i++) { baseString = stringsArray[i]; if (baseString.equals("")) continue; Vector<Integer> indicesVector = new Vector<Integer>(); for (int j = 0; j < set.numInstances(); j++) { if (baseString.equals(stringsArray[j])) { stringsArray[j] = ""; indicesVector.add(j); } } mothershipVector.add(indicesVector); } Instances[] partitions = new Instances[mothershipVector.size()]; for (int i = 0; i < mothershipVector.size(); i++) { partitions[i] = new Instances(set, mothershipVector.elementAt(i).size()); for (int j = 0; j < mothershipVector.elementAt(i).size(); j++) { Instance instanceToAdd = set.instance((Integer) mothershipVector.elementAt(i).elementAt(j)); partitions[i].add(instanceToAdd); } } /* * up to here, the partitions array has been formed. it contains the split dataset by label combinations * it holds both the attributes and the labels, but for clustering the input should only be the attributes, * so we need to delete the labels. this is taken care of by initializePopulation() */ return partitions; }
From source file:gr.auth.ee.lcs.utilities.InstancesUtility.java
License:Open Source License
public static Instances[] partitionInstances(final AbstractLearningClassifierSystem lcs, final Instances trainSet) throws Exception { // Open .arff final Instances set = trainSet; if (set.classIndex() < 0) { set.setClassIndex(set.numAttributes() - 1); }/*from w w w. ja v a2 s. c o m*/ //set.randomize(new Random()); int numberOfLabels = (int) SettingsLoader.getNumericSetting("numberOfLabels", 1); // the partitions vector holds the indices String stringsArray[] = new String[trainSet.numInstances()]; int indicesArray[] = new int[trainSet.numInstances()]; // convert each instance's labelset into a string and store it in the stringsArray array for (int i = 0; i < set.numInstances(); i++) { stringsArray[i] = ""; indicesArray[i] = i; for (int j = set.numAttributes() - numberOfLabels; j < set.numAttributes(); j++) { stringsArray[i] += (int) set.instance(i).value(j); } } // contains the indicesVector(s) Vector<Vector> mothershipVector = new Vector<Vector>(); String baseString = ""; for (int i = 0; i < set.numInstances(); i++) { baseString = stringsArray[i]; if (baseString.equals("")) continue; Vector<Integer> indicesVector = new Vector<Integer>(); for (int j = 0; j < set.numInstances(); j++) { if (baseString.equals(stringsArray[j])) { stringsArray[j] = ""; indicesVector.add(j); } } mothershipVector.add(indicesVector); } Instances[] partitions = new Instances[mothershipVector.size()]; for (int i = 0; i < mothershipVector.size(); i++) { partitions[i] = new Instances(set, mothershipVector.elementAt(i).size()); for (int j = 0; j < mothershipVector.elementAt(i).size(); j++) { Instance instanceToAdd = set.instance((Integer) mothershipVector.elementAt(i).elementAt(j)); partitions[i].add(instanceToAdd); } } /* * up to here, the partitions array has been formed. it contains the split dataset by label combinations * it holds both the attributes and the labels, but for clustering the input should only be the attributes, * so we need to delete the labels. this is taken care of by initializePopulation() */ return partitions; }
From source file:gr.auth.ee.lcs.utilities.InstancesUtility.java
License:Open Source License
public static void splitDatasetIntoFolds(final AbstractLearningClassifierSystem lcs, final Instances dataset, final int numberOfFolds) throws Exception { Instances[] partitions = InstancesUtility.partitionInstances(lcs, dataset); testInstances.setSize(partitions.length); trainInstances.setSize(partitions.length); int lowerBound = (int) Math.floor((double) dataset.numInstances() / (double) numberOfFolds); int upperBound = (int) Math.ceil((double) dataset.numInstances() / (double) numberOfFolds); // we demand lowerBound <= numberOfTestInstancesPerFold[i] <= upperBound int[] numberOfTestInstancesPerFold = new int[numberOfFolds]; /*//from w w w . j ava 2 s . c om * let X partitions have partitions[i].numInstances() > numberOfFolds. * Then, vectors testInstances and trainInstances, after the call of splitPartitionIntoFolds(), will hold X arrays * meaning X elements. * */ Vector<Integer> vectorOfPartitionIndices = new Vector<Integer>(); for (int i = 0; i < partitions.length; i++) { if (partitions[i].numInstances() > numberOfFolds) { InstancesUtility.splitPartitionIntoFolds(partitions[i], numberOfFolds, i); vectorOfPartitionIndices.add(i); } else { Instances[] emptyArrayTest = new Instances[numberOfFolds]; Instances[] emptyArrayTrain = new Instances[numberOfFolds]; for (int j = 0; j < numberOfFolds; j++) { emptyArrayTest[j] = new Instances(partitions[0], partitions[i].numInstances()); emptyArrayTrain[j] = new Instances(partitions[0], partitions[i].numInstances()); } //placeholders InstancesUtility.testInstances.add(i, emptyArrayTest); InstancesUtility.trainInstances.add(i, emptyArrayTrain); } } /* * At this point all partitions with numInstances > numFolds have been successfully been split. * What is left is splitting the leftovers. 1st from the above partitions and 2nd from the ones that originally had numInstances < numFolds * */ for (int i = 0; i < numberOfFolds; i++) { int instancesSum = 0; for (int j = 0; j < vectorOfPartitionIndices.size(); j++) { instancesSum += InstancesUtility.testInstances.elementAt(vectorOfPartitionIndices.elementAt(j))[i] .numInstances(); } // initial number of instances in test set per fold numberOfTestInstancesPerFold[i] = instancesSum; } /* * * i = 0 |_0|_0|_0|_0|_0|_0|_0|_0|_0|_0| i = 1 |_0|_0|_0|_0|_0|_0|_0|_0|_0|_0| i = 2 |_0|_0|_0|_0|_0|_0|_0|_0|_0|_0| i = 3 |_0|_0|_0|_0|_0|_0|_0|_0|_0|_0| i = 4 |_0|_0|_0|_0|_0|_0|_0|_0|_0|_0| i = 5 |_1|_1|_1|_1|_1|_1|_1|_1|_1|_1| i = 6 |_3|_3|_3|_3|_3|_3|_3|_3|_3|_3| i = 7 |_6|_6|_6|_6|_6|_6|_6|_6|_6|_6| * * * */ for (int i = 0; i < partitions.length; i++) { int numberOfLeftoverInstances = partitions[i].numInstances() % numberOfFolds; // eg 64 % 10 = 4 Instances leftoverInstances = new Instances(partitions[i], numberOfLeftoverInstances); if (numberOfLeftoverInstances > 0) { /* * Starting from the end. Anyhow they are the last {numberOfLeftoverInstances} instances in each partition * that splitPartitionIntoFolds() has been called on. * */ for (int k = partitions[i].numInstances() - 1; k >= partitions[i].numInstances() - numberOfLeftoverInstances; k--) { leftoverInstances.add(partitions[i].instance(k)); } /* * For each partition, randomize the folds. Leftover instances will be placed in the first {numberOfLeftoverInstances} folds, * that are already randomly distributed. If the first folds were not randomly distributed, there would be an uneven distribution, * meaning that in the first ones there would be instances of the first partition and so on. * * */ ArrayList<Integer> folds = new ArrayList<Integer>(); for (int k = 0; k < numberOfFolds; k++) { folds.add(k); } Collections.shuffle(folds); int j = 0; while (leftoverInstances.numInstances() > 0) { int foldIndex = folds.get(j); if (numberOfTestInstancesPerFold[foldIndex] < upperBound) { Instance toBeAdded = leftoverInstances.instance(0); // place the leftover first instance in a test set testInstances.elementAt(i)[foldIndex].add(toBeAdded); numberOfTestInstancesPerFold[foldIndex]++; // the instance placed in a test set for the current fold, needs to be put in the train set for all the other folds, // except for the current one of course for (int k = 0; k < numberOfFolds; k++) { if (k != foldIndex) { trainInstances.elementAt(i)[k].add(toBeAdded); } } // remove the instance placed in the test set leftoverInstances.delete(0); } j++; // if j hits the roof reset it. // there may exist folds that have not reached their upper limit and abandon them if (j == numberOfFolds) j = 0; } } } }
From source file:gr.auth.ee.lcs.utilities.InstancesUtility.java
License:Open Source License
/** * Splits a partition (collection of instances that belong to the same label combination) into train and test sets, leaving leftover instances. * It presupposes that partition.numInstances > numberOfFolds. * /*from ww w. j ava 2s .com*/ * Leftover instances should be distributed in a way that each test set holds * * floor(totalNumInstances / numberOfFolds) <= testSetNumInstances <= ceil(totalNumInstances / numberOfFolds) */ public static void splitPartitionIntoFolds(Instances partition, int numberOfFolds, int partitionIndex) { int numberOfTestInstancesPerFold = partition.numInstances() / numberOfFolds; // eg 64 / 10 = 6 int numberOfLeftoverInstances = partition.numInstances() % numberOfFolds; // eg 64 % 10 = 4 int numberOfTrainInstancesPerFold = partition.numInstances() - numberOfTestInstancesPerFold - numberOfLeftoverInstances; // eg 64 - 6 - 4 = 54 Instances[] testArrayPerPartition = new Instances[numberOfFolds]; Instances[] trainArrayPerPartition = new Instances[numberOfFolds]; Instances bulk = new Instances(partition, partition.numInstances() - numberOfLeftoverInstances); /* * E.g. I will split 64 total instances into 6 for testing, 54 for training and the rest (4) will be leftovers. * 6 + 54 = 60 ~ 10 * The first 60 instances will be temporarily placed in the roundArray array * */ for (int i = 0; i < partition.numInstances() - numberOfLeftoverInstances; i++) { bulk.add(partition.instance(i)); } for (int i = 0; i < numberOfFolds; i++) { testArrayPerPartition[i] = InstancesUtility.getPartitionSegment(bulk, i, numberOfFolds); trainArrayPerPartition[i] = new Instances(bulk, numberOfFolds); for (int j = 0; j < numberOfFolds; j++) { if (j != i) { for (int k = 0; k < numberOfTestInstancesPerFold; k++) { Instance kthInstance = InstancesUtility.getPartitionSegment(bulk, j, numberOfFolds) .instance(k); trainArrayPerPartition[i].add(kthInstance); } } } } /* * In total, there will be partitions.length additions. * Place each array in its respective place, depending on the partition index. * */ InstancesUtility.testInstances.add(partitionIndex, testArrayPerPartition); InstancesUtility.trainInstances.add(partitionIndex, trainArrayPerPartition); }
From source file:gr.demokritos.iit.cpgislanddetection.CpGIslandDetection.java
License:Apache License
/** * @param args the command line arguments *//* w w w .j a v a 2s.c o m*/ public static void main(String[] args) throws IOException, ParseException, Exception { // String sFileNameArgs = args[0]; // String[] fileNames = null; // Read file //IGenomicSequenceFileReader reader = new SequenceListFileReader(); // String seq ="GCTCTTGACTTTCAGACTTCCTGAAAACAACGTTCTGGTAAGGACAAGGGTT"; // // CpGIslandIdentification iClass = new CpGIslandIdentification(); // boolean b = iClass.identify(seq); // System.out.println("This sequence is a CpG island: " + b); // SequenceListFileReader s = new SequenceListFileReader(); // ArrayList<BaseSequence> alRes = new ArrayList<>(); // // alRes = s.getSequencesFromFile("C:\\Users\\Xenia\\Desktop\\files\\posSamples.txt"); // for(int i=0; i<alRes.size(); i++) // System.out.println("alRes = " + i + alRes.get(i)); // VectorAnalyzer vA = new VectorAnalyzer(); // List<Vector<Integer>> listVector = new ArrayList<>(); //Vector<Vector<Integer>> list = // listVector = vA.analyze(alRes); // for(int i=0; i<listVector.size();i++) // System.out.println(i + " " +listVector.get(i)); //IGenomicSequenceFileReader reader = new FASTAFileReader(); // If no input file has been given /* if (args.length == 0) { // Use default fileNames[0] = "C:\\Users\\Xenia\\Desktop\\files\\posSamples.txt"; fileNames[1] = "C:\\Users\\Xenia\\Desktop\\files\\negSamples.txt"; fileNames[2] = "C:\\Users\\Xenia\\Desktop\\files\\newsamples.txt"; } else // else use the provided one { fileNames = sFileNameArgs.split(";"); } */ //-----------------VECTOR ANALYSIS STARTS HERE-------------------------------------- //read sequences from txt files SequenceListFileReader reader = new SequenceListFileReader(); ArrayList<BaseSequence> lSeqs1 = new ArrayList<>(); ArrayList<BaseSequence> lSeqs2 = new ArrayList<>(); lSeqs1 = reader.getSequencesFromFile("C:\\Users\\Xenia\\Desktop\\files\\posSamples.txt"); lSeqs2 = reader.getSequencesFromFile("C:\\Users\\Xenia\\Desktop\\files\\negSamples.txt"); //create vectors for every sequence List<Vector<Integer>> listVectorForPositiveSamples = new ArrayList<>(); List<Vector<Integer>> listVectorForNegativeSamples = new ArrayList<>(); VectorAnalyzer v = new VectorAnalyzer(); listVectorForPositiveSamples = v.analyze(lSeqs1); listVectorForNegativeSamples = v.analyze(lSeqs2); //create ARFF files for positive and negative samples FileCreatorARFF fc = new FileCreatorARFF(); Instances positiveInstances = fc.createARFF(listVectorForPositiveSamples, "yes"); Instances negativeInstances = fc.createARFF(listVectorForNegativeSamples, "no"); //System.out.println(positiveInstances); //build and train classifier // setting class attribute positiveInstances.setClassIndex(positiveInstances.numAttributes() - 1); negativeInstances.setClassIndex(negativeInstances.numAttributes() - 1); // train NaiveBayes NaiveBayesUpdateable nb = new NaiveBayesUpdateable(); nb.buildClassifier(positiveInstances); nb.buildClassifier(negativeInstances); Instance current; for (int i = 0; i < positiveInstances.numInstances(); i++) { current = positiveInstances.instance(i); nb.updateClassifier(current); } // Test the model Evaluation eTest = new Evaluation(positiveInstances); Instances isTestingSet = fc.createARFF(listVectorForNegativeSamples, "?"); isTestingSet.setClassIndex(isTestingSet.numAttributes() - 1); eTest.evaluateModel(nb, isTestingSet); //------------------VECTOR ANALYSIS ENDS HERE--------------------------------------- //----------------------------HMM CLASSIFIER STARTS HERE---------------------------------- // Init classifier /* ISequenceClassifier<List<ObservationDiscrete<HmmSequence.Packet>>> classifier = new HmmClassifier(); */ // WARNING: Remember to change when you have normal data!!! // Obfuscation in negative training file? // final boolean bObfuscateNeg = true; // FASTAObfuscatorReader r = new FASTAObfuscatorReader(); //for each file do the same work: train // for (int i = 0; i < 3; i++) { // Read the sequences // If obfuscation is on and we are dealing with the negative // training file /* if ((i == 2) && (bObfuscateNeg)) { //FASTAObfuscatorReader r = new FASTAObfuscatorReader(); lSeqs = r.getSequencesFromFile(fileNames[i]); fileNames[1] = "Not" + fileNames[1]; // Update to indicate different class } else // else read normally lSeqs = reader.getSequencesFromFile(fileNames[i]); System.out.println("lSeqs size="+lSeqs.size()); */ // Create HMM sequences /* ISequenceAnalyst<List<ObservationDiscrete<HmmSequence.Packet>>> analyst = new HmmAnalyzer(); List<List<ObservationDiscrete<HmmSequence.Packet>>> lHmmSeqs = analyst.analyze(lSeqs); // Train classifier with the observations classifier.train(lHmmSeqs, new File(fileNames[i]).getName()); } //Classify the test file //First: Read the sequences lSeqs = r.getSequencesFromFile(fileNames[2]); //System.out.println("file name= "+fileNames[2]); //Then: Create HMM sequences ISequenceAnalyst<List<ObservationDiscrete<HmmSequence.Packet>>> analyst = new HmmAnalyzer(); List<List<ObservationDiscrete<HmmSequence.Packet>>> lHmmSeqs = analyst.analyze(lSeqs); */ //-------------------------------HMM CLASSIFIER ENDS HERE----------------------------------------- /* //----------------------------HMM EVALUATION STARTS----------------------------------------------- //System.out.println("size of lHmmSeqs="+ lHmmSeqs.size()); String str = null; String[] savedResults = new String[lHmmSeqs.size()]; //create a 2x2 array to store successes and failures for each class int[][] matrix = new int[2][2]; int successForCpG = 0, failForCpG = 0, successForNotCpG = 0, failForNotCpG = 0; // Init identifier // CpGIslandIdentification identifier = new CpGIslandIdentification(); CpGIslandIdentification identifier = new CpGIslandIdentificationByList("CpG_hg18.fa"); for (int i = 0; i < lHmmSeqs.size(); i++) { // DEBUG System.err.print("."); if (i % 10 == 0) System.err.println(); //////// str = classifier.classify(lHmmSeqs.get(i)); // System.out.println( "i="+i); System.out.println("Determined class:" + str); // savedResults[i] = str; //kalw sunarthsh pou exetazei an to sequence ikanopoiei ta CpG criterias if (identifier.identify(lSeqs.get(i).getSymbolSequence()) && str.equals(fileNames[0])) { //Success for CpG class successForCpG++; System.out.println("successForCpG" + successForCpG); } else if (identifier.identify(lSeqs.get(i).getSymbolSequence()) && str.equals(fileNames[1])) { //fail for CpG class failForCpG++; System.out.println("failForCpG" + failForCpG); } else if (identifier.identify(lSeqs.get(i).getSymbolSequence()) == false && str.equals(fileNames[1])) { //System.out.println(i); //Success for Not CpG class successForNotCpG++; System.out.println("successForNotCpG" + successForNotCpG); } else if (identifier.identify(lSeqs.get(i).getSymbolSequence()) == false && str.equals(fileNames[0])) { //fail for Not CpG class failForNotCpG++; System.out.println("failForNotCpG" + failForNotCpG); } } //Evaluation: calculation of classification rate and accuracy double totalAccuracy = (successForNotCpG + successForCpG)/(successForCpG + failForCpG + failForNotCpG + successForNotCpG); //missclassification rate for CpG class double rate1 = ( failForCpG + successForCpG ) != 0 ? failForCpG / ( failForCpG + successForCpG ) : 0.0; //missclassification rate for Not CpG class double rate2 = ( failForNotCpG + successForNotCpG ) != 0 ? failForNotCpG / ( failForNotCpG + successForNotCpG ) : 0.0; System.out.println(totalAccuracy +" "+ rate1 + " "+ rate2); NGramGraphClassifier nGramGraphClassifier = new NGramGraphClassifier(); List<List<DocumentNGramGraph>> representation; NGramGraphAnalyzer myAnalyst = new NGramGraphAnalyzer(); representation = myAnalyst.analyze(lSeqs); for(int i=0; i<representation.size();i++) nGramGraphClassifier.classify(representation.get(i)); */ }
From source file:gr.iti.mklab.visual.quantization.SimpleKMeansWithOutput.java
License:Open Source License
/** * Generates a clusterer. Has to initialize all fields of the clusterer that are not being set via * options./*from www . jav a 2s .c om*/ * * @param data * set of instances serving as training data * @throws Exception * if the clusterer has not been generated successfully */ @Override public void buildClusterer(Instances data) throws Exception { // can clusterer handle the data? getCapabilities().testWithFail(data); m_Iterations = 0; m_ReplaceMissingFilter = new ReplaceMissingValues(); Instances instances = new Instances(data); instances.setClassIndex(-1); if (!m_dontReplaceMissing) { m_ReplaceMissingFilter.setInputFormat(instances); instances = Filter.useFilter(instances, m_ReplaceMissingFilter); } m_FullMissingCounts = new int[instances.numAttributes()]; if (m_displayStdDevs) { m_FullStdDevs = new double[instances.numAttributes()]; } m_FullNominalCounts = new int[instances.numAttributes()][0]; m_FullMeansOrMediansOrModes = moveCentroid(0, instances, false, false); for (int i = 0; i < instances.numAttributes(); i++) { m_FullMissingCounts[i] = instances.attributeStats(i).missingCount; if (instances.attribute(i).isNumeric()) { if (m_displayStdDevs) { m_FullStdDevs[i] = Math.sqrt(instances.variance(i)); } if (m_FullMissingCounts[i] == instances.numInstances()) { m_FullMeansOrMediansOrModes[i] = Double.NaN; // mark missing as mean } } else { m_FullNominalCounts[i] = instances.attributeStats(i).nominalCounts; if (m_FullMissingCounts[i] > m_FullNominalCounts[i][Utils.maxIndex(m_FullNominalCounts[i])]) { m_FullMeansOrMediansOrModes[i] = -1; // mark missing as most common // value } } } m_ClusterCentroids = new Instances(instances, m_NumClusters); int[] clusterAssignments = new int[instances.numInstances()]; if (m_PreserveOrder) m_Assignments = clusterAssignments; m_DistanceFunction.setInstances(instances); Random RandomO = new Random(getSeed()); int instIndex; HashMap initC = new HashMap(); DecisionTableHashKey hk = null; Instances initInstances = null; if (m_PreserveOrder) initInstances = new Instances(instances); else initInstances = instances; if (m_initializeWithKMeansPlusPlus) { kMeansPlusPlusInit(initInstances); } else { for (int j = initInstances.numInstances() - 1; j >= 0; j--) { instIndex = RandomO.nextInt(j + 1); hk = new DecisionTableHashKey(initInstances.instance(instIndex), initInstances.numAttributes(), true); if (!initC.containsKey(hk)) { m_ClusterCentroids.add(initInstances.instance(instIndex)); initC.put(hk, null); } initInstances.swap(j, instIndex); if (m_ClusterCentroids.numInstances() == m_NumClusters) { break; } } } m_NumClusters = m_ClusterCentroids.numInstances(); // removing reference initInstances = null; int i; boolean converged = false; int emptyClusterCount; Instances[] tempI = new Instances[m_NumClusters]; m_squaredErrors = new double[m_NumClusters]; m_ClusterNominalCounts = new int[m_NumClusters][instances.numAttributes()][0]; m_ClusterMissingCounts = new int[m_NumClusters][instances.numAttributes()]; startExecutorPool(); long start = System.currentTimeMillis(); while (!converged) { emptyClusterCount = 0; m_Iterations++; converged = true; System.out.print(new Date() + ": " + "Iter " + m_Iterations + " "); if (m_executionSlots <= 1 || instances.numInstances() < 2 * m_executionSlots) { for (i = 0; i < instances.numInstances(); i++) { Instance toCluster = instances.instance(i); int newC = clusterProcessedInstance(toCluster, true, true); if (newC != clusterAssignments[i]) { converged = false; } clusterAssignments[i] = newC; } } else { converged = launchAssignToClusters(instances, clusterAssignments); } // update centroids m_ClusterCentroids = new Instances(instances, m_NumClusters); for (i = 0; i < m_NumClusters; i++) { tempI[i] = new Instances(instances, 0); } for (i = 0; i < instances.numInstances(); i++) { tempI[clusterAssignments[i]].add(instances.instance(i)); } if (m_executionSlots <= 1 || instances.numInstances() < 2 * m_executionSlots) { for (i = 0; i < m_NumClusters; i++) { if (tempI[i].numInstances() == 0) { // empty cluster emptyClusterCount++; } else { moveCentroid(i, tempI[i], true, true); } } } else { emptyClusterCount = launchMoveCentroids(tempI); } if (m_Iterations == m_MaxIterations) converged = true; if (emptyClusterCount > 0) { m_NumClusters -= emptyClusterCount; if (converged) { Instances[] t = new Instances[m_NumClusters]; int index = 0; for (int k = 0; k < tempI.length; k++) { if (tempI[k].numInstances() > 0) { t[index++] = tempI[k]; } } tempI = t; } else { tempI = new Instances[m_NumClusters]; } } if (!converged) { m_ClusterNominalCounts = new int[m_NumClusters][instances.numAttributes()][0]; } System.out.println("Sum of within cluster distances: " + Utils.sum(m_squaredErrors)); // reset erros to zero m_squaredErrors = new double[m_NumClusters]; } long end = System.currentTimeMillis(); System.out.println("\nClustering completed in " + (end - start) + " ms and converged in " + m_Iterations + " iterations"); // calculate errors if (!m_FastDistanceCalc) { for (i = 0; i < instances.numInstances(); i++) { clusterProcessedInstance(instances.instance(i), true, false); } } if (m_displayStdDevs) { m_ClusterStdDevs = new Instances(instances, m_NumClusters); } m_ClusterSizes = new int[m_NumClusters]; for (i = 0; i < m_NumClusters; i++) { if (m_displayStdDevs) { double[] vals2 = new double[instances.numAttributes()]; for (int j = 0; j < instances.numAttributes(); j++) { if (instances.attribute(j).isNumeric()) { vals2[j] = Math.sqrt(tempI[i].variance(j)); } else { vals2[j] = Utils.missingValue(); } } m_ClusterStdDevs.add(new DenseInstance(1.0, vals2)); } m_ClusterSizes[i] = tempI[i].numInstances(); } m_executorPool.shutdown(); }
From source file:gr.iti.mklab.visual.quantization.SimpleKMeansWithOutput.java
License:Open Source License
protected void kMeansPlusPlusInit(Instances data) throws Exception { Random randomO = new Random(getSeed()); HashMap<DecisionTableHashKey, String> initC = new HashMap<DecisionTableHashKey, String>(); // choose initial center uniformly at random int index = randomO.nextInt(data.numInstances()); m_ClusterCentroids.add(data.instance(index)); DecisionTableHashKey hk = new DecisionTableHashKey(data.instance(index), data.numAttributes(), true); initC.put(hk, null);/*from ww w . j a va 2 s.co m*/ int iteration = 0; int remainingInstances = data.numInstances() - 1; if (m_NumClusters > 1) { // proceed with selecting the rest // distances to the initial randomly chose center double[] distances = new double[data.numInstances()]; double[] cumProbs = new double[data.numInstances()]; for (int i = 0; i < data.numInstances(); i++) { distances[i] = m_DistanceFunction.distance(data.instance(i), m_ClusterCentroids.instance(iteration)); } // now choose the remaining cluster centers for (int i = 1; i < m_NumClusters; i++) { // distances converted to probabilities double[] weights = new double[data.numInstances()]; System.arraycopy(distances, 0, weights, 0, distances.length); Utils.normalize(weights); double sumOfProbs = 0; for (int k = 0; k < data.numInstances(); k++) { sumOfProbs += weights[k]; cumProbs[k] = sumOfProbs; } cumProbs[data.numInstances() - 1] = 1.0; // make sure there are no // rounding issues // choose a random instance double prob = randomO.nextDouble(); for (int k = 0; k < cumProbs.length; k++) { if (prob < cumProbs[k]) { Instance candidateCenter = data.instance(k); hk = new DecisionTableHashKey(candidateCenter, data.numAttributes(), true); if (!initC.containsKey(hk)) { initC.put(hk, null); m_ClusterCentroids.add(candidateCenter); } else { // we shouldn't get here because any instance that is a duplicate // of // an already chosen cluster center should have zero distance (and // hence // zero probability of getting chosen) to that center. System.err.println("We shouldn't get here...."); } remainingInstances--; break; } } iteration++; if (remainingInstances == 0) { break; } // prepare to choose the next cluster center. // check distances against the new cluster center to see if it is closer for (int k = 0; k < data.numInstances(); k++) { if (distances[k] > 0) { double newDist = m_DistanceFunction.distance(data.instance(k), m_ClusterCentroids.instance(iteration)); if (newDist < distances[k]) { distances[k] = newDist; } } } } } }
From source file:gr.iti.mklab.visual.quantization.SimpleKMeansWithOutput.java
License:Open Source License
/** * Move the centroid to it's new coordinates. Generate the centroid coordinates based on it's members * (objects assigned to the cluster of the centroid) and the distance function being used. * //w w w.j av a2 s .c o m * @param centroidIndex * index of the centroid which the coordinates will be computed * @param members * the objects that are assigned to the cluster of this centroid * @param updateClusterInfo * if the method is supposed to update the m_Cluster arrays * @param addToCentroidInstances * true if the method is to add the computed coordinates to the Instances holding the centroids * @return the centroid coordinates */ protected double[] moveCentroid(int centroidIndex, Instances members, boolean updateClusterInfo, boolean addToCentroidInstances) { double[] vals = new double[members.numAttributes()]; // used only for Manhattan Distance Instances sortedMembers = null; int middle = 0; boolean dataIsEven = false; if (m_DistanceFunction instanceof ManhattanDistance) { middle = (members.numInstances() - 1) / 2; dataIsEven = ((members.numInstances() % 2) == 0); if (m_PreserveOrder) { sortedMembers = members; } else { sortedMembers = new Instances(members); } } for (int j = 0; j < members.numAttributes(); j++) { // in case of Euclidian distance the centroid is the mean point // in case of Manhattan distance the centroid is the median point // in both cases, if the attribute is nominal, the centroid is the mode if (m_DistanceFunction instanceof EuclideanDistance || members.attribute(j).isNominal()) { vals[j] = members.meanOrMode(j); } else if (m_DistanceFunction instanceof ManhattanDistance) { // singleton special case if (members.numInstances() == 1) { vals[j] = members.instance(0).value(j); } else { vals[j] = sortedMembers.kthSmallestValue(j, middle + 1); if (dataIsEven) { vals[j] = (vals[j] + sortedMembers.kthSmallestValue(j, middle + 2)) / 2; } } } if (updateClusterInfo) { m_ClusterMissingCounts[centroidIndex][j] = members.attributeStats(j).missingCount; m_ClusterNominalCounts[centroidIndex][j] = members.attributeStats(j).nominalCounts; if (members.attribute(j).isNominal()) { if (m_ClusterMissingCounts[centroidIndex][j] > m_ClusterNominalCounts[centroidIndex][j][Utils .maxIndex(m_ClusterNominalCounts[centroidIndex][j])]) { vals[j] = Utils.missingValue(); // mark mode as missing } } else { if (m_ClusterMissingCounts[centroidIndex][j] == members.numInstances()) { vals[j] = Utils.missingValue(); // mark mean as missing } } } } if (addToCentroidInstances) { m_ClusterCentroids.add(new DenseInstance(1.0, vals)); } return vals; }
From source file:gr.uoc.nlp.opinion.analysis.suggestion.AnalyzeSuggestions.java
/** * * @param classifier/*from w w w . j av a 2s. com*/ * @param unclassified * @return */ public Instances classify(Classifier classifier, Instances unclassified) { unclassified.setClassIndex(unclassified.numAttributes() - 1); //new set wich will contain classifies instances Instances classified = new Instances(unclassified); double clsLabel; try { for (int i = 0; i < unclassified.numInstances(); i++) { //for each unclassifies, classify clsLabel = classifier.classifyInstance(unclassified.instance(i)); //append result to final set classified.instance(i).setClassValue(clsLabel); } } catch (Exception ex) { Logger.getLogger(AnalyzeArguments.class.getName()).log(Level.SEVERE, null, ex); } return classified; }