List of usage examples for weka.core Instances delete
public void delete(int index)
From source file:gr.auth.ee.lcs.utilities.InstancesUtility.java
License:Open Source License
public static void splitDatasetIntoFolds(final AbstractLearningClassifierSystem lcs, final Instances dataset, final int numberOfFolds) throws Exception { Instances[] partitions = InstancesUtility.partitionInstances(lcs, dataset); testInstances.setSize(partitions.length); trainInstances.setSize(partitions.length); int lowerBound = (int) Math.floor((double) dataset.numInstances() / (double) numberOfFolds); int upperBound = (int) Math.ceil((double) dataset.numInstances() / (double) numberOfFolds); // we demand lowerBound <= numberOfTestInstancesPerFold[i] <= upperBound int[] numberOfTestInstancesPerFold = new int[numberOfFolds]; /*/*from www. j a va 2s .c o m*/ * let X partitions have partitions[i].numInstances() > numberOfFolds. * Then, vectors testInstances and trainInstances, after the call of splitPartitionIntoFolds(), will hold X arrays * meaning X elements. * */ Vector<Integer> vectorOfPartitionIndices = new Vector<Integer>(); for (int i = 0; i < partitions.length; i++) { if (partitions[i].numInstances() > numberOfFolds) { InstancesUtility.splitPartitionIntoFolds(partitions[i], numberOfFolds, i); vectorOfPartitionIndices.add(i); } else { Instances[] emptyArrayTest = new Instances[numberOfFolds]; Instances[] emptyArrayTrain = new Instances[numberOfFolds]; for (int j = 0; j < numberOfFolds; j++) { emptyArrayTest[j] = new Instances(partitions[0], partitions[i].numInstances()); emptyArrayTrain[j] = new Instances(partitions[0], partitions[i].numInstances()); } //placeholders InstancesUtility.testInstances.add(i, emptyArrayTest); InstancesUtility.trainInstances.add(i, emptyArrayTrain); } } /* * At this point all partitions with numInstances > numFolds have been successfully been split. * What is left is splitting the leftovers. 1st from the above partitions and 2nd from the ones that originally had numInstances < numFolds * */ for (int i = 0; i < numberOfFolds; i++) { int instancesSum = 0; for (int j = 0; j < vectorOfPartitionIndices.size(); j++) { instancesSum += InstancesUtility.testInstances.elementAt(vectorOfPartitionIndices.elementAt(j))[i] .numInstances(); } // initial number of instances in test set per fold numberOfTestInstancesPerFold[i] = instancesSum; } /* * * i = 0 |_0|_0|_0|_0|_0|_0|_0|_0|_0|_0| i = 1 |_0|_0|_0|_0|_0|_0|_0|_0|_0|_0| i = 2 |_0|_0|_0|_0|_0|_0|_0|_0|_0|_0| i = 3 |_0|_0|_0|_0|_0|_0|_0|_0|_0|_0| i = 4 |_0|_0|_0|_0|_0|_0|_0|_0|_0|_0| i = 5 |_1|_1|_1|_1|_1|_1|_1|_1|_1|_1| i = 6 |_3|_3|_3|_3|_3|_3|_3|_3|_3|_3| i = 7 |_6|_6|_6|_6|_6|_6|_6|_6|_6|_6| * * * */ for (int i = 0; i < partitions.length; i++) { int numberOfLeftoverInstances = partitions[i].numInstances() % numberOfFolds; // eg 64 % 10 = 4 Instances leftoverInstances = new Instances(partitions[i], numberOfLeftoverInstances); if (numberOfLeftoverInstances > 0) { /* * Starting from the end. Anyhow they are the last {numberOfLeftoverInstances} instances in each partition * that splitPartitionIntoFolds() has been called on. * */ for (int k = partitions[i].numInstances() - 1; k >= partitions[i].numInstances() - numberOfLeftoverInstances; k--) { leftoverInstances.add(partitions[i].instance(k)); } /* * For each partition, randomize the folds. Leftover instances will be placed in the first {numberOfLeftoverInstances} folds, * that are already randomly distributed. If the first folds were not randomly distributed, there would be an uneven distribution, * meaning that in the first ones there would be instances of the first partition and so on. * * */ ArrayList<Integer> folds = new ArrayList<Integer>(); for (int k = 0; k < numberOfFolds; k++) { folds.add(k); } Collections.shuffle(folds); int j = 0; while (leftoverInstances.numInstances() > 0) { int foldIndex = folds.get(j); if (numberOfTestInstancesPerFold[foldIndex] < upperBound) { Instance toBeAdded = leftoverInstances.instance(0); // place the leftover first instance in a test set testInstances.elementAt(i)[foldIndex].add(toBeAdded); numberOfTestInstancesPerFold[foldIndex]++; // the instance placed in a test set for the current fold, needs to be put in the train set for all the other folds, // except for the current one of course for (int k = 0; k < numberOfFolds; k++) { if (k != foldIndex) { trainInstances.elementAt(i)[k].add(toBeAdded); } } // remove the instance placed in the test set leftoverInstances.delete(0); } j++; // if j hits the roof reset it. // there may exist folds that have not reached their upper limit and abandon them if (j == numberOfFolds) j = 0; } } } }
From source file:j48.GraftSplit.java
License:Open Source License
/** * deletes the cases in data that belong to leaf pointed to by * the test (i.e. the subset of interest). this is useful so * the instances belonging to that leaf aren't passed down the * other branch./*from w w w . j a v a 2 s . com*/ * * @param data the instances to delete from */ public void deleteGraftedCases(Instances data) { int subOfInterest = subsetOfInterest(); for (int x = 0; x < data.numInstances(); x++) { if (whichSubset(data.instance(x)) == subOfInterest) { data.delete(x--); } } }
From source file:machinelearningproject.RFTree.java
@Override public Tree buildTree(Instances instances) throws Exception { Tree tree = new Tree(); ArrayList<String> availableAttributes = new ArrayList(); int largestInfoGainAttrIdx = -1; double largestInfoGainAttrValue = 0.0; //choose random fraction int numAttr = instances.numAttributes(); int k = (int) round(sqrt(numAttr)); ArrayList<Integer> randomIdx = randomFraction(numAttr); for (int idx = 0; idx < k; idx++) { if (idx != instances.classIndex()) { availableAttributes.add(instances.attribute(idx).name()); }/*from w ww. j a v a2 s .c o m*/ } if (instances.numInstances() == 0) { return null; } else if (calculateClassEntropy(instances) == 0.0) { // all examples have the sama classification tree.attributeName = instances.get(0).stringValue(instances.classIndex()); } else if (availableAttributes.isEmpty()) { // mode classification tree.attributeName = getModeClass(instances, instances.classIndex()); } else { for (int idx = 0; idx < instances.numAttributes(); idx++) { if (idx != instances.classIndex()) { double attrInfoGain = calculateInformationGain(instances, idx, instances.classIndex()); if (largestInfoGainAttrValue < attrInfoGain) { largestInfoGainAttrIdx = idx; largestInfoGainAttrValue = attrInfoGain; } } } if (largestInfoGainAttrIdx != -1) { tree.attributeName = instances.attribute(largestInfoGainAttrIdx).name(); ArrayList<String> attrValues = new ArrayList(); for (int i = 0; i < instances.numInstances(); i++) { Instance instance = instances.get(i); String attrValue = instance.stringValue(largestInfoGainAttrIdx); if (attrValues.isEmpty() || !attrValues.contains(attrValue)) { attrValues.add(attrValue); } } for (String attrValue : attrValues) { Node node = new Node(attrValue); Instances copyInstances = new Instances(instances); copyInstances.setClassIndex(instances.classIndex()); int i = 0; while (i < copyInstances.numInstances()) { Instance instance = copyInstances.get(i); // reducing examples if (!instance.stringValue(largestInfoGainAttrIdx).equals(attrValue)) { copyInstances.delete(i); i--; } i++; } copyInstances.deleteAttributeAt(largestInfoGainAttrIdx); node.subTree = buildTree(copyInstances); tree.nodes.add(node); } } } return tree; }
From source file:machinelearningproject.Tree.java
public Tree buildTree(Instances instances) throws Exception { Tree tree = new Tree(); ArrayList<String> availableAttributes = new ArrayList(); int largestInfoGainAttrIdx = -1; double largestInfoGainAttrValue = 0.0; for (int idx = 0; idx < instances.numAttributes(); idx++) { if (idx != instances.classIndex()) { availableAttributes.add(instances.attribute(idx).name()); }/* w w w . ja v a 2 s . com*/ } if (instances.numInstances() == 0) { return null; } else if (calculateClassEntropy(instances) == 0.0) { // all examples have the sama classification tree.attributeName = instances.get(0).stringValue(instances.classIndex()); } else if (availableAttributes.isEmpty()) { // mode classification tree.attributeName = getModeClass(instances, instances.classIndex()); } else { for (int idx = 0; idx < instances.numAttributes(); idx++) { if (idx != instances.classIndex()) { double attrInfoGain = calculateInformationGain(instances, idx, instances.classIndex()); if (largestInfoGainAttrValue < attrInfoGain) { largestInfoGainAttrIdx = idx; largestInfoGainAttrValue = attrInfoGain; } } } if (largestInfoGainAttrIdx != -1) { tree.attributeName = instances.attribute(largestInfoGainAttrIdx).name(); ArrayList<String> attrValues = new ArrayList(); for (int i = 0; i < instances.numInstances(); i++) { Instance instance = instances.get(i); String attrValue = instance.stringValue(largestInfoGainAttrIdx); if (attrValues.isEmpty() || !attrValues.contains(attrValue)) { attrValues.add(attrValue); } } for (String attrValue : attrValues) { Node node = new Node(attrValue); Instances copyInstances = new Instances(instances); copyInstances.setClassIndex(instances.classIndex()); int i = 0; while (i < copyInstances.numInstances()) { Instance instance = copyInstances.get(i); // reducing examples if (!instance.stringValue(largestInfoGainAttrIdx).equals(attrValue)) { copyInstances.delete(i); i--; } i++; } copyInstances.deleteAttributeAt(largestInfoGainAttrIdx); node.subTree = buildTree(copyInstances); tree.nodes.add(node); } } } return tree; }
From source file:meka.core.SuperLabelUtils.java
License:Open Source License
/** * Super Label Transformation - transform dataset D into a dataset with <code>k</code> multi-class target attributes. * Use the NSR/PS-style pruning and recomposition, according to partition 'indices', and pruning values 'p' and 'n'. * @see PSUtils.PSTransformation/*from w w w . j a v a 2 s. c om*/ * @param indices m by k: m super variables, each relating to k original variables * @param D either multi-label or multi-target dataset * @param p pruning value * @param n subset relpacement value * @return a multi-target dataset */ public static Instances SLTransformation(Instances D, int indices[][], int p, int n) { int L = D.classIndex(); int K = indices.length; ArrayList<String> values[] = new ArrayList[K]; HashMap<String, Integer> counts[] = new HashMap[K]; // create D_ Instances D_ = new Instances(D); // clear D_ // F.removeLabels(D_,L); for (int j = 0; j < L; j++) { D_.deleteAttributeAt(0); } // create atts for (int j = 0; j < K; j++) { int att[] = indices[j]; //int values[] = new int[2]; //getValues(indices,D,p); counts[j] = getCounts(D, att, p); Set<String> vals = counts[j].keySet(); //getValues(D,att,p); values[j] = new ArrayList(vals); D_.insertAttributeAt(new Attribute(encodeClass(att), new ArrayList(vals)), j); } // copy over values ArrayList<Integer> deleteList = new ArrayList<Integer>(); for (int i = 0; i < D.numInstances(); i++) { Instance x = D.instance(i); for (int j = 0; j < K; j++) { String y = encodeValue(x, indices[j]); try { D_.instance(i).setValue(j, y); // y = } catch (Exception e) { // value not allowed deleteList.add(i); // mark it for deletion String y_close[] = getTopNSubsets(y, counts[j], n); // get N subsets for (int m = 0; m < y_close.length; m++) { //System.out.println("add "+y_close[m]+" "+counts[j]); Instance x_copy = (Instance) D_.instance(i).copy(); x_copy.setValue(j, y_close[m]); x_copy.setWeight(1.0 / y_close.length); D_.add(x_copy); } } } } // clean up Collections.sort(deleteList, Collections.reverseOrder()); //System.out.println("Deleting "+deleteList.size()+" defunct instances."); for (int i : deleteList) { D_.delete(i); } // set class D_.setClassIndex(K); // done! return D_; }
From source file:meka.filters.multilabel.SuperNodeFilter.java
License:Open Source License
/** * Merge Labels - Make a new 'D', with labels made into superlabels, according to partition 'indices', and pruning values 'p' and 'n'. * @param D assume attributes in D labeled by original index * @return Instances with attributes at j and k moved to position L as (j,k), with classIndex = L-1 *//*from w ww . j av a 2 s . co m*/ public static Instances mergeLabels(Instances D, int indices[][], int p, int n) { int L = D.classIndex(); int K = indices.length; ArrayList<String> values[] = new ArrayList[K]; HashMap<String, Integer> counts[] = new HashMap[K]; // create D_ Instances D_ = new Instances(D); // clear D_ for (int j = 0; j < L; j++) { D_.deleteAttributeAt(0); } // create atts for (int j = 0; j < K; j++) { int att[] = indices[j]; //int values[] = new int[2]; //getValues(indices,D,p); counts[j] = getCounts(D, att, p); Set<String> vals = counts[j].keySet(); //getValues(D,att,p); values[j] = new ArrayList(vals); D_.insertAttributeAt(new Attribute(encodeClass(att), new ArrayList(vals)), j); } // copy over values ArrayList<Integer> deleteList = new ArrayList<Integer>(); for (int i = 0; i < D.numInstances(); i++) { Instance x = D.instance(i); for (int j = 0; j < K; j++) { String y = encodeValue(x, indices[j]); try { D_.instance(i).setValue(j, y); // y = } catch (Exception e) { // value not allowed deleteList.add(i); // mark it for deletion String y_close[] = NSR.getTopNSubsets(y, counts[j], n); // get N subsets for (int m = 0; m < y_close.length; m++) { //System.out.println("add "+y_close[m]+" "+counts[j]); Instance x_copy = (Instance) D_.instance(i).copy(); x_copy.setValue(j, y_close[m]); x_copy.setWeight(1.0 / y_close.length); D_.add(x_copy); } } } } // clean up Collections.sort(deleteList, Collections.reverseOrder()); //System.out.println("Deleting "+deleteList.size()+" defunct instances."); for (int i : deleteList) { D_.delete(i); } // set class D_.setClassIndex(K); // done! D = null; return D_; }
From source file:moa.tud.ke.patching.InstanceStore.java
public void cleanBatch(int index, int size) { Instances inst = getBatch(index); System.out.println("Size Batch: " + inst.size()); while (inst.size() > size) { inst.delete(0); }/* ww w . j a va2 s. c o m*/ System.out.println("Size Batch: " + inst.size()); }
From source file:mulan.classifier.meta.HMC.java
License:Open Source License
private void buildRec(HMCNode node, Instances data) throws InvalidDataFormatException, Exception { String metaLabel = node.getName(); //debug("Preparing node data"); Set<String> childrenLabels = new HashSet<String>(); Set<String> currentlyAvailableLabels = new HashSet<String>(); if (metaLabel.equals("root")) { for (LabelNode child : originalMetaData.getRootLabels()) { childrenLabels.add(child.getName()); }//w w w . ja v a 2 s .c om currentlyAvailableLabels = originalMetaData.getLabelNames(); } else { LabelNode labelNode = originalMetaData.getLabelNode(metaLabel); for (LabelNode child : labelNode.getChildren()) { childrenLabels.add(child.getName()); } currentlyAvailableLabels = labelNode.getDescendantLabels(); } // delete non-children labels Set<String> labelsToDelete = new HashSet(currentlyAvailableLabels); labelsToDelete.removeAll(childrenLabels); //===================================================== // System.out.println("Children: " + Arrays.toString(childrenLabels.toArray())); // System.out.println("Labels to delete:" + Arrays.toString(labelsToDelete.toArray())); //====================================================== int[] indicesToDelete = new int[labelsToDelete.size()]; int counter1 = 0; for (String label : labelsToDelete) { indicesToDelete[counter1] = data.attribute(label).index(); counter1++; } Remove filter1 = new Remove(); filter1.setAttributeIndicesArray(indicesToDelete); filter1.setInputFormat(data); Instances nodeInstances = Filter.useFilter(data, filter1); // System.out.println() // create meta data LabelsMetaDataImpl nodeMetaData = new LabelsMetaDataImpl(); for (String label : childrenLabels) { nodeMetaData.addRootNode(new LabelNodeImpl(label)); } // create multi-label instance MultiLabelInstances nodeData = new MultiLabelInstances(nodeInstances, nodeMetaData); //================================================== // System.out.println("Building model"); //============================================ node.build(nodeData); //============================================ System.out.println("spark #instances:" + nodeInstances.numInstances()); //============================================ TotalUsedTrainInsts += nodeInstances.numInstances(); NoNodes++; //============================================ // System.out.println("spark:#nodes: "+ root); //============================================ for (String childLabel : childrenLabels) { LabelNode childNode = originalMetaData.getLabelNode(childLabel); if (!childNode.hasChildren()) { continue; } //================================= // System.out.println("Preparing child data"); //============================================ // remove instances where child is 0 int childMetaLabelIndex = data.attribute(childLabel).index(); Instances childData = new Instances(data); for (int i = 0; i < childData.numInstances(); i++) { if (childData.instance(i).stringValue(childMetaLabelIndex).equals("0")) { childData.delete(i); // While deleting an instance from the trainSet, i must reduced too i--; } } // delete non-descendant labels Set<String> descendantLabels = childNode.getDescendantLabels(); Set<String> labelsToDelete2 = new HashSet(currentlyAvailableLabels); labelsToDelete2.removeAll(descendantLabels); //System.out.println("Labels to delete:" + Arrays.toString(labelsToDelete2.toArray())); int[] indicesToDelete2 = new int[labelsToDelete2.size()]; int counter2 = 0; for (String label : labelsToDelete2) { indicesToDelete2[counter2] = childData.attribute(label).index(); counter2++; } Remove filter2 = new Remove(); filter2.setAttributeIndicesArray(indicesToDelete2); filter2.setInputFormat(childData); childData = Filter.useFilter(childData, filter2); MultiLabelLearner mll = baseLearner.makeCopy(); HMCNode child = new HMCNode(childLabel, mll); node.addChild(child); buildRec(child, childData); } }
From source file:mulan.classifier.meta.HMC.java
License:Open Source License
/** * Deletes the unnecessary instances, the instances that have value 0 on * given attribute.//from w w w . ja v a 2 s. c om * * @param trainSet the trainSet on which the deletion will be applied * @param attrIndex the index of the attribute that the deletion is based */ protected void deleteInstances(Instances trainSet, int attrIndex) { for (int i = 0; i < trainSet.numInstances(); i++) { if (trainSet.instance(i).stringValue(attrIndex).equals("0")) { trainSet.delete(i); // While deleting an instance from the trainSet, i must reduced too i--; } } }
From source file:mulan.data.IterativeStratification.java
License:Open Source License
private Instances[] foldsCreation(Instances workingSet, Random random, double[] splitRatio, int numLabels, int[] labelIndices, int totalNumberOfInstances) { int numFolds = splitRatio.length; // The instances on the final folds Instances[] instancesOnSplits = new Instances[numFolds]; // Initialize the folds for (int fold = 0; fold < numFolds; fold++) { instancesOnSplits[fold] = new Instances(workingSet, 0); }// w w w . j av a2 s .com // ************************************* // First Part of the Algorithm LINES 1-9 // ************************************* // LINE 7 in the Algorithm // The vector with the frequencies in the data set (frequency: the number of // examples per label) int[] frequenciesOnDataset = new int[numLabels]; // Calculating the number of examples per label in the initial data set frequenciesOnDataset = calculatingTheFrequencies(workingSet, numLabels, labelIndices); // LINE 2-3 and 8-9 in the Algorithm // I define the desiredFolds that I want by calculating them using the // array of the splitRatio and in the last column the desired number of // instances in each fold double[][] desiredSplit = new double[numFolds][numLabels + 1]; // In the beginning is the desiredSplit and I reduce the values of the // frequencies (first numLabels columns) and of the instances (last column) // every time I put an instance in the splits. desiredSplit = calculatingTheDesiredSplits(frequenciesOnDataset, splitRatio, numLabels, totalNumberOfInstances); // ************************************* // Second Part of the Algorithm LINES 10-34 // ************************************* // LINE 11-14 in the Algorithm // A vector to keep the rarest label. I keep both the index [0] and the // value [1], when I say value I mean the number of examples for the rarest label. int[] smallestFreqLabel = new int[2]; // Function which returns these characteristics of the rarest label smallestFreqLabel = takingTheSmallestIndexAndNumberInVector(frequenciesOnDataset, totalNumberOfInstances); // This variable gives me the fold in which I will insert an instance int splitToBeInserted; // The instances that are filtered for a particular label (there are 1 // for a particular label) Instances filteredInstancesForLabel; Instance filteredInstance; boolean[] trueLabels = new boolean[numLabels]; for (int lab = 0; lab < numLabels; lab++) { // By calling the function I take the instances that are annotated // with the label with index smallestFreqLabel[0] // and I also take the workingSet with the remaining instances. // I use a temporal variable temp for making the code more efficient Instances[] temp = new Instances[2]; temp = takeTheInstancesOfTheLabel(workingSet, numLabels, labelIndices, smallestFreqLabel); // The instances that I will split at this point // LINE 13 in the Algorithm filteredInstancesForLabel = temp[0]; // The remaining instances workingSet = temp[1]; // This variable is used to tell me the suitable folds in which an instance can be inserted. // The first element contains the total number of the proper Folds and the rest are the indexes of these folds int[] possibleSplits; // I share the filtered instances into the splits. // The first priority is the splits with the highest desired frequency. // The second priority is the split with the highest desired number of instances. // If two splits are equivalent for the above two rules I decide randomly // in which fold the instance will be inserted for (int instancesOfTheLab = 0; instancesOfTheLab < filteredInstancesForLabel .numInstances(); instancesOfTheLab++) { filteredInstance = filteredInstancesForLabel.instance(instancesOfTheLab); trueLabels = getTrueLabels(filteredInstance, numLabels, labelIndices); // LINES 20-27 in the Algorithm // I call that function to return the possible folds with the above priorities. // possibleSplits[0] contains the total number of possible folds and the rest elements // are the indexes of the possible folds. possibleSplits = findThePossibleSpit(desiredSplit, smallestFreqLabel[0], numFolds); // I decide in which fold to enter the instance. If there are more that one possible folds // I break the ties randomly if (possibleSplits[0] != 1) { splitToBeInserted = possibleSplits[random.nextInt(possibleSplits[0]) + 1]; } else { splitToBeInserted = possibleSplits[1]; } // LINE 28 in the Algorithm // Enter the instance to the proper fold instancesOnSplits[splitToBeInserted].add(filteredInstance); // LINE 30-32 in the Algorithm // Update the statistics of this fold desiredSplit[splitToBeInserted] = updateDesiredSplitStatistics(desiredSplit[splitToBeInserted], trueLabels); } // I updating the values for the next iteration frequenciesOnDataset = calculatingTheFrequencies(workingSet, numLabels, labelIndices); smallestFreqLabel = takingTheSmallestIndexAndNumberInVector(frequenciesOnDataset, totalNumberOfInstances); } // Special case when I have a number of examples that are not annotated with any label (i.e. mediamill data set) // These examples are distributed so as to balance the desired number of examples at each fold Instance noAnnotatedInstances; int[] possibleSplitsNoAnnotated = new int[numFolds]; while (workingSet.numInstances() != 0) { possibleSplitsNoAnnotated = returnPossibleSplitsForNotAnnotated(desiredSplit); noAnnotatedInstances = workingSet.instance(0); if (possibleSplitsNoAnnotated[0] != 1) { splitToBeInserted = possibleSplitsNoAnnotated[random.nextInt(possibleSplitsNoAnnotated[0]) + 1]; } else { splitToBeInserted = possibleSplitsNoAnnotated[1]; } // Entering the instance to the proper fold instancesOnSplits[splitToBeInserted].add(noAnnotatedInstances); // Updating the instances desiredSplit[splitToBeInserted][desiredSplit[splitToBeInserted].length - 1] = desiredSplit[splitToBeInserted][desiredSplit[splitToBeInserted].length - 1] - 1; // Deleting the instance from the working set workingSet.delete(0); } return instancesOnSplits; }