List of usage examples for weka.core Instances delete
public void delete()
From source file:de.ugoe.cs.cpdp.dataprocessing.SimulationFilter.java
License:Apache License
@Override public void apply(Instances testdata, Instances traindata) { Instances newDataSet = new Instances(traindata); traindata.delete(); HashMap<Double, Instance> artifactNames = new HashMap<Double, Instance>(); // This is to add all data, where the first occurence of the file has a bug ArrayList<Double> firstOccurenceArtifactNames = new ArrayList<Double>(); // Sort dataset (StateID is connected to the date of commit: Lower StateID // means earlier commit than a higher stateID) Attribute wekaAttribute = newDataSet.attribute("Artifact.Target.StateID"); newDataSet.sort(wekaAttribute);/*from w w w. j a v a 2 s . c om*/ /* * Logical summary: If there is an instance that dont have a bug, put it into the hashmap * (only unique values in there) * * If there is an instance, that hava a bug look up if it is in the hashmap already (this * means: it does not had a bug before!): If this is true add it to a new dataset and remove * it from the hashmap, so that new changes from "nonBug" -> "bug" for this file can be * found. * * If the instance has a bug and is not in the hashmap (this means: The file has a bug with * its first occurence or this file only has bugs and not an instance with no bug), then (if * it is not in the arrayList above) add it to the new dataset. This way it is possible to * get the first occurence of a file, which has a bug */ for (int i = 0; i < newDataSet.numInstances(); i++) { Instance wekaInstance = newDataSet.instance(i); double newBugLabel = wekaInstance.classValue(); Attribute wekaArtifactName = newDataSet.attribute("Artifact.Name"); Double artifactName = wekaInstance.value(wekaArtifactName); if (newBugLabel == 0.0 && artifactNames.keySet().contains(artifactName)) { artifactNames.put(artifactName, wekaInstance); } else if (newBugLabel == 0.0 && !artifactNames.keySet().contains(artifactName)) { artifactNames.put(artifactName, wekaInstance); } else if (newBugLabel == 1.0 && artifactNames.keySet().contains(artifactName)) { traindata.add(wekaInstance); artifactNames.remove(artifactName); } else if (newBugLabel == 1.0 && !artifactNames.keySet().contains(artifactName)) { if (!firstOccurenceArtifactNames.contains(artifactName)) { traindata.add(wekaInstance); firstOccurenceArtifactNames.add(artifactName); } } } // If we have a file, that never had a bug (this is, when it is NOT in the // new created dataset, but it is in the HashMap from above) add it to // the new dataset double[] artifactNamesinNewDataSet = traindata.attributeToDoubleArray(0); HashMap<Double, Instance> artifactNamesCopy = new HashMap<Double, Instance>(artifactNames); for (Double artifactName : artifactNames.keySet()) { for (int i = 0; i < artifactNamesinNewDataSet.length; i++) { if (artifactNamesinNewDataSet[i] == artifactName) { artifactNamesCopy.remove(artifactName); } } } for (Double artifact : artifactNamesCopy.keySet()) { traindata.add(artifactNamesCopy.get(artifact)); } }
From source file:de.ugoe.cs.cpdp.dataselection.CLIFF.java
License:Apache License
/** * <p>//w w w . j a v a 2 s.c o m * Applies the CLIFF relevancy filter to the data. * </p> * * @param data * the data * @return CLIFF-filtered data */ protected Instances applyCLIFF(Instances data) { final double[][] powerAttributes = new double[data.size()][data.numAttributes()]; final double[] powerEntity = new double[data.size()]; final int[] counts = data.attributeStats(data.classIndex()).nominalCounts; final double probDefect = data.numInstances() / (double) counts[1]; for (int j = 0; j < data.numAttributes(); j++) { if (data.attribute(j) != data.classAttribute()) { final double[] ranges = getRanges(data, j); final double[] probDefectRange = getRangeProbabilities(data, j, ranges); for (int i = 0; i < data.numInstances(); i++) { final double value = data.instance(i).value(j); final int range = determineRange(ranges, value); double probClass, probNotClass, probRangeClass, probRangeNotClass; if (data.instance(i).classValue() == 1) { probClass = probDefect; probNotClass = 1.0 - probDefect; probRangeClass = probDefectRange[range]; probRangeNotClass = 1.0 - probDefectRange[range]; } else { probClass = 1.0 - probDefect; probNotClass = probDefect; probRangeClass = 1.0 - probDefectRange[range]; probRangeNotClass = probDefectRange[range]; } powerAttributes[i][j] = Math.pow(probRangeClass, 2.0) / (probRangeClass * probClass + probRangeNotClass * probNotClass); } } } for (int i = 0; i < data.numInstances(); i++) { powerEntity[i] = 1.0; for (int j = 0; j < data.numAttributes(); j++) { powerEntity[i] *= powerAttributes[i][j]; } } double[] sortedPower = powerEntity.clone(); Arrays.sort(sortedPower); double cutOff = sortedPower[(int) (data.numInstances() * (1 - percentage))]; final Instances selected = new Instances(data); selected.delete(); for (int i = 0; i < data.numInstances(); i++) { if (powerEntity[i] >= cutOff) { selected.add(data.instance(i)); } } return selected; }
From source file:entity.DifficultyResamplingManager.java
License:Open Source License
/** * called by generateResampledSubdataset * /*w w w.ja v a 2 s . c om*/ * @param originalDataset * @param subdatasetDimensions * @return */ private Instances generateResampledSubdataset(Instances originalDataset, SubdatasetDimensions subdatasetDimensions) { // creates an empty dataset Instances resampledSubdataset = new Instances(originalDataset); resampledSubdataset.delete(); // randomize dataset instances order originalDataset.randomize(RandomizationManager.randomGenerator); // calc number of positives to insert int positivesToInsert = subdatasetDimensions.getP(); if (verbose) System.out.println("[DifficultyResamplingManager, generateResampledSubdataset] positivesToInsert = " + positivesToInsert); // calc number of negatives to insert int negativesToInsert = subdatasetDimensions.getN(); // iterates over the original dataset instances for (int i = 0; i < originalDataset.numInstances(); i++) { // if instance is positive and more are needed in the new dataset, inserts into new dataset if ((positivesToInsert > 0) && (originalDataset.instance(i).stringValue(originalDataset.classIndex()) .equals(Settings.buggyLabel))) { resampledSubdataset.add(originalDataset.instance(i)); positivesToInsert--; } // if instance is negative and more are needed in the new dataset, inserts into new dataset if ((negativesToInsert > 0) && (originalDataset.instance(i).stringValue(originalDataset.classIndex()) .equals(Settings.nonbuggyLabel))) { resampledSubdataset.add(originalDataset.instance(i)); negativesToInsert--; } } if (verbose) System.out.println("[DifficultyResamplingManager, generateResampledSubdataset] resampling terminato: " + this.printDatasetInfo(resampledSubdataset)); return resampledSubdataset; }
From source file:ergasia2pkg.ML_RUS.java
/** * Method to perform undersampling on the initial dataset. The method * removes instances from the dataset according to the algorithm proposed on * the paper, utilising the Mean Imbalance Ratio measure. * * @param mlData MultiLabelInstances object, holds a set of multilabel * instances/*from w ww . j av a 2s .c om*/ * @return MultiLabelInstances object containing the initial labels minus * the labels removed by undersampling * @throws Exception */ @Override public MultiLabelInstances transformInstances(MultiLabelInstances mlData) throws Exception { //Initialise the label counters labelCount(mlData); //Clone the dataset into a new object MultiLabelInstances mlDataClone = mlData.clone(); //Clone a new set to contain all the instances that will be returned Instances mlDataReturned = mlData.clone().getDataSet(); mlDataReturned.delete(); //Calculate the number of samples to remove int samplesToDelete = (int) (mlData.getNumInstances() / (100 * P)); int remainingLabels; //Declare two lists of lists, a minorityBag and a majorityBag. The minBag //will contain lists (bags) of instances having labels with //an imbalance ratio higher than the mean imbalance ratio. These will be //set aside and not tampered with in any way. The majBag will also contain //lists of instances having labels with an imbalance ratio lower than or //equal to the mean imbalance ratio. These instances will be the candidates //for deletion. List<List<Instance>> minBags = new ArrayList<>(); List<List<Instance>> majBags = new ArrayList<>(); //Get an array with the indices of all the labels int L[] = mlDataClone.getLabelIndices(); //Calculate the dataset's mean imbalance ratio double meanIR = meanImbalanceRatio(mlDataClone); String labelName; int i = 0, m = 0, x, labelCounter = 0; //Declare a boolean array which will follow the labelset L, and determine //whether or not a label's instances should be considered for undersampling //Initialise all its values to true. boolean included[] = new boolean[L.length]; for (int k = 0; k < L.length; k++) { included[k] = true; } Random rand = new Random(); //Perform the following operation for each label //Note that labels are represented by their integer index, which is then //transformed to its string name. This was done to avoid problems and //exceptions thrown by methods required below for (int label : L) { //Get the label name from the current instance, based on label index labelName = mlDataClone.getDataSet().attribute(label).name(); if (imbalanceRatioPerLabel(mlDataClone, labelName) > meanIR) { //if the imbalance ratio of the label is greater than the mean //imbalance ratio of the dataset, add it to the minbag corresponding //to the specific label. minBags.add(new ArrayList<Instance>()); //Add all instances containing this label to the minbag we just //created for (int l = 0; l < mlDataClone.getNumInstances(); l++) { if (mlDataClone.getDataSet().get(l).value(label) == 1.0) { minBags.get(i).add(mlDataClone.getDataSet().get(l)); //Remove the label from the dataset mlDataClone.getDataSet().delete(l); } } //Set the included flag as false, so that the label is not added //to the majbags included[labelCounter] = false; i++; } labelCounter++; } //For every label again for (int label : L) { //Add a new majbag (one for each label) majBags.add(new ArrayList<Instance>()); //Add all the instances having this label to the majbag. Note that //this operation takes place on the cloned dataset, which now contains //only the instances not having minority labels for (int l = 0; l < mlDataClone.getNumInstances(); l++) { if (mlDataClone.getDataSet().get(l).value(label) == 1.0) { majBags.get(m).add(mlDataClone.getDataSet().get(l)); } } m++; } remainingLabels = L.length - minBags.size(); //While we haven't deleted all the samples yet and we still have labels //to delete while (samplesToDelete > 0 && remainingLabels > 0) { //For each of the INITIAL labels (not only the ones in the cloned dataset) for (int j = 0; j < mlData.getNumLabels(); j++) { if (included[j]) { //if it is to be included (meaning it is a majority label), check //if this bag contains instances. If it doesn't, decrease the //numbers and go to the next iteration if (majBags.get(j).size() == 0) { included[j] = false; remainingLabels--; continue; } //Get a random instance from the bag x = rand.nextInt(majBags.get(j).size()); //Based on the instance and the index, get its label labelName = majBags.get(j).get(x).attribute(L[j]).name(); //Remove the instance from the bag majBags.get(j).remove(x); //If the imbalance ratio of the label has increased beyond the //acceptable limit of the mean imbalance ratio, remove this //majbag from future candidates if (imbalanceRatioPerLabel(mlDataClone, labelName) >= meanIR) { included[j] = false; remainingLabels--; } samplesToDelete--; } } } //Add the contents of the minbags and the majbags to an empty dataset //and return it for (List<Instance> list : minBags) { for (Instance inst : list) { mlDataReturned.add(inst); } } for (List<Instance> list : majBags) { for (Instance inst : list) { mlDataReturned.add(inst); } } return new MultiLabelInstances(mlDataReturned, mlData.getLabelsMetaData()); }
From source file:es.upm.dit.gsi.barmas.dataset.utils.DatasetSplitter.java
License:Open Source License
/** * @param folds/*from w w w.j a v a 2 s .c o m*/ * @param minAgents * @param maxAgents * @param originalDatasetPath * @param outputDir * @param scenario * @param logger */ public void splitDataset(int folds, int minAgents, int maxAgents, String originalDatasetPath, String outputDir, String scenario, Logger logger) { int ratioint = (int) ((1 / (double) folds) * 100); double roundedratio = ((double) ratioint) / 100; // Look for essentials List<String[]> essentials = this.getEssentials(originalDatasetPath, logger); for (int fold = 0; fold < folds; fold++) { String outputDirWithRatio = outputDir + "/" + roundedratio + "testRatio/iteration-" + fold; File dir = new File(outputDirWithRatio); if (!dir.exists() || !dir.isDirectory()) { dir.mkdirs(); } logger.finer("--> splitDataset()"); logger.fine("Creating experiment.info..."); try { Instances originalData = this.getDataFromCSV(originalDatasetPath); originalData.randomize(new Random()); originalData.stratify(folds); // TestDataSet Instances testData = originalData.testCV(folds, fold); CSVSaver saver = new CSVSaver(); ArffSaver arffsaver = new ArffSaver(); File file = new File(outputDirWithRatio + File.separator + "test-dataset.csv"); if (!file.exists()) { saver.resetOptions(); saver.setInstances(testData); saver.setFile(file); saver.writeBatch(); } file = new File(outputDirWithRatio + File.separator + "test-dataset.arff"); if (!file.exists()) { arffsaver.resetOptions(); arffsaver.setInstances(testData); arffsaver.setFile(file); arffsaver.writeBatch(); } // BayesCentralDataset Instances trainData = originalData.trainCV(folds, fold); file = new File(outputDirWithRatio + File.separator + "bayes-central-dataset.csv"); if (!file.exists()) { saver.resetOptions(); saver.setInstances(trainData); saver.setFile(file); saver.writeBatch(); this.copyFileUsingApacheCommonsIO(file, new File( outputDirWithRatio + File.separator + "bayes-central-dataset-noEssentials.csv"), logger); CsvWriter w = new CsvWriter(new FileWriter(file, true), ','); for (String[] essential : essentials) { w.writeRecord(essential); } w.close(); } file = new File(outputDirWithRatio + File.separator + "bayes-central-dataset.arff"); if (!file.exists()) { arffsaver.resetOptions(); arffsaver.setInstances(trainData); arffsaver.setFile(file); arffsaver.writeBatch(); this.copyFileUsingApacheCommonsIO(file, new File( outputDirWithRatio + File.separator + "bayes-central-dataset-noEssentials.arff"), logger); CsvWriter w = new CsvWriter(new FileWriter(file, true), ','); for (String[] essential : essentials) { w.writeRecord(essential); } w.close(); } // Agent datasets CsvReader csvreader = new CsvReader(new FileReader(new File(originalDatasetPath))); csvreader.readHeaders(); String[] headers = csvreader.getHeaders(); csvreader.close(); for (int agents = minAgents; agents <= maxAgents; agents++) { this.createExperimentInfoFile(folds, agents, originalDatasetPath, outputDirWithRatio, scenario, logger); HashMap<String, CsvWriter> writers = new HashMap<String, CsvWriter>(); String agentsDatasetsDir = outputDirWithRatio + File.separator + agents + "agents"; HashMap<String, CsvWriter> arffWriters = new HashMap<String, CsvWriter>(); File f = new File(agentsDatasetsDir); if (!f.isDirectory()) { f.mkdirs(); } Instances copy = new Instances(trainData); copy.delete(); for (int i = 0; i < agents; i++) { String fileName = agentsDatasetsDir + File.separator + "agent-" + i + "-dataset.csv"; file = new File(fileName); if (!file.exists()) { CsvWriter writer = new CsvWriter(new FileWriter(fileName), ','); writer.writeRecord(headers); writers.put("AGENT" + i, writer); } fileName = agentsDatasetsDir + File.separator + "agent-" + i + "-dataset.arff"; file = new File(fileName); if (!file.exists()) { arffsaver.resetOptions(); arffsaver.setInstances(copy); arffsaver.setFile(new File(fileName)); arffsaver.writeBatch(); CsvWriter arffwriter = new CsvWriter(new FileWriter(fileName, true), ','); arffWriters.put("AGENT" + i, arffwriter); } logger.fine("AGENT" + i + " dataset created in csv and arff formats."); } // Append essentials to all for (String[] essential : essentials) { for (CsvWriter wr : writers.values()) { wr.writeRecord(essential); } for (CsvWriter arffwr : arffWriters.values()) { arffwr.writeRecord(essential); } } int agentCounter = 0; for (int j = 0; j < trainData.numInstances(); j++) { Instance instance = trainData.instance(j); CsvWriter writer = writers.get("AGENT" + agentCounter); CsvWriter arffwriter = arffWriters.get("AGENT" + agentCounter); String[] row = new String[instance.numAttributes()]; for (int a = 0; a < instance.numAttributes(); a++) { row[a] = instance.stringValue(a); } if (writer != null) { writer.writeRecord(row); } if (arffwriter != null) { arffwriter.writeRecord(row); } agentCounter++; if (agentCounter == agents) { agentCounter = 0; } } for (CsvWriter wr : writers.values()) { wr.close(); } for (CsvWriter arffwr : arffWriters.values()) { arffwr.close(); } } } catch (Exception e) { logger.severe("Exception while splitting dataset. ->"); logger.severe(e.getMessage()); System.exit(1); } logger.finest("Dataset for fold " + fold + " created."); } logger.finer("<-- splitDataset()"); }
From source file:gov.va.chir.tagline.TagLineTrainer.java
License:Open Source License
public TagLineModel getTagLineModel() { final Instances header = new Instances(instances); header.delete(); tagLineModel.setHeader(header);/* w ww .ja v a 2s .c om*/ tagLineModel.setFeatures(extractor.getFeatures()); return tagLineModel; }
From source file:linqs.gaia.model.oc.ncc.WekaClassifier.java
License:Open Source License
@Override public void learn(Iterable<? extends Decorable> trainitems, String targetschemaid, String targetfeatureid, List<String> featureids) { try {//from w w w . j a v a2s . com this.targetschemaid = targetschemaid; this.targetfeatureid = targetfeatureid; this.featureids = new LinkedList<String>(featureids); LinkedHashSet<String> uniquefids = new LinkedHashSet<String>(featureids); if (uniquefids.size() != featureids.size()) { Log.WARN("Duplicate feature ids found in set of features: " + featureids); this.featureids = new ArrayList<String>(uniquefids); } if (this.featureids.contains(this.targetfeatureid)) { throw new InvalidStateException( "Cannot include target feature as a dependency feature: " + this.targetfeatureid); } Log.DEBUG("Features Used: " + ListUtils.list2string(featureids, ",")); // Added for weka. Will only be used for training. // Target will not be used as a feature itself. this.featureids.add(this.targetfeatureid); String wcclass = WekaClassifier.DEFAULT_WEKA_CLASSIFIER; if (this.hasParameter("wekaclassifier")) { wcclass = this.getStringParameter("wekaclassifier"); } String wekaparams = WekaClassifier.NO_PARAMS; if (this.hasParameter("wekaparams")) { wekaparams = this.getStringParameter("wekaparams"); } boolean printwekamodel = this.hasParameter("printwekamodel", "yes"); // Support generation of class based cost matrix if (this.hasParameter("costbyclass", "yes")) { fclasscount = new KeyedCount<String>(); } // Weka instances int numinstances = IteratorUtils.numIterable(trainitems); Instances traininstances = this.gaia2weka(trainitems.iterator(), numinstances, false); // Handle class based cost matrix if (fclasscount != null) { if (wekaparams.equals(WekaClassifier.NO_PARAMS)) { wekaparams = ""; } else { wekaparams += ","; } wekaparams += "-cost-matrix," + this.getCostMatrix(); } // Set GAIA parameters and initialize classifier String params[] = null; if (!wekaparams.equals(WekaClassifier.NO_PARAMS)) { Log.DEBUG("Using wekaparams: " + wekaparams); params = wekaparams.split(","); } wekaclassifier = Classifier.forName(wcclass, params); // Train classifier if (this.hasParameter("wekatrainfile")) { String savefile = this.getStringParameter("wekatrainfile"); this.saveWekaInstances(savefile, traininstances); } Log.DEBUG("Weka building classifier"); SimpleTimer st = new SimpleTimer(); st.start(); wekaclassifier.buildClassifier(traininstances); Log.DEBUG("Weka done building classifier: (" + st.timeLapse(true) + ")"); // Print Weka Model, if requested if (printwekamodel) { Log.INFO("Learned Weka Model:\n" + this.wekaclassifier); } // Print attributes if (Log.SHOWDEBUG) { String features = null; for (int f = 0; f < traininstances.numAttributes(); f++) { if (features == null) { features = ""; } else { features += ","; } features += traininstances.attribute(f).name(); } String options[] = wekaclassifier.getOptions(); Log.DEBUG("Weka Options: " + ArrayUtils.array2String(options, ",")); } // Clear instances once training is complete traininstances.delete(); } catch (RuntimeException e) { throw e; } catch (Exception e) { throw new RuntimeException(e); } }
From source file:meka.classifiers.multilabel.Maniac.java
License:Open Source License
@Override public Instance transformInstance(Instance x) throws Exception { Instances tmpInst = new Instances(x.dataset()); tmpInst.delete(); tmpInst.add(x);/*from w w w . j a v a 2 s .co m*/ Instances features = this.extractPart(tmpInst, false); Instances pseudoLabels = new Instances(this.compressedTemplateInst); Instance tmpin = pseudoLabels.instance(0); pseudoLabels.delete(); pseudoLabels.add(tmpin); for (int i = 0; i < pseudoLabels.classIndex(); i++) { pseudoLabels.instance(0).setMissing(i); } Instances newDataSet = Instances.mergeInstances(pseudoLabels, features); newDataSet.setClassIndex(pseudoLabels.numAttributes()); return newDataSet.instance(0); }
From source file:meka.classifiers.multilabel.MLCBMaD.java
License:Open Source License
@Override public Instance transformInstance(Instance x) throws Exception { Instances tmpInst = new Instances(x.dataset()); tmpInst.delete(); tmpInst.add(x);/*from w w w . java2 s .c o m*/ Instances features = this.extractPart(tmpInst, false); Instances pseudoLabels = new Instances(this.compressedMatrix); Instance tmpin = pseudoLabels.instance(0); pseudoLabels.delete(); pseudoLabels.add(tmpin); for (int i = 0; i < pseudoLabels.classIndex(); i++) { pseudoLabels.instance(0).setMissing(i); } Instances newDataSet = Instances.mergeInstances(pseudoLabels, features); newDataSet.setClassIndex(this.size); return newDataSet.instance(0); }
From source file:meka.classifiers.multilabel.PLST.java
License:Open Source License
/** * Transforms the instance in the prediction process before given to the internal multi-label * or multi-target classifier. The instance is passed having the original set of labels, these * must be replaced with the transformed labels (attributes) so that the internla classifier * can predict them.// w w w.j a v a2s. c o m * * @param x The instance to transform. Consists of features and labels. * @return The transformed instance. Consists of features and transformed labels. */ @Override public Instance transformInstance(Instance x) throws Exception { Instances tmpInst = new Instances(x.dataset()); tmpInst.delete(); tmpInst.add(x); Instances features = this.extractPart(tmpInst, false); Instances labels = new Instances(this.m_PatternInstances); labels.add(new DenseInstance(labels.numAttributes())); Instances result = Instances.mergeInstances(labels, features); result.setClassIndex(labels.numAttributes()); return result.instance(0); }