List of usage examples for weka.core Instances attribute
publicAttribute attribute(String name)
From source file:de.ugoe.cs.cpdp.dataprocessing.CLAMIProcessor.java
License:Apache License
/** * <p>/*www .ja v a 2 s . c o m*/ * Applies the CLAMI processor to the data. The test data is also required, in order to * guarantee a consistent metric set. * </p> * * @param testdata * test data; the data is not modified, only metrics are dropped * @param data * data to which the CLAMI processor is applied */ public void applyCLAMI(Instances testdata, Instances data) { // first determine medians double[] medians = new double[data.numAttributes()]; // get medians for (int j = 0; j < data.numAttributes(); j++) { if (j != data.classIndex()) { medians[j] = data.kthSmallestValue(j, (data.numInstances() + 1) >> 1); } } // now determine cluster number for each instance double[] clusterNumber = new double[data.numInstances()]; for (int i = 0; i < data.numInstances(); i++) { int countHighValues = 0; Instance currentInstance = data.get(i); for (int j = 0; j < data.numAttributes(); j++) { if (j != data.classIndex()) { if (currentInstance.value(j) > medians[j]) { countHighValues++; } } } clusterNumber[i] = countHighValues; } // determine median of cluster number Median m = new Median(); double medianClusterNumber = m.evaluate(clusterNumber); // now we filter the metrics int[] numMetricViolations = new int[data.numAttributes()]; for (int j = 0; j < data.numAttributes(); j++) { int currentViolations = 0; for (int i = 0; i < data.numInstances(); i++) { Instance currentInstance = data.get(i); if (j != data.classIndex()) { if (clusterNumber[i] > medianClusterNumber) { // "buggy" if (currentInstance.value(j) <= medians[j]) { currentViolations++; } } else { // "not buggy" if (currentInstance.value(j) > medians[j]) { currentViolations++; } } } } numMetricViolations[j] = currentViolations; } SortedSet<Integer> distinctViolationCounts = new TreeSet<>(); for (int currentViolations : numMetricViolations) { distinctViolationCounts.add(currentViolations); } Iterator<Integer> violationCountInterator = distinctViolationCounts.iterator(); int violationCutoff = violationCountInterator.next(); // now we filter the data; // this is first tried with the metrics with fewest violations. if no buggy/bugfree // instances remain, this is repeated with the next metrics with second fewest violations, // and so on. // this part is a bit unclear from the description in the paper, but I confirmed with the // author that this is how they implemented it boolean[] cleanInstances = new boolean[data.numInstances()]; int numCleanBuggyInstances = 0; int numCleanBugfreeInstances = 0; do { violationCutoff = violationCountInterator.next(); cleanInstances = new boolean[data.numInstances()]; numCleanBuggyInstances = 0; numCleanBugfreeInstances = 0; for (int i = 0; i < data.numInstances(); i++) { int currentViolations = 0; Instance currentInstance = data.get(i); for (int j = 0; j < data.numAttributes(); j++) { if (j != data.classIndex() && numMetricViolations[j] == violationCutoff) { if (clusterNumber[i] > medianClusterNumber) { // "buggy" if (currentInstance.value(j) <= medians[j]) { currentViolations++; } } else { // "not buggy" if (currentInstance.value(j) > medians[j]) { currentViolations++; } } } } if (currentViolations == 0) { cleanInstances[i] = true; if (clusterNumber[i] > medianClusterNumber) { numCleanBuggyInstances++; } else { numCleanBugfreeInstances++; } } else { cleanInstances[i] = false; } } } while (numCleanBuggyInstances == 0 || numCleanBugfreeInstances == 0); // output some interesting information to provide insights into the CLAMI model Console.traceln(Level.FINE, "Selected Metrics and Median-threshold: "); for (int j = 0; j < data.numAttributes(); j++) { if (j != data.classIndex() && numMetricViolations[j] == violationCutoff) { Console.traceln(Level.FINE, "\t" + data.attribute(j).name() + ": " + medians[j]); } } // finally modify the instances // drop the metrics (also from the testdata) for (int j = data.numAttributes() - 1; j >= 0; j--) { if (j != data.classIndex() && numMetricViolations[j] != violationCutoff) { data.deleteAttributeAt(j); testdata.deleteAttributeAt(j); } } // drop the unclean instances for (int i = data.numInstances() - 1; i >= 0; i--) { if (!cleanInstances[i]) { data.delete(i); } else { // set the classification if (clusterNumber[i] > medianClusterNumber) { data.get(i).setClassValue(1.0d); } else { data.get(i).setClassValue(0.0d); } } } }
From source file:de.ugoe.cs.cpdp.dataprocessing.MORPH.java
License:Apache License
/** * <p>/*www. j av a2 s. co m*/ * Applies MORPH to a single instance * </p> * * @param instance * instance that is morphed * @param data * data based on which the instance is morphed */ public void morphInstance(Instance instance, Instances data) { Instance nearestUnlikeNeighbor = getNearestUnlikeNeighbor(instance, data); if (nearestUnlikeNeighbor == null) { throw new RuntimeException( "could not find nearest unlike neighbor within the data: " + data.relationName()); } for (int j = 0; j < data.numAttributes(); j++) { if (data.attribute(j) != data.classAttribute() && data.attribute(j).isNumeric()) { double randVal = rand.nextDouble() * (beta - alpha) + alpha; instance.setValue(j, instance.value(j) + randVal * (instance.value(j) - nearestUnlikeNeighbor.value(j))); } } }
From source file:de.ugoe.cs.cpdp.dataprocessing.MORPH.java
License:Apache License
/** * <p>//from w w w . ja v a2 s.c o m * Determines the nearest unlike neighbor of an instance. * </p> * * @param instance * instance to which the nearest unlike neighbor is determined * @param data * data where the nearest unlike neighbor is determined from * @return nearest unlike instance */ public Instance getNearestUnlikeNeighbor(Instance instance, Instances data) { Instance nearestUnlikeNeighbor = null; double[] instanceVector = new double[data.numAttributes() - 1]; int tmp = 0; for (int j = 0; j < data.numAttributes(); j++) { if (data.attribute(j) != data.classAttribute() && data.attribute(j).isNumeric()) { instanceVector[tmp] = instance.value(j); } } double minDistance = Double.MAX_VALUE; for (int i = 0; i < data.numInstances(); i++) { if (instance.classValue() != data.instance(i).classValue()) { double[] otherVector = new double[data.numAttributes() - 1]; tmp = 0; for (int j = 0; j < data.numAttributes(); j++) { if (data.attribute(j) != data.classAttribute() && data.attribute(j).isNumeric()) { otherVector[tmp++] = data.instance(i).value(j); } } if (MathArrays.distance(instanceVector, otherVector) < minDistance) { minDistance = MathArrays.distance(instanceVector, otherVector); nearestUnlikeNeighbor = data.instance(i); } } } return nearestUnlikeNeighbor; }
From source file:de.ugoe.cs.cpdp.dataprocessing.NominalAttributeFilter.java
License:Apache License
@Override public void apply(Instances testdata, Instances traindata) { int indexOfConfidenceAttribute = -1; // Find index of the named confidence attribute to filter for for (int i = 0; i < traindata.numAttributes(); i++) { if (traindata.attribute(i).name().equals(nominalAttributeName)) { indexOfConfidenceAttribute = i; }/*from w w w. ja va2s . c o m*/ } // if it was not found return if (indexOfConfidenceAttribute == -1) { return; } // Find index of nominal values Attribute confidenceAttribute = traindata.attribute(indexOfConfidenceAttribute); ArrayList<Object> nominalValuesOfConfidenceAttribute = Collections .list(confidenceAttribute.enumerateValues()); ArrayList<Double> indexOfnominalAttributeValues = new ArrayList<Double>(); for (int k = 0; k < nominalValuesOfConfidenceAttribute.size(); k++) { for (String attributeValue : nominalAttributeValues) { if (((String) nominalValuesOfConfidenceAttribute.get(k)).equals(attributeValue)) { indexOfnominalAttributeValues.add((double) k); } } } // Go through all instances and check if nominal attribute equals for (int j = traindata.numInstances() - 1; j >= 0; j--) { Instance wekaInstance = traindata.get(j); // delete all instances where nominal attribute has the value of one of the parameter if (indexOfnominalAttributeValues.contains(wekaInstance.value(indexOfConfidenceAttribute))) { traindata.delete(j); } } }
From source file:de.ugoe.cs.cpdp.dataprocessing.SimulationFilter.java
License:Apache License
@Override public void apply(Instances testdata, Instances traindata) { Instances newDataSet = new Instances(traindata); traindata.delete();/*from www . ja v a 2 s. c o m*/ HashMap<Double, Instance> artifactNames = new HashMap<Double, Instance>(); // This is to add all data, where the first occurence of the file has a bug ArrayList<Double> firstOccurenceArtifactNames = new ArrayList<Double>(); // Sort dataset (StateID is connected to the date of commit: Lower StateID // means earlier commit than a higher stateID) Attribute wekaAttribute = newDataSet.attribute("Artifact.Target.StateID"); newDataSet.sort(wekaAttribute); /* * Logical summary: If there is an instance that dont have a bug, put it into the hashmap * (only unique values in there) * * If there is an instance, that hava a bug look up if it is in the hashmap already (this * means: it does not had a bug before!): If this is true add it to a new dataset and remove * it from the hashmap, so that new changes from "nonBug" -> "bug" for this file can be * found. * * If the instance has a bug and is not in the hashmap (this means: The file has a bug with * its first occurence or this file only has bugs and not an instance with no bug), then (if * it is not in the arrayList above) add it to the new dataset. This way it is possible to * get the first occurence of a file, which has a bug */ for (int i = 0; i < newDataSet.numInstances(); i++) { Instance wekaInstance = newDataSet.instance(i); double newBugLabel = wekaInstance.classValue(); Attribute wekaArtifactName = newDataSet.attribute("Artifact.Name"); Double artifactName = wekaInstance.value(wekaArtifactName); if (newBugLabel == 0.0 && artifactNames.keySet().contains(artifactName)) { artifactNames.put(artifactName, wekaInstance); } else if (newBugLabel == 0.0 && !artifactNames.keySet().contains(artifactName)) { artifactNames.put(artifactName, wekaInstance); } else if (newBugLabel == 1.0 && artifactNames.keySet().contains(artifactName)) { traindata.add(wekaInstance); artifactNames.remove(artifactName); } else if (newBugLabel == 1.0 && !artifactNames.keySet().contains(artifactName)) { if (!firstOccurenceArtifactNames.contains(artifactName)) { traindata.add(wekaInstance); firstOccurenceArtifactNames.add(artifactName); } } } // If we have a file, that never had a bug (this is, when it is NOT in the // new created dataset, but it is in the HashMap from above) add it to // the new dataset double[] artifactNamesinNewDataSet = traindata.attributeToDoubleArray(0); HashMap<Double, Instance> artifactNamesCopy = new HashMap<Double, Instance>(artifactNames); for (Double artifactName : artifactNames.keySet()) { for (int i = 0; i < artifactNamesinNewDataSet.length; i++) { if (artifactNamesinNewDataSet[i] == artifactName) { artifactNamesCopy.remove(artifactName); } } } for (Double artifact : artifactNamesCopy.keySet()) { traindata.add(artifactNamesCopy.get(artifact)); } }
From source file:de.ugoe.cs.cpdp.dataprocessing.TopMetricFilter.java
License:Apache License
private void determineTopKAttributes(Instances testdata, SetUniqueList<Instances> traindataSet) throws Exception { Integer[] counts = new Integer[traindataSet.get(0).numAttributes() - 1]; IntStream.range(0, counts.length).forEach(val -> counts[val] = 0); for (Instances traindata : traindataSet) { J48 decisionTree = new J48(); decisionTree.buildClassifier(traindata); int k = 0; for (int j = 0; j < traindata.numAttributes(); j++) { if (j != traindata.classIndex()) { if (decisionTree.toString().contains(traindata.attribute(j).name())) { counts[k] = counts[k] + 1; }/*from w ww . j a v a 2s . com*/ k++; } } } int[] topkIndex = new int[counts.length]; IntStream.range(0, counts.length).forEach(val -> topkIndex[val] = val); SortUtils.quicksort(counts, topkIndex, true); // get CFSs for each training set List<Set<Integer>> cfsSets = new LinkedList<>(); for (Instances traindata : traindataSet) { boolean selectionSuccessful = false; boolean secondAttempt = false; Instances traindataCopy = null; do { try { if (secondAttempt) { AttributeSelection attsel = new AttributeSelection(); CfsSubsetEval eval = new CfsSubsetEval(); GreedyStepwise search = new GreedyStepwise(); search.setSearchBackwards(true); attsel.setEvaluator(eval); attsel.setSearch(search); attsel.SelectAttributes(traindataCopy); Set<Integer> cfsSet = new HashSet<>(); for (int attr : attsel.selectedAttributes()) { cfsSet.add(attr); } cfsSets.add(cfsSet); selectionSuccessful = true; } else { AttributeSelection attsel = new AttributeSelection(); CfsSubsetEval eval = new CfsSubsetEval(); GreedyStepwise search = new GreedyStepwise(); search.setSearchBackwards(true); attsel.setEvaluator(eval); attsel.setSearch(search); attsel.SelectAttributes(traindata); Set<Integer> cfsSet = new HashSet<>(); for (int attr : attsel.selectedAttributes()) { cfsSet.add(attr); } cfsSets.add(cfsSet); selectionSuccessful = true; } } catch (IllegalArgumentException e) { String regex = "A nominal attribute \\((.*)\\) cannot have duplicate labels.*"; Pattern p = Pattern.compile(regex); Matcher m = p.matcher(e.getMessage()); if (!m.find()) { // cannot treat problem, rethrow exception throw e; } String attributeName = m.group(1); int attrIndex = traindata.attribute(attributeName).index(); if (secondAttempt) { traindataCopy = WekaUtils.upscaleAttribute(traindataCopy, attrIndex); } else { traindataCopy = WekaUtils.upscaleAttribute(traindata, attrIndex); } Console.traceln(Level.FINE, "upscaled attribute " + attributeName + "; restarting training"); secondAttempt = true; continue; } } while (!selectionSuccessful); // dummy loop for internal continue } double[] coverages = new double[topkIndex.length]; for (Set<Integer> cfsSet : cfsSets) { Set<Integer> topkSet = new HashSet<>(); for (int k = 0; k < topkIndex.length; k++) { topkSet.add(topkIndex[k]); coverages[k] += (coverage(topkSet, cfsSet) / traindataSet.size()); } } double bestCoverageValue = Double.MIN_VALUE; int bestCoverageIndex = 0; for (int i = 0; i < coverages.length; i++) { if (coverages[i] > bestCoverageValue) { bestCoverageValue = coverages[i]; bestCoverageIndex = i; } } // build correlation matrix SpearmansCorrelation corr = new SpearmansCorrelation(); double[][] correlationMatrix = new double[bestCoverageIndex][bestCoverageIndex]; for (Instances traindata : traindataSet) { double[][] vectors = new double[bestCoverageIndex][traindata.size()]; for (int i = 0; i < traindata.size(); i++) { for (int j = 0; j < bestCoverageIndex; j++) { vectors[j][i] = traindata.get(i).value(topkIndex[j]); } } for (int j = 0; j < bestCoverageIndex; j++) { for (int k = j + 1; k < bestCoverageIndex; k++) { correlationMatrix[j][k] = Math.abs(corr.correlation(vectors[j], vectors[k])); } } } Set<Integer> topkSetIndexSet = new TreeSet<>(); // j<30 ensures that the computational time does not explode since the powerset is 2^n in // complexity for (int j = 0; j < bestCoverageIndex && j < 30; j++) { topkSetIndexSet.add(j); } Set<Set<Integer>> allCombinations = Sets.powerSet(topkSetIndexSet); double bestOptCoverage = Double.MIN_VALUE; Set<Integer> opttopkSetIndexSet = null; for (Set<Integer> combination : allCombinations) { if (isUncorrelated(correlationMatrix, combination)) { double currentCoverage = 0.0; Set<Integer> topkCombination = new TreeSet<>(); for (Integer index : combination) { topkCombination.add(topkIndex[index]); } for (Set<Integer> cfsSet : cfsSets) { currentCoverage += (coverage(topkCombination, cfsSet) / traindataSet.size()); } if (currentCoverage > bestOptCoverage) { bestOptCoverage = currentCoverage; opttopkSetIndexSet = combination; } } } Set<Integer> opttopkIndex = new TreeSet<>(); for (Integer index : opttopkSetIndexSet) { opttopkIndex.add(topkIndex[index]); } Console.traceln(Level.FINE, "selected the following metrics:"); for (Integer index : opttopkIndex) { Console.traceln(Level.FINE, traindataSet.get(0).attribute(index).name()); } // finally remove attributes for (int j = testdata.numAttributes() - 1; j >= 0; j--) { if (j != testdata.classIndex() && !opttopkIndex.contains(j)) { testdata.deleteAttributeAt(j); for (Instances traindata : traindataSet) { traindata.deleteAttributeAt(j); } } } }
From source file:de.ugoe.cs.cpdp.dataselection.CLIFF.java
License:Apache License
/** * <p>/*from w w w.j av a 2 s .co m*/ * Applies the CLIFF relevancy filter to the data. * </p> * * @param data * the data * @return CLIFF-filtered data */ protected Instances applyCLIFF(Instances data) { final double[][] powerAttributes = new double[data.size()][data.numAttributes()]; final double[] powerEntity = new double[data.size()]; final int[] counts = data.attributeStats(data.classIndex()).nominalCounts; final double probDefect = data.numInstances() / (double) counts[1]; for (int j = 0; j < data.numAttributes(); j++) { if (data.attribute(j) != data.classAttribute()) { final double[] ranges = getRanges(data, j); final double[] probDefectRange = getRangeProbabilities(data, j, ranges); for (int i = 0; i < data.numInstances(); i++) { final double value = data.instance(i).value(j); final int range = determineRange(ranges, value); double probClass, probNotClass, probRangeClass, probRangeNotClass; if (data.instance(i).classValue() == 1) { probClass = probDefect; probNotClass = 1.0 - probDefect; probRangeClass = probDefectRange[range]; probRangeNotClass = 1.0 - probDefectRange[range]; } else { probClass = 1.0 - probDefect; probNotClass = probDefect; probRangeClass = 1.0 - probDefectRange[range]; probRangeNotClass = probDefectRange[range]; } powerAttributes[i][j] = Math.pow(probRangeClass, 2.0) / (probRangeClass * probClass + probRangeNotClass * probNotClass); } } } for (int i = 0; i < data.numInstances(); i++) { powerEntity[i] = 1.0; for (int j = 0; j < data.numAttributes(); j++) { powerEntity[i] *= powerAttributes[i][j]; } } double[] sortedPower = powerEntity.clone(); Arrays.sort(sortedPower); double cutOff = sortedPower[(int) (data.numInstances() * (1 - percentage))]; final Instances selected = new Instances(data); selected.delete(); for (int i = 0; i < data.numInstances(); i++) { if (powerEntity[i] >= cutOff) { selected.add(data.instance(i)); } } return selected; }
From source file:de.ugoe.cs.cpdp.dataselection.DecisionTreeSelection.java
License:Apache License
@Override public void apply(Instances testdata, SetUniqueList<Instances> traindataSet) { final Instances data = characteristicInstances(testdata, traindataSet); final ArrayList<String> attVals = new ArrayList<String>(); attVals.add("same"); attVals.add("more"); attVals.add("less"); final ArrayList<Attribute> atts = new ArrayList<Attribute>(); for (int j = 0; j < data.numAttributes(); j++) { atts.add(new Attribute(data.attribute(j).name(), attVals)); }/*from w ww . ja v a 2 s . c o m*/ atts.add(new Attribute("score")); Instances similarityData = new Instances("similarity", atts, 0); similarityData.setClassIndex(similarityData.numAttributes() - 1); try { Classifier classifier = new J48(); for (int i = 0; i < traindataSet.size(); i++) { classifier.buildClassifier(traindataSet.get(i)); for (int j = 0; j < traindataSet.size(); j++) { if (i != j) { double[] similarity = new double[data.numAttributes() + 1]; for (int k = 0; k < data.numAttributes(); k++) { if (0.9 * data.get(i + 1).value(k) > data.get(j + 1).value(k)) { similarity[k] = 2.0; } else if (1.1 * data.get(i + 1).value(k) < data.get(j + 1).value(k)) { similarity[k] = 1.0; } else { similarity[k] = 0.0; } } Evaluation eval = new Evaluation(traindataSet.get(j)); eval.evaluateModel(classifier, traindataSet.get(j)); similarity[data.numAttributes()] = eval.fMeasure(1); similarityData.add(new DenseInstance(1.0, similarity)); } } } REPTree repTree = new REPTree(); if (repTree.getNumFolds() > similarityData.size()) { repTree.setNumFolds(similarityData.size()); } repTree.setNumFolds(2); repTree.buildClassifier(similarityData); Instances testTrainSimilarity = new Instances(similarityData); testTrainSimilarity.clear(); for (int i = 0; i < traindataSet.size(); i++) { double[] similarity = new double[data.numAttributes() + 1]; for (int k = 0; k < data.numAttributes(); k++) { if (0.9 * data.get(0).value(k) > data.get(i + 1).value(k)) { similarity[k] = 2.0; } else if (1.1 * data.get(0).value(k) < data.get(i + 1).value(k)) { similarity[k] = 1.0; } else { similarity[k] = 0.0; } } testTrainSimilarity.add(new DenseInstance(1.0, similarity)); } int bestScoringProductIndex = -1; double maxScore = Double.MIN_VALUE; for (int i = 0; i < traindataSet.size(); i++) { double score = repTree.classifyInstance(testTrainSimilarity.get(i)); if (score > maxScore) { maxScore = score; bestScoringProductIndex = i; } } Instances bestScoringProduct = traindataSet.get(bestScoringProductIndex); traindataSet.clear(); traindataSet.add(bestScoringProduct); } catch (Exception e) { Console.printerr("failure during DecisionTreeSelection: " + e.getMessage()); throw new RuntimeException(e); } }
From source file:de.ugoe.cs.cpdp.dataselection.SetWiseEMContextSelection.java
License:Apache License
/** * Returns test- and training data with only the project context factors which were chosen in * the configuration. This is later used for clustering. * /* w w w .j a va 2 s .c o m*/ * @param testdata * @param traindataSet * @return */ protected Instances getContextFactors(Instances testdata, SetUniqueList<Instances> traindataSet) { // setup weka Instances for clustering final ArrayList<Attribute> atts = new ArrayList<Attribute>(); // we only want the project context factors for (String pcf : this.project_context_factors) { atts.add(new Attribute(pcf)); } // set up the data final Instances data = new Instances("project_context_factors", atts, 0); double[] instanceValues = new double[atts.size()]; // only project context factors + only one instance per project needed int i = 0; for (String pcf : this.project_context_factors) { instanceValues[i] = testdata.instance(0).value(testdata.attribute(pcf)); // Console.traceln(Level.INFO, "adding attribute: " + pcf + " value: " + // instanceValues[i]); i++; } data.add(new DenseInstance(1.0, instanceValues)); // now for the projects of the training stet for (Instances traindata : traindataSet) { instanceValues = new double[atts.size()]; // ohne das hier immer dieselben werte?! i = 0; for (String pcf : this.project_context_factors) { instanceValues[i] = traindata.instance(0).value(traindata.attribute(pcf)); // Console.traceln(Level.INFO, "adding attribute: " + pcf + " value: " + // instanceValues[i]); i++; } data.add(new DenseInstance(1.0, instanceValues)); } return data; }
From source file:de.ugoe.cs.cpdp.dataselection.SetWiseEMContextSelection.java
License:Apache License
/** * Delete projects where the project context does not match the training project * /*from w w w . ja v a 2 s .c om*/ * @param testdata * @param traindataSet * @param attribute */ protected void removeWrongContext(Instances testdata, SetUniqueList<Instances> traindataSet, String attribute) { Set<Instances> remove = new HashSet<Instances>(); for (Instances traindata : traindataSet) { if (traindata.firstInstance().value(traindata.attribute(attribute)) != testdata.firstInstance() .value(testdata.attribute(attribute))) { remove.add(traindata); // Console.traceln(Level.WARNING, // "rmove attribute "+attribute+" test: // "+testdata.firstInstance().value(testdata.attribute(attribute))+" train: // "+traindata.firstInstance().value(traindata.attribute(attribute))); } } // now delete the projects from set for (Instances i : remove) { traindataSet.remove(i); // Console.traceln(Level.INFO, "removing training project from set"); } }