List of usage examples for weka.core Instances instance
publicInstance instance(int index)
From source file:de.ugoe.cs.cpdp.dataprocessing.NormalizationUtil.java
License:Apache License
/** * <p>/* w w w . jav a 2 s . com*/ * Min-Max normalization to scale all data to the interval [0,1] (N1 in Transfer Defect Learning * by Nam et al.). * </p> * * @param data * data that is normalized */ public static void minMax(Instances data) { for (int j = 0; j < data.numAttributes(); j++) { if (data.classIndex() != j) { double min = data.attributeStats(j).numericStats.min; double max = data.attributeStats(j).numericStats.max; for (int i = 0; i < data.numInstances(); i++) { Instance inst = data.instance(i); double newValue = (inst.value(j) - min) / (max - min); inst.setValue(j, newValue); } } } }
From source file:de.ugoe.cs.cpdp.dataprocessing.NormalizationUtil.java
License:Apache License
/** * <p>// w ww . j av a 2 s . co m * Internal helper function * </p> */ private static void applyZScore(Instances data, double[] mean, double[] std) { for (int i = 0; i < data.numInstances(); i++) { Instance instance = data.instance(i); for (int j = 0; j < data.numAttributes(); j++) { if (data.classIndex() != j) { instance.setValue(j, instance.value(j) - mean[j] / std[j]); } } } }
From source file:de.ugoe.cs.cpdp.dataprocessing.SimulationFilter.java
License:Apache License
@Override public void apply(Instances testdata, Instances traindata) { Instances newDataSet = new Instances(traindata); traindata.delete();/*from w ww .j ava 2 s . c o m*/ HashMap<Double, Instance> artifactNames = new HashMap<Double, Instance>(); // This is to add all data, where the first occurence of the file has a bug ArrayList<Double> firstOccurenceArtifactNames = new ArrayList<Double>(); // Sort dataset (StateID is connected to the date of commit: Lower StateID // means earlier commit than a higher stateID) Attribute wekaAttribute = newDataSet.attribute("Artifact.Target.StateID"); newDataSet.sort(wekaAttribute); /* * Logical summary: If there is an instance that dont have a bug, put it into the hashmap * (only unique values in there) * * If there is an instance, that hava a bug look up if it is in the hashmap already (this * means: it does not had a bug before!): If this is true add it to a new dataset and remove * it from the hashmap, so that new changes from "nonBug" -> "bug" for this file can be * found. * * If the instance has a bug and is not in the hashmap (this means: The file has a bug with * its first occurence or this file only has bugs and not an instance with no bug), then (if * it is not in the arrayList above) add it to the new dataset. This way it is possible to * get the first occurence of a file, which has a bug */ for (int i = 0; i < newDataSet.numInstances(); i++) { Instance wekaInstance = newDataSet.instance(i); double newBugLabel = wekaInstance.classValue(); Attribute wekaArtifactName = newDataSet.attribute("Artifact.Name"); Double artifactName = wekaInstance.value(wekaArtifactName); if (newBugLabel == 0.0 && artifactNames.keySet().contains(artifactName)) { artifactNames.put(artifactName, wekaInstance); } else if (newBugLabel == 0.0 && !artifactNames.keySet().contains(artifactName)) { artifactNames.put(artifactName, wekaInstance); } else if (newBugLabel == 1.0 && artifactNames.keySet().contains(artifactName)) { traindata.add(wekaInstance); artifactNames.remove(artifactName); } else if (newBugLabel == 1.0 && !artifactNames.keySet().contains(artifactName)) { if (!firstOccurenceArtifactNames.contains(artifactName)) { traindata.add(wekaInstance); firstOccurenceArtifactNames.add(artifactName); } } } // If we have a file, that never had a bug (this is, when it is NOT in the // new created dataset, but it is in the HashMap from above) add it to // the new dataset double[] artifactNamesinNewDataSet = traindata.attributeToDoubleArray(0); HashMap<Double, Instance> artifactNamesCopy = new HashMap<Double, Instance>(artifactNames); for (Double artifactName : artifactNames.keySet()) { for (int i = 0; i < artifactNamesinNewDataSet.length; i++) { if (artifactNamesinNewDataSet[i] == artifactName) { artifactNamesCopy.remove(artifactName); } } } for (Double artifact : artifactNamesCopy.keySet()) { traindata.add(artifactNamesCopy.get(artifact)); } }
From source file:de.ugoe.cs.cpdp.dataprocessing.TransferComponentAnalysis.java
License:Apache License
/** * <p>//from www . j a v a2 s. c o m * Applies TCA to the test and training data. * </p> * * @param testdata * the test data * @param traindata * the training data */ private void applyTCA(Instances testdata, Instances traindata) { final int sizeTest = testdata.numInstances(); final int sizeTrain = traindata.numInstances(); final PrimitiveMatrix kernelMatrix = buildKernel(testdata, traindata); final PrimitiveMatrix kernelNormMatrix = buildKernelNormMatrix(sizeTest, sizeTrain); // L in // the // paper final PrimitiveMatrix centerMatrix = buildCenterMatrix(sizeTest, sizeTrain); // H in the // paper final double mu = 1.0; // default from the MATLAB implementation final PrimitiveMatrix muMatrix = buildMuMatrix(sizeTest, sizeTrain, mu); PrimitiveMatrix.FACTORY.makeEye(sizeTest + sizeTrain, sizeTest + sizeTrain); Console.traceln(Level.FINEST, "creating optimization matrix (dimension " + (sizeTest + sizeTrain) + ")"); final PrimitiveMatrix optimizationProblem = kernelMatrix.multiplyRight(kernelNormMatrix) .multiplyRight(kernelMatrix).add(muMatrix).invert().multiplyRight(kernelMatrix) .multiplyRight(centerMatrix).multiplyRight(kernelMatrix); Console.traceln(Level.FINEST, "optimization matrix created, now solving eigenvalue problem"); General eigenvalueDecomposition = new JamaEigenvalue.General(); eigenvalueDecomposition.compute(optimizationProblem); Console.traceln(Level.FINEST, "eigenvalue problem solved"); Array1D<ComplexNumber> eigenvaluesArray = eigenvalueDecomposition.getEigenvalues(); System.out.println(eigenvaluesArray.length); final Double[] eigenvalues = new Double[(int) eigenvaluesArray.length]; final int[] index = new int[(int) eigenvaluesArray.length]; // create kernel transformation matrix from eigenvectors for (int i = 0; i < eigenvaluesArray.length; i++) { eigenvalues[i] = eigenvaluesArray.doubleValue(i); index[i] = i; } SortUtils.quicksort(eigenvalues, index); final PrimitiveMatrix transformedKernel = kernelMatrix.multiplyRight( eigenvalueDecomposition.getV().selectColumns(Arrays.copyOfRange(index, 0, reducedDimension))); // update testdata and traindata for (int j = testdata.numAttributes() - 1; j >= 0; j--) { if (j != testdata.classIndex()) { testdata.deleteAttributeAt(j); traindata.deleteAttributeAt(j); } } for (int j = 0; j < reducedDimension; j++) { testdata.insertAttributeAt(new Attribute("kerneldim" + j), 1); traindata.insertAttributeAt(new Attribute("kerneldim" + j), 1); } for (int i = 0; i < sizeTrain; i++) { for (int j = 0; j < reducedDimension; j++) { traindata.instance(i).setValue(j + 1, transformedKernel.get(i, j)); } } for (int i = 0; i < sizeTest; i++) { for (int j = 0; j < reducedDimension; j++) { testdata.instance(i).setValue(j + 1, transformedKernel.get(i + sizeTrain, j)); } } }
From source file:de.ugoe.cs.cpdp.dataselection.CLIFF.java
License:Apache License
/** * <p>//from ww w. j a v a2 s . co m * Applies the CLIFF relevancy filter to the data. * </p> * * @param data * the data * @return CLIFF-filtered data */ protected Instances applyCLIFF(Instances data) { final double[][] powerAttributes = new double[data.size()][data.numAttributes()]; final double[] powerEntity = new double[data.size()]; final int[] counts = data.attributeStats(data.classIndex()).nominalCounts; final double probDefect = data.numInstances() / (double) counts[1]; for (int j = 0; j < data.numAttributes(); j++) { if (data.attribute(j) != data.classAttribute()) { final double[] ranges = getRanges(data, j); final double[] probDefectRange = getRangeProbabilities(data, j, ranges); for (int i = 0; i < data.numInstances(); i++) { final double value = data.instance(i).value(j); final int range = determineRange(ranges, value); double probClass, probNotClass, probRangeClass, probRangeNotClass; if (data.instance(i).classValue() == 1) { probClass = probDefect; probNotClass = 1.0 - probDefect; probRangeClass = probDefectRange[range]; probRangeNotClass = 1.0 - probDefectRange[range]; } else { probClass = 1.0 - probDefect; probNotClass = probDefect; probRangeClass = 1.0 - probDefectRange[range]; probRangeNotClass = probDefectRange[range]; } powerAttributes[i][j] = Math.pow(probRangeClass, 2.0) / (probRangeClass * probClass + probRangeNotClass * probNotClass); } } } for (int i = 0; i < data.numInstances(); i++) { powerEntity[i] = 1.0; for (int j = 0; j < data.numAttributes(); j++) { powerEntity[i] *= powerAttributes[i][j]; } } double[] sortedPower = powerEntity.clone(); Arrays.sort(sortedPower); double cutOff = sortedPower[(int) (data.numInstances() * (1 - percentage))]; final Instances selected = new Instances(data); selected.delete(); for (int i = 0; i < data.numInstances(); i++) { if (powerEntity[i] >= cutOff) { selected.add(data.instance(i)); } } return selected; }
From source file:de.ugoe.cs.cpdp.dataselection.CLIFF.java
License:Apache License
/** * <p>// www . ja va 2 s.c o m * Gets the probabilities of a positive prediction for each range for a given attribute * </p> * * @param data * the data * @param j * index of the attribute * @param ranges * the ranges * @return probabilities for each range */ private double[] getRangeProbabilities(Instances data, int j, double[] ranges) { double[] probDefectRange = new double[numRanges]; int[] countRange = new int[numRanges]; int[] countDefect = new int[numRanges]; for (int i = 0; i < data.numInstances(); i++) { int range = determineRange(ranges, data.instance(i).value(j)); countRange[range]++; if (data.instance(i).classValue() == 1) { countDefect[range]++; } } for (int k = 0; k < numRanges; k++) { probDefectRange[k] = ((double) countDefect[k]) / countRange[k]; } return probDefectRange; }
From source file:de.ugoe.cs.cpdp.dataselection.NeighborhoodFilter.java
License:Apache License
/** * <p>/*from w ww . j a va2 s. c o m*/ * Applies the relevancy filter after Ryu et al. * </p> * * @param testdata * test data * @param traindata * training data * @return filtered trainind data */ private Instances applyNeighborhoodFilter(Instances testdata, Instances traindata) { TreeSet<Integer> selectedInstances = new TreeSet<>(); for (int i = 0; i < testdata.size(); i++) { double minHam = Double.MAX_VALUE; for (int j = 0; j < traindata.size(); j++) { double distance = WekaUtils.hammingDistance(testdata.get(i), traindata.get(j)); if (distance < minHam) { minHam = distance; } } for (int j = 0; j < traindata.size(); j++) { double distance = WekaUtils.hammingDistance(testdata.get(i), traindata.get(j)); if (distance <= minHam) { selectedInstances.add(j); } } } Instances selectedTraindata = new Instances(testdata); selectedTraindata.clear(); for (Integer index : selectedInstances) { selectedTraindata.add(traindata.instance(index)); } return selectedTraindata; }
From source file:de.ugoe.cs.cpdp.dataselection.SetWiseEMContextSelection.java
License:Apache License
/** * Uses the Weka EM-Clustering algorithm to cluster the projects by their project context * factors. The project context factors are first normalized and then used for clustering. They * can be configured in the configuration param. * //from w w w .ja v a 2 s. c o m * @param testdata * @param traindataSet */ protected void cluster(Instances testdata, SetUniqueList<Instances> traindataSet) { // now do the clustering, normalizedCharacteristicInstances ruft getContextFactors auf final Instances data = this.normalizedCharacteristicInstances(testdata, traindataSet); final Instance targetInstance = data.instance(0); final List<Instance> candidateInstances = new LinkedList<Instance>(); for (int i = 1; i < data.numInstances(); i++) { candidateInstances.add(data.instance(i)); } // cluster and select try { final EM emeans = new EM(); boolean onlyTarget = true; int targetCluster; int maxNumClusters = candidateInstances.size(); do { // while(onlyTarget) emeans.setMaximumNumberOfClusters(maxNumClusters); emeans.buildClusterer(data); targetCluster = emeans.clusterInstance(targetInstance); // check if cluster only contains target project for (int i = 0; i < candidateInstances.size() && onlyTarget; i++) { onlyTarget &= !(emeans.clusterInstance(candidateInstances.get(i)) == targetCluster); } maxNumClusters = emeans.numberOfClusters() - 1; // Console.traceln(Level.INFO, "number of clusters: " + emeans.numberOfClusters()); } while (onlyTarget); Console.traceln(Level.INFO, "clusters: " + maxNumClusters); Console.traceln(Level.INFO, "instances vor dem clustern: " + traindataSet.size()); int numRemoved = 0; for (int i = 0; i < candidateInstances.size(); i++) { if (emeans.clusterInstance(candidateInstances.get(i)) != targetCluster) { traindataSet.remove(i - numRemoved++); } } Console.traceln(Level.INFO, "instances nach dem clustern: " + traindataSet.size()); } catch (Exception e) { throw new RuntimeException("error applying setwise EM clustering training data selection", e); } }
From source file:de.ugoe.cs.cpdp.dataselection.SetWiseEMContextSelection.java
License:Apache License
/** * Returns test- and training data with only the project context factors which were chosen in * the configuration. This is later used for clustering. * //w ww . j av a 2s. c om * @param testdata * @param traindataSet * @return */ protected Instances getContextFactors(Instances testdata, SetUniqueList<Instances> traindataSet) { // setup weka Instances for clustering final ArrayList<Attribute> atts = new ArrayList<Attribute>(); // we only want the project context factors for (String pcf : this.project_context_factors) { atts.add(new Attribute(pcf)); } // set up the data final Instances data = new Instances("project_context_factors", atts, 0); double[] instanceValues = new double[atts.size()]; // only project context factors + only one instance per project needed int i = 0; for (String pcf : this.project_context_factors) { instanceValues[i] = testdata.instance(0).value(testdata.attribute(pcf)); // Console.traceln(Level.INFO, "adding attribute: " + pcf + " value: " + // instanceValues[i]); i++; } data.add(new DenseInstance(1.0, instanceValues)); // now for the projects of the training stet for (Instances traindata : traindataSet) { instanceValues = new double[atts.size()]; // ohne das hier immer dieselben werte?! i = 0; for (String pcf : this.project_context_factors) { instanceValues[i] = traindata.instance(0).value(traindata.attribute(pcf)); // Console.traceln(Level.INFO, "adding attribute: " + pcf + " value: " + // instanceValues[i]); i++; } data.add(new DenseInstance(1.0, instanceValues)); } return data; }
From source file:de.ugoe.cs.cpdp.execution.AbstractCrossProjectExperiment.java
License:Apache License
/** * Helper method that combines a set of Weka {@link Instances} sets into a single * {@link Instances} set.//from w w w. j a v a2 s . c o m * * @param traindataSet * set of {@link Instances} to be combines * @return single {@link Instances} set */ public static Instances makeSingleTrainingSet(SetUniqueList<Instances> traindataSet) { Instances traindataFull = null; for (Instances traindata : traindataSet) { if (traindataFull == null) { traindataFull = new Instances(traindata); } else { for (int i = 0; i < traindata.numInstances(); i++) { traindataFull.add(traindata.instance(i)); } } } return traindataFull; }