List of usage examples for weka.core Instances numInstances
publicint numInstances()
From source file:de.ugoe.cs.cpdp.dataprocessing.NormalizationUtil.java
License:Apache License
/** * <p>/*from w ww . j a v a2 s. c o m*/ * Min-Max normalization to scale all data to the interval [0,1] (N1 in Transfer Defect Learning * by Nam et al.). * </p> * * @param data * data that is normalized */ public static void minMax(Instances data) { for (int j = 0; j < data.numAttributes(); j++) { if (data.classIndex() != j) { double min = data.attributeStats(j).numericStats.min; double max = data.attributeStats(j).numericStats.max; for (int i = 0; i < data.numInstances(); i++) { Instance inst = data.instance(i); double newValue = (inst.value(j) - min) / (max - min); inst.setValue(j, newValue); } } } }
From source file:de.ugoe.cs.cpdp.dataprocessing.NormalizationUtil.java
License:Apache License
/** * <p>//from w w w .j a v a 2s. com * Internal helper function * </p> */ private static void applyZScore(Instances data, double[] mean, double[] std) { for (int i = 0; i < data.numInstances(); i++) { Instance instance = data.instance(i); for (int j = 0; j < data.numAttributes(); j++) { if (data.classIndex() != j) { instance.setValue(j, instance.value(j) - mean[j] / std[j]); } } } }
From source file:de.ugoe.cs.cpdp.dataprocessing.SimulationFilter.java
License:Apache License
@Override public void apply(Instances testdata, Instances traindata) { Instances newDataSet = new Instances(traindata); traindata.delete();//from w w w .ja va2s. c om HashMap<Double, Instance> artifactNames = new HashMap<Double, Instance>(); // This is to add all data, where the first occurence of the file has a bug ArrayList<Double> firstOccurenceArtifactNames = new ArrayList<Double>(); // Sort dataset (StateID is connected to the date of commit: Lower StateID // means earlier commit than a higher stateID) Attribute wekaAttribute = newDataSet.attribute("Artifact.Target.StateID"); newDataSet.sort(wekaAttribute); /* * Logical summary: If there is an instance that dont have a bug, put it into the hashmap * (only unique values in there) * * If there is an instance, that hava a bug look up if it is in the hashmap already (this * means: it does not had a bug before!): If this is true add it to a new dataset and remove * it from the hashmap, so that new changes from "nonBug" -> "bug" for this file can be * found. * * If the instance has a bug and is not in the hashmap (this means: The file has a bug with * its first occurence or this file only has bugs and not an instance with no bug), then (if * it is not in the arrayList above) add it to the new dataset. This way it is possible to * get the first occurence of a file, which has a bug */ for (int i = 0; i < newDataSet.numInstances(); i++) { Instance wekaInstance = newDataSet.instance(i); double newBugLabel = wekaInstance.classValue(); Attribute wekaArtifactName = newDataSet.attribute("Artifact.Name"); Double artifactName = wekaInstance.value(wekaArtifactName); if (newBugLabel == 0.0 && artifactNames.keySet().contains(artifactName)) { artifactNames.put(artifactName, wekaInstance); } else if (newBugLabel == 0.0 && !artifactNames.keySet().contains(artifactName)) { artifactNames.put(artifactName, wekaInstance); } else if (newBugLabel == 1.0 && artifactNames.keySet().contains(artifactName)) { traindata.add(wekaInstance); artifactNames.remove(artifactName); } else if (newBugLabel == 1.0 && !artifactNames.keySet().contains(artifactName)) { if (!firstOccurenceArtifactNames.contains(artifactName)) { traindata.add(wekaInstance); firstOccurenceArtifactNames.add(artifactName); } } } // If we have a file, that never had a bug (this is, when it is NOT in the // new created dataset, but it is in the HashMap from above) add it to // the new dataset double[] artifactNamesinNewDataSet = traindata.attributeToDoubleArray(0); HashMap<Double, Instance> artifactNamesCopy = new HashMap<Double, Instance>(artifactNames); for (Double artifactName : artifactNames.keySet()) { for (int i = 0; i < artifactNamesinNewDataSet.length; i++) { if (artifactNamesinNewDataSet[i] == artifactName) { artifactNamesCopy.remove(artifactName); } } } for (Double artifact : artifactNamesCopy.keySet()) { traindata.add(artifactNamesCopy.get(artifact)); } }
From source file:de.ugoe.cs.cpdp.dataprocessing.TransferComponentAnalysis.java
License:Apache License
/** * <p>// w ww .j a v a 2s.c o m * Applies TCA to the test and training data. * </p> * * @param testdata * the test data * @param traindata * the training data */ private void applyTCA(Instances testdata, Instances traindata) { final int sizeTest = testdata.numInstances(); final int sizeTrain = traindata.numInstances(); final PrimitiveMatrix kernelMatrix = buildKernel(testdata, traindata); final PrimitiveMatrix kernelNormMatrix = buildKernelNormMatrix(sizeTest, sizeTrain); // L in // the // paper final PrimitiveMatrix centerMatrix = buildCenterMatrix(sizeTest, sizeTrain); // H in the // paper final double mu = 1.0; // default from the MATLAB implementation final PrimitiveMatrix muMatrix = buildMuMatrix(sizeTest, sizeTrain, mu); PrimitiveMatrix.FACTORY.makeEye(sizeTest + sizeTrain, sizeTest + sizeTrain); Console.traceln(Level.FINEST, "creating optimization matrix (dimension " + (sizeTest + sizeTrain) + ")"); final PrimitiveMatrix optimizationProblem = kernelMatrix.multiplyRight(kernelNormMatrix) .multiplyRight(kernelMatrix).add(muMatrix).invert().multiplyRight(kernelMatrix) .multiplyRight(centerMatrix).multiplyRight(kernelMatrix); Console.traceln(Level.FINEST, "optimization matrix created, now solving eigenvalue problem"); General eigenvalueDecomposition = new JamaEigenvalue.General(); eigenvalueDecomposition.compute(optimizationProblem); Console.traceln(Level.FINEST, "eigenvalue problem solved"); Array1D<ComplexNumber> eigenvaluesArray = eigenvalueDecomposition.getEigenvalues(); System.out.println(eigenvaluesArray.length); final Double[] eigenvalues = new Double[(int) eigenvaluesArray.length]; final int[] index = new int[(int) eigenvaluesArray.length]; // create kernel transformation matrix from eigenvectors for (int i = 0; i < eigenvaluesArray.length; i++) { eigenvalues[i] = eigenvaluesArray.doubleValue(i); index[i] = i; } SortUtils.quicksort(eigenvalues, index); final PrimitiveMatrix transformedKernel = kernelMatrix.multiplyRight( eigenvalueDecomposition.getV().selectColumns(Arrays.copyOfRange(index, 0, reducedDimension))); // update testdata and traindata for (int j = testdata.numAttributes() - 1; j >= 0; j--) { if (j != testdata.classIndex()) { testdata.deleteAttributeAt(j); traindata.deleteAttributeAt(j); } } for (int j = 0; j < reducedDimension; j++) { testdata.insertAttributeAt(new Attribute("kerneldim" + j), 1); traindata.insertAttributeAt(new Attribute("kerneldim" + j), 1); } for (int i = 0; i < sizeTrain; i++) { for (int j = 0; j < reducedDimension; j++) { traindata.instance(i).setValue(j + 1, transformedKernel.get(i, j)); } } for (int i = 0; i < sizeTest; i++) { for (int j = 0; j < reducedDimension; j++) { testdata.instance(i).setValue(j + 1, transformedKernel.get(i + sizeTrain, j)); } } }
From source file:de.ugoe.cs.cpdp.dataprocessing.TransferComponentAnalysis.java
License:Apache License
/** * <p>/*from w w w. jav a 2 s .c o m*/ * Creates the kernel matrix of the test and training data * </p> * * @param testdata * the test data * @param traindata * the training data * @return kernel matrix */ private PrimitiveMatrix buildKernel(Instances testdata, Instances traindata) { final int kernelDim = traindata.numInstances() + testdata.numInstances(); Builder<PrimitiveMatrix> kernelBuilder = PrimitiveMatrix.getBuilder(kernelDim, kernelDim); // built upper left quadrant (source, source) for (int i = 0; i < traindata.numInstances(); i++) { for (int j = 0; j < traindata.numInstances(); j++) { kernelBuilder.set(i, j, linearKernel(traindata.get(i), traindata.get(j))); } } // built upper right quadrant (source, target) for (int i = 0; i < traindata.numInstances(); i++) { for (int j = 0; j < testdata.numInstances(); j++) { kernelBuilder.set(i, j + traindata.numInstances(), linearKernel(traindata.get(i), testdata.get(j))); } } // built lower left quadrant (target, source) for (int i = 0; i < testdata.numInstances(); i++) { for (int j = 0; j < traindata.numInstances(); j++) { kernelBuilder.set(i + traindata.numInstances(), j, linearKernel(testdata.get(i), traindata.get(j))); } } // built lower right quadrant (target, target) for (int i = 0; i < testdata.numInstances(); i++) { for (int j = 0; j < testdata.numInstances(); j++) { kernelBuilder.set(i + traindata.numInstances(), j + traindata.numInstances(), linearKernel(testdata.get(i), testdata.get(j))); } } return kernelBuilder.build(); }
From source file:de.ugoe.cs.cpdp.dataselection.CLIFF.java
License:Apache License
/** * <p>//w w w . j av a 2 s . com * Applies the CLIFF relevancy filter to the data. * </p> * * @param data * the data * @return CLIFF-filtered data */ protected Instances applyCLIFF(Instances data) { final double[][] powerAttributes = new double[data.size()][data.numAttributes()]; final double[] powerEntity = new double[data.size()]; final int[] counts = data.attributeStats(data.classIndex()).nominalCounts; final double probDefect = data.numInstances() / (double) counts[1]; for (int j = 0; j < data.numAttributes(); j++) { if (data.attribute(j) != data.classAttribute()) { final double[] ranges = getRanges(data, j); final double[] probDefectRange = getRangeProbabilities(data, j, ranges); for (int i = 0; i < data.numInstances(); i++) { final double value = data.instance(i).value(j); final int range = determineRange(ranges, value); double probClass, probNotClass, probRangeClass, probRangeNotClass; if (data.instance(i).classValue() == 1) { probClass = probDefect; probNotClass = 1.0 - probDefect; probRangeClass = probDefectRange[range]; probRangeNotClass = 1.0 - probDefectRange[range]; } else { probClass = 1.0 - probDefect; probNotClass = probDefect; probRangeClass = 1.0 - probDefectRange[range]; probRangeNotClass = probDefectRange[range]; } powerAttributes[i][j] = Math.pow(probRangeClass, 2.0) / (probRangeClass * probClass + probRangeNotClass * probNotClass); } } } for (int i = 0; i < data.numInstances(); i++) { powerEntity[i] = 1.0; for (int j = 0; j < data.numAttributes(); j++) { powerEntity[i] *= powerAttributes[i][j]; } } double[] sortedPower = powerEntity.clone(); Arrays.sort(sortedPower); double cutOff = sortedPower[(int) (data.numInstances() * (1 - percentage))]; final Instances selected = new Instances(data); selected.delete(); for (int i = 0; i < data.numInstances(); i++) { if (powerEntity[i] >= cutOff) { selected.add(data.instance(i)); } } return selected; }
From source file:de.ugoe.cs.cpdp.dataselection.CLIFF.java
License:Apache License
/** * <p>//from w ww. j a v a2s.co m * Gets the probabilities of a positive prediction for each range for a given attribute * </p> * * @param data * the data * @param j * index of the attribute * @param ranges * the ranges * @return probabilities for each range */ private double[] getRangeProbabilities(Instances data, int j, double[] ranges) { double[] probDefectRange = new double[numRanges]; int[] countRange = new int[numRanges]; int[] countDefect = new int[numRanges]; for (int i = 0; i < data.numInstances(); i++) { int range = determineRange(ranges, data.instance(i).value(j)); countRange[range]++; if (data.instance(i).classValue() == 1) { countDefect[range]++; } } for (int k = 0; k < numRanges; k++) { probDefectRange[k] = ((double) countDefect[k]) / countRange[k]; } return probDefectRange; }
From source file:de.ugoe.cs.cpdp.dataselection.SetWiseEMContextSelection.java
License:Apache License
/** * Uses the Weka EM-Clustering algorithm to cluster the projects by their project context * factors. The project context factors are first normalized and then used for clustering. They * can be configured in the configuration param. * /*from ww w .j a v a 2 s . com*/ * @param testdata * @param traindataSet */ protected void cluster(Instances testdata, SetUniqueList<Instances> traindataSet) { // now do the clustering, normalizedCharacteristicInstances ruft getContextFactors auf final Instances data = this.normalizedCharacteristicInstances(testdata, traindataSet); final Instance targetInstance = data.instance(0); final List<Instance> candidateInstances = new LinkedList<Instance>(); for (int i = 1; i < data.numInstances(); i++) { candidateInstances.add(data.instance(i)); } // cluster and select try { final EM emeans = new EM(); boolean onlyTarget = true; int targetCluster; int maxNumClusters = candidateInstances.size(); do { // while(onlyTarget) emeans.setMaximumNumberOfClusters(maxNumClusters); emeans.buildClusterer(data); targetCluster = emeans.clusterInstance(targetInstance); // check if cluster only contains target project for (int i = 0; i < candidateInstances.size() && onlyTarget; i++) { onlyTarget &= !(emeans.clusterInstance(candidateInstances.get(i)) == targetCluster); } maxNumClusters = emeans.numberOfClusters() - 1; // Console.traceln(Level.INFO, "number of clusters: " + emeans.numberOfClusters()); } while (onlyTarget); Console.traceln(Level.INFO, "clusters: " + maxNumClusters); Console.traceln(Level.INFO, "instances vor dem clustern: " + traindataSet.size()); int numRemoved = 0; for (int i = 0; i < candidateInstances.size(); i++) { if (emeans.clusterInstance(candidateInstances.get(i)) != targetCluster) { traindataSet.remove(i - numRemoved++); } } Console.traceln(Level.INFO, "instances nach dem clustern: " + traindataSet.size()); } catch (Exception e) { throw new RuntimeException("error applying setwise EM clustering training data selection", e); } }
From source file:de.ugoe.cs.cpdp.execution.AbstractCrossProjectExperiment.java
License:Apache License
/** * Helper method that combines a set of Weka {@link Instances} sets into a single * {@link Instances} set./* w w w . jav a2s. c o m*/ * * @param traindataSet * set of {@link Instances} to be combines * @return single {@link Instances} set */ public static Instances makeSingleTrainingSet(SetUniqueList<Instances> traindataSet) { Instances traindataFull = null; for (Instances traindata : traindataSet) { if (traindataFull == null) { traindataFull = new Instances(traindata); } else { for (int i = 0; i < traindata.numInstances(); i++) { traindataFull.add(traindata.instance(i)); } } } return traindataFull; }
From source file:de.ugoe.cs.cpdp.util.WekaUtils.java
License:Apache License
/** * <p>/*from w w w . j a va 2 s .c o m*/ * Calculates the distributional characteristics of the distances the instances within a data * set have to each other. * </p> * * @param data * data for which the instances are characterized * @return characteristics */ public static DistChar datasetDistance(Instances data) { double distance; double sumAll = 0.0; double sumAllQ = 0.0; double min = Double.MAX_VALUE; double max = Double.MIN_VALUE; int numCmp = 0; int l = 0; double[] inst1 = new double[data.numAttributes() - 1]; double[] inst2 = new double[data.numAttributes() - 1]; EuclideanDistance euclideanDistance = new EuclideanDistance(); for (int i = 0; i < data.numInstances(); i++) { l = 0; for (int k = 0; k < data.numAttributes(); k++) { if (k != data.classIndex()) { inst1[l] = data.instance(i).value(k); } } for (int j = 0; j < data.numInstances(); j++) { if (j != i) { l = 0; for (int k = 0; k < data.numAttributes(); k++) { if (k != data.classIndex()) { inst2[l] = data.instance(j).value(k); } } distance = euclideanDistance.compute(inst1, inst2); sumAll += distance; sumAllQ += distance * distance; numCmp++; if (distance < min) { min = distance; } if (distance > max) { max = distance; } } } } double mean = sumAll / numCmp; double std = Math.sqrt((sumAllQ - (sumAll * sumAll) / numCmp) * (1.0d / (numCmp - 1))); return new DistChar(mean, std, min, max, data.numInstances()); }