List of usage examples for weka.core Instances numAttributes
publicint numAttributes()
From source file:de.tudarmstadt.ukp.similarity.experiments.coling2012.util.Evaluator.java
License:Open Source License
public static void runClassifierCV(WekaClassifier wekaClassifier, Dataset dataset) throws Exception { // Set parameters int folds = 10; Classifier baseClassifier = getClassifier(wekaClassifier); // Set up the random number generator long seed = new Date().getTime(); Random random = new Random(seed); // Add IDs to the instances AddID.main(new String[] { "-i", MODELS_DIR + "/" + dataset.toString() + ".arff", "-o", MODELS_DIR + "/" + dataset.toString() + "-plusIDs.arff" }); Instances data = DataSource.read(MODELS_DIR + "/" + dataset.toString() + "-plusIDs.arff"); data.setClassIndex(data.numAttributes() - 1); // Instantiate the Remove filter Remove removeIDFilter = new Remove(); removeIDFilter.setAttributeIndices("first"); // Randomize the data data.randomize(random);/*from w w w. ja v a 2 s. com*/ // Perform cross-validation Instances predictedData = null; Evaluation eval = new Evaluation(data); for (int n = 0; n < folds; n++) { Instances train = data.trainCV(folds, n, random); Instances test = data.testCV(folds, n); // Apply log filter // Filter logFilter = new LogFilter(); // logFilter.setInputFormat(train); // train = Filter.useFilter(train, logFilter); // logFilter.setInputFormat(test); // test = Filter.useFilter(test, logFilter); // Copy the classifier Classifier classifier = AbstractClassifier.makeCopy(baseClassifier); // Instantiate the FilteredClassifier FilteredClassifier filteredClassifier = new FilteredClassifier(); filteredClassifier.setFilter(removeIDFilter); filteredClassifier.setClassifier(classifier); // Build the classifier filteredClassifier.buildClassifier(train); // Evaluate eval.evaluateModel(filteredClassifier, test); // Add predictions AddClassification filter = new AddClassification(); filter.setClassifier(filteredClassifier); filter.setOutputClassification(true); filter.setOutputDistribution(false); filter.setOutputErrorFlag(true); filter.setInputFormat(train); Filter.useFilter(train, filter); // trains the classifier Instances pred = Filter.useFilter(test, filter); // performs predictions on test set if (predictedData == null) predictedData = new Instances(pred, 0); for (int j = 0; j < pred.numInstances(); j++) predictedData.add(pred.instance(j)); } // Prepare output classification String[] scores = new String[predictedData.numInstances()]; for (Instance predInst : predictedData) { int id = new Double(predInst.value(predInst.attribute(0))).intValue() - 1; int valueIdx = predictedData.numAttributes() - 2; String value = predInst.stringValue(predInst.attribute(valueIdx)); scores[id] = value; } // Output StringBuilder sb = new StringBuilder(); for (String score : scores) sb.append(score.toString() + LF); FileUtils.writeStringToFile( new File(OUTPUT_DIR + "/" + dataset.toString() + "/" + wekaClassifier.toString() + "/output.csv"), sb.toString()); }
From source file:de.ugoe.cs.cpdp.dataprocessing.AttributeNonRemoval.java
License:Apache License
/** * @see de.ugoe.cs.cpdp.dataprocessing.SetWiseProcessingStrategy#apply(weka.core.Instances, * org.apache.commons.collections4.list.SetUniqueList) */// ww w . j a va 2 s. c om @Override public void apply(Instances testdata, SetUniqueList<Instances> traindataSet) { for (String attributeName : attributeNames) { for (int i = 0; i < testdata.numAttributes(); i++) { if (!attributeName.equals(testdata.attribute(i).name())) { testdata.deleteAttributeAt(i); for (Instances traindata : traindataSet) { traindata.deleteAttributeAt(i); } } } } }
From source file:de.ugoe.cs.cpdp.dataprocessing.AttributeNonRemoval.java
License:Apache License
/** * @see de.ugoe.cs.cpdp.dataprocessing.ProcessesingStrategy#apply(weka.core.Instances, * weka.core.Instances)//from ww w. j a v a 2 s . com */ @Override public void apply(Instances testdata, Instances traindata) { for (int i = testdata.numAttributes() - 1; i >= 0; i--) { if (!attributeNames.contains(testdata.attribute(i).name())) { testdata.deleteAttributeAt(i); traindata.deleteAttributeAt(i); } } }
From source file:de.ugoe.cs.cpdp.dataprocessing.CLAMIProcessor.java
License:Apache License
/** * <p>//from ww w .j a v a2s . c om * Applies the CLAMI processor to the data. The test data is also required, in order to * guarantee a consistent metric set. * </p> * * @param testdata * test data; the data is not modified, only metrics are dropped * @param data * data to which the CLAMI processor is applied */ public void applyCLAMI(Instances testdata, Instances data) { // first determine medians double[] medians = new double[data.numAttributes()]; // get medians for (int j = 0; j < data.numAttributes(); j++) { if (j != data.classIndex()) { medians[j] = data.kthSmallestValue(j, (data.numInstances() + 1) >> 1); } } // now determine cluster number for each instance double[] clusterNumber = new double[data.numInstances()]; for (int i = 0; i < data.numInstances(); i++) { int countHighValues = 0; Instance currentInstance = data.get(i); for (int j = 0; j < data.numAttributes(); j++) { if (j != data.classIndex()) { if (currentInstance.value(j) > medians[j]) { countHighValues++; } } } clusterNumber[i] = countHighValues; } // determine median of cluster number Median m = new Median(); double medianClusterNumber = m.evaluate(clusterNumber); // now we filter the metrics int[] numMetricViolations = new int[data.numAttributes()]; for (int j = 0; j < data.numAttributes(); j++) { int currentViolations = 0; for (int i = 0; i < data.numInstances(); i++) { Instance currentInstance = data.get(i); if (j != data.classIndex()) { if (clusterNumber[i] > medianClusterNumber) { // "buggy" if (currentInstance.value(j) <= medians[j]) { currentViolations++; } } else { // "not buggy" if (currentInstance.value(j) > medians[j]) { currentViolations++; } } } } numMetricViolations[j] = currentViolations; } SortedSet<Integer> distinctViolationCounts = new TreeSet<>(); for (int currentViolations : numMetricViolations) { distinctViolationCounts.add(currentViolations); } Iterator<Integer> violationCountInterator = distinctViolationCounts.iterator(); int violationCutoff = violationCountInterator.next(); // now we filter the data; // this is first tried with the metrics with fewest violations. if no buggy/bugfree // instances remain, this is repeated with the next metrics with second fewest violations, // and so on. // this part is a bit unclear from the description in the paper, but I confirmed with the // author that this is how they implemented it boolean[] cleanInstances = new boolean[data.numInstances()]; int numCleanBuggyInstances = 0; int numCleanBugfreeInstances = 0; do { violationCutoff = violationCountInterator.next(); cleanInstances = new boolean[data.numInstances()]; numCleanBuggyInstances = 0; numCleanBugfreeInstances = 0; for (int i = 0; i < data.numInstances(); i++) { int currentViolations = 0; Instance currentInstance = data.get(i); for (int j = 0; j < data.numAttributes(); j++) { if (j != data.classIndex() && numMetricViolations[j] == violationCutoff) { if (clusterNumber[i] > medianClusterNumber) { // "buggy" if (currentInstance.value(j) <= medians[j]) { currentViolations++; } } else { // "not buggy" if (currentInstance.value(j) > medians[j]) { currentViolations++; } } } } if (currentViolations == 0) { cleanInstances[i] = true; if (clusterNumber[i] > medianClusterNumber) { numCleanBuggyInstances++; } else { numCleanBugfreeInstances++; } } else { cleanInstances[i] = false; } } } while (numCleanBuggyInstances == 0 || numCleanBugfreeInstances == 0); // output some interesting information to provide insights into the CLAMI model Console.traceln(Level.FINE, "Selected Metrics and Median-threshold: "); for (int j = 0; j < data.numAttributes(); j++) { if (j != data.classIndex() && numMetricViolations[j] == violationCutoff) { Console.traceln(Level.FINE, "\t" + data.attribute(j).name() + ": " + medians[j]); } } // finally modify the instances // drop the metrics (also from the testdata) for (int j = data.numAttributes() - 1; j >= 0; j--) { if (j != data.classIndex() && numMetricViolations[j] != violationCutoff) { data.deleteAttributeAt(j); testdata.deleteAttributeAt(j); } } // drop the unclean instances for (int i = data.numInstances() - 1; i >= 0; i--) { if (!cleanInstances[i]) { data.delete(i); } else { // set the classification if (clusterNumber[i] > medianClusterNumber) { data.get(i).setClassValue(1.0d); } else { data.get(i).setClassValue(0.0d); } } } }
From source file:de.ugoe.cs.cpdp.dataprocessing.CLAProcessor.java
License:Apache License
/** * Applies the CLA processor the the data. * //from w w w . ja v a 2s .com * @param data * data to which the processor is applied */ public void applyCLA(Instances data) { // first determine medians double[] medians = new double[data.numAttributes()]; // get medians for (int j = 0; j < data.numAttributes(); j++) { if (j != data.classIndex()) { medians[j] = data.kthSmallestValue(j, (data.numInstances() + 1) >> 1); } } // now determine cluster number for each instance double[] clusterNumber = new double[data.numInstances()]; for (int i = 0; i < data.numInstances(); i++) { int countHighValues = 0; Instance currentInstance = data.get(i); for (int j = 0; j < data.numAttributes(); j++) { if (j != data.classIndex()) { if (currentInstance.value(j) > medians[j]) { countHighValues++; } } } clusterNumber[i] = countHighValues; } // determine median of cluster number Median m = new Median(); double medianClusterNumber = m.evaluate(Arrays.stream(clusterNumber).distinct().toArray()); // finally modify the instances // drop the unclean instances for (int i = data.numInstances() - 1; i >= 0; i--) { // set the classification if (clusterNumber[i] > medianClusterNumber) { data.get(i).setClassValue(1.0d); } else { data.get(i).setClassValue(0.0d); } } }
From source file:de.ugoe.cs.cpdp.dataprocessing.MORPH.java
License:Apache License
/** * <p>//from w ww . j a v a2 s . c o m * Applies MORPH to a single instance * </p> * * @param instance * instance that is morphed * @param data * data based on which the instance is morphed */ public void morphInstance(Instance instance, Instances data) { Instance nearestUnlikeNeighbor = getNearestUnlikeNeighbor(instance, data); if (nearestUnlikeNeighbor == null) { throw new RuntimeException( "could not find nearest unlike neighbor within the data: " + data.relationName()); } for (int j = 0; j < data.numAttributes(); j++) { if (data.attribute(j) != data.classAttribute() && data.attribute(j).isNumeric()) { double randVal = rand.nextDouble() * (beta - alpha) + alpha; instance.setValue(j, instance.value(j) + randVal * (instance.value(j) - nearestUnlikeNeighbor.value(j))); } } }
From source file:de.ugoe.cs.cpdp.dataprocessing.MORPH.java
License:Apache License
/** * <p>//from ww w . ja v a 2s. co m * Determines the nearest unlike neighbor of an instance. * </p> * * @param instance * instance to which the nearest unlike neighbor is determined * @param data * data where the nearest unlike neighbor is determined from * @return nearest unlike instance */ public Instance getNearestUnlikeNeighbor(Instance instance, Instances data) { Instance nearestUnlikeNeighbor = null; double[] instanceVector = new double[data.numAttributes() - 1]; int tmp = 0; for (int j = 0; j < data.numAttributes(); j++) { if (data.attribute(j) != data.classAttribute() && data.attribute(j).isNumeric()) { instanceVector[tmp] = instance.value(j); } } double minDistance = Double.MAX_VALUE; for (int i = 0; i < data.numInstances(); i++) { if (instance.classValue() != data.instance(i).classValue()) { double[] otherVector = new double[data.numAttributes() - 1]; tmp = 0; for (int j = 0; j < data.numAttributes(); j++) { if (data.attribute(j) != data.classAttribute() && data.attribute(j).isNumeric()) { otherVector[tmp++] = data.instance(i).value(j); } } if (MathArrays.distance(instanceVector, otherVector) < minDistance) { minDistance = MathArrays.distance(instanceVector, otherVector); nearestUnlikeNeighbor = data.instance(i); } } } return nearestUnlikeNeighbor; }
From source file:de.ugoe.cs.cpdp.dataprocessing.NominalAttributeFilter.java
License:Apache License
@Override public void apply(Instances testdata, Instances traindata) { int indexOfConfidenceAttribute = -1; // Find index of the named confidence attribute to filter for for (int i = 0; i < traindata.numAttributes(); i++) { if (traindata.attribute(i).name().equals(nominalAttributeName)) { indexOfConfidenceAttribute = i; }//from ww w . j ava 2 s .c o m } // if it was not found return if (indexOfConfidenceAttribute == -1) { return; } // Find index of nominal values Attribute confidenceAttribute = traindata.attribute(indexOfConfidenceAttribute); ArrayList<Object> nominalValuesOfConfidenceAttribute = Collections .list(confidenceAttribute.enumerateValues()); ArrayList<Double> indexOfnominalAttributeValues = new ArrayList<Double>(); for (int k = 0; k < nominalValuesOfConfidenceAttribute.size(); k++) { for (String attributeValue : nominalAttributeValues) { if (((String) nominalValuesOfConfidenceAttribute.get(k)).equals(attributeValue)) { indexOfnominalAttributeValues.add((double) k); } } } // Go through all instances and check if nominal attribute equals for (int j = traindata.numInstances() - 1; j >= 0; j--) { Instance wekaInstance = traindata.get(j); // delete all instances where nominal attribute has the value of one of the parameter if (indexOfnominalAttributeValues.contains(wekaInstance.value(indexOfConfidenceAttribute))) { traindata.delete(j); } } }
From source file:de.ugoe.cs.cpdp.dataprocessing.NormalizationUtil.java
License:Apache License
/** * <p>/*from w ww . j av a2 s . c o m*/ * Min-Max normalization to scale all data to the interval [0,1] (N1 in Transfer Defect Learning * by Nam et al.). * </p> * * @param data * data that is normalized */ public static void minMax(Instances data) { for (int j = 0; j < data.numAttributes(); j++) { if (data.classIndex() != j) { double min = data.attributeStats(j).numericStats.min; double max = data.attributeStats(j).numericStats.max; for (int i = 0; i < data.numInstances(); i++) { Instance inst = data.instance(i); double newValue = (inst.value(j) - min) / (max - min); inst.setValue(j, newValue); } } } }
From source file:de.ugoe.cs.cpdp.dataprocessing.NormalizationUtil.java
License:Apache License
/** * <p>/*www. jav a2 s.c o m*/ * Z-Score normalization (N2 in Transfer Defect Learning by Nam et al.). * </p> * * @param data * data that is normalized */ public static void zScore(Instances data) { final double[] mean = new double[data.numAttributes()]; final double[] std = new double[data.numAttributes()]; // get means and stddevs of data for (int j = 0; j < data.numAttributes(); j++) { if (data.classIndex() != j) { mean[j] = data.meanOrMode(j); std[j] = Math.sqrt(data.variance(j)); } } applyZScore(data, mean, std); }