List of usage examples for weka.core Instances numAttributes
publicint numAttributes()
From source file:de.ugoe.cs.cpdp.dataselection.MahalanobisOutlierRemoval.java
License:Apache License
/** * <p>/*from ww w .j av a 2 s .c o m*/ * removes all instances, whose Mahalanobi distance to the mean of the data is greater than * epsilon. * </p> * * @param data * data where the outliers are removed */ private void applyMahalanobisDistancesRemoval(Instances data) { RealMatrix values = new BlockRealMatrix(data.size(), data.numAttributes() - 1); for (int i = 0; i < data.size(); i++) { values.setRow(i, WekaUtils.instanceValues(data.get(i))); } RealMatrix inverseCovariance; try { inverseCovariance = new LUDecomposition(new Covariance(values).getCovarianceMatrix()).getSolver() .getInverse(); } catch (SingularMatrixException e) { Console.traceln(Level.WARNING, "could not perform Mahalanobis outlier removal due to singular covariance matrix"); return; } // create mean vector double[] meanValues = new double[data.numAttributes() - 1]; int k = 0; for (int j = 0; j < data.numAttributes(); j++) { if (j != data.classIndex()) { meanValues[k] = data.attributeStats(j).numericStats.mean; k++; } } for (int i = data.size() - 1; i >= 0; i--) { double distance = mahalanobisDistance(inverseCovariance, WekaUtils.instanceValues(data.get(i)), meanValues); if (distance > epsilon) { data.remove(i); } } }
From source file:de.ugoe.cs.cpdp.dataselection.SynonymOutlierRemoval.java
License:Apache License
/** * <p>// w ww . j a v a 2 s .c om * Applies the synonym outlier removal. * </p> * * @param traindata * data from which the outliers are removed. */ public void applySynonymRemoval(Instances traindata) { double minDistance[][] = new double[traindata.size()][traindata.numAttributes() - 1]; double minDistanceAttribute[] = new double[traindata.numAttributes() - 1]; double distance; for (int j = 0; j < minDistanceAttribute.length; j++) { minDistanceAttribute[j] = Double.MAX_VALUE; } for (int i1 = traindata.size() - 1; i1 < traindata.size(); i1++) { int k = 0; for (int j = 0; j < traindata.numAttributes(); j++) { if (j != traindata.classIndex()) { minDistance[i1][k] = Double.MAX_VALUE; for (int i2 = 0; i2 < traindata.size(); i2++) { if (i1 != i2) { distance = Math.abs(traindata.get(i1).value(j) - traindata.get(i2).value(j)); if (distance < minDistance[i1][k]) { minDistance[i1][k] = distance; } if (distance < minDistanceAttribute[k]) { minDistanceAttribute[k] = distance; } } } k++; } } } for (int i = traindata.size() - 1; i >= 0; i--) { boolean hasClosest = false; for (int j = 0; !hasClosest && j < traindata.numAttributes(); j++) { hasClosest = minDistance[i][j] <= minDistanceAttribute[j]; } if (!hasClosest) { traindata.delete(i); } } }
From source file:de.ugoe.cs.cpdp.loader.AUDIChangeLoader.java
License:Apache License
public Instances load(File file, String dummy) { final String[] lines; try {// www .j a v a 2s . co m lines = FileTools.getLinesFromFile(file.getAbsolutePath()); } catch (IOException e) { throw new RuntimeException(e); } // information about bugs are in another file String path = file.getAbsolutePath(); path = path.substring(0, path.length() - 14) + "repro.csv"; final String[] linesBug; try { linesBug = FileTools.getLinesFromFile(path); } catch (IOException e) { throw new RuntimeException(e); } // configure Instances final ArrayList<Attribute> atts = new ArrayList<Attribute>(); String[] lineSplit = lines[0].split(";"); // ignore first three/four and last two columns int offset; if (lineSplit[3].equals("project_rev")) { offset = 4; } else { offset = 3; } for (int j = 0; j < lineSplit.length - (offset + 2); j++) { atts.add(new Attribute(lineSplit[j + offset])); } final ArrayList<String> classAttVals = new ArrayList<String>(); classAttVals.add("0"); classAttVals.add("1"); final Attribute classAtt = new Attribute("bug", classAttVals); atts.add(classAtt); final Instances data = new Instances(file.getName(), atts, 0); data.setClass(classAtt); // fetch data for (int i = 1; i < lines.length; i++) { boolean validInstance = true; lineSplit = lines[i].split(";"); String[] lineSplitBug = linesBug[i].split(";"); double[] values = new double[data.numAttributes()]; for (int j = 0; validInstance && j < values.length - 1; j++) { if (lineSplit[j + offset].trim().isEmpty()) { validInstance = false; } else { values[j] = Double.parseDouble(lineSplit[j + offset].trim()); } } if (offset == 3) { values[values.length - 1] = lineSplitBug[7].equals("0") ? 0 : 1; } else { values[values.length - 1] = lineSplitBug[8].equals("0") ? 0 : 1; } if (validInstance) { data.add(new DenseInstance(1.0, values)); } else { System.out.println("instance " + i + " is invalid"); } } return data; }
From source file:de.ugoe.cs.cpdp.loader.AUDIDataLoader.java
License:Apache License
@Override public Instances load(File file) { final String[] lines; try {/* w w w. jav a 2 s. c o m*/ lines = FileTools.getLinesFromFile(file.getAbsolutePath()); } catch (IOException e) { throw new RuntimeException(e); } // information about bugs are in another file String path = file.getAbsolutePath(); path = path.substring(0, path.length() - 14) + "repro.csv"; final String[] linesBug; try { linesBug = FileTools.getLinesFromFile(path); } catch (IOException e) { throw new RuntimeException(e); } // configure Instances final ArrayList<Attribute> atts = new ArrayList<Attribute>(); String[] lineSplit = lines[0].split(";"); // ignore first three/four and last two columns int offset; if (lineSplit[3].equals("project_rev")) { offset = 4; } else { offset = 3; } for (int j = 0; j < lineSplit.length - (offset + 2); j++) { atts.add(new Attribute(lineSplit[j + offset])); } final ArrayList<String> classAttVals = new ArrayList<String>(); classAttVals.add("0"); classAttVals.add("1"); final Attribute classAtt = new Attribute("bug", classAttVals); atts.add(classAtt); final Instances data = new Instances(file.getName(), atts, 0); data.setClass(classAtt); // fetch data for (int i = 1; i < lines.length; i++) { boolean validInstance = true; lineSplit = lines[i].split(";"); String[] lineSplitBug = linesBug[i].split(";"); double[] values = new double[data.numAttributes()]; for (int j = 0; validInstance && j < values.length - 1; j++) { if (lineSplit[j + offset].trim().isEmpty()) { validInstance = false; } else { values[j] = Double.parseDouble(lineSplit[j + offset].trim()); } } if (offset == 3) { values[values.length - 1] = lineSplitBug[7].equals("0") ? 0 : 1; } else { values[values.length - 1] = lineSplitBug[8].equals("0") ? 0 : 1; } if (validInstance) { data.add(new DenseInstance(1.0, values)); } else { System.out.println("instance " + i + " is invalid"); } } return data; }
From source file:de.ugoe.cs.cpdp.loader.DecentDataLoader.java
License:Apache License
/** * Creates a WekaInstance from an ARFFX Model Instance * /* w w w. j av a 2 s. c o m*/ * @param dataSet * WekaInstance dataset, where the arffx model instances should be added to * @param i * arffx model instance */ private void createWekaInstance(Instances dataSet, Instance i) { double[] values = new double[dataSet.numAttributes()]; int j = 0; for (Value value : i.getValues()) { String dataValue = value.getContent(); String attributeName = value.getOfAttribute().getName(); if (attributeFilter.contains(attributeName)) { continue; } // Is value a LABEL.* attribute? if (isLabel(attributeName)) { values[j] = dataSet.attribute(j).indexOfValue(dataValue); } else if (isConfidenceLabel(attributeName)) { // Is value a CONFIDENCE.* attribute? values[j] = dataSet.attribute(j).indexOfValue(dataValue); } else if (attributeName.equals("Artifact.Name")) { // Is it the name of the artifact? artifactNames.add(dataValue); values[j] = getIndexOfArtifactName(dataValue); } else { // Is it a numeric value? values[j] = Double.parseDouble(dataValue); } j++; } DenseInstance inst = new DenseInstance(1.0, values); dataSet.add(inst); }
From source file:de.ugoe.cs.cpdp.loader.NetgeneLoader.java
License:Apache License
@Override public Instances load(File fileMetricsFile) { // first determine all files String path = fileMetricsFile.getParentFile().getAbsolutePath(); String project = fileMetricsFile.getName().split("_")[0]; File bugsFile = new File(path + "/" + project + "_bugs_per_file.csv"); File networkMetrics = new File(path + "/" + project + "_network_metrics.csv"); Instances metricsData = null;/*from ww w. ja va 2 s. c o m*/ try { CSVLoader wekaCsvLoader = new CSVLoader(); wekaCsvLoader.setSource(fileMetricsFile); metricsData = wekaCsvLoader.getDataSet(); wekaCsvLoader.setSource(bugsFile); Instances bugsData = wekaCsvLoader.getDataSet(); wekaCsvLoader.setSource(networkMetrics); Instances networkData = wekaCsvLoader.getDataSet(); metricsData.setRelationName(project); // fix nominal attributes (i.e., NA values) for (int j = 2; j < networkData.numAttributes(); j++) { if (networkData.attribute(j).isNominal()) { String attributeName = networkData.attribute(j).name(); double[] tmpVals = new double[networkData.size()]; // get temporary values for (int i = 0; i < networkData.size(); i++) { Instance inst = networkData.instance(i); if (!inst.isMissing(j)) { String val = networkData.instance(i).stringValue(j); try { tmpVals[i] = Double.parseDouble(val); } catch (NumberFormatException e) { // not a number, using 0.0; tmpVals[i] = 0.0; } } else { tmpVals[i] = 0.0; } } // replace attribute networkData.deleteAttributeAt(j); networkData.insertAttributeAt(new Attribute(attributeName), j); for (int i = 0; i < networkData.size(); i++) { networkData.instance(i).setValue(j, tmpVals[i]); } } } // fix string attributes for (int j = 2; j < networkData.numAttributes(); j++) { if (networkData.attribute(j).isString()) { String attributeName = networkData.attribute(j).name(); double[] tmpVals = new double[networkData.size()]; // get temporary values for (int i = 0; i < networkData.size(); i++) { Instance inst = networkData.instance(i); if (!inst.isMissing(j)) { String val = networkData.instance(i).stringValue(j); try { tmpVals[i] = Double.parseDouble(val); } catch (NumberFormatException e) { // not a number, using 0.0; tmpVals[i] = 0.0; } } else { tmpVals[i] = 0.0; } } // replace attribute networkData.deleteAttributeAt(j); networkData.insertAttributeAt(new Attribute(attributeName), j); for (int i = 0; i < networkData.size(); i++) { networkData.instance(i).setValue(j, tmpVals[i]); } } } Map<String, Integer> filenames = new HashMap<>(); for (int j = 0; j < metricsData.size(); j++) { filenames.put(metricsData.instance(j).stringValue(0), j); } // merge with network data int attributeIndex; for (int j = 2; j < networkData.numAttributes(); j++) { attributeIndex = metricsData.numAttributes(); metricsData.insertAttributeAt(networkData.attribute(j), attributeIndex); for (int i = 0; i < networkData.size(); i++) { Integer instanceIndex = filenames.get(networkData.instance(i).stringValue(1)); if (instanceIndex != null) { metricsData.instance(instanceIndex).setValue(attributeIndex, networkData.instance(i).value(j)); } } } // add bug information attributeIndex = metricsData.numAttributes(); final ArrayList<String> classAttVals = new ArrayList<String>(); classAttVals.add("0"); classAttVals.add("1"); final Attribute classAtt = new Attribute("bug", classAttVals); metricsData.insertAttributeAt(classAtt, attributeIndex); for (int i = 0; i < bugsData.size(); i++) { if (bugsData.instance(i).value(2) > 0.0d) { Integer instanceIndex = filenames.get(bugsData.instance(i).stringValue(1)); if (instanceIndex != null) { metricsData.instance(instanceIndex).setValue(attributeIndex, 1.0); } } } // remove filenames metricsData.deleteAttributeAt(0); Attribute eigenvector = metricsData.attribute("eigenvector"); if (eigenvector != null) { for (int j = 0; j < metricsData.numAttributes(); j++) { if (metricsData.attribute(j) == eigenvector) { metricsData.deleteAttributeAt(j); } } } metricsData.setClassIndex(metricsData.numAttributes() - 1); // set all missing values to 0 for (int i = 0; i < metricsData.size(); i++) { for (int j = 0; j < metricsData.numAttributes(); j++) { if (metricsData.instance(i).isMissing(j)) { metricsData.instance(i).setValue(j, 0.0d); } } } } catch (IOException e) { Console.traceln(Level.SEVERE, "failure reading file: " + e.getMessage()); metricsData = null; } return metricsData; }
From source file:de.ugoe.cs.cpdp.training.MetricMatchingTraining.java
License:Apache License
/** * We need the test data instances to do a metric matching, so in this special case we get this * data before evaluation./*from w w w . jav a 2 s.c o m*/ */ @Override public void apply(SetUniqueList<Instances> traindataSet, Instances testdata) { // reset these for each run this.mm = null; this.classifier = null; double score = 0; // matching score to select the best matching training data from the set int num = 0; int biggest_num = 0; MetricMatch tmp; for (Instances traindata : traindataSet) { num++; tmp = new MetricMatch(traindata, testdata); // metric selection may create error, continue to next training set try { tmp.attributeSelection(); tmp.matchAttributes(this.method, this.threshold); } catch (Exception e) { e.printStackTrace(); throw new RuntimeException(e); } // we only select the training data from our set with the most matching attributes if (tmp.getScore() > score && tmp.attributes.size() > 0) { score = tmp.getScore(); this.mm = tmp; biggest_num = num; } } // if we have found a matching instance we use it, log information about the match for // additional eval later Instances ilist = null; if (this.mm != null) { ilist = this.mm.getMatchedTrain(); Console.traceln(Level.INFO, "[MATCH FOUND] match: [" + biggest_num + "], score: [" + score + "], instances: [" + ilist.size() + "], attributes: [" + this.mm.attributes.size() + "], ilist attrs: [" + ilist.numAttributes() + "]"); for (Map.Entry<Integer, Integer> attmatch : this.mm.attributes.entrySet()) { Console.traceln(Level.INFO, "[MATCHED ATTRIBUTE] source attribute: [" + this.mm.train.attribute(attmatch.getKey()).name() + "], target attribute: [" + this.mm.test.attribute(attmatch.getValue()).name() + "]"); } } else { Console.traceln(Level.INFO, "[NO MATCH FOUND]"); } // if we have a match we build the MetricMatchingClassifier, if not we fall back to FixClass // Classifier try { if (this.mm != null) { this.classifier = new MetricMatchingClassifier(); this.classifier.buildClassifier(ilist); ((MetricMatchingClassifier) this.classifier).setMetricMatching(this.mm); } else { this.classifier = new FixClass(); this.classifier.buildClassifier(ilist); // this is null, but the FixClass Classifier // does not use it anyway } } catch (Exception e) { e.printStackTrace(); throw new RuntimeException(e); } }
From source file:de.ugoe.cs.cpdp.util.WekaUtils.java
License:Apache License
/** * <p>/*from w w w . j a va 2 s.c om*/ * Calculates the distributional characteristics of the distances the instances within a data * set have to each other. * </p> * * @param data * data for which the instances are characterized * @return characteristics */ public static DistChar datasetDistance(Instances data) { double distance; double sumAll = 0.0; double sumAllQ = 0.0; double min = Double.MAX_VALUE; double max = Double.MIN_VALUE; int numCmp = 0; int l = 0; double[] inst1 = new double[data.numAttributes() - 1]; double[] inst2 = new double[data.numAttributes() - 1]; EuclideanDistance euclideanDistance = new EuclideanDistance(); for (int i = 0; i < data.numInstances(); i++) { l = 0; for (int k = 0; k < data.numAttributes(); k++) { if (k != data.classIndex()) { inst1[l] = data.instance(i).value(k); } } for (int j = 0; j < data.numInstances(); j++) { if (j != i) { l = 0; for (int k = 0; k < data.numAttributes(); k++) { if (k != data.classIndex()) { inst2[l] = data.instance(j).value(k); } } distance = euclideanDistance.compute(inst1, inst2); sumAll += distance; sumAllQ += distance * distance; numCmp++; if (distance < min) { min = distance; } if (distance > max) { max = distance; } } } } double mean = sumAll / numCmp; double std = Math.sqrt((sumAllQ - (sumAll * sumAll) / numCmp) * (1.0d / (numCmp - 1))); return new DistChar(mean, std, min, max, data.numInstances()); }
From source file:de.uniheidelberg.cl.swp.mlprocess.AblationTesting.java
License:Apache License
/** * Creates an Instance object for the specified List of Features. * <br>//from w w w .j a va2 s .c om * Extracts the Instance objects from a source file and suppresses all features but the ones * specified. * * @param fileName File to the training results in ARFF format. * @param features List of {@link AbstractFeatureExtractor}s which are currently being tested. * @return Instances object consisting of the desired attribute structure. * @throws Exception If the ARFF file couldn't be read, an exception is thrown. */ public Instances createInstances(String fileName, List<AbstractFeatureExtractor> features) throws Exception { final Instances train = new Instances(new BufferedReader(new FileReader(fileName))); ArrayList<Attribute> newAttributes = new ArrayList<Attribute>(); for (int i = 0; i < train.numAttributes(); i++) { for (AbstractFeatureExtractor feature : features) { if (train.attribute(i).name().equals(feature.getName())) { newAttributes.add(train.attribute(i)); continue; } } } /* * add the last two features (ACR-System + correct/false predictions) as those * are no features gathered by a FeatureExtractor. */ newAttributes.add(train.attribute(train.numAttributes() - 2)); newAttributes.add(train.attribute(train.numAttributes() - 1)); Instances trainCopy = copyInstances(train, newAttributes); trainCopy.setClassIndex(trainCopy.numAttributes() - 1); return trainCopy; }
From source file:de.uniheidelberg.cl.swp.mlprocess.WEKARunner.java
License:Apache License
/** * Evaluates our classifier with a test set. * <br>//from www . j av a 2 s .c om * Not used yet. * * @param testArff ARFF file to evaluate against. * @throws If the evaluation couldn't be initialized. */ public void buildEvaluation(String testArff) throws Exception { Instances evalIns = new Instances(new BufferedReader(new FileReader(testArff))); evalIns.setClassIndex(evalIns.numAttributes() - 1); evaluation = new Evaluation(train); }