List of usage examples for weka.core Instances add
@Override public boolean add(Instance instance)
From source file:de.ugoe.cs.cpdp.dataselection.NeighborhoodFilter.java
License:Apache License
/** * <p>//from ww w . j a v a2s . c om * Applies the relevancy filter after Ryu et al. * </p> * * @param testdata * test data * @param traindata * training data * @return filtered trainind data */ private Instances applyNeighborhoodFilter(Instances testdata, Instances traindata) { TreeSet<Integer> selectedInstances = new TreeSet<>(); for (int i = 0; i < testdata.size(); i++) { double minHam = Double.MAX_VALUE; for (int j = 0; j < traindata.size(); j++) { double distance = WekaUtils.hammingDistance(testdata.get(i), traindata.get(j)); if (distance < minHam) { minHam = distance; } } for (int j = 0; j < traindata.size(); j++) { double distance = WekaUtils.hammingDistance(testdata.get(i), traindata.get(j)); if (distance <= minHam) { selectedInstances.add(j); } } } Instances selectedTraindata = new Instances(testdata); selectedTraindata.clear(); for (Integer index : selectedInstances) { selectedTraindata.add(traindata.instance(index)); } return selectedTraindata; }
From source file:de.ugoe.cs.cpdp.dataselection.SetWiseEMContextSelection.java
License:Apache License
/** * Returns test- and training data with only the project context factors which were chosen in * the configuration. This is later used for clustering. * //from w w w . java2 s . c o m * @param testdata * @param traindataSet * @return */ protected Instances getContextFactors(Instances testdata, SetUniqueList<Instances> traindataSet) { // setup weka Instances for clustering final ArrayList<Attribute> atts = new ArrayList<Attribute>(); // we only want the project context factors for (String pcf : this.project_context_factors) { atts.add(new Attribute(pcf)); } // set up the data final Instances data = new Instances("project_context_factors", atts, 0); double[] instanceValues = new double[atts.size()]; // only project context factors + only one instance per project needed int i = 0; for (String pcf : this.project_context_factors) { instanceValues[i] = testdata.instance(0).value(testdata.attribute(pcf)); // Console.traceln(Level.INFO, "adding attribute: " + pcf + " value: " + // instanceValues[i]); i++; } data.add(new DenseInstance(1.0, instanceValues)); // now for the projects of the training stet for (Instances traindata : traindataSet) { instanceValues = new double[atts.size()]; // ohne das hier immer dieselben werte?! i = 0; for (String pcf : this.project_context_factors) { instanceValues[i] = traindata.instance(0).value(traindata.attribute(pcf)); // Console.traceln(Level.INFO, "adding attribute: " + pcf + " value: " + // instanceValues[i]); i++; } data.add(new DenseInstance(1.0, instanceValues)); } return data; }
From source file:de.ugoe.cs.cpdp.execution.AbstractCrossProjectExperiment.java
License:Apache License
/** * Helper method that combines a set of Weka {@link Instances} sets into a single * {@link Instances} set.// w ww. java2s . com * * @param traindataSet * set of {@link Instances} to be combines * @return single {@link Instances} set */ public static Instances makeSingleTrainingSet(SetUniqueList<Instances> traindataSet) { Instances traindataFull = null; for (Instances traindata : traindataSet) { if (traindataFull == null) { traindataFull = new Instances(traindata); } else { for (int i = 0; i < traindata.numInstances(); i++) { traindataFull.add(traindata.instance(i)); } } } return traindataFull; }
From source file:de.ugoe.cs.cpdp.loader.AUDIChangeLoader.java
License:Apache License
@Override public Instances load(File file) { final String[] lines; String[] lineSplit;/*from ww w . j a va 2 s . c o m*/ String[] lineSplitBug; try { lines = FileTools.getLinesFromFile(file.getAbsolutePath()); } catch (IOException e) { throw new RuntimeException(e); } // information about bugs are in another file String path = file.getAbsolutePath(); path = path.substring(0, path.length() - 14) + "repro.csv"; final String[] linesBug; try { linesBug = FileTools.getLinesFromFile(path); } catch (IOException e) { throw new RuntimeException(e); } int revisionIndex = -1; int bugIndex = -1; lineSplitBug = linesBug[0].split(";"); for (int j = 0; j < lineSplitBug.length; j++) { if (lineSplitBug[j].equals("svnrev")) { revisionIndex = j; } if (lineSplitBug[j].equals("num_bugs_trace")) { bugIndex = j; } } if (revisionIndex < 0) { throw new RuntimeException("could not find SVN revisions"); } if (bugIndex < 0) { throw new RuntimeException("could not find bug information"); } int metricsStartIndex = -1; int metricsEndIndex = -1; lineSplit = lines[0].split(";"); for (int j = 0; j < lineSplit.length; j++) { if (lineSplit[j].equals("lm_LOC")) { metricsStartIndex = j; } if (lineSplit[j].equals("h_E")) { metricsEndIndex = j; } } if (metricsStartIndex < 0) { throw new RuntimeException("could not find first metric, i.e., lm_LOC"); } if (metricsEndIndex < 0) { throw new RuntimeException("could not find last metric, i.e., h_E"); } int numMetrics = metricsEndIndex - metricsStartIndex + 1; // create sets of all filenames and revisions SortedMap<EntityRevisionPair, Integer> entityRevisionPairs = new TreeMap<>(); for (int i = 1; i < linesBug.length; i++) { lineSplitBug = linesBug[i].split(";"); entityRevisionPairs .put(new EntityRevisionPair(lineSplitBug[0], Integer.parseInt(lineSplitBug[revisionIndex])), i); } // prepare weka instances final ArrayList<Attribute> atts = new ArrayList<Attribute>(); lineSplit = lines[0].split(";"); for (int j = metricsStartIndex; j <= metricsEndIndex; j++) { atts.add(new Attribute(lineSplit[j] + "_delta")); } for (int j = metricsStartIndex; j <= metricsEndIndex; j++) { atts.add(new Attribute(lineSplit[j] + "_abs")); } final ArrayList<String> classAttVals = new ArrayList<String>(); classAttVals.add("0"); classAttVals.add("1"); final Attribute classAtt = new Attribute("bug", classAttVals); atts.add(classAtt); final Instances data = new Instances(file.getName(), atts, 0); data.setClass(classAtt); // create data String lastFile = null; double[] lastValues = null; int lastNumBugs = 0; for (Entry<EntityRevisionPair, Integer> entry : entityRevisionPairs.entrySet()) { try { // first get values lineSplit = lines[entry.getValue()].split(";"); lineSplitBug = linesBug[entry.getValue()].split(";"); int i = 0; double[] values = new double[numMetrics]; for (int j = metricsStartIndex; j <= metricsEndIndex; j++) { values[i] = Double.parseDouble(lineSplit[j]); i++; } int numBugs = Integer.parseInt(lineSplitBug[bugIndex]); // then check if an entity must be created if (entry.getKey().entity.equals(lastFile)) { // create new instance double[] instanceValues = new double[2 * numMetrics + 1]; for (int j = 0; j < numMetrics; j++) { instanceValues[j] = values[j] - lastValues[j]; instanceValues[j + numMetrics] = values[j]; } // check if any value>0 boolean changeOccured = false; for (int j = 0; j < numMetrics; j++) { if (instanceValues[j] > 0) { changeOccured = true; } } if (changeOccured) { instanceValues[instanceValues.length - 1] = numBugs <= lastNumBugs ? 0 : 1; data.add(new DenseInstance(1.0, instanceValues)); } } lastFile = entry.getKey().entity; lastValues = values; lastNumBugs = numBugs; } catch (IllegalArgumentException e) { System.err.println("error in line " + entry.getValue() + ": " + e.getMessage()); System.err.println("metrics line: " + lines[entry.getValue()]); System.err.println("bugs line: " + linesBug[entry.getValue()]); System.err.println("line is ignored"); } } return data; }
From source file:de.ugoe.cs.cpdp.loader.AUDIChangeLoader.java
License:Apache License
public Instances load(File file, String dummy) { final String[] lines; try {//from ww w . j a v a2 s.c o m lines = FileTools.getLinesFromFile(file.getAbsolutePath()); } catch (IOException e) { throw new RuntimeException(e); } // information about bugs are in another file String path = file.getAbsolutePath(); path = path.substring(0, path.length() - 14) + "repro.csv"; final String[] linesBug; try { linesBug = FileTools.getLinesFromFile(path); } catch (IOException e) { throw new RuntimeException(e); } // configure Instances final ArrayList<Attribute> atts = new ArrayList<Attribute>(); String[] lineSplit = lines[0].split(";"); // ignore first three/four and last two columns int offset; if (lineSplit[3].equals("project_rev")) { offset = 4; } else { offset = 3; } for (int j = 0; j < lineSplit.length - (offset + 2); j++) { atts.add(new Attribute(lineSplit[j + offset])); } final ArrayList<String> classAttVals = new ArrayList<String>(); classAttVals.add("0"); classAttVals.add("1"); final Attribute classAtt = new Attribute("bug", classAttVals); atts.add(classAtt); final Instances data = new Instances(file.getName(), atts, 0); data.setClass(classAtt); // fetch data for (int i = 1; i < lines.length; i++) { boolean validInstance = true; lineSplit = lines[i].split(";"); String[] lineSplitBug = linesBug[i].split(";"); double[] values = new double[data.numAttributes()]; for (int j = 0; validInstance && j < values.length - 1; j++) { if (lineSplit[j + offset].trim().isEmpty()) { validInstance = false; } else { values[j] = Double.parseDouble(lineSplit[j + offset].trim()); } } if (offset == 3) { values[values.length - 1] = lineSplitBug[7].equals("0") ? 0 : 1; } else { values[values.length - 1] = lineSplitBug[8].equals("0") ? 0 : 1; } if (validInstance) { data.add(new DenseInstance(1.0, values)); } else { System.out.println("instance " + i + " is invalid"); } } return data; }
From source file:de.ugoe.cs.cpdp.loader.AUDIDataLoader.java
License:Apache License
@Override public Instances load(File file) { final String[] lines; try {/*w w w. j a v a 2s .c o m*/ lines = FileTools.getLinesFromFile(file.getAbsolutePath()); } catch (IOException e) { throw new RuntimeException(e); } // information about bugs are in another file String path = file.getAbsolutePath(); path = path.substring(0, path.length() - 14) + "repro.csv"; final String[] linesBug; try { linesBug = FileTools.getLinesFromFile(path); } catch (IOException e) { throw new RuntimeException(e); } // configure Instances final ArrayList<Attribute> atts = new ArrayList<Attribute>(); String[] lineSplit = lines[0].split(";"); // ignore first three/four and last two columns int offset; if (lineSplit[3].equals("project_rev")) { offset = 4; } else { offset = 3; } for (int j = 0; j < lineSplit.length - (offset + 2); j++) { atts.add(new Attribute(lineSplit[j + offset])); } final ArrayList<String> classAttVals = new ArrayList<String>(); classAttVals.add("0"); classAttVals.add("1"); final Attribute classAtt = new Attribute("bug", classAttVals); atts.add(classAtt); final Instances data = new Instances(file.getName(), atts, 0); data.setClass(classAtt); // fetch data for (int i = 1; i < lines.length; i++) { boolean validInstance = true; lineSplit = lines[i].split(";"); String[] lineSplitBug = linesBug[i].split(";"); double[] values = new double[data.numAttributes()]; for (int j = 0; validInstance && j < values.length - 1; j++) { if (lineSplit[j + offset].trim().isEmpty()) { validInstance = false; } else { values[j] = Double.parseDouble(lineSplit[j + offset].trim()); } } if (offset == 3) { values[values.length - 1] = lineSplitBug[7].equals("0") ? 0 : 1; } else { values[values.length - 1] = lineSplitBug[8].equals("0") ? 0 : 1; } if (validInstance) { data.add(new DenseInstance(1.0, values)); } else { System.out.println("instance " + i + " is invalid"); } } return data; }
From source file:de.ugoe.cs.cpdp.loader.CSVMockusDataLoader.java
License:Apache License
@Override public Instances load(File file) { final String[] lines; try {//ww w .java 2 s . c om lines = FileTools.getLinesFromFile(file.getAbsolutePath()); } catch (IOException e) { throw new RuntimeException(e); } // configure Instances final ArrayList<Attribute> atts = new ArrayList<Attribute>(); String[] lineSplit = lines[0].split(","); for (int j = 0; j < lineSplit.length - 3; j++) { atts.add(new Attribute(lineSplit[j + 2])); } final ArrayList<String> classAttVals = new ArrayList<String>(); classAttVals.add("0"); classAttVals.add("1"); final Attribute classAtt = new Attribute("bug", classAttVals); atts.add(classAtt); final Instances data = new Instances(file.getName(), atts, 0); data.setClass(classAtt); // fetch data for (int i = 1; i < lines.length; i++) { lineSplit = lines[i].split(","); double[] values = new double[lineSplit.length - 2]; for (int j = 0; j < values.length - 1; j++) { values[j] = Double.parseDouble(lineSplit[j + 2].trim()); } values[values.length - 1] = lineSplit[lineSplit.length - 1].trim().equals("0") ? 0 : 1; data.add(new DenseInstance(1.0, values)); } return data; }
From source file:de.ugoe.cs.cpdp.loader.DecentDataLoader.java
License:Apache License
/** * Creates a WekaInstance from an ARFFX Model Instance * // w ww . ja va2s .co m * @param dataSet * WekaInstance dataset, where the arffx model instances should be added to * @param i * arffx model instance */ private void createWekaInstance(Instances dataSet, Instance i) { double[] values = new double[dataSet.numAttributes()]; int j = 0; for (Value value : i.getValues()) { String dataValue = value.getContent(); String attributeName = value.getOfAttribute().getName(); if (attributeFilter.contains(attributeName)) { continue; } // Is value a LABEL.* attribute? if (isLabel(attributeName)) { values[j] = dataSet.attribute(j).indexOfValue(dataValue); } else if (isConfidenceLabel(attributeName)) { // Is value a CONFIDENCE.* attribute? values[j] = dataSet.attribute(j).indexOfValue(dataValue); } else if (attributeName.equals("Artifact.Name")) { // Is it the name of the artifact? artifactNames.add(dataValue); values[j] = getIndexOfArtifactName(dataValue); } else { // Is it a numeric value? values[j] = Double.parseDouble(dataValue); } j++; } DenseInstance inst = new DenseInstance(1.0, values); dataSet.add(inst); }
From source file:de.unidue.langtech.grading.tc.ClusterExemplarTask.java
License:Open Source License
@Override public void execute(TaskContext aContext) throws Exception { if (learningMode.equals(Constants.LM_MULTI_LABEL)) { throw new IllegalArgumentException("Cannot use multi-label setup in clustering."); }//from w w w .j av a 2s. c o m boolean multiLabel = false; File arffFileTrain = new File( aContext.getStorageLocation(TEST_TASK_INPUT_KEY_TRAINING_DATA, AccessMode.READONLY).getPath() + "/" + TRAINING_DATA_FILENAME); Instances trainData = TaskUtils.getInstances(arffFileTrain, multiLabel); Clusterer abstractClusterer = AbstractClusterer.forName(clusteringArguments.get(0), clusteringArguments.subList(1, clusteringArguments.size()).toArray(new String[0])); // we assume that only this method has been used - breaks modularity, but need results fast ... :/ SimpleKMeans clusterer = (SimpleKMeans) abstractClusterer; trainData = WekaUtils.removeOutcomeId(trainData, multiLabel); Instances copyTrainData = new Instances(trainData); // generate data for clusterer (w/o class) Remove filter = new Remove(); filter.setAttributeIndices("" + (trainData.classIndex() + 1)); filter.setInputFormat(trainData); Instances clusterTrainData = Filter.useFilter(trainData, filter); clusterer.buildClusterer(clusterTrainData); Instances centroids = clusterer.getClusterCentroids(); // Add addFilter = new Add(); // addFilter.setAttributeIndex(new Integer(numTestLabels + i + 1).toString()); // addFilter.setNominalLabels("0,1"); // addFilter.setAttributeName(trainData.attribute(i).name() + COMPATIBLE_OUTCOME_CLASS); // addFilter.setInputFormat(testData); trainData.clear(); Enumeration<Instance> centroidInstances = centroids.enumerateInstances(); while (centroidInstances.hasMoreElements()) { Instance centroidInstance = centroidInstances.nextElement(); // centroidInstance is usually not a real instance, but a virtual centroid // we need to find the closest point in the training data double minDistance = Double.POSITIVE_INFINITY; int offset = 0; int minOffset = 0; Enumeration<Instance> trainInstances = clusterTrainData.enumerateInstances(); while (trainInstances.hasMoreElements()) { Instance trainInstance = trainInstances.nextElement(); double dist = distance(centroidInstance, trainInstance); if (dist < minDistance) { minDistance = dist; minOffset = offset; } offset++; } // add selected instance to instances trainData.add(copyTrainData.get(minOffset)); } // write the new training data (that will be used by the test task instead of the original one) DataSink.write(aContext.getStorageLocation(ADAPTED_TRAINING_DATA, AccessMode.READWRITE).getPath() + "/" + ARFF_FILENAME, trainData); }
From source file:de.uniheidelberg.cl.swp.mlprocess.AblationTesting.java
License:Apache License
/** * Copies the Instances from the source Instances object to a new one, which only contains the * currently tested features./*from w ww .j a v a2s .co m*/ * * @param source The Instances object containing all the Instance objects from the source file. * @param targetStructure The list of {@link AbstractFeatureExtractor}s which is currently * being tested. * @return An instances object consisting of all Instance objects from the source file. */ private Instances copyInstances(Instances source, ArrayList<Attribute> targetStructure) { Instances target = new Instances("ACResolution", targetStructure, 0); for (int i = 0; i < source.numInstances(); i++) { double[] vals = new double[targetStructure.size()]; for (int z = 0; z < targetStructure.size(); z++) { vals[z] = getAttributeValue(source.instance(i), targetStructure.get(z).name()); } Instance in = new DenseInstance(1.0, vals); target.add(in); } return target; }