List of usage examples for weka.core Instances add
@Override public boolean add(Instance instance)
From source file:de.fub.maps.project.detector.model.inference.processhandler.SpecialInferenceDataProcessHandler.java
License:Open Source License
@Override protected void handle() { clearResults();/*from w w w . j a v a 2s .com*/ Classifier classifier = getInferenceModel().getClassifier(); Collection<Attribute> attributeList = getInferenceModel().getAttributes(); if (!attributeList.isEmpty()) { Set<String> keySet = getInferenceModel().getInput().getTrainingsSet().keySet(); setClassesToView(keySet); Instances unlabeledInstances = new Instances("Unlabeld Tracks", new ArrayList<Attribute>(attributeList), 0); //NO18N unlabeledInstances.setClassIndex(0); ArrayList<TrackSegment> segmentList = new ArrayList<TrackSegment>(); for (Entry<String, HashSet<TrackSegment>> entry : getInferenceModel().getInput().getTrainingsSet() .entrySet()) { for (TrackSegment segment : entry.getValue()) { segment.setLabel(entry.getKey()); Instance instance = getInstance(segment); unlabeledInstances.add(instance); segmentList.add(segment); } } // create copy Instances labeledInstances = new Instances(unlabeledInstances); for (int index = 0; index < labeledInstances.numInstances(); index++) { try { Instance instance = labeledInstances.instance(index); // classify instance double classifyed = classifier.classifyInstance(instance); instance.setClassValue(classifyed); // get class label String value = unlabeledInstances.classAttribute().value((int) classifyed); if (index < segmentList.size()) { instanceToTrackSegmentMap.put(instance, segmentList.get(index)); } // put label and instance to result map put(value, instance); } catch (Exception ex) { Exceptions.printStackTrace(ex); } } // update visw updateVisualRepresentation(); // update result set of the inferenceModel for (Map.Entry<String, List<Instance>> entry : resultMap.entrySet()) { HashSet<TrackSegment> trackSegmentList = new HashSet<TrackSegment>(); for (Instance instance : entry.getValue()) { TrackSegment trackSegment = instanceToTrackSegmentMap.get(instance); if (trackSegment != null) { trackSegmentList.add(trackSegment); } } // only those classes are put into the result data set, which are not empty if (!trackSegmentList.isEmpty()) { getInferenceModel().getResult().put(entry.getKey(), trackSegmentList); } } } else { throw new InferenceModelClassifyException(MessageFormat .format("No attributes available. Attribute list lengeth == {0}", attributeList.size())); } resultMap.clear(); instanceToTrackSegmentMap.clear(); }
From source file:de.fub.maps.project.detector.model.inference.processhandler.TrainingsDataProcessHandler.java
License:Open Source License
@Override protected void handle() { final ProgressHandle handle = ProgressHandleFactory.createHandle("Trainings"); try {/* ww w . java 2s.c om*/ handle.start(); Collection<Attribute> attributeCollection = getInferenceModel().getAttributes(); ArrayList<Attribute> arrayList = new ArrayList<Attribute>(attributeCollection); Instances trainingSet = new Instances("Classes", arrayList, 0); trainingSet.setClassIndex(0); Instances testingSet = new Instances("Classes", arrayList, 0); testingSet.setClassIndex(0); HashMap<String, HashSet<TrackSegment>> dataset = getInferenceModel().getInput().getTrainingsSet(); int datasetCount = 0; for (HashSet<TrackSegment> list : dataset.values()) { for (TrackSegment trackSegment : list) { datasetCount += trackSegment.getWayPointList().size(); } } handle.switchToDeterminate(datasetCount); int trackCount = 0; for (Entry<String, HashSet<TrackSegment>> entry : dataset.entrySet()) { int trainingsSetSize = (int) Math.ceil(entry.getValue().size() * getTrainingsSetRatioParameter()); int index = 0; for (TrackSegment trackSegment : entry.getValue()) { Instance instance = getInstance(entry.getKey(), trackSegment); if (index < trainingsSetSize) { trainingSet.add(instance); } else { testingSet.add(instance); } handle.progress(trackCount++); index++; } } assert trainingSet.numInstances() > 0 : "Training set is empty and has no instances"; //NO18N assert testingSet.numInstances() > 0 : "Testing set is empty and has no instances"; //NO18N handle.switchToIndeterminate(); evaluate(trainingSet, testingSet); } finally { handle.finish(); } }
From source file:de.tudarmstadt.ukp.similarity.experiments.coling2012.util.Evaluator.java
License:Open Source License
public static void runClassifierCV(WekaClassifier wekaClassifier, Dataset dataset) throws Exception { // Set parameters int folds = 10; Classifier baseClassifier = getClassifier(wekaClassifier); // Set up the random number generator long seed = new Date().getTime(); Random random = new Random(seed); // Add IDs to the instances AddID.main(new String[] { "-i", MODELS_DIR + "/" + dataset.toString() + ".arff", "-o", MODELS_DIR + "/" + dataset.toString() + "-plusIDs.arff" }); Instances data = DataSource.read(MODELS_DIR + "/" + dataset.toString() + "-plusIDs.arff"); data.setClassIndex(data.numAttributes() - 1); // Instantiate the Remove filter Remove removeIDFilter = new Remove(); removeIDFilter.setAttributeIndices("first"); // Randomize the data data.randomize(random);//from w w w . j a va2s .c om // Perform cross-validation Instances predictedData = null; Evaluation eval = new Evaluation(data); for (int n = 0; n < folds; n++) { Instances train = data.trainCV(folds, n, random); Instances test = data.testCV(folds, n); // Apply log filter // Filter logFilter = new LogFilter(); // logFilter.setInputFormat(train); // train = Filter.useFilter(train, logFilter); // logFilter.setInputFormat(test); // test = Filter.useFilter(test, logFilter); // Copy the classifier Classifier classifier = AbstractClassifier.makeCopy(baseClassifier); // Instantiate the FilteredClassifier FilteredClassifier filteredClassifier = new FilteredClassifier(); filteredClassifier.setFilter(removeIDFilter); filteredClassifier.setClassifier(classifier); // Build the classifier filteredClassifier.buildClassifier(train); // Evaluate eval.evaluateModel(filteredClassifier, test); // Add predictions AddClassification filter = new AddClassification(); filter.setClassifier(filteredClassifier); filter.setOutputClassification(true); filter.setOutputDistribution(false); filter.setOutputErrorFlag(true); filter.setInputFormat(train); Filter.useFilter(train, filter); // trains the classifier Instances pred = Filter.useFilter(test, filter); // performs predictions on test set if (predictedData == null) predictedData = new Instances(pred, 0); for (int j = 0; j < pred.numInstances(); j++) predictedData.add(pred.instance(j)); } // Prepare output classification String[] scores = new String[predictedData.numInstances()]; for (Instance predInst : predictedData) { int id = new Double(predInst.value(predInst.attribute(0))).intValue() - 1; int valueIdx = predictedData.numAttributes() - 2; String value = predInst.stringValue(predInst.attribute(valueIdx)); scores[id] = value; } // Output StringBuilder sb = new StringBuilder(); for (String score : scores) sb.append(score.toString() + LF); FileUtils.writeStringToFile( new File(OUTPUT_DIR + "/" + dataset.toString() + "/" + wekaClassifier.toString() + "/output.csv"), sb.toString()); }
From source file:de.ugoe.cs.cpdp.dataprocessing.Oversampling.java
License:Apache License
@Override public void apply(Instances testdata, Instances traindata) { final int[] counts = traindata.attributeStats(traindata.classIndex()).nominalCounts; if (counts[1] < counts[0]) { Instances negatives = new Instances(traindata); Instances positives = new Instances(traindata); for (int i = traindata.size() - 1; i >= 0; i--) { if (Double.compare(1.0, negatives.get(i).classValue()) == 0) { negatives.remove(i);// w ww. ja v a2 s. c o m } if (Double.compare(0.0, positives.get(i).classValue()) == 0) { positives.remove(i); } } Resample resample = new Resample(); resample.setSampleSizePercent((100.0 * counts[0]) / counts[1]); try { resample.setInputFormat(traindata); positives = Filter.useFilter(positives, resample); } catch (Exception e) { throw new RuntimeException(e); } traindata.clear(); for (int i = 0; i < negatives.size(); i++) { traindata.add(negatives.get(i)); } for (int i = 0; i < positives.size(); i++) { traindata.add(positives.get(i)); } } }
From source file:de.ugoe.cs.cpdp.dataprocessing.Resampling.java
License:Apache License
@Override public void apply(Instances testdata, Instances traindata) { Resample resample = new Resample(); resample.setSampleSizePercent(100);/*from w w w. j a va 2 s .c o m*/ resample.setBiasToUniformClass(1.0); Instances traindataSample; try { resample.setInputFormat(traindata); traindataSample = Filter.useFilter(traindata, resample); } catch (Exception e) { throw new RuntimeException(e); } traindata.clear(); for (int i = 0; i < traindataSample.size(); i++) { traindata.add(traindataSample.get(i)); } }
From source file:de.ugoe.cs.cpdp.dataprocessing.SimulationFilter.java
License:Apache License
@Override public void apply(Instances testdata, Instances traindata) { Instances newDataSet = new Instances(traindata); traindata.delete();// w ww .j av a 2s . co m HashMap<Double, Instance> artifactNames = new HashMap<Double, Instance>(); // This is to add all data, where the first occurence of the file has a bug ArrayList<Double> firstOccurenceArtifactNames = new ArrayList<Double>(); // Sort dataset (StateID is connected to the date of commit: Lower StateID // means earlier commit than a higher stateID) Attribute wekaAttribute = newDataSet.attribute("Artifact.Target.StateID"); newDataSet.sort(wekaAttribute); /* * Logical summary: If there is an instance that dont have a bug, put it into the hashmap * (only unique values in there) * * If there is an instance, that hava a bug look up if it is in the hashmap already (this * means: it does not had a bug before!): If this is true add it to a new dataset and remove * it from the hashmap, so that new changes from "nonBug" -> "bug" for this file can be * found. * * If the instance has a bug and is not in the hashmap (this means: The file has a bug with * its first occurence or this file only has bugs and not an instance with no bug), then (if * it is not in the arrayList above) add it to the new dataset. This way it is possible to * get the first occurence of a file, which has a bug */ for (int i = 0; i < newDataSet.numInstances(); i++) { Instance wekaInstance = newDataSet.instance(i); double newBugLabel = wekaInstance.classValue(); Attribute wekaArtifactName = newDataSet.attribute("Artifact.Name"); Double artifactName = wekaInstance.value(wekaArtifactName); if (newBugLabel == 0.0 && artifactNames.keySet().contains(artifactName)) { artifactNames.put(artifactName, wekaInstance); } else if (newBugLabel == 0.0 && !artifactNames.keySet().contains(artifactName)) { artifactNames.put(artifactName, wekaInstance); } else if (newBugLabel == 1.0 && artifactNames.keySet().contains(artifactName)) { traindata.add(wekaInstance); artifactNames.remove(artifactName); } else if (newBugLabel == 1.0 && !artifactNames.keySet().contains(artifactName)) { if (!firstOccurenceArtifactNames.contains(artifactName)) { traindata.add(wekaInstance); firstOccurenceArtifactNames.add(artifactName); } } } // If we have a file, that never had a bug (this is, when it is NOT in the // new created dataset, but it is in the HashMap from above) add it to // the new dataset double[] artifactNamesinNewDataSet = traindata.attributeToDoubleArray(0); HashMap<Double, Instance> artifactNamesCopy = new HashMap<Double, Instance>(artifactNames); for (Double artifactName : artifactNames.keySet()) { for (int i = 0; i < artifactNamesinNewDataSet.length; i++) { if (artifactNamesinNewDataSet[i] == artifactName) { artifactNamesCopy.remove(artifactName); } } } for (Double artifact : artifactNamesCopy.keySet()) { traindata.add(artifactNamesCopy.get(artifact)); } }
From source file:de.ugoe.cs.cpdp.dataselection.CLIFF.java
License:Apache License
/** * <p>/*from w w w . j a va 2 s. c o m*/ * Applies the CLIFF relevancy filter to the data. * </p> * * @param data * the data * @return CLIFF-filtered data */ protected Instances applyCLIFF(Instances data) { final double[][] powerAttributes = new double[data.size()][data.numAttributes()]; final double[] powerEntity = new double[data.size()]; final int[] counts = data.attributeStats(data.classIndex()).nominalCounts; final double probDefect = data.numInstances() / (double) counts[1]; for (int j = 0; j < data.numAttributes(); j++) { if (data.attribute(j) != data.classAttribute()) { final double[] ranges = getRanges(data, j); final double[] probDefectRange = getRangeProbabilities(data, j, ranges); for (int i = 0; i < data.numInstances(); i++) { final double value = data.instance(i).value(j); final int range = determineRange(ranges, value); double probClass, probNotClass, probRangeClass, probRangeNotClass; if (data.instance(i).classValue() == 1) { probClass = probDefect; probNotClass = 1.0 - probDefect; probRangeClass = probDefectRange[range]; probRangeNotClass = 1.0 - probDefectRange[range]; } else { probClass = 1.0 - probDefect; probNotClass = probDefect; probRangeClass = 1.0 - probDefectRange[range]; probRangeNotClass = probDefectRange[range]; } powerAttributes[i][j] = Math.pow(probRangeClass, 2.0) / (probRangeClass * probClass + probRangeNotClass * probNotClass); } } } for (int i = 0; i < data.numInstances(); i++) { powerEntity[i] = 1.0; for (int j = 0; j < data.numAttributes(); j++) { powerEntity[i] *= powerAttributes[i][j]; } } double[] sortedPower = powerEntity.clone(); Arrays.sort(sortedPower); double cutOff = sortedPower[(int) (data.numInstances() * (1 - percentage))]; final Instances selected = new Instances(data); selected.delete(); for (int i = 0; i < data.numInstances(); i++) { if (powerEntity[i] >= cutOff) { selected.add(data.instance(i)); } } return selected; }
From source file:de.ugoe.cs.cpdp.dataselection.DBSCANFilter.java
License:Apache License
/** * @see de.ugoe.cs.cpdp.dataselection.PointWiseDataselectionStrategy#apply(weka.core.Instances, * weka.core.Instances)/*from www. j a v a 2 s. c om*/ */ @Override public Instances apply(Instances testdata, Instances traindata) { Instances filteredTraindata = new Instances(traindata); filteredTraindata.clear(); double[][] data = new double[testdata.size() + traindata.size()][testdata.numAttributes() - 1]; int classIndex = testdata.classIndex(); for (int i = 0; i < testdata.size(); i++) { int k = 0; for (int j = 0; j < testdata.numAttributes(); j++) { if (j != classIndex) { data[i][k] = testdata.get(i).value(j); k++; } } } for (int i = 0; i < traindata.size(); i++) { int k = 0; for (int j = 0; j < traindata.numAttributes(); j++) { if (j != classIndex) { data[i + testdata.size()][k] = traindata.get(i).value(j); k++; } } } DatabaseConnection dbc = new ArrayAdapterDatabaseConnection(data); Database db = new StaticArrayDatabase(dbc, null); db.initialize(); DBSCAN<DoubleVector> dbscan = new DBSCAN<DoubleVector>(EuclideanDistanceFunction.STATIC, 1.0, 10); Clustering<Model> clusterer = dbscan.run(db); Relation<DoubleVector> rel = db.getRelation(TypeUtil.DOUBLE_VECTOR_FIELD); int firstInternalIndex = rel.iterDBIDs().internalGetIndex(); for (Cluster<Model> cluster : clusterer.getAllClusters()) { // check if cluster has any training data DBIDIter iter = rel.iterDBIDs(); boolean noMatch = true; for (int i = 0; noMatch && i < testdata.size(); i++) { noMatch = !cluster.getIDs().contains(iter); iter.advance(); } if (!noMatch) { // cluster contains test data for (DBIDIter clusterIter = cluster.getIDs().iter(); clusterIter.valid(); clusterIter.advance()) { int internalIndex = clusterIter.internalGetIndex() - testdata.size() - firstInternalIndex; if (internalIndex >= 0) { // index belongs to a training instance filteredTraindata.add(traindata.get(internalIndex)); } } } } return filteredTraindata; }
From source file:de.ugoe.cs.cpdp.dataselection.DecisionTreeSelection.java
License:Apache License
@Override public void apply(Instances testdata, SetUniqueList<Instances> traindataSet) { final Instances data = characteristicInstances(testdata, traindataSet); final ArrayList<String> attVals = new ArrayList<String>(); attVals.add("same"); attVals.add("more"); attVals.add("less"); final ArrayList<Attribute> atts = new ArrayList<Attribute>(); for (int j = 0; j < data.numAttributes(); j++) { atts.add(new Attribute(data.attribute(j).name(), attVals)); }/* www. j a va2 s . c om*/ atts.add(new Attribute("score")); Instances similarityData = new Instances("similarity", atts, 0); similarityData.setClassIndex(similarityData.numAttributes() - 1); try { Classifier classifier = new J48(); for (int i = 0; i < traindataSet.size(); i++) { classifier.buildClassifier(traindataSet.get(i)); for (int j = 0; j < traindataSet.size(); j++) { if (i != j) { double[] similarity = new double[data.numAttributes() + 1]; for (int k = 0; k < data.numAttributes(); k++) { if (0.9 * data.get(i + 1).value(k) > data.get(j + 1).value(k)) { similarity[k] = 2.0; } else if (1.1 * data.get(i + 1).value(k) < data.get(j + 1).value(k)) { similarity[k] = 1.0; } else { similarity[k] = 0.0; } } Evaluation eval = new Evaluation(traindataSet.get(j)); eval.evaluateModel(classifier, traindataSet.get(j)); similarity[data.numAttributes()] = eval.fMeasure(1); similarityData.add(new DenseInstance(1.0, similarity)); } } } REPTree repTree = new REPTree(); if (repTree.getNumFolds() > similarityData.size()) { repTree.setNumFolds(similarityData.size()); } repTree.setNumFolds(2); repTree.buildClassifier(similarityData); Instances testTrainSimilarity = new Instances(similarityData); testTrainSimilarity.clear(); for (int i = 0; i < traindataSet.size(); i++) { double[] similarity = new double[data.numAttributes() + 1]; for (int k = 0; k < data.numAttributes(); k++) { if (0.9 * data.get(0).value(k) > data.get(i + 1).value(k)) { similarity[k] = 2.0; } else if (1.1 * data.get(0).value(k) < data.get(i + 1).value(k)) { similarity[k] = 1.0; } else { similarity[k] = 0.0; } } testTrainSimilarity.add(new DenseInstance(1.0, similarity)); } int bestScoringProductIndex = -1; double maxScore = Double.MIN_VALUE; for (int i = 0; i < traindataSet.size(); i++) { double score = repTree.classifyInstance(testTrainSimilarity.get(i)); if (score > maxScore) { maxScore = score; bestScoringProductIndex = i; } } Instances bestScoringProduct = traindataSet.get(bestScoringProductIndex); traindataSet.clear(); traindataSet.add(bestScoringProduct); } catch (Exception e) { Console.printerr("failure during DecisionTreeSelection: " + e.getMessage()); throw new RuntimeException(e); } }
From source file:de.ugoe.cs.cpdp.dataselection.LACE2.java
License:Apache License
@Override public void apply(Instances testdata, SetUniqueList<Instances> traindataSet) { Instances selectedData = new Instances(testdata); selectedData.clear();//from w w w . ja va 2s.c o m LinkedList<Instances> traindataCopy = new LinkedList<>(traindataSet); Collections.shuffle(traindataCopy); CLIFF cliff = new CLIFF(); cliff.setParameter(Double.toString(percentage)); MORPH morph = new MORPH(); Median median = new Median(); double minDist = Double.MIN_VALUE; for (Instances traindata : traindataCopy) { Instances cliffedData = cliff.applyCLIFF(traindata); if (minDist == Double.MIN_VALUE) { // determine distance for leader-follower algorithm Instances sample; if (traindata.size() > 100) { Resample resample = new Resample(); resample.setSampleSizePercent(100.0 / traindata.size() * 100.0); resample.setBiasToUniformClass(0.0); resample.setNoReplacement(true); try { resample.setInputFormat(traindata); sample = Filter.useFilter(traindata, resample); } catch (Exception e) { throw new RuntimeException(e); } } else { sample = new Instances(traindata); } double[] distances = new double[sample.size()]; for (int i = 0; i < sample.size(); i++) { Instance unlikeNeighbor = morph.getNearestUnlikeNeighbor(sample.get(i), sample); distances[i] = MathArrays.distance(WekaUtils.instanceValues(sample.get(i)), WekaUtils.instanceValues(unlikeNeighbor)); } minDist = median.evaluate(distances); } for (int i = 0; i < cliffedData.size(); i++) { Instance unlikeNeighbor = morph.getNearestUnlikeNeighbor(cliffedData.get(i), selectedData); if (unlikeNeighbor == null) { selectedData.add(cliffedData.get(i)); } else { double distance = MathArrays.distance(WekaUtils.instanceValues(cliffedData.get(i)), WekaUtils.instanceValues(unlikeNeighbor)); if (distance > minDist) { morph.morphInstance(cliffedData.get(i), cliffedData); selectedData.add(cliffedData.get(i)); } } } } }