Example usage for weka.core Instances add

Introduction

In this page you can find the example usage for weka.core Instances add.

Prototype

@Override
public boolean add(Instance instance)

Source Link

Document

Adds one instance to the end of the set.

Usage

From source file:de.fub.maps.project.detector.model.inference.processhandler.SpecialInferenceDataProcessHandler.java

License:Open Source License

@Override
protected void handle() {
    clearResults();/*from  w  w w  .  j a v  a 2s .com*/

    Classifier classifier = getInferenceModel().getClassifier();
    Collection<Attribute> attributeList = getInferenceModel().getAttributes();

    if (!attributeList.isEmpty()) {
        Set<String> keySet = getInferenceModel().getInput().getTrainingsSet().keySet();
        setClassesToView(keySet);

        Instances unlabeledInstances = new Instances("Unlabeld Tracks", new ArrayList<Attribute>(attributeList),
                0); //NO18N
        unlabeledInstances.setClassIndex(0);

        ArrayList<TrackSegment> segmentList = new ArrayList<TrackSegment>();
        for (Entry<String, HashSet<TrackSegment>> entry : getInferenceModel().getInput().getTrainingsSet()
                .entrySet()) {
            for (TrackSegment segment : entry.getValue()) {
                segment.setLabel(entry.getKey());
                Instance instance = getInstance(segment);
                unlabeledInstances.add(instance);
                segmentList.add(segment);
            }
        }

        // create copy
        Instances labeledInstances = new Instances(unlabeledInstances);

        for (int index = 0; index < labeledInstances.numInstances(); index++) {
            try {
                Instance instance = labeledInstances.instance(index);

                // classify instance
                double classifyed = classifier.classifyInstance(instance);
                instance.setClassValue(classifyed);

                // get class label
                String value = unlabeledInstances.classAttribute().value((int) classifyed);

                if (index < segmentList.size()) {
                    instanceToTrackSegmentMap.put(instance, segmentList.get(index));
                }

                // put label and instance to result map
                put(value, instance);

            } catch (Exception ex) {
                Exceptions.printStackTrace(ex);
            }
        }

        // update visw
        updateVisualRepresentation();

        // update result set of the inferenceModel
        for (Map.Entry<String, List<Instance>> entry : resultMap.entrySet()) {
            HashSet<TrackSegment> trackSegmentList = new HashSet<TrackSegment>();
            for (Instance instance : entry.getValue()) {
                TrackSegment trackSegment = instanceToTrackSegmentMap.get(instance);
                if (trackSegment != null) {
                    trackSegmentList.add(trackSegment);
                }
            }

            // only those classes are put into  the result data set, which are not empty
            if (!trackSegmentList.isEmpty()) {
                getInferenceModel().getResult().put(entry.getKey(), trackSegmentList);
            }
        }
    } else {
        throw new InferenceModelClassifyException(MessageFormat
                .format("No attributes available. Attribute list lengeth == {0}", attributeList.size()));
    }
    resultMap.clear();
    instanceToTrackSegmentMap.clear();
}

From source file:de.fub.maps.project.detector.model.inference.processhandler.TrainingsDataProcessHandler.java

License:Open Source License

@Override
protected void handle() {
    final ProgressHandle handle = ProgressHandleFactory.createHandle("Trainings");
    try {/* ww w .  java  2s.c  om*/
        handle.start();
        Collection<Attribute> attributeCollection = getInferenceModel().getAttributes();
        ArrayList<Attribute> arrayList = new ArrayList<Attribute>(attributeCollection);
        Instances trainingSet = new Instances("Classes", arrayList, 0);
        trainingSet.setClassIndex(0);

        Instances testingSet = new Instances("Classes", arrayList, 0);
        testingSet.setClassIndex(0);

        HashMap<String, HashSet<TrackSegment>> dataset = getInferenceModel().getInput().getTrainingsSet();

        int datasetCount = 0;
        for (HashSet<TrackSegment> list : dataset.values()) {
            for (TrackSegment trackSegment : list) {
                datasetCount += trackSegment.getWayPointList().size();
            }
        }
        handle.switchToDeterminate(datasetCount);
        int trackCount = 0;
        for (Entry<String, HashSet<TrackSegment>> entry : dataset.entrySet()) {

            int trainingsSetSize = (int) Math.ceil(entry.getValue().size() * getTrainingsSetRatioParameter());
            int index = 0;
            for (TrackSegment trackSegment : entry.getValue()) {
                Instance instance = getInstance(entry.getKey(), trackSegment);

                if (index < trainingsSetSize) {
                    trainingSet.add(instance);
                } else {
                    testingSet.add(instance);
                }
                handle.progress(trackCount++);
                index++;
            }
        }

        assert trainingSet.numInstances() > 0 : "Training set is empty and has no instances"; //NO18N
        assert testingSet.numInstances() > 0 : "Testing set is empty and has no instances"; //NO18N
        handle.switchToIndeterminate();
        evaluate(trainingSet, testingSet);
    } finally {
        handle.finish();
    }
}

From source file:de.tudarmstadt.ukp.similarity.experiments.coling2012.util.Evaluator.java

License:Open Source License

public static void runClassifierCV(WekaClassifier wekaClassifier, Dataset dataset) throws Exception {
    // Set parameters
    int folds = 10;
    Classifier baseClassifier = getClassifier(wekaClassifier);

    // Set up the random number generator
    long seed = new Date().getTime();
    Random random = new Random(seed);

    // Add IDs to the instances
    AddID.main(new String[] { "-i", MODELS_DIR + "/" + dataset.toString() + ".arff", "-o",
            MODELS_DIR + "/" + dataset.toString() + "-plusIDs.arff" });
    Instances data = DataSource.read(MODELS_DIR + "/" + dataset.toString() + "-plusIDs.arff");
    data.setClassIndex(data.numAttributes() - 1);

    // Instantiate the Remove filter
    Remove removeIDFilter = new Remove();
    removeIDFilter.setAttributeIndices("first");

    // Randomize the data
    data.randomize(random);//from w w  w  . j  a va2s .c  om

    // Perform cross-validation
    Instances predictedData = null;
    Evaluation eval = new Evaluation(data);

    for (int n = 0; n < folds; n++) {
        Instances train = data.trainCV(folds, n, random);
        Instances test = data.testCV(folds, n);

        // Apply log filter
        //          Filter logFilter = new LogFilter();
        //           logFilter.setInputFormat(train);
        //           train = Filter.useFilter(train, logFilter);        
        //           logFilter.setInputFormat(test);
        //           test = Filter.useFilter(test, logFilter);

        // Copy the classifier
        Classifier classifier = AbstractClassifier.makeCopy(baseClassifier);

        // Instantiate the FilteredClassifier
        FilteredClassifier filteredClassifier = new FilteredClassifier();
        filteredClassifier.setFilter(removeIDFilter);
        filteredClassifier.setClassifier(classifier);

        // Build the classifier
        filteredClassifier.buildClassifier(train);

        // Evaluate
        eval.evaluateModel(filteredClassifier, test);

        // Add predictions
        AddClassification filter = new AddClassification();
        filter.setClassifier(filteredClassifier);
        filter.setOutputClassification(true);
        filter.setOutputDistribution(false);
        filter.setOutputErrorFlag(true);
        filter.setInputFormat(train);
        Filter.useFilter(train, filter); // trains the classifier

        Instances pred = Filter.useFilter(test, filter); // performs predictions on test set
        if (predictedData == null)
            predictedData = new Instances(pred, 0);
        for (int j = 0; j < pred.numInstances(); j++)
            predictedData.add(pred.instance(j));
    }

    // Prepare output classification
    String[] scores = new String[predictedData.numInstances()];

    for (Instance predInst : predictedData) {
        int id = new Double(predInst.value(predInst.attribute(0))).intValue() - 1;

        int valueIdx = predictedData.numAttributes() - 2;

        String value = predInst.stringValue(predInst.attribute(valueIdx));

        scores[id] = value;
    }

    // Output
    StringBuilder sb = new StringBuilder();
    for (String score : scores)
        sb.append(score.toString() + LF);

    FileUtils.writeStringToFile(
            new File(OUTPUT_DIR + "/" + dataset.toString() + "/" + wekaClassifier.toString() + "/output.csv"),
            sb.toString());
}

From source file:de.ugoe.cs.cpdp.dataprocessing.Oversampling.java

License:Apache License

@Override
public void apply(Instances testdata, Instances traindata) {

    final int[] counts = traindata.attributeStats(traindata.classIndex()).nominalCounts;
    if (counts[1] < counts[0]) {
        Instances negatives = new Instances(traindata);
        Instances positives = new Instances(traindata);

        for (int i = traindata.size() - 1; i >= 0; i--) {
            if (Double.compare(1.0, negatives.get(i).classValue()) == 0) {
                negatives.remove(i);//  w  ww.  ja  v a2  s.  c  o  m
            }
            if (Double.compare(0.0, positives.get(i).classValue()) == 0) {
                positives.remove(i);
            }
        }

        Resample resample = new Resample();
        resample.setSampleSizePercent((100.0 * counts[0]) / counts[1]);
        try {
            resample.setInputFormat(traindata);
            positives = Filter.useFilter(positives, resample);
        } catch (Exception e) {
            throw new RuntimeException(e);
        }
        traindata.clear();
        for (int i = 0; i < negatives.size(); i++) {
            traindata.add(negatives.get(i));
        }
        for (int i = 0; i < positives.size(); i++) {
            traindata.add(positives.get(i));
        }
    }
}

From source file:de.ugoe.cs.cpdp.dataprocessing.Resampling.java

License:Apache License

@Override
public void apply(Instances testdata, Instances traindata) {
    Resample resample = new Resample();
    resample.setSampleSizePercent(100);/*from  w  w  w.  j a va  2 s  .c o m*/
    resample.setBiasToUniformClass(1.0);

    Instances traindataSample;
    try {
        resample.setInputFormat(traindata);
        traindataSample = Filter.useFilter(traindata, resample);
    } catch (Exception e) {
        throw new RuntimeException(e);
    }
    traindata.clear();
    for (int i = 0; i < traindataSample.size(); i++) {
        traindata.add(traindataSample.get(i));
    }
}

From source file:de.ugoe.cs.cpdp.dataprocessing.SimulationFilter.java

License:Apache License

@Override
public void apply(Instances testdata, Instances traindata) {
    Instances newDataSet = new Instances(traindata);
    traindata.delete();//  w ww .j  av  a  2s .  co  m

    HashMap<Double, Instance> artifactNames = new HashMap<Double, Instance>();

    // This is to add all data, where the first occurence of the file has a bug
    ArrayList<Double> firstOccurenceArtifactNames = new ArrayList<Double>();

    // Sort dataset (StateID is connected to the date of commit: Lower StateID
    // means earlier commit than a higher stateID)
    Attribute wekaAttribute = newDataSet.attribute("Artifact.Target.StateID");
    newDataSet.sort(wekaAttribute);

    /*
     * Logical summary: If there is an instance that dont have a bug, put it into the hashmap
     * (only unique values in there)
     * 
     * If there is an instance, that hava a bug look up if it is in the hashmap already (this
     * means: it does not had a bug before!): If this is true add it to a new dataset and remove
     * it from the hashmap, so that new changes from "nonBug" -> "bug" for this file can be
     * found.
     * 
     * If the instance has a bug and is not in the hashmap (this means: The file has a bug with
     * its first occurence or this file only has bugs and not an instance with no bug), then (if
     * it is not in the arrayList above) add it to the new dataset. This way it is possible to
     * get the first occurence of a file, which has a bug
     */
    for (int i = 0; i < newDataSet.numInstances(); i++) {
        Instance wekaInstance = newDataSet.instance(i);

        double newBugLabel = wekaInstance.classValue();
        Attribute wekaArtifactName = newDataSet.attribute("Artifact.Name");
        Double artifactName = wekaInstance.value(wekaArtifactName);

        if (newBugLabel == 0.0 && artifactNames.keySet().contains(artifactName)) {
            artifactNames.put(artifactName, wekaInstance);
        } else if (newBugLabel == 0.0 && !artifactNames.keySet().contains(artifactName)) {
            artifactNames.put(artifactName, wekaInstance);
        } else if (newBugLabel == 1.0 && artifactNames.keySet().contains(artifactName)) {
            traindata.add(wekaInstance);
            artifactNames.remove(artifactName);
        } else if (newBugLabel == 1.0 && !artifactNames.keySet().contains(artifactName)) {
            if (!firstOccurenceArtifactNames.contains(artifactName)) {
                traindata.add(wekaInstance);
                firstOccurenceArtifactNames.add(artifactName);
            }
        }
    }

    // If we have a file, that never had a bug (this is, when it is NOT in the
    // new created dataset, but it is in the HashMap from above) add it to
    // the new dataset

    double[] artifactNamesinNewDataSet = traindata.attributeToDoubleArray(0);
    HashMap<Double, Instance> artifactNamesCopy = new HashMap<Double, Instance>(artifactNames);

    for (Double artifactName : artifactNames.keySet()) {

        for (int i = 0; i < artifactNamesinNewDataSet.length; i++) {
            if (artifactNamesinNewDataSet[i] == artifactName) {
                artifactNamesCopy.remove(artifactName);
            }
        }
    }

    for (Double artifact : artifactNamesCopy.keySet()) {
        traindata.add(artifactNamesCopy.get(artifact));
    }

}

From source file:de.ugoe.cs.cpdp.dataselection.CLIFF.java

License:Apache License

/**
 * <p>/*from   w w w  . j a  va 2  s. c  o m*/
 * Applies the CLIFF relevancy filter to the data.
 * </p>
 *
 * @param data
 *            the data
 * @return CLIFF-filtered data
 */
protected Instances applyCLIFF(Instances data) {
    final double[][] powerAttributes = new double[data.size()][data.numAttributes()];
    final double[] powerEntity = new double[data.size()];

    final int[] counts = data.attributeStats(data.classIndex()).nominalCounts;
    final double probDefect = data.numInstances() / (double) counts[1];

    for (int j = 0; j < data.numAttributes(); j++) {
        if (data.attribute(j) != data.classAttribute()) {
            final double[] ranges = getRanges(data, j);
            final double[] probDefectRange = getRangeProbabilities(data, j, ranges);

            for (int i = 0; i < data.numInstances(); i++) {
                final double value = data.instance(i).value(j);
                final int range = determineRange(ranges, value);
                double probClass, probNotClass, probRangeClass, probRangeNotClass;
                if (data.instance(i).classValue() == 1) {
                    probClass = probDefect;
                    probNotClass = 1.0 - probDefect;
                    probRangeClass = probDefectRange[range];
                    probRangeNotClass = 1.0 - probDefectRange[range];
                } else {
                    probClass = 1.0 - probDefect;
                    probNotClass = probDefect;
                    probRangeClass = 1.0 - probDefectRange[range];
                    probRangeNotClass = probDefectRange[range];
                }
                powerAttributes[i][j] = Math.pow(probRangeClass, 2.0)
                        / (probRangeClass * probClass + probRangeNotClass * probNotClass);
            }
        }
    }

    for (int i = 0; i < data.numInstances(); i++) {
        powerEntity[i] = 1.0;
        for (int j = 0; j < data.numAttributes(); j++) {
            powerEntity[i] *= powerAttributes[i][j];
        }
    }
    double[] sortedPower = powerEntity.clone();
    Arrays.sort(sortedPower);
    double cutOff = sortedPower[(int) (data.numInstances() * (1 - percentage))];

    final Instances selected = new Instances(data);
    selected.delete();
    for (int i = 0; i < data.numInstances(); i++) {
        if (powerEntity[i] >= cutOff) {
            selected.add(data.instance(i));
        }
    }
    return selected;
}

From source file:de.ugoe.cs.cpdp.dataselection.DBSCANFilter.java

License:Apache License

/**
 * @see de.ugoe.cs.cpdp.dataselection.PointWiseDataselectionStrategy#apply(weka.core.Instances,
 *      weka.core.Instances)/*from   www.  j  a  v a 2  s. c om*/
 */
@Override
public Instances apply(Instances testdata, Instances traindata) {
    Instances filteredTraindata = new Instances(traindata);
    filteredTraindata.clear();

    double[][] data = new double[testdata.size() + traindata.size()][testdata.numAttributes() - 1];
    int classIndex = testdata.classIndex();
    for (int i = 0; i < testdata.size(); i++) {
        int k = 0;
        for (int j = 0; j < testdata.numAttributes(); j++) {
            if (j != classIndex) {
                data[i][k] = testdata.get(i).value(j);
                k++;
            }
        }
    }
    for (int i = 0; i < traindata.size(); i++) {
        int k = 0;
        for (int j = 0; j < traindata.numAttributes(); j++) {
            if (j != classIndex) {
                data[i + testdata.size()][k] = traindata.get(i).value(j);
                k++;
            }
        }
    }
    DatabaseConnection dbc = new ArrayAdapterDatabaseConnection(data);
    Database db = new StaticArrayDatabase(dbc, null);
    db.initialize();
    DBSCAN<DoubleVector> dbscan = new DBSCAN<DoubleVector>(EuclideanDistanceFunction.STATIC, 1.0, 10);
    Clustering<Model> clusterer = dbscan.run(db);
    Relation<DoubleVector> rel = db.getRelation(TypeUtil.DOUBLE_VECTOR_FIELD);
    int firstInternalIndex = rel.iterDBIDs().internalGetIndex();

    for (Cluster<Model> cluster : clusterer.getAllClusters()) {
        // check if cluster has any training data
        DBIDIter iter = rel.iterDBIDs();
        boolean noMatch = true;
        for (int i = 0; noMatch && i < testdata.size(); i++) {
            noMatch = !cluster.getIDs().contains(iter);
            iter.advance();
        }
        if (!noMatch) {
            // cluster contains test data
            for (DBIDIter clusterIter = cluster.getIDs().iter(); clusterIter.valid(); clusterIter.advance()) {
                int internalIndex = clusterIter.internalGetIndex() - testdata.size() - firstInternalIndex;
                if (internalIndex >= 0) {
                    // index belongs to a training instance
                    filteredTraindata.add(traindata.get(internalIndex));
                }
            }

        }
    }

    return filteredTraindata;
}

From source file:de.ugoe.cs.cpdp.dataselection.DecisionTreeSelection.java

License:Apache License

@Override
public void apply(Instances testdata, SetUniqueList<Instances> traindataSet) {
    final Instances data = characteristicInstances(testdata, traindataSet);

    final ArrayList<String> attVals = new ArrayList<String>();
    attVals.add("same");
    attVals.add("more");
    attVals.add("less");
    final ArrayList<Attribute> atts = new ArrayList<Attribute>();
    for (int j = 0; j < data.numAttributes(); j++) {
        atts.add(new Attribute(data.attribute(j).name(), attVals));
    }/*  www. j a va2  s . c om*/
    atts.add(new Attribute("score"));
    Instances similarityData = new Instances("similarity", atts, 0);
    similarityData.setClassIndex(similarityData.numAttributes() - 1);

    try {
        Classifier classifier = new J48();
        for (int i = 0; i < traindataSet.size(); i++) {
            classifier.buildClassifier(traindataSet.get(i));
            for (int j = 0; j < traindataSet.size(); j++) {
                if (i != j) {
                    double[] similarity = new double[data.numAttributes() + 1];
                    for (int k = 0; k < data.numAttributes(); k++) {
                        if (0.9 * data.get(i + 1).value(k) > data.get(j + 1).value(k)) {
                            similarity[k] = 2.0;
                        } else if (1.1 * data.get(i + 1).value(k) < data.get(j + 1).value(k)) {
                            similarity[k] = 1.0;
                        } else {
                            similarity[k] = 0.0;
                        }
                    }

                    Evaluation eval = new Evaluation(traindataSet.get(j));
                    eval.evaluateModel(classifier, traindataSet.get(j));
                    similarity[data.numAttributes()] = eval.fMeasure(1);
                    similarityData.add(new DenseInstance(1.0, similarity));
                }
            }
        }
        REPTree repTree = new REPTree();
        if (repTree.getNumFolds() > similarityData.size()) {
            repTree.setNumFolds(similarityData.size());
        }
        repTree.setNumFolds(2);
        repTree.buildClassifier(similarityData);

        Instances testTrainSimilarity = new Instances(similarityData);
        testTrainSimilarity.clear();
        for (int i = 0; i < traindataSet.size(); i++) {
            double[] similarity = new double[data.numAttributes() + 1];
            for (int k = 0; k < data.numAttributes(); k++) {
                if (0.9 * data.get(0).value(k) > data.get(i + 1).value(k)) {
                    similarity[k] = 2.0;
                } else if (1.1 * data.get(0).value(k) < data.get(i + 1).value(k)) {
                    similarity[k] = 1.0;
                } else {
                    similarity[k] = 0.0;
                }
            }
            testTrainSimilarity.add(new DenseInstance(1.0, similarity));
        }

        int bestScoringProductIndex = -1;
        double maxScore = Double.MIN_VALUE;
        for (int i = 0; i < traindataSet.size(); i++) {
            double score = repTree.classifyInstance(testTrainSimilarity.get(i));
            if (score > maxScore) {
                maxScore = score;
                bestScoringProductIndex = i;
            }
        }
        Instances bestScoringProduct = traindataSet.get(bestScoringProductIndex);
        traindataSet.clear();
        traindataSet.add(bestScoringProduct);
    } catch (Exception e) {
        Console.printerr("failure during DecisionTreeSelection: " + e.getMessage());
        throw new RuntimeException(e);
    }
}

From source file:de.ugoe.cs.cpdp.dataselection.LACE2.java

License:Apache License

@Override
public void apply(Instances testdata, SetUniqueList<Instances> traindataSet) {
    Instances selectedData = new Instances(testdata);
    selectedData.clear();//from w  w w . ja va 2s.c  o m

    LinkedList<Instances> traindataCopy = new LinkedList<>(traindataSet);
    Collections.shuffle(traindataCopy);

    CLIFF cliff = new CLIFF();
    cliff.setParameter(Double.toString(percentage));
    MORPH morph = new MORPH();
    Median median = new Median();
    double minDist = Double.MIN_VALUE;

    for (Instances traindata : traindataCopy) {
        Instances cliffedData = cliff.applyCLIFF(traindata);
        if (minDist == Double.MIN_VALUE) {
            // determine distance for leader-follower algorithm
            Instances sample;
            if (traindata.size() > 100) {
                Resample resample = new Resample();
                resample.setSampleSizePercent(100.0 / traindata.size() * 100.0);
                resample.setBiasToUniformClass(0.0);
                resample.setNoReplacement(true);
                try {
                    resample.setInputFormat(traindata);
                    sample = Filter.useFilter(traindata, resample);
                } catch (Exception e) {
                    throw new RuntimeException(e);
                }
            } else {
                sample = new Instances(traindata);
            }
            double[] distances = new double[sample.size()];
            for (int i = 0; i < sample.size(); i++) {
                Instance unlikeNeighbor = morph.getNearestUnlikeNeighbor(sample.get(i), sample);
                distances[i] = MathArrays.distance(WekaUtils.instanceValues(sample.get(i)),
                        WekaUtils.instanceValues(unlikeNeighbor));
            }
            minDist = median.evaluate(distances);
        }
        for (int i = 0; i < cliffedData.size(); i++) {
            Instance unlikeNeighbor = morph.getNearestUnlikeNeighbor(cliffedData.get(i), selectedData);
            if (unlikeNeighbor == null) {
                selectedData.add(cliffedData.get(i));
            } else {
                double distance = MathArrays.distance(WekaUtils.instanceValues(cliffedData.get(i)),
                        WekaUtils.instanceValues(unlikeNeighbor));
                if (distance > minDist) {
                    morph.morphInstance(cliffedData.get(i), cliffedData);
                    selectedData.add(cliffedData.get(i));
                }
            }
        }
    }
}