Example usage for weka.core Instances attribute

Introduction

In this page you can find the example usage for weka.core Instances attribute.

Prototype

publicAttribute attribute(String name)

Source Link

Document

Returns an attribute given its name.

Usage

From source file:de.ugoe.cs.cpdp.dataprocessing.CLAMIProcessor.java

License:Apache License

/**
 * <p>/*www  .ja  v a  2  s  . c o m*/
 * Applies the CLAMI processor to the data. The test data is also required, in order to
 * guarantee a consistent metric set.
 * </p>
 *
 * @param testdata
 *            test data; the data is not modified, only metrics are dropped
 * @param data
 *            data to which the CLAMI processor is applied
 */
public void applyCLAMI(Instances testdata, Instances data) {

    // first determine medians
    double[] medians = new double[data.numAttributes()];
    // get medians
    for (int j = 0; j < data.numAttributes(); j++) {
        if (j != data.classIndex()) {
            medians[j] = data.kthSmallestValue(j, (data.numInstances() + 1) >> 1);
        }
    }
    // now determine cluster number for each instance
    double[] clusterNumber = new double[data.numInstances()];
    for (int i = 0; i < data.numInstances(); i++) {
        int countHighValues = 0;
        Instance currentInstance = data.get(i);
        for (int j = 0; j < data.numAttributes(); j++) {
            if (j != data.classIndex()) {
                if (currentInstance.value(j) > medians[j]) {
                    countHighValues++;
                }
            }
        }
        clusterNumber[i] = countHighValues;
    }

    // determine median of cluster number
    Median m = new Median();
    double medianClusterNumber = m.evaluate(clusterNumber);

    // now we filter the metrics
    int[] numMetricViolations = new int[data.numAttributes()];
    for (int j = 0; j < data.numAttributes(); j++) {
        int currentViolations = 0;
        for (int i = 0; i < data.numInstances(); i++) {
            Instance currentInstance = data.get(i);
            if (j != data.classIndex()) {
                if (clusterNumber[i] > medianClusterNumber) {
                    // "buggy"
                    if (currentInstance.value(j) <= medians[j]) {
                        currentViolations++;
                    }
                } else {
                    // "not buggy"
                    if (currentInstance.value(j) > medians[j]) {
                        currentViolations++;
                    }
                }
            }
        }
        numMetricViolations[j] = currentViolations;
    }

    SortedSet<Integer> distinctViolationCounts = new TreeSet<>();
    for (int currentViolations : numMetricViolations) {
        distinctViolationCounts.add(currentViolations);
    }
    Iterator<Integer> violationCountInterator = distinctViolationCounts.iterator();

    int violationCutoff = violationCountInterator.next();
    // now we filter the data;
    // this is first tried with the metrics with fewest violations. if no buggy/bugfree
    // instances remain, this is repeated with the next metrics with second fewest violations,
    // and so on.
    // this part is a bit unclear from the description in the paper, but I confirmed with the
    // author that this is how they implemented it
    boolean[] cleanInstances = new boolean[data.numInstances()];
    int numCleanBuggyInstances = 0;
    int numCleanBugfreeInstances = 0;
    do {
        violationCutoff = violationCountInterator.next();
        cleanInstances = new boolean[data.numInstances()];
        numCleanBuggyInstances = 0;
        numCleanBugfreeInstances = 0;
        for (int i = 0; i < data.numInstances(); i++) {
            int currentViolations = 0;
            Instance currentInstance = data.get(i);
            for (int j = 0; j < data.numAttributes(); j++) {
                if (j != data.classIndex() && numMetricViolations[j] == violationCutoff) {
                    if (clusterNumber[i] > medianClusterNumber) {
                        // "buggy"
                        if (currentInstance.value(j) <= medians[j]) {
                            currentViolations++;
                        }
                    } else {
                        // "not buggy"
                        if (currentInstance.value(j) > medians[j]) {
                            currentViolations++;
                        }
                    }
                }
            }
            if (currentViolations == 0) {
                cleanInstances[i] = true;
                if (clusterNumber[i] > medianClusterNumber) {
                    numCleanBuggyInstances++;
                } else {
                    numCleanBugfreeInstances++;
                }
            } else {
                cleanInstances[i] = false;
            }
        }
    } while (numCleanBuggyInstances == 0 || numCleanBugfreeInstances == 0);

    // output some interesting information to provide insights into the CLAMI model
    Console.traceln(Level.FINE, "Selected Metrics and Median-threshold: ");
    for (int j = 0; j < data.numAttributes(); j++) {
        if (j != data.classIndex() && numMetricViolations[j] == violationCutoff) {
            Console.traceln(Level.FINE, "\t" + data.attribute(j).name() + ": " + medians[j]);
        }
    }

    // finally modify the instances
    // drop the metrics (also from the testdata)
    for (int j = data.numAttributes() - 1; j >= 0; j--) {
        if (j != data.classIndex() && numMetricViolations[j] != violationCutoff) {
            data.deleteAttributeAt(j);
            testdata.deleteAttributeAt(j);
        }
    }
    // drop the unclean instances
    for (int i = data.numInstances() - 1; i >= 0; i--) {
        if (!cleanInstances[i]) {
            data.delete(i);
        } else {
            // set the classification
            if (clusterNumber[i] > medianClusterNumber) {
                data.get(i).setClassValue(1.0d);
            } else {
                data.get(i).setClassValue(0.0d);
            }
        }
    }
}

From source file:de.ugoe.cs.cpdp.dataprocessing.MORPH.java

License:Apache License

/**
 * <p>/*www.  j  av  a2 s. co m*/
 * Applies MORPH to a single instance
 * </p>
 *
 * @param instance
 *            instance that is morphed
 * @param data
 *            data based on which the instance is morphed
 */
public void morphInstance(Instance instance, Instances data) {
    Instance nearestUnlikeNeighbor = getNearestUnlikeNeighbor(instance, data);
    if (nearestUnlikeNeighbor == null) {
        throw new RuntimeException(
                "could not find nearest unlike neighbor within the data: " + data.relationName());
    }
    for (int j = 0; j < data.numAttributes(); j++) {
        if (data.attribute(j) != data.classAttribute() && data.attribute(j).isNumeric()) {
            double randVal = rand.nextDouble() * (beta - alpha) + alpha;
            instance.setValue(j,
                    instance.value(j) + randVal * (instance.value(j) - nearestUnlikeNeighbor.value(j)));
        }
    }
}

From source file:de.ugoe.cs.cpdp.dataprocessing.MORPH.java

License:Apache License

/**
 * <p>//from w  w w .  ja  v  a2 s.c  o  m
 * Determines the nearest unlike neighbor of an instance.
 * </p>
 *
 * @param instance
 *            instance to which the nearest unlike neighbor is determined
 * @param data
 *            data where the nearest unlike neighbor is determined from
 * @return nearest unlike instance
 */
public Instance getNearestUnlikeNeighbor(Instance instance, Instances data) {
    Instance nearestUnlikeNeighbor = null;

    double[] instanceVector = new double[data.numAttributes() - 1];
    int tmp = 0;
    for (int j = 0; j < data.numAttributes(); j++) {
        if (data.attribute(j) != data.classAttribute() && data.attribute(j).isNumeric()) {
            instanceVector[tmp] = instance.value(j);
        }
    }

    double minDistance = Double.MAX_VALUE;
    for (int i = 0; i < data.numInstances(); i++) {
        if (instance.classValue() != data.instance(i).classValue()) {
            double[] otherVector = new double[data.numAttributes() - 1];
            tmp = 0;
            for (int j = 0; j < data.numAttributes(); j++) {
                if (data.attribute(j) != data.classAttribute() && data.attribute(j).isNumeric()) {
                    otherVector[tmp++] = data.instance(i).value(j);
                }
            }
            if (MathArrays.distance(instanceVector, otherVector) < minDistance) {
                minDistance = MathArrays.distance(instanceVector, otherVector);
                nearestUnlikeNeighbor = data.instance(i);
            }
        }
    }
    return nearestUnlikeNeighbor;
}

From source file:de.ugoe.cs.cpdp.dataprocessing.NominalAttributeFilter.java

License:Apache License

@Override
public void apply(Instances testdata, Instances traindata) {
    int indexOfConfidenceAttribute = -1;

    // Find index of the named confidence attribute to filter for
    for (int i = 0; i < traindata.numAttributes(); i++) {
        if (traindata.attribute(i).name().equals(nominalAttributeName)) {
            indexOfConfidenceAttribute = i;
        }/*from w w w. ja va2s  . c  o  m*/
    }

    // if it was not found return
    if (indexOfConfidenceAttribute == -1) {
        return;
    }

    // Find index of nominal values
    Attribute confidenceAttribute = traindata.attribute(indexOfConfidenceAttribute);
    ArrayList<Object> nominalValuesOfConfidenceAttribute = Collections
            .list(confidenceAttribute.enumerateValues());
    ArrayList<Double> indexOfnominalAttributeValues = new ArrayList<Double>();

    for (int k = 0; k < nominalValuesOfConfidenceAttribute.size(); k++) {
        for (String attributeValue : nominalAttributeValues) {
            if (((String) nominalValuesOfConfidenceAttribute.get(k)).equals(attributeValue)) {
                indexOfnominalAttributeValues.add((double) k);
            }
        }
    }

    // Go through all instances and check if nominal attribute equals
    for (int j = traindata.numInstances() - 1; j >= 0; j--) {
        Instance wekaInstance = traindata.get(j);

        // delete all instances where nominal attribute has the value of one of the parameter
        if (indexOfnominalAttributeValues.contains(wekaInstance.value(indexOfConfidenceAttribute))) {
            traindata.delete(j);
        }
    }
}

From source file:de.ugoe.cs.cpdp.dataprocessing.SimulationFilter.java

License:Apache License

@Override
public void apply(Instances testdata, Instances traindata) {
    Instances newDataSet = new Instances(traindata);
    traindata.delete();/*from  www  . ja v a  2  s.  c o  m*/

    HashMap<Double, Instance> artifactNames = new HashMap<Double, Instance>();

    // This is to add all data, where the first occurence of the file has a bug
    ArrayList<Double> firstOccurenceArtifactNames = new ArrayList<Double>();

    // Sort dataset (StateID is connected to the date of commit: Lower StateID
    // means earlier commit than a higher stateID)
    Attribute wekaAttribute = newDataSet.attribute("Artifact.Target.StateID");
    newDataSet.sort(wekaAttribute);

    /*
     * Logical summary: If there is an instance that dont have a bug, put it into the hashmap
     * (only unique values in there)
     * 
     * If there is an instance, that hava a bug look up if it is in the hashmap already (this
     * means: it does not had a bug before!): If this is true add it to a new dataset and remove
     * it from the hashmap, so that new changes from "nonBug" -> "bug" for this file can be
     * found.
     * 
     * If the instance has a bug and is not in the hashmap (this means: The file has a bug with
     * its first occurence or this file only has bugs and not an instance with no bug), then (if
     * it is not in the arrayList above) add it to the new dataset. This way it is possible to
     * get the first occurence of a file, which has a bug
     */
    for (int i = 0; i < newDataSet.numInstances(); i++) {
        Instance wekaInstance = newDataSet.instance(i);

        double newBugLabel = wekaInstance.classValue();
        Attribute wekaArtifactName = newDataSet.attribute("Artifact.Name");
        Double artifactName = wekaInstance.value(wekaArtifactName);

        if (newBugLabel == 0.0 && artifactNames.keySet().contains(artifactName)) {
            artifactNames.put(artifactName, wekaInstance);
        } else if (newBugLabel == 0.0 && !artifactNames.keySet().contains(artifactName)) {
            artifactNames.put(artifactName, wekaInstance);
        } else if (newBugLabel == 1.0 && artifactNames.keySet().contains(artifactName)) {
            traindata.add(wekaInstance);
            artifactNames.remove(artifactName);
        } else if (newBugLabel == 1.0 && !artifactNames.keySet().contains(artifactName)) {
            if (!firstOccurenceArtifactNames.contains(artifactName)) {
                traindata.add(wekaInstance);
                firstOccurenceArtifactNames.add(artifactName);
            }
        }
    }

    // If we have a file, that never had a bug (this is, when it is NOT in the
    // new created dataset, but it is in the HashMap from above) add it to
    // the new dataset

    double[] artifactNamesinNewDataSet = traindata.attributeToDoubleArray(0);
    HashMap<Double, Instance> artifactNamesCopy = new HashMap<Double, Instance>(artifactNames);

    for (Double artifactName : artifactNames.keySet()) {

        for (int i = 0; i < artifactNamesinNewDataSet.length; i++) {
            if (artifactNamesinNewDataSet[i] == artifactName) {
                artifactNamesCopy.remove(artifactName);
            }
        }
    }

    for (Double artifact : artifactNamesCopy.keySet()) {
        traindata.add(artifactNamesCopy.get(artifact));
    }

}

From source file:de.ugoe.cs.cpdp.dataprocessing.TopMetricFilter.java

License:Apache License

private void determineTopKAttributes(Instances testdata, SetUniqueList<Instances> traindataSet)
        throws Exception {
    Integer[] counts = new Integer[traindataSet.get(0).numAttributes() - 1];
    IntStream.range(0, counts.length).forEach(val -> counts[val] = 0);
    for (Instances traindata : traindataSet) {
        J48 decisionTree = new J48();
        decisionTree.buildClassifier(traindata);
        int k = 0;
        for (int j = 0; j < traindata.numAttributes(); j++) {
            if (j != traindata.classIndex()) {
                if (decisionTree.toString().contains(traindata.attribute(j).name())) {
                    counts[k] = counts[k] + 1;
                }/*from w  ww  .  j a  v  a  2s  . com*/
                k++;
            }
        }
    }
    int[] topkIndex = new int[counts.length];
    IntStream.range(0, counts.length).forEach(val -> topkIndex[val] = val);
    SortUtils.quicksort(counts, topkIndex, true);

    // get CFSs for each training set
    List<Set<Integer>> cfsSets = new LinkedList<>();
    for (Instances traindata : traindataSet) {
        boolean selectionSuccessful = false;
        boolean secondAttempt = false;
        Instances traindataCopy = null;
        do {
            try {
                if (secondAttempt) {
                    AttributeSelection attsel = new AttributeSelection();
                    CfsSubsetEval eval = new CfsSubsetEval();
                    GreedyStepwise search = new GreedyStepwise();
                    search.setSearchBackwards(true);
                    attsel.setEvaluator(eval);
                    attsel.setSearch(search);
                    attsel.SelectAttributes(traindataCopy);
                    Set<Integer> cfsSet = new HashSet<>();
                    for (int attr : attsel.selectedAttributes()) {
                        cfsSet.add(attr);
                    }
                    cfsSets.add(cfsSet);
                    selectionSuccessful = true;
                } else {
                    AttributeSelection attsel = new AttributeSelection();
                    CfsSubsetEval eval = new CfsSubsetEval();
                    GreedyStepwise search = new GreedyStepwise();
                    search.setSearchBackwards(true);
                    attsel.setEvaluator(eval);
                    attsel.setSearch(search);
                    attsel.SelectAttributes(traindata);
                    Set<Integer> cfsSet = new HashSet<>();
                    for (int attr : attsel.selectedAttributes()) {
                        cfsSet.add(attr);
                    }
                    cfsSets.add(cfsSet);
                    selectionSuccessful = true;
                }
            } catch (IllegalArgumentException e) {
                String regex = "A nominal attribute \\((.*)\\) cannot have duplicate labels.*";
                Pattern p = Pattern.compile(regex);
                Matcher m = p.matcher(e.getMessage());
                if (!m.find()) {
                    // cannot treat problem, rethrow exception
                    throw e;
                }
                String attributeName = m.group(1);
                int attrIndex = traindata.attribute(attributeName).index();
                if (secondAttempt) {
                    traindataCopy = WekaUtils.upscaleAttribute(traindataCopy, attrIndex);
                } else {
                    traindataCopy = WekaUtils.upscaleAttribute(traindata, attrIndex);
                }
                Console.traceln(Level.FINE, "upscaled attribute " + attributeName + "; restarting training");
                secondAttempt = true;
                continue;
            }
        } while (!selectionSuccessful); // dummy loop for internal continue
    }

    double[] coverages = new double[topkIndex.length];
    for (Set<Integer> cfsSet : cfsSets) {
        Set<Integer> topkSet = new HashSet<>();
        for (int k = 0; k < topkIndex.length; k++) {
            topkSet.add(topkIndex[k]);
            coverages[k] += (coverage(topkSet, cfsSet) / traindataSet.size());
        }
    }
    double bestCoverageValue = Double.MIN_VALUE;
    int bestCoverageIndex = 0;
    for (int i = 0; i < coverages.length; i++) {
        if (coverages[i] > bestCoverageValue) {
            bestCoverageValue = coverages[i];
            bestCoverageIndex = i;
        }
    }
    // build correlation matrix
    SpearmansCorrelation corr = new SpearmansCorrelation();
    double[][] correlationMatrix = new double[bestCoverageIndex][bestCoverageIndex];
    for (Instances traindata : traindataSet) {
        double[][] vectors = new double[bestCoverageIndex][traindata.size()];
        for (int i = 0; i < traindata.size(); i++) {
            for (int j = 0; j < bestCoverageIndex; j++) {
                vectors[j][i] = traindata.get(i).value(topkIndex[j]);
            }
        }
        for (int j = 0; j < bestCoverageIndex; j++) {
            for (int k = j + 1; k < bestCoverageIndex; k++) {
                correlationMatrix[j][k] = Math.abs(corr.correlation(vectors[j], vectors[k]));
            }
        }
    }
    Set<Integer> topkSetIndexSet = new TreeSet<>();
    // j<30 ensures that the computational time does not explode since the powerset is 2^n in
    // complexity
    for (int j = 0; j < bestCoverageIndex && j < 30; j++) {
        topkSetIndexSet.add(j);
    }
    Set<Set<Integer>> allCombinations = Sets.powerSet(topkSetIndexSet);
    double bestOptCoverage = Double.MIN_VALUE;
    Set<Integer> opttopkSetIndexSet = null;
    for (Set<Integer> combination : allCombinations) {
        if (isUncorrelated(correlationMatrix, combination)) {
            double currentCoverage = 0.0;
            Set<Integer> topkCombination = new TreeSet<>();
            for (Integer index : combination) {
                topkCombination.add(topkIndex[index]);
            }
            for (Set<Integer> cfsSet : cfsSets) {
                currentCoverage += (coverage(topkCombination, cfsSet) / traindataSet.size());
            }
            if (currentCoverage > bestOptCoverage) {
                bestOptCoverage = currentCoverage;
                opttopkSetIndexSet = combination;
            }
        }
    }
    Set<Integer> opttopkIndex = new TreeSet<>();
    for (Integer index : opttopkSetIndexSet) {
        opttopkIndex.add(topkIndex[index]);
    }
    Console.traceln(Level.FINE, "selected the following metrics:");
    for (Integer index : opttopkIndex) {
        Console.traceln(Level.FINE, traindataSet.get(0).attribute(index).name());
    }
    // finally remove attributes
    for (int j = testdata.numAttributes() - 1; j >= 0; j--) {
        if (j != testdata.classIndex() && !opttopkIndex.contains(j)) {
            testdata.deleteAttributeAt(j);
            for (Instances traindata : traindataSet) {
                traindata.deleteAttributeAt(j);
            }
        }
    }
}

From source file:de.ugoe.cs.cpdp.dataselection.CLIFF.java

License:Apache License

/**
 * <p>/*from w  w  w.j  av  a  2 s  .co  m*/
 * Applies the CLIFF relevancy filter to the data.
 * </p>
 *
 * @param data
 *            the data
 * @return CLIFF-filtered data
 */
protected Instances applyCLIFF(Instances data) {
    final double[][] powerAttributes = new double[data.size()][data.numAttributes()];
    final double[] powerEntity = new double[data.size()];

    final int[] counts = data.attributeStats(data.classIndex()).nominalCounts;
    final double probDefect = data.numInstances() / (double) counts[1];

    for (int j = 0; j < data.numAttributes(); j++) {
        if (data.attribute(j) != data.classAttribute()) {
            final double[] ranges = getRanges(data, j);
            final double[] probDefectRange = getRangeProbabilities(data, j, ranges);

            for (int i = 0; i < data.numInstances(); i++) {
                final double value = data.instance(i).value(j);
                final int range = determineRange(ranges, value);
                double probClass, probNotClass, probRangeClass, probRangeNotClass;
                if (data.instance(i).classValue() == 1) {
                    probClass = probDefect;
                    probNotClass = 1.0 - probDefect;
                    probRangeClass = probDefectRange[range];
                    probRangeNotClass = 1.0 - probDefectRange[range];
                } else {
                    probClass = 1.0 - probDefect;
                    probNotClass = probDefect;
                    probRangeClass = 1.0 - probDefectRange[range];
                    probRangeNotClass = probDefectRange[range];
                }
                powerAttributes[i][j] = Math.pow(probRangeClass, 2.0)
                        / (probRangeClass * probClass + probRangeNotClass * probNotClass);
            }
        }
    }

    for (int i = 0; i < data.numInstances(); i++) {
        powerEntity[i] = 1.0;
        for (int j = 0; j < data.numAttributes(); j++) {
            powerEntity[i] *= powerAttributes[i][j];
        }
    }
    double[] sortedPower = powerEntity.clone();
    Arrays.sort(sortedPower);
    double cutOff = sortedPower[(int) (data.numInstances() * (1 - percentage))];

    final Instances selected = new Instances(data);
    selected.delete();
    for (int i = 0; i < data.numInstances(); i++) {
        if (powerEntity[i] >= cutOff) {
            selected.add(data.instance(i));
        }
    }
    return selected;
}

From source file:de.ugoe.cs.cpdp.dataselection.DecisionTreeSelection.java

License:Apache License

@Override
public void apply(Instances testdata, SetUniqueList<Instances> traindataSet) {
    final Instances data = characteristicInstances(testdata, traindataSet);

    final ArrayList<String> attVals = new ArrayList<String>();
    attVals.add("same");
    attVals.add("more");
    attVals.add("less");
    final ArrayList<Attribute> atts = new ArrayList<Attribute>();
    for (int j = 0; j < data.numAttributes(); j++) {
        atts.add(new Attribute(data.attribute(j).name(), attVals));
    }/*from  w  ww  . ja  v a 2  s  .  c o  m*/
    atts.add(new Attribute("score"));
    Instances similarityData = new Instances("similarity", atts, 0);
    similarityData.setClassIndex(similarityData.numAttributes() - 1);

    try {
        Classifier classifier = new J48();
        for (int i = 0; i < traindataSet.size(); i++) {
            classifier.buildClassifier(traindataSet.get(i));
            for (int j = 0; j < traindataSet.size(); j++) {
                if (i != j) {
                    double[] similarity = new double[data.numAttributes() + 1];
                    for (int k = 0; k < data.numAttributes(); k++) {
                        if (0.9 * data.get(i + 1).value(k) > data.get(j + 1).value(k)) {
                            similarity[k] = 2.0;
                        } else if (1.1 * data.get(i + 1).value(k) < data.get(j + 1).value(k)) {
                            similarity[k] = 1.0;
                        } else {
                            similarity[k] = 0.0;
                        }
                    }

                    Evaluation eval = new Evaluation(traindataSet.get(j));
                    eval.evaluateModel(classifier, traindataSet.get(j));
                    similarity[data.numAttributes()] = eval.fMeasure(1);
                    similarityData.add(new DenseInstance(1.0, similarity));
                }
            }
        }
        REPTree repTree = new REPTree();
        if (repTree.getNumFolds() > similarityData.size()) {
            repTree.setNumFolds(similarityData.size());
        }
        repTree.setNumFolds(2);
        repTree.buildClassifier(similarityData);

        Instances testTrainSimilarity = new Instances(similarityData);
        testTrainSimilarity.clear();
        for (int i = 0; i < traindataSet.size(); i++) {
            double[] similarity = new double[data.numAttributes() + 1];
            for (int k = 0; k < data.numAttributes(); k++) {
                if (0.9 * data.get(0).value(k) > data.get(i + 1).value(k)) {
                    similarity[k] = 2.0;
                } else if (1.1 * data.get(0).value(k) < data.get(i + 1).value(k)) {
                    similarity[k] = 1.0;
                } else {
                    similarity[k] = 0.0;
                }
            }
            testTrainSimilarity.add(new DenseInstance(1.0, similarity));
        }

        int bestScoringProductIndex = -1;
        double maxScore = Double.MIN_VALUE;
        for (int i = 0; i < traindataSet.size(); i++) {
            double score = repTree.classifyInstance(testTrainSimilarity.get(i));
            if (score > maxScore) {
                maxScore = score;
                bestScoringProductIndex = i;
            }
        }
        Instances bestScoringProduct = traindataSet.get(bestScoringProductIndex);
        traindataSet.clear();
        traindataSet.add(bestScoringProduct);
    } catch (Exception e) {
        Console.printerr("failure during DecisionTreeSelection: " + e.getMessage());
        throw new RuntimeException(e);
    }
}

From source file:de.ugoe.cs.cpdp.dataselection.SetWiseEMContextSelection.java

License:Apache License

/**
 * Returns test- and training data with only the project context factors which were chosen in
 * the configuration. This is later used for clustering.
 * /* w w w  .j a va 2 s .c o m*/
 * @param testdata
 * @param traindataSet
 * @return
 */
protected Instances getContextFactors(Instances testdata, SetUniqueList<Instances> traindataSet) {
    // setup weka Instances for clustering
    final ArrayList<Attribute> atts = new ArrayList<Attribute>();

    // we only want the project context factors
    for (String pcf : this.project_context_factors) {
        atts.add(new Attribute(pcf));
    }

    // set up the data
    final Instances data = new Instances("project_context_factors", atts, 0);
    double[] instanceValues = new double[atts.size()];

    // only project context factors + only one instance per project needed
    int i = 0;
    for (String pcf : this.project_context_factors) {
        instanceValues[i] = testdata.instance(0).value(testdata.attribute(pcf));
        // Console.traceln(Level.INFO, "adding attribute: " + pcf + " value: " +
        // instanceValues[i]);
        i++;
    }
    data.add(new DenseInstance(1.0, instanceValues));

    // now for the projects of the training stet
    for (Instances traindata : traindataSet) {
        instanceValues = new double[atts.size()]; // ohne das hier immer dieselben werte?!
        i = 0;
        for (String pcf : this.project_context_factors) {
            instanceValues[i] = traindata.instance(0).value(traindata.attribute(pcf));
            // Console.traceln(Level.INFO, "adding attribute: " + pcf + " value: " +
            // instanceValues[i]);
            i++;
        }

        data.add(new DenseInstance(1.0, instanceValues));
    }

    return data;
}

From source file:de.ugoe.cs.cpdp.dataselection.SetWiseEMContextSelection.java

License:Apache License

/**
 * Delete projects where the project context does not match the training project
 * /*from  w  w  w  . ja  v a  2  s .c om*/
 * @param testdata
 * @param traindataSet
 * @param attribute
 */
protected void removeWrongContext(Instances testdata, SetUniqueList<Instances> traindataSet, String attribute) {
    Set<Instances> remove = new HashSet<Instances>();
    for (Instances traindata : traindataSet) {
        if (traindata.firstInstance().value(traindata.attribute(attribute)) != testdata.firstInstance()
                .value(testdata.attribute(attribute))) {
            remove.add(traindata);
            // Console.traceln(Level.WARNING,
            // "rmove attribute "+attribute+" test:
            // "+testdata.firstInstance().value(testdata.attribute(attribute))+" train:
            // "+traindata.firstInstance().value(traindata.attribute(attribute)));
        }
    }

    // now delete the projects from set
    for (Instances i : remove) {
        traindataSet.remove(i);
        // Console.traceln(Level.INFO, "removing training project from set");
    }
}