Example usage for weka.core Instances instance

Introduction

In this page you can find the example usage for weka.core Instances instance.

Prototype



publicInstance instance(int index)

Source Link

Document

Returns the instance at the given position.

Usage

From source file:de.ugoe.cs.cpdp.dataprocessing.NormalizationUtil.java

License:Apache License

/**
 * <p>/* w w  w  .  jav  a 2  s  . com*/
 * Min-Max normalization to scale all data to the interval [0,1] (N1 in Transfer Defect Learning
 * by Nam et al.).
 * </p>
 *
 * @param data
 *            data that is normalized
 */
public static void minMax(Instances data) {
    for (int j = 0; j < data.numAttributes(); j++) {
        if (data.classIndex() != j) {
            double min = data.attributeStats(j).numericStats.min;
            double max = data.attributeStats(j).numericStats.max;

            for (int i = 0; i < data.numInstances(); i++) {
                Instance inst = data.instance(i);
                double newValue = (inst.value(j) - min) / (max - min);
                inst.setValue(j, newValue);
            }
        }
    }
}

From source file:de.ugoe.cs.cpdp.dataprocessing.NormalizationUtil.java

License:Apache License

/**
 * <p>// w ww  .  j  av a 2 s .  co m
 * Internal helper function
 * </p>
 */
private static void applyZScore(Instances data, double[] mean, double[] std) {
    for (int i = 0; i < data.numInstances(); i++) {
        Instance instance = data.instance(i);
        for (int j = 0; j < data.numAttributes(); j++) {
            if (data.classIndex() != j) {
                instance.setValue(j, instance.value(j) - mean[j] / std[j]);
            }
        }
    }
}

From source file:de.ugoe.cs.cpdp.dataprocessing.SimulationFilter.java

License:Apache License

@Override
public void apply(Instances testdata, Instances traindata) {
    Instances newDataSet = new Instances(traindata);
    traindata.delete();/*from   w  ww  .j ava  2  s . c  o m*/

    HashMap<Double, Instance> artifactNames = new HashMap<Double, Instance>();

    // This is to add all data, where the first occurence of the file has a bug
    ArrayList<Double> firstOccurenceArtifactNames = new ArrayList<Double>();

    // Sort dataset (StateID is connected to the date of commit: Lower StateID
    // means earlier commit than a higher stateID)
    Attribute wekaAttribute = newDataSet.attribute("Artifact.Target.StateID");
    newDataSet.sort(wekaAttribute);

    /*
     * Logical summary: If there is an instance that dont have a bug, put it into the hashmap
     * (only unique values in there)
     * 
     * If there is an instance, that hava a bug look up if it is in the hashmap already (this
     * means: it does not had a bug before!): If this is true add it to a new dataset and remove
     * it from the hashmap, so that new changes from "nonBug" -> "bug" for this file can be
     * found.
     * 
     * If the instance has a bug and is not in the hashmap (this means: The file has a bug with
     * its first occurence or this file only has bugs and not an instance with no bug), then (if
     * it is not in the arrayList above) add it to the new dataset. This way it is possible to
     * get the first occurence of a file, which has a bug
     */
    for (int i = 0; i < newDataSet.numInstances(); i++) {
        Instance wekaInstance = newDataSet.instance(i);

        double newBugLabel = wekaInstance.classValue();
        Attribute wekaArtifactName = newDataSet.attribute("Artifact.Name");
        Double artifactName = wekaInstance.value(wekaArtifactName);

        if (newBugLabel == 0.0 && artifactNames.keySet().contains(artifactName)) {
            artifactNames.put(artifactName, wekaInstance);
        } else if (newBugLabel == 0.0 && !artifactNames.keySet().contains(artifactName)) {
            artifactNames.put(artifactName, wekaInstance);
        } else if (newBugLabel == 1.0 && artifactNames.keySet().contains(artifactName)) {
            traindata.add(wekaInstance);
            artifactNames.remove(artifactName);
        } else if (newBugLabel == 1.0 && !artifactNames.keySet().contains(artifactName)) {
            if (!firstOccurenceArtifactNames.contains(artifactName)) {
                traindata.add(wekaInstance);
                firstOccurenceArtifactNames.add(artifactName);
            }
        }
    }

    // If we have a file, that never had a bug (this is, when it is NOT in the
    // new created dataset, but it is in the HashMap from above) add it to
    // the new dataset

    double[] artifactNamesinNewDataSet = traindata.attributeToDoubleArray(0);
    HashMap<Double, Instance> artifactNamesCopy = new HashMap<Double, Instance>(artifactNames);

    for (Double artifactName : artifactNames.keySet()) {

        for (int i = 0; i < artifactNamesinNewDataSet.length; i++) {
            if (artifactNamesinNewDataSet[i] == artifactName) {
                artifactNamesCopy.remove(artifactName);
            }
        }
    }

    for (Double artifact : artifactNamesCopy.keySet()) {
        traindata.add(artifactNamesCopy.get(artifact));
    }

}

From source file:de.ugoe.cs.cpdp.dataprocessing.TransferComponentAnalysis.java

License:Apache License

/**
 * <p>//from  www . j  a  v a2  s. c  o m
 * Applies TCA to the test and training data.
 * </p>
 *
 * @param testdata
 *            the test data
 * @param traindata
 *            the training data
 */
private void applyTCA(Instances testdata, Instances traindata) {
    final int sizeTest = testdata.numInstances();
    final int sizeTrain = traindata.numInstances();
    final PrimitiveMatrix kernelMatrix = buildKernel(testdata, traindata);
    final PrimitiveMatrix kernelNormMatrix = buildKernelNormMatrix(sizeTest, sizeTrain); // L in
                                                                                         // the
                                                                                         // paper
    final PrimitiveMatrix centerMatrix = buildCenterMatrix(sizeTest, sizeTrain); // H in the
                                                                                 // paper
    final double mu = 1.0; // default from the MATLAB implementation
    final PrimitiveMatrix muMatrix = buildMuMatrix(sizeTest, sizeTrain, mu);
    PrimitiveMatrix.FACTORY.makeEye(sizeTest + sizeTrain, sizeTest + sizeTrain);

    Console.traceln(Level.FINEST, "creating optimization matrix (dimension " + (sizeTest + sizeTrain) + ")");
    final PrimitiveMatrix optimizationProblem = kernelMatrix.multiplyRight(kernelNormMatrix)
            .multiplyRight(kernelMatrix).add(muMatrix).invert().multiplyRight(kernelMatrix)
            .multiplyRight(centerMatrix).multiplyRight(kernelMatrix);
    Console.traceln(Level.FINEST, "optimization matrix created, now solving eigenvalue problem");
    General eigenvalueDecomposition = new JamaEigenvalue.General();
    eigenvalueDecomposition.compute(optimizationProblem);
    Console.traceln(Level.FINEST, "eigenvalue problem solved");

    Array1D<ComplexNumber> eigenvaluesArray = eigenvalueDecomposition.getEigenvalues();
    System.out.println(eigenvaluesArray.length);
    final Double[] eigenvalues = new Double[(int) eigenvaluesArray.length];
    final int[] index = new int[(int) eigenvaluesArray.length];
    // create kernel transformation matrix from eigenvectors
    for (int i = 0; i < eigenvaluesArray.length; i++) {
        eigenvalues[i] = eigenvaluesArray.doubleValue(i);
        index[i] = i;
    }
    SortUtils.quicksort(eigenvalues, index);

    final PrimitiveMatrix transformedKernel = kernelMatrix.multiplyRight(
            eigenvalueDecomposition.getV().selectColumns(Arrays.copyOfRange(index, 0, reducedDimension)));

    // update testdata and traindata
    for (int j = testdata.numAttributes() - 1; j >= 0; j--) {
        if (j != testdata.classIndex()) {
            testdata.deleteAttributeAt(j);
            traindata.deleteAttributeAt(j);
        }
    }
    for (int j = 0; j < reducedDimension; j++) {
        testdata.insertAttributeAt(new Attribute("kerneldim" + j), 1);
        traindata.insertAttributeAt(new Attribute("kerneldim" + j), 1);
    }
    for (int i = 0; i < sizeTrain; i++) {
        for (int j = 0; j < reducedDimension; j++) {
            traindata.instance(i).setValue(j + 1, transformedKernel.get(i, j));
        }
    }
    for (int i = 0; i < sizeTest; i++) {
        for (int j = 0; j < reducedDimension; j++) {
            testdata.instance(i).setValue(j + 1, transformedKernel.get(i + sizeTrain, j));
        }
    }
}

From source file:de.ugoe.cs.cpdp.dataselection.CLIFF.java

License:Apache License

/**
 * <p>//from ww  w. j  a v a2  s . co  m
 * Applies the CLIFF relevancy filter to the data.
 * </p>
 *
 * @param data
 *            the data
 * @return CLIFF-filtered data
 */
protected Instances applyCLIFF(Instances data) {
    final double[][] powerAttributes = new double[data.size()][data.numAttributes()];
    final double[] powerEntity = new double[data.size()];

    final int[] counts = data.attributeStats(data.classIndex()).nominalCounts;
    final double probDefect = data.numInstances() / (double) counts[1];

    for (int j = 0; j < data.numAttributes(); j++) {
        if (data.attribute(j) != data.classAttribute()) {
            final double[] ranges = getRanges(data, j);
            final double[] probDefectRange = getRangeProbabilities(data, j, ranges);

            for (int i = 0; i < data.numInstances(); i++) {
                final double value = data.instance(i).value(j);
                final int range = determineRange(ranges, value);
                double probClass, probNotClass, probRangeClass, probRangeNotClass;
                if (data.instance(i).classValue() == 1) {
                    probClass = probDefect;
                    probNotClass = 1.0 - probDefect;
                    probRangeClass = probDefectRange[range];
                    probRangeNotClass = 1.0 - probDefectRange[range];
                } else {
                    probClass = 1.0 - probDefect;
                    probNotClass = probDefect;
                    probRangeClass = 1.0 - probDefectRange[range];
                    probRangeNotClass = probDefectRange[range];
                }
                powerAttributes[i][j] = Math.pow(probRangeClass, 2.0)
                        / (probRangeClass * probClass + probRangeNotClass * probNotClass);
            }
        }
    }

    for (int i = 0; i < data.numInstances(); i++) {
        powerEntity[i] = 1.0;
        for (int j = 0; j < data.numAttributes(); j++) {
            powerEntity[i] *= powerAttributes[i][j];
        }
    }
    double[] sortedPower = powerEntity.clone();
    Arrays.sort(sortedPower);
    double cutOff = sortedPower[(int) (data.numInstances() * (1 - percentage))];

    final Instances selected = new Instances(data);
    selected.delete();
    for (int i = 0; i < data.numInstances(); i++) {
        if (powerEntity[i] >= cutOff) {
            selected.add(data.instance(i));
        }
    }
    return selected;
}

From source file:de.ugoe.cs.cpdp.dataselection.CLIFF.java

License:Apache License

/**
 * <p>//  www  . ja  va  2  s.c  o m
 * Gets the probabilities of a positive prediction for each range for a given attribute
 * </p>
 *
 * @param data
 *            the data
 * @param j
 *            index of the attribute
 * @param ranges
 *            the ranges
 * @return probabilities for each range
 */
private double[] getRangeProbabilities(Instances data, int j, double[] ranges) {
    double[] probDefectRange = new double[numRanges];
    int[] countRange = new int[numRanges];
    int[] countDefect = new int[numRanges];
    for (int i = 0; i < data.numInstances(); i++) {
        int range = determineRange(ranges, data.instance(i).value(j));
        countRange[range]++;
        if (data.instance(i).classValue() == 1) {
            countDefect[range]++;
        }

    }
    for (int k = 0; k < numRanges; k++) {
        probDefectRange[k] = ((double) countDefect[k]) / countRange[k];
    }
    return probDefectRange;
}

From source file:de.ugoe.cs.cpdp.dataselection.NeighborhoodFilter.java

License:Apache License

/**
 * <p>/*from   w  ww .  j  a  va2  s. c o m*/
 * Applies the relevancy filter after Ryu et al.
 * </p>
 *
 * @param testdata
 *            test data
 * @param traindata
 *            training data
 * @return filtered trainind data
 */
private Instances applyNeighborhoodFilter(Instances testdata, Instances traindata) {
    TreeSet<Integer> selectedInstances = new TreeSet<>();
    for (int i = 0; i < testdata.size(); i++) {
        double minHam = Double.MAX_VALUE;
        for (int j = 0; j < traindata.size(); j++) {
            double distance = WekaUtils.hammingDistance(testdata.get(i), traindata.get(j));
            if (distance < minHam) {
                minHam = distance;
            }
        }
        for (int j = 0; j < traindata.size(); j++) {
            double distance = WekaUtils.hammingDistance(testdata.get(i), traindata.get(j));
            if (distance <= minHam) {
                selectedInstances.add(j);
            }
        }
    }
    Instances selectedTraindata = new Instances(testdata);
    selectedTraindata.clear();
    for (Integer index : selectedInstances) {
        selectedTraindata.add(traindata.instance(index));
    }
    return selectedTraindata;
}

From source file:de.ugoe.cs.cpdp.dataselection.SetWiseEMContextSelection.java

License:Apache License

/**
 * Uses the Weka EM-Clustering algorithm to cluster the projects by their project context
 * factors. The project context factors are first normalized and then used for clustering. They
 * can be configured in the configuration param.
 * //from  w  w  w  .ja  v  a 2 s.  c o  m
 * @param testdata
 * @param traindataSet
 */
protected void cluster(Instances testdata, SetUniqueList<Instances> traindataSet) {
    // now do the clustering, normalizedCharacteristicInstances ruft getContextFactors auf
    final Instances data = this.normalizedCharacteristicInstances(testdata, traindataSet);

    final Instance targetInstance = data.instance(0);
    final List<Instance> candidateInstances = new LinkedList<Instance>();
    for (int i = 1; i < data.numInstances(); i++) {
        candidateInstances.add(data.instance(i));
    }

    // cluster and select
    try {
        final EM emeans = new EM();
        boolean onlyTarget = true;
        int targetCluster;
        int maxNumClusters = candidateInstances.size();

        do { // while(onlyTarget)
            emeans.setMaximumNumberOfClusters(maxNumClusters);
            emeans.buildClusterer(data);

            targetCluster = emeans.clusterInstance(targetInstance);

            // check if cluster only contains target project
            for (int i = 0; i < candidateInstances.size() && onlyTarget; i++) {
                onlyTarget &= !(emeans.clusterInstance(candidateInstances.get(i)) == targetCluster);
            }
            maxNumClusters = emeans.numberOfClusters() - 1;

            // Console.traceln(Level.INFO, "number of clusters: " + emeans.numberOfClusters());
        } while (onlyTarget);

        Console.traceln(Level.INFO, "clusters: " + maxNumClusters);
        Console.traceln(Level.INFO, "instances vor dem clustern: " + traindataSet.size());
        int numRemoved = 0;
        for (int i = 0; i < candidateInstances.size(); i++) {
            if (emeans.clusterInstance(candidateInstances.get(i)) != targetCluster) {
                traindataSet.remove(i - numRemoved++);
            }
        }
        Console.traceln(Level.INFO, "instances nach dem clustern: " + traindataSet.size());
    } catch (Exception e) {
        throw new RuntimeException("error applying setwise EM clustering training data selection", e);
    }
}

From source file:de.ugoe.cs.cpdp.dataselection.SetWiseEMContextSelection.java

License:Apache License

/**
 * Returns test- and training data with only the project context factors which were chosen in
 * the configuration. This is later used for clustering.
 * //w ww  . j  av  a 2s.  c  om
 * @param testdata
 * @param traindataSet
 * @return
 */
protected Instances getContextFactors(Instances testdata, SetUniqueList<Instances> traindataSet) {
    // setup weka Instances for clustering
    final ArrayList<Attribute> atts = new ArrayList<Attribute>();

    // we only want the project context factors
    for (String pcf : this.project_context_factors) {
        atts.add(new Attribute(pcf));
    }

    // set up the data
    final Instances data = new Instances("project_context_factors", atts, 0);
    double[] instanceValues = new double[atts.size()];

    // only project context factors + only one instance per project needed
    int i = 0;
    for (String pcf : this.project_context_factors) {
        instanceValues[i] = testdata.instance(0).value(testdata.attribute(pcf));
        // Console.traceln(Level.INFO, "adding attribute: " + pcf + " value: " +
        // instanceValues[i]);
        i++;
    }
    data.add(new DenseInstance(1.0, instanceValues));

    // now for the projects of the training stet
    for (Instances traindata : traindataSet) {
        instanceValues = new double[atts.size()]; // ohne das hier immer dieselben werte?!
        i = 0;
        for (String pcf : this.project_context_factors) {
            instanceValues[i] = traindata.instance(0).value(traindata.attribute(pcf));
            // Console.traceln(Level.INFO, "adding attribute: " + pcf + " value: " +
            // instanceValues[i]);
            i++;
        }

        data.add(new DenseInstance(1.0, instanceValues));
    }

    return data;
}

From source file:de.ugoe.cs.cpdp.execution.AbstractCrossProjectExperiment.java

License:Apache License

/**
 * Helper method that combines a set of Weka {@link Instances} sets into a single
 * {@link Instances} set.//from  w w w.  j  a  v a2 s . c o m
 * 
 * @param traindataSet
 *            set of {@link Instances} to be combines
 * @return single {@link Instances} set
 */
public static Instances makeSingleTrainingSet(SetUniqueList<Instances> traindataSet) {
    Instances traindataFull = null;
    for (Instances traindata : traindataSet) {
        if (traindataFull == null) {
            traindataFull = new Instances(traindata);
        } else {
            for (int i = 0; i < traindata.numInstances(); i++) {
                traindataFull.add(traindata.instance(i));
            }
        }
    }
    return traindataFull;
}