Example usage for weka.core Instances numInstances

Introduction

In this page you can find the example usage for weka.core Instances numInstances.

Prototype


publicint numInstances()

Source Link

Document

Returns the number of instances in the dataset.

Usage

From source file:de.ugoe.cs.cpdp.dataprocessing.NormalizationUtil.java

License:Apache License

/**
 * <p>/*from   w  ww  .  j  a  v a2  s. c o m*/
 * Min-Max normalization to scale all data to the interval [0,1] (N1 in Transfer Defect Learning
 * by Nam et al.).
 * </p>
 *
 * @param data
 *            data that is normalized
 */
public static void minMax(Instances data) {
    for (int j = 0; j < data.numAttributes(); j++) {
        if (data.classIndex() != j) {
            double min = data.attributeStats(j).numericStats.min;
            double max = data.attributeStats(j).numericStats.max;

            for (int i = 0; i < data.numInstances(); i++) {
                Instance inst = data.instance(i);
                double newValue = (inst.value(j) - min) / (max - min);
                inst.setValue(j, newValue);
            }
        }
    }
}

From source file:de.ugoe.cs.cpdp.dataprocessing.NormalizationUtil.java

License:Apache License

/**
 * <p>//from w w  w .j a v  a  2s. com
 * Internal helper function
 * </p>
 */
private static void applyZScore(Instances data, double[] mean, double[] std) {
    for (int i = 0; i < data.numInstances(); i++) {
        Instance instance = data.instance(i);
        for (int j = 0; j < data.numAttributes(); j++) {
            if (data.classIndex() != j) {
                instance.setValue(j, instance.value(j) - mean[j] / std[j]);
            }
        }
    }
}

From source file:de.ugoe.cs.cpdp.dataprocessing.SimulationFilter.java

License:Apache License

@Override
public void apply(Instances testdata, Instances traindata) {
    Instances newDataSet = new Instances(traindata);
    traindata.delete();//from  w w w  .ja  va2s.  c om

    HashMap<Double, Instance> artifactNames = new HashMap<Double, Instance>();

    // This is to add all data, where the first occurence of the file has a bug
    ArrayList<Double> firstOccurenceArtifactNames = new ArrayList<Double>();

    // Sort dataset (StateID is connected to the date of commit: Lower StateID
    // means earlier commit than a higher stateID)
    Attribute wekaAttribute = newDataSet.attribute("Artifact.Target.StateID");
    newDataSet.sort(wekaAttribute);

    /*
     * Logical summary: If there is an instance that dont have a bug, put it into the hashmap
     * (only unique values in there)
     * 
     * If there is an instance, that hava a bug look up if it is in the hashmap already (this
     * means: it does not had a bug before!): If this is true add it to a new dataset and remove
     * it from the hashmap, so that new changes from "nonBug" -> "bug" for this file can be
     * found.
     * 
     * If the instance has a bug and is not in the hashmap (this means: The file has a bug with
     * its first occurence or this file only has bugs and not an instance with no bug), then (if
     * it is not in the arrayList above) add it to the new dataset. This way it is possible to
     * get the first occurence of a file, which has a bug
     */
    for (int i = 0; i < newDataSet.numInstances(); i++) {
        Instance wekaInstance = newDataSet.instance(i);

        double newBugLabel = wekaInstance.classValue();
        Attribute wekaArtifactName = newDataSet.attribute("Artifact.Name");
        Double artifactName = wekaInstance.value(wekaArtifactName);

        if (newBugLabel == 0.0 && artifactNames.keySet().contains(artifactName)) {
            artifactNames.put(artifactName, wekaInstance);
        } else if (newBugLabel == 0.0 && !artifactNames.keySet().contains(artifactName)) {
            artifactNames.put(artifactName, wekaInstance);
        } else if (newBugLabel == 1.0 && artifactNames.keySet().contains(artifactName)) {
            traindata.add(wekaInstance);
            artifactNames.remove(artifactName);
        } else if (newBugLabel == 1.0 && !artifactNames.keySet().contains(artifactName)) {
            if (!firstOccurenceArtifactNames.contains(artifactName)) {
                traindata.add(wekaInstance);
                firstOccurenceArtifactNames.add(artifactName);
            }
        }
    }

    // If we have a file, that never had a bug (this is, when it is NOT in the
    // new created dataset, but it is in the HashMap from above) add it to
    // the new dataset

    double[] artifactNamesinNewDataSet = traindata.attributeToDoubleArray(0);
    HashMap<Double, Instance> artifactNamesCopy = new HashMap<Double, Instance>(artifactNames);

    for (Double artifactName : artifactNames.keySet()) {

        for (int i = 0; i < artifactNamesinNewDataSet.length; i++) {
            if (artifactNamesinNewDataSet[i] == artifactName) {
                artifactNamesCopy.remove(artifactName);
            }
        }
    }

    for (Double artifact : artifactNamesCopy.keySet()) {
        traindata.add(artifactNamesCopy.get(artifact));
    }

}

From source file:de.ugoe.cs.cpdp.dataprocessing.TransferComponentAnalysis.java

License:Apache License

/**
 * <p>// w ww .j a  v  a 2s.c o m
 * Applies TCA to the test and training data.
 * </p>
 *
 * @param testdata
 *            the test data
 * @param traindata
 *            the training data
 */
private void applyTCA(Instances testdata, Instances traindata) {
    final int sizeTest = testdata.numInstances();
    final int sizeTrain = traindata.numInstances();
    final PrimitiveMatrix kernelMatrix = buildKernel(testdata, traindata);
    final PrimitiveMatrix kernelNormMatrix = buildKernelNormMatrix(sizeTest, sizeTrain); // L in
                                                                                         // the
                                                                                         // paper
    final PrimitiveMatrix centerMatrix = buildCenterMatrix(sizeTest, sizeTrain); // H in the
                                                                                 // paper
    final double mu = 1.0; // default from the MATLAB implementation
    final PrimitiveMatrix muMatrix = buildMuMatrix(sizeTest, sizeTrain, mu);
    PrimitiveMatrix.FACTORY.makeEye(sizeTest + sizeTrain, sizeTest + sizeTrain);

    Console.traceln(Level.FINEST, "creating optimization matrix (dimension " + (sizeTest + sizeTrain) + ")");
    final PrimitiveMatrix optimizationProblem = kernelMatrix.multiplyRight(kernelNormMatrix)
            .multiplyRight(kernelMatrix).add(muMatrix).invert().multiplyRight(kernelMatrix)
            .multiplyRight(centerMatrix).multiplyRight(kernelMatrix);
    Console.traceln(Level.FINEST, "optimization matrix created, now solving eigenvalue problem");
    General eigenvalueDecomposition = new JamaEigenvalue.General();
    eigenvalueDecomposition.compute(optimizationProblem);
    Console.traceln(Level.FINEST, "eigenvalue problem solved");

    Array1D<ComplexNumber> eigenvaluesArray = eigenvalueDecomposition.getEigenvalues();
    System.out.println(eigenvaluesArray.length);
    final Double[] eigenvalues = new Double[(int) eigenvaluesArray.length];
    final int[] index = new int[(int) eigenvaluesArray.length];
    // create kernel transformation matrix from eigenvectors
    for (int i = 0; i < eigenvaluesArray.length; i++) {
        eigenvalues[i] = eigenvaluesArray.doubleValue(i);
        index[i] = i;
    }
    SortUtils.quicksort(eigenvalues, index);

    final PrimitiveMatrix transformedKernel = kernelMatrix.multiplyRight(
            eigenvalueDecomposition.getV().selectColumns(Arrays.copyOfRange(index, 0, reducedDimension)));

    // update testdata and traindata
    for (int j = testdata.numAttributes() - 1; j >= 0; j--) {
        if (j != testdata.classIndex()) {
            testdata.deleteAttributeAt(j);
            traindata.deleteAttributeAt(j);
        }
    }
    for (int j = 0; j < reducedDimension; j++) {
        testdata.insertAttributeAt(new Attribute("kerneldim" + j), 1);
        traindata.insertAttributeAt(new Attribute("kerneldim" + j), 1);
    }
    for (int i = 0; i < sizeTrain; i++) {
        for (int j = 0; j < reducedDimension; j++) {
            traindata.instance(i).setValue(j + 1, transformedKernel.get(i, j));
        }
    }
    for (int i = 0; i < sizeTest; i++) {
        for (int j = 0; j < reducedDimension; j++) {
            testdata.instance(i).setValue(j + 1, transformedKernel.get(i + sizeTrain, j));
        }
    }
}

From source file:de.ugoe.cs.cpdp.dataprocessing.TransferComponentAnalysis.java

License:Apache License

/**
 * <p>/*from w w w. jav a 2 s .c o m*/
 * Creates the kernel matrix of the test and training data
 * </p>
 *
 * @param testdata
 *            the test data
 * @param traindata
 *            the training data
 * @return kernel matrix
 */
private PrimitiveMatrix buildKernel(Instances testdata, Instances traindata) {
    final int kernelDim = traindata.numInstances() + testdata.numInstances();

    Builder<PrimitiveMatrix> kernelBuilder = PrimitiveMatrix.getBuilder(kernelDim, kernelDim);
    // built upper left quadrant (source, source)
    for (int i = 0; i < traindata.numInstances(); i++) {
        for (int j = 0; j < traindata.numInstances(); j++) {
            kernelBuilder.set(i, j, linearKernel(traindata.get(i), traindata.get(j)));
        }
    }

    // built upper right quadrant (source, target)
    for (int i = 0; i < traindata.numInstances(); i++) {
        for (int j = 0; j < testdata.numInstances(); j++) {
            kernelBuilder.set(i, j + traindata.numInstances(), linearKernel(traindata.get(i), testdata.get(j)));
        }
    }

    // built lower left quadrant (target, source)
    for (int i = 0; i < testdata.numInstances(); i++) {
        for (int j = 0; j < traindata.numInstances(); j++) {
            kernelBuilder.set(i + traindata.numInstances(), j, linearKernel(testdata.get(i), traindata.get(j)));
        }
    }

    // built lower right quadrant (target, target)
    for (int i = 0; i < testdata.numInstances(); i++) {
        for (int j = 0; j < testdata.numInstances(); j++) {
            kernelBuilder.set(i + traindata.numInstances(), j + traindata.numInstances(),
                    linearKernel(testdata.get(i), testdata.get(j)));
        }
    }
    return kernelBuilder.build();
}

From source file:de.ugoe.cs.cpdp.dataselection.CLIFF.java

License:Apache License

/**
 * <p>//w  w  w . j av  a  2  s  .  com
 * Applies the CLIFF relevancy filter to the data.
 * </p>
 *
 * @param data
 *            the data
 * @return CLIFF-filtered data
 */
protected Instances applyCLIFF(Instances data) {
    final double[][] powerAttributes = new double[data.size()][data.numAttributes()];
    final double[] powerEntity = new double[data.size()];

    final int[] counts = data.attributeStats(data.classIndex()).nominalCounts;
    final double probDefect = data.numInstances() / (double) counts[1];

    for (int j = 0; j < data.numAttributes(); j++) {
        if (data.attribute(j) != data.classAttribute()) {
            final double[] ranges = getRanges(data, j);
            final double[] probDefectRange = getRangeProbabilities(data, j, ranges);

            for (int i = 0; i < data.numInstances(); i++) {
                final double value = data.instance(i).value(j);
                final int range = determineRange(ranges, value);
                double probClass, probNotClass, probRangeClass, probRangeNotClass;
                if (data.instance(i).classValue() == 1) {
                    probClass = probDefect;
                    probNotClass = 1.0 - probDefect;
                    probRangeClass = probDefectRange[range];
                    probRangeNotClass = 1.0 - probDefectRange[range];
                } else {
                    probClass = 1.0 - probDefect;
                    probNotClass = probDefect;
                    probRangeClass = 1.0 - probDefectRange[range];
                    probRangeNotClass = probDefectRange[range];
                }
                powerAttributes[i][j] = Math.pow(probRangeClass, 2.0)
                        / (probRangeClass * probClass + probRangeNotClass * probNotClass);
            }
        }
    }

    for (int i = 0; i < data.numInstances(); i++) {
        powerEntity[i] = 1.0;
        for (int j = 0; j < data.numAttributes(); j++) {
            powerEntity[i] *= powerAttributes[i][j];
        }
    }
    double[] sortedPower = powerEntity.clone();
    Arrays.sort(sortedPower);
    double cutOff = sortedPower[(int) (data.numInstances() * (1 - percentage))];

    final Instances selected = new Instances(data);
    selected.delete();
    for (int i = 0; i < data.numInstances(); i++) {
        if (powerEntity[i] >= cutOff) {
            selected.add(data.instance(i));
        }
    }
    return selected;
}

From source file:de.ugoe.cs.cpdp.dataselection.CLIFF.java

License:Apache License

/**
 * <p>//from  w  ww. j  a v  a2s.co m
 * Gets the probabilities of a positive prediction for each range for a given attribute
 * </p>
 *
 * @param data
 *            the data
 * @param j
 *            index of the attribute
 * @param ranges
 *            the ranges
 * @return probabilities for each range
 */
private double[] getRangeProbabilities(Instances data, int j, double[] ranges) {
    double[] probDefectRange = new double[numRanges];
    int[] countRange = new int[numRanges];
    int[] countDefect = new int[numRanges];
    for (int i = 0; i < data.numInstances(); i++) {
        int range = determineRange(ranges, data.instance(i).value(j));
        countRange[range]++;
        if (data.instance(i).classValue() == 1) {
            countDefect[range]++;
        }

    }
    for (int k = 0; k < numRanges; k++) {
        probDefectRange[k] = ((double) countDefect[k]) / countRange[k];
    }
    return probDefectRange;
}

From source file:de.ugoe.cs.cpdp.dataselection.SetWiseEMContextSelection.java

License:Apache License

/**
 * Uses the Weka EM-Clustering algorithm to cluster the projects by their project context
 * factors. The project context factors are first normalized and then used for clustering. They
 * can be configured in the configuration param.
 * /*from   ww w .j a v  a 2  s .  com*/
 * @param testdata
 * @param traindataSet
 */
protected void cluster(Instances testdata, SetUniqueList<Instances> traindataSet) {
    // now do the clustering, normalizedCharacteristicInstances ruft getContextFactors auf
    final Instances data = this.normalizedCharacteristicInstances(testdata, traindataSet);

    final Instance targetInstance = data.instance(0);
    final List<Instance> candidateInstances = new LinkedList<Instance>();
    for (int i = 1; i < data.numInstances(); i++) {
        candidateInstances.add(data.instance(i));
    }

    // cluster and select
    try {
        final EM emeans = new EM();
        boolean onlyTarget = true;
        int targetCluster;
        int maxNumClusters = candidateInstances.size();

        do { // while(onlyTarget)
            emeans.setMaximumNumberOfClusters(maxNumClusters);
            emeans.buildClusterer(data);

            targetCluster = emeans.clusterInstance(targetInstance);

            // check if cluster only contains target project
            for (int i = 0; i < candidateInstances.size() && onlyTarget; i++) {
                onlyTarget &= !(emeans.clusterInstance(candidateInstances.get(i)) == targetCluster);
            }
            maxNumClusters = emeans.numberOfClusters() - 1;

            // Console.traceln(Level.INFO, "number of clusters: " + emeans.numberOfClusters());
        } while (onlyTarget);

        Console.traceln(Level.INFO, "clusters: " + maxNumClusters);
        Console.traceln(Level.INFO, "instances vor dem clustern: " + traindataSet.size());
        int numRemoved = 0;
        for (int i = 0; i < candidateInstances.size(); i++) {
            if (emeans.clusterInstance(candidateInstances.get(i)) != targetCluster) {
                traindataSet.remove(i - numRemoved++);
            }
        }
        Console.traceln(Level.INFO, "instances nach dem clustern: " + traindataSet.size());
    } catch (Exception e) {
        throw new RuntimeException("error applying setwise EM clustering training data selection", e);
    }
}

From source file:de.ugoe.cs.cpdp.execution.AbstractCrossProjectExperiment.java

License:Apache License

/**
 * Helper method that combines a set of Weka {@link Instances} sets into a single
 * {@link Instances} set./*  w  w w .  jav  a2s. c  o  m*/
 * 
 * @param traindataSet
 *            set of {@link Instances} to be combines
 * @return single {@link Instances} set
 */
public static Instances makeSingleTrainingSet(SetUniqueList<Instances> traindataSet) {
    Instances traindataFull = null;
    for (Instances traindata : traindataSet) {
        if (traindataFull == null) {
            traindataFull = new Instances(traindata);
        } else {
            for (int i = 0; i < traindata.numInstances(); i++) {
                traindataFull.add(traindata.instance(i));
            }
        }
    }
    return traindataFull;
}

From source file:de.ugoe.cs.cpdp.util.WekaUtils.java

License:Apache License

/**
 * <p>/*from  w w w .  j a va  2  s .c o  m*/
 * Calculates the distributional characteristics of the distances the instances within a data
 * set have to each other.
 * </p>
 *
 * @param data
 *            data for which the instances are characterized
 * @return characteristics
 */
public static DistChar datasetDistance(Instances data) {
    double distance;
    double sumAll = 0.0;
    double sumAllQ = 0.0;
    double min = Double.MAX_VALUE;
    double max = Double.MIN_VALUE;
    int numCmp = 0;
    int l = 0;
    double[] inst1 = new double[data.numAttributes() - 1];
    double[] inst2 = new double[data.numAttributes() - 1];
    EuclideanDistance euclideanDistance = new EuclideanDistance();
    for (int i = 0; i < data.numInstances(); i++) {
        l = 0;
        for (int k = 0; k < data.numAttributes(); k++) {
            if (k != data.classIndex()) {
                inst1[l] = data.instance(i).value(k);
            }
        }
        for (int j = 0; j < data.numInstances(); j++) {
            if (j != i) {
                l = 0;
                for (int k = 0; k < data.numAttributes(); k++) {
                    if (k != data.classIndex()) {
                        inst2[l] = data.instance(j).value(k);
                    }
                }
                distance = euclideanDistance.compute(inst1, inst2);
                sumAll += distance;
                sumAllQ += distance * distance;
                numCmp++;
                if (distance < min) {
                    min = distance;
                }
                if (distance > max) {
                    max = distance;
                }
            }
        }
    }
    double mean = sumAll / numCmp;
    double std = Math.sqrt((sumAllQ - (sumAll * sumAll) / numCmp) * (1.0d / (numCmp - 1)));
    return new DistChar(mean, std, min, max, data.numInstances());
}