Example usage for weka.core Instances numAttributes

List of usage examples for weka.core Instances numAttributes

Introduction

In this page you can find the example usage for weka.core Instances numAttributes.

Prototype


publicint numAttributes() 

Source Link

Document

Returns the number of attributes.

Usage

From source file:de.ugoe.cs.cpdp.dataselection.MahalanobisOutlierRemoval.java

License:Apache License

/**
 * <p>/*from ww w  .j  av a 2  s .c  o m*/
 * removes all instances, whose Mahalanobi distance to the mean of the data is greater than
 * epsilon.
 * </p>
 *
 * @param data
 *            data where the outliers are removed
 */
private void applyMahalanobisDistancesRemoval(Instances data) {
    RealMatrix values = new BlockRealMatrix(data.size(), data.numAttributes() - 1);
    for (int i = 0; i < data.size(); i++) {
        values.setRow(i, WekaUtils.instanceValues(data.get(i)));
    }
    RealMatrix inverseCovariance;
    try {
        inverseCovariance = new LUDecomposition(new Covariance(values).getCovarianceMatrix()).getSolver()
                .getInverse();
    } catch (SingularMatrixException e) {
        Console.traceln(Level.WARNING,
                "could not perform Mahalanobis outlier removal due to singular covariance matrix");
        return;
    }
    // create mean vector
    double[] meanValues = new double[data.numAttributes() - 1];
    int k = 0;
    for (int j = 0; j < data.numAttributes(); j++) {
        if (j != data.classIndex()) {
            meanValues[k] = data.attributeStats(j).numericStats.mean;
            k++;
        }
    }

    for (int i = data.size() - 1; i >= 0; i--) {
        double distance = mahalanobisDistance(inverseCovariance, WekaUtils.instanceValues(data.get(i)),
                meanValues);
        if (distance > epsilon) {
            data.remove(i);
        }
    }
}

From source file:de.ugoe.cs.cpdp.dataselection.SynonymOutlierRemoval.java

License:Apache License

/**
 * <p>// w ww  .  j  a  v  a 2 s .c om
 * Applies the synonym outlier removal.
 * </p>
 *
 * @param traindata
 *            data from which the outliers are removed.
 */
public void applySynonymRemoval(Instances traindata) {
    double minDistance[][] = new double[traindata.size()][traindata.numAttributes() - 1];
    double minDistanceAttribute[] = new double[traindata.numAttributes() - 1];
    double distance;
    for (int j = 0; j < minDistanceAttribute.length; j++) {
        minDistanceAttribute[j] = Double.MAX_VALUE;
    }
    for (int i1 = traindata.size() - 1; i1 < traindata.size(); i1++) {
        int k = 0;
        for (int j = 0; j < traindata.numAttributes(); j++) {
            if (j != traindata.classIndex()) {
                minDistance[i1][k] = Double.MAX_VALUE;
                for (int i2 = 0; i2 < traindata.size(); i2++) {
                    if (i1 != i2) {
                        distance = Math.abs(traindata.get(i1).value(j) - traindata.get(i2).value(j));
                        if (distance < minDistance[i1][k]) {
                            minDistance[i1][k] = distance;
                        }
                        if (distance < minDistanceAttribute[k]) {
                            minDistanceAttribute[k] = distance;
                        }
                    }
                }
                k++;
            }
        }
    }
    for (int i = traindata.size() - 1; i >= 0; i--) {
        boolean hasClosest = false;
        for (int j = 0; !hasClosest && j < traindata.numAttributes(); j++) {
            hasClosest = minDistance[i][j] <= minDistanceAttribute[j];
        }
        if (!hasClosest) {
            traindata.delete(i);
        }
    }
}

From source file:de.ugoe.cs.cpdp.loader.AUDIChangeLoader.java

License:Apache License

public Instances load(File file, String dummy) {
    final String[] lines;
    try {//  www .j  a  v a  2s .  co  m
        lines = FileTools.getLinesFromFile(file.getAbsolutePath());
    } catch (IOException e) {
        throw new RuntimeException(e);
    }

    // information about bugs are in another file
    String path = file.getAbsolutePath();
    path = path.substring(0, path.length() - 14) + "repro.csv";
    final String[] linesBug;
    try {
        linesBug = FileTools.getLinesFromFile(path);
    } catch (IOException e) {
        throw new RuntimeException(e);
    }

    // configure Instances
    final ArrayList<Attribute> atts = new ArrayList<Attribute>();

    String[] lineSplit = lines[0].split(";");
    // ignore first three/four and last two columns
    int offset;
    if (lineSplit[3].equals("project_rev")) {
        offset = 4;
    } else {
        offset = 3;
    }
    for (int j = 0; j < lineSplit.length - (offset + 2); j++) {
        atts.add(new Attribute(lineSplit[j + offset]));
    }
    final ArrayList<String> classAttVals = new ArrayList<String>();
    classAttVals.add("0");
    classAttVals.add("1");
    final Attribute classAtt = new Attribute("bug", classAttVals);
    atts.add(classAtt);

    final Instances data = new Instances(file.getName(), atts, 0);
    data.setClass(classAtt);

    // fetch data
    for (int i = 1; i < lines.length; i++) {
        boolean validInstance = true;
        lineSplit = lines[i].split(";");
        String[] lineSplitBug = linesBug[i].split(";");
        double[] values = new double[data.numAttributes()];
        for (int j = 0; validInstance && j < values.length - 1; j++) {
            if (lineSplit[j + offset].trim().isEmpty()) {
                validInstance = false;
            } else {
                values[j] = Double.parseDouble(lineSplit[j + offset].trim());
            }
        }
        if (offset == 3) {
            values[values.length - 1] = lineSplitBug[7].equals("0") ? 0 : 1;
        } else {
            values[values.length - 1] = lineSplitBug[8].equals("0") ? 0 : 1;
        }

        if (validInstance) {
            data.add(new DenseInstance(1.0, values));
        } else {
            System.out.println("instance " + i + " is invalid");
        }
    }
    return data;
}

From source file:de.ugoe.cs.cpdp.loader.AUDIDataLoader.java

License:Apache License

@Override
public Instances load(File file) {
    final String[] lines;
    try {/* w  w w.  jav  a  2  s.  c  o  m*/
        lines = FileTools.getLinesFromFile(file.getAbsolutePath());
    } catch (IOException e) {
        throw new RuntimeException(e);
    }

    // information about bugs are in another file
    String path = file.getAbsolutePath();
    path = path.substring(0, path.length() - 14) + "repro.csv";
    final String[] linesBug;
    try {
        linesBug = FileTools.getLinesFromFile(path);
    } catch (IOException e) {
        throw new RuntimeException(e);
    }

    // configure Instances
    final ArrayList<Attribute> atts = new ArrayList<Attribute>();

    String[] lineSplit = lines[0].split(";");
    // ignore first three/four and last two columns
    int offset;
    if (lineSplit[3].equals("project_rev")) {
        offset = 4;
    } else {
        offset = 3;
    }
    for (int j = 0; j < lineSplit.length - (offset + 2); j++) {
        atts.add(new Attribute(lineSplit[j + offset]));
    }
    final ArrayList<String> classAttVals = new ArrayList<String>();
    classAttVals.add("0");
    classAttVals.add("1");
    final Attribute classAtt = new Attribute("bug", classAttVals);
    atts.add(classAtt);

    final Instances data = new Instances(file.getName(), atts, 0);
    data.setClass(classAtt);

    // fetch data
    for (int i = 1; i < lines.length; i++) {
        boolean validInstance = true;
        lineSplit = lines[i].split(";");
        String[] lineSplitBug = linesBug[i].split(";");
        double[] values = new double[data.numAttributes()];
        for (int j = 0; validInstance && j < values.length - 1; j++) {
            if (lineSplit[j + offset].trim().isEmpty()) {
                validInstance = false;
            } else {
                values[j] = Double.parseDouble(lineSplit[j + offset].trim());
            }
        }
        if (offset == 3) {
            values[values.length - 1] = lineSplitBug[7].equals("0") ? 0 : 1;
        } else {
            values[values.length - 1] = lineSplitBug[8].equals("0") ? 0 : 1;
        }

        if (validInstance) {
            data.add(new DenseInstance(1.0, values));
        } else {
            System.out.println("instance " + i + " is invalid");
        }
    }
    return data;
}

From source file:de.ugoe.cs.cpdp.loader.DecentDataLoader.java

License:Apache License

/**
 * Creates a WekaInstance from an ARFFX Model Instance
 * /*  w w w.  j  av  a 2 s. c o m*/
 * @param dataSet
 *            WekaInstance dataset, where the arffx model instances should be added to
 * @param i
 *            arffx model instance
 */
private void createWekaInstance(Instances dataSet, Instance i) {
    double[] values = new double[dataSet.numAttributes()];
    int j = 0;

    for (Value value : i.getValues()) {
        String dataValue = value.getContent();
        String attributeName = value.getOfAttribute().getName();

        if (attributeFilter.contains(attributeName)) {
            continue;
        }

        // Is value a LABEL.* attribute?
        if (isLabel(attributeName)) {
            values[j] = dataSet.attribute(j).indexOfValue(dataValue);
        } else if (isConfidenceLabel(attributeName)) {
            // Is value a CONFIDENCE.* attribute?
            values[j] = dataSet.attribute(j).indexOfValue(dataValue);
        } else if (attributeName.equals("Artifact.Name")) {
            // Is it the name of the artifact?
            artifactNames.add(dataValue);
            values[j] = getIndexOfArtifactName(dataValue);
        } else {
            // Is it a numeric value?
            values[j] = Double.parseDouble(dataValue);
        }

        j++;
    }

    DenseInstance inst = new DenseInstance(1.0, values);
    dataSet.add(inst);
}

From source file:de.ugoe.cs.cpdp.loader.NetgeneLoader.java

License:Apache License

@Override
public Instances load(File fileMetricsFile) {
    // first determine all files
    String path = fileMetricsFile.getParentFile().getAbsolutePath();
    String project = fileMetricsFile.getName().split("_")[0];
    File bugsFile = new File(path + "/" + project + "_bugs_per_file.csv");
    File networkMetrics = new File(path + "/" + project + "_network_metrics.csv");
    Instances metricsData = null;/*from ww w.  ja va 2  s. c  o m*/

    try {
        CSVLoader wekaCsvLoader = new CSVLoader();
        wekaCsvLoader.setSource(fileMetricsFile);
        metricsData = wekaCsvLoader.getDataSet();
        wekaCsvLoader.setSource(bugsFile);
        Instances bugsData = wekaCsvLoader.getDataSet();
        wekaCsvLoader.setSource(networkMetrics);
        Instances networkData = wekaCsvLoader.getDataSet();

        metricsData.setRelationName(project);

        // fix nominal attributes (i.e., NA values)
        for (int j = 2; j < networkData.numAttributes(); j++) {
            if (networkData.attribute(j).isNominal()) {
                String attributeName = networkData.attribute(j).name();
                double[] tmpVals = new double[networkData.size()];
                // get temporary values
                for (int i = 0; i < networkData.size(); i++) {
                    Instance inst = networkData.instance(i);
                    if (!inst.isMissing(j)) {
                        String val = networkData.instance(i).stringValue(j);
                        try {
                            tmpVals[i] = Double.parseDouble(val);
                        } catch (NumberFormatException e) {
                            // not a number, using 0.0;
                            tmpVals[i] = 0.0;
                        }
                    } else {
                        tmpVals[i] = 0.0;
                    }
                }
                // replace attribute
                networkData.deleteAttributeAt(j);
                networkData.insertAttributeAt(new Attribute(attributeName), j);
                for (int i = 0; i < networkData.size(); i++) {
                    networkData.instance(i).setValue(j, tmpVals[i]);
                }
            }
        }
        // fix string attributes
        for (int j = 2; j < networkData.numAttributes(); j++) {
            if (networkData.attribute(j).isString()) {
                String attributeName = networkData.attribute(j).name();
                double[] tmpVals = new double[networkData.size()];
                // get temporary values
                for (int i = 0; i < networkData.size(); i++) {
                    Instance inst = networkData.instance(i);
                    if (!inst.isMissing(j)) {
                        String val = networkData.instance(i).stringValue(j);
                        try {
                            tmpVals[i] = Double.parseDouble(val);
                        } catch (NumberFormatException e) {
                            // not a number, using 0.0;
                            tmpVals[i] = 0.0;
                        }
                    } else {
                        tmpVals[i] = 0.0;
                    }
                }
                // replace attribute
                networkData.deleteAttributeAt(j);
                networkData.insertAttributeAt(new Attribute(attributeName), j);
                for (int i = 0; i < networkData.size(); i++) {
                    networkData.instance(i).setValue(j, tmpVals[i]);
                }
            }
        }

        Map<String, Integer> filenames = new HashMap<>();
        for (int j = 0; j < metricsData.size(); j++) {
            filenames.put(metricsData.instance(j).stringValue(0), j);
        }
        // merge with network data
        int attributeIndex;
        for (int j = 2; j < networkData.numAttributes(); j++) {
            attributeIndex = metricsData.numAttributes();
            metricsData.insertAttributeAt(networkData.attribute(j), attributeIndex);
            for (int i = 0; i < networkData.size(); i++) {
                Integer instanceIndex = filenames.get(networkData.instance(i).stringValue(1));
                if (instanceIndex != null) {
                    metricsData.instance(instanceIndex).setValue(attributeIndex,
                            networkData.instance(i).value(j));
                }
            }
        }

        // add bug information
        attributeIndex = metricsData.numAttributes();
        final ArrayList<String> classAttVals = new ArrayList<String>();
        classAttVals.add("0");
        classAttVals.add("1");
        final Attribute classAtt = new Attribute("bug", classAttVals);
        metricsData.insertAttributeAt(classAtt, attributeIndex);
        for (int i = 0; i < bugsData.size(); i++) {
            if (bugsData.instance(i).value(2) > 0.0d) {
                Integer instanceIndex = filenames.get(bugsData.instance(i).stringValue(1));
                if (instanceIndex != null) {
                    metricsData.instance(instanceIndex).setValue(attributeIndex, 1.0);
                }
            }
        }

        // remove filenames
        metricsData.deleteAttributeAt(0);
        Attribute eigenvector = metricsData.attribute("eigenvector");
        if (eigenvector != null) {
            for (int j = 0; j < metricsData.numAttributes(); j++) {
                if (metricsData.attribute(j) == eigenvector) {
                    metricsData.deleteAttributeAt(j);
                }
            }
        }

        metricsData.setClassIndex(metricsData.numAttributes() - 1);

        // set all missing values to 0
        for (int i = 0; i < metricsData.size(); i++) {
            for (int j = 0; j < metricsData.numAttributes(); j++) {
                if (metricsData.instance(i).isMissing(j)) {
                    metricsData.instance(i).setValue(j, 0.0d);
                }
            }
        }
    } catch (IOException e) {
        Console.traceln(Level.SEVERE, "failure reading file: " + e.getMessage());
        metricsData = null;
    }
    return metricsData;
}

From source file:de.ugoe.cs.cpdp.training.MetricMatchingTraining.java

License:Apache License

/**
 * We need the test data instances to do a metric matching, so in this special case we get this
 * data before evaluation./*from w  w  w  .  jav a 2  s.c  o  m*/
 */
@Override
public void apply(SetUniqueList<Instances> traindataSet, Instances testdata) {
    // reset these for each run
    this.mm = null;
    this.classifier = null;

    double score = 0; // matching score to select the best matching training data from the set
    int num = 0;
    int biggest_num = 0;
    MetricMatch tmp;
    for (Instances traindata : traindataSet) {
        num++;

        tmp = new MetricMatch(traindata, testdata);

        // metric selection may create error, continue to next training set
        try {
            tmp.attributeSelection();
            tmp.matchAttributes(this.method, this.threshold);
        } catch (Exception e) {
            e.printStackTrace();
            throw new RuntimeException(e);
        }

        // we only select the training data from our set with the most matching attributes
        if (tmp.getScore() > score && tmp.attributes.size() > 0) {
            score = tmp.getScore();
            this.mm = tmp;
            biggest_num = num;
        }
    }

    // if we have found a matching instance we use it, log information about the match for
    // additional eval later
    Instances ilist = null;
    if (this.mm != null) {
        ilist = this.mm.getMatchedTrain();
        Console.traceln(Level.INFO,
                "[MATCH FOUND] match: [" + biggest_num + "], score: [" + score + "], instances: ["
                        + ilist.size() + "], attributes: [" + this.mm.attributes.size() + "], ilist attrs: ["
                        + ilist.numAttributes() + "]");
        for (Map.Entry<Integer, Integer> attmatch : this.mm.attributes.entrySet()) {
            Console.traceln(Level.INFO,
                    "[MATCHED ATTRIBUTE] source attribute: ["
                            + this.mm.train.attribute(attmatch.getKey()).name() + "], target attribute: ["
                            + this.mm.test.attribute(attmatch.getValue()).name() + "]");
        }
    } else {
        Console.traceln(Level.INFO, "[NO MATCH FOUND]");
    }

    // if we have a match we build the MetricMatchingClassifier, if not we fall back to FixClass
    // Classifier
    try {
        if (this.mm != null) {
            this.classifier = new MetricMatchingClassifier();
            this.classifier.buildClassifier(ilist);
            ((MetricMatchingClassifier) this.classifier).setMetricMatching(this.mm);
        } else {
            this.classifier = new FixClass();
            this.classifier.buildClassifier(ilist); // this is null, but the FixClass Classifier
                                                    // does not use it anyway
        }
    } catch (Exception e) {
        e.printStackTrace();
        throw new RuntimeException(e);
    }
}

From source file:de.ugoe.cs.cpdp.util.WekaUtils.java

License:Apache License

/**
 * <p>/*from  w w  w  . j  a  va  2 s.c om*/
 * Calculates the distributional characteristics of the distances the instances within a data
 * set have to each other.
 * </p>
 *
 * @param data
 *            data for which the instances are characterized
 * @return characteristics
 */
public static DistChar datasetDistance(Instances data) {
    double distance;
    double sumAll = 0.0;
    double sumAllQ = 0.0;
    double min = Double.MAX_VALUE;
    double max = Double.MIN_VALUE;
    int numCmp = 0;
    int l = 0;
    double[] inst1 = new double[data.numAttributes() - 1];
    double[] inst2 = new double[data.numAttributes() - 1];
    EuclideanDistance euclideanDistance = new EuclideanDistance();
    for (int i = 0; i < data.numInstances(); i++) {
        l = 0;
        for (int k = 0; k < data.numAttributes(); k++) {
            if (k != data.classIndex()) {
                inst1[l] = data.instance(i).value(k);
            }
        }
        for (int j = 0; j < data.numInstances(); j++) {
            if (j != i) {
                l = 0;
                for (int k = 0; k < data.numAttributes(); k++) {
                    if (k != data.classIndex()) {
                        inst2[l] = data.instance(j).value(k);
                    }
                }
                distance = euclideanDistance.compute(inst1, inst2);
                sumAll += distance;
                sumAllQ += distance * distance;
                numCmp++;
                if (distance < min) {
                    min = distance;
                }
                if (distance > max) {
                    max = distance;
                }
            }
        }
    }
    double mean = sumAll / numCmp;
    double std = Math.sqrt((sumAllQ - (sumAll * sumAll) / numCmp) * (1.0d / (numCmp - 1)));
    return new DistChar(mean, std, min, max, data.numInstances());
}

From source file:de.uniheidelberg.cl.swp.mlprocess.AblationTesting.java

License:Apache License

/**
 * Creates an Instance object for the specified List of Features.
 * <br>//from w  w  w .j a va2  s .c  om
 * Extracts the Instance objects from a source file and suppresses all features but the ones 
 * specified.
 * 
 * @param fileName File to the training results in ARFF format.
 * @param features List of {@link AbstractFeatureExtractor}s which are currently being tested.
 * @return Instances object consisting of the desired attribute structure.
 * @throws Exception If the ARFF file couldn't be read, an exception is thrown.
 */
public Instances createInstances(String fileName, List<AbstractFeatureExtractor> features) throws Exception {
    final Instances train = new Instances(new BufferedReader(new FileReader(fileName)));
    ArrayList<Attribute> newAttributes = new ArrayList<Attribute>();

    for (int i = 0; i < train.numAttributes(); i++) {
        for (AbstractFeatureExtractor feature : features) {
            if (train.attribute(i).name().equals(feature.getName())) {
                newAttributes.add(train.attribute(i));

                continue;
            }
        }
    }

    /* 
     * add the last two features (ACR-System + correct/false predictions) as those 
     * are no features gathered by a FeatureExtractor.
     */
    newAttributes.add(train.attribute(train.numAttributes() - 2));
    newAttributes.add(train.attribute(train.numAttributes() - 1));
    Instances trainCopy = copyInstances(train, newAttributes);
    trainCopy.setClassIndex(trainCopy.numAttributes() - 1);

    return trainCopy;
}

From source file:de.uniheidelberg.cl.swp.mlprocess.WEKARunner.java

License:Apache License

/**
 * Evaluates our classifier with a test set.
 * <br>//from www  .  j av  a 2  s .c om
 * Not used yet.
 *
 * @param testArff ARFF file to evaluate against.
 * @throws If the evaluation couldn't be initialized.
 */
public void buildEvaluation(String testArff) throws Exception {
    Instances evalIns = new Instances(new BufferedReader(new FileReader(testArff)));
    evalIns.setClassIndex(evalIns.numAttributes() - 1);
    evaluation = new Evaluation(train);
}