Example usage for weka.core Instances add

List of usage examples for weka.core Instances add

Introduction

In this page you can find the example usage for weka.core Instances add.

Prototype

@Override
public boolean add(Instance instance) 

Source Link

Document

Adds one instance to the end of the set.

Usage

From source file:de.ugoe.cs.cpdp.dataselection.NeighborhoodFilter.java

License:Apache License

/**
 * <p>//from ww  w  . j a v a2s  .  c  om
 * Applies the relevancy filter after Ryu et al.
 * </p>
 *
 * @param testdata
 *            test data
 * @param traindata
 *            training data
 * @return filtered trainind data
 */
private Instances applyNeighborhoodFilter(Instances testdata, Instances traindata) {
    TreeSet<Integer> selectedInstances = new TreeSet<>();
    for (int i = 0; i < testdata.size(); i++) {
        double minHam = Double.MAX_VALUE;
        for (int j = 0; j < traindata.size(); j++) {
            double distance = WekaUtils.hammingDistance(testdata.get(i), traindata.get(j));
            if (distance < minHam) {
                minHam = distance;
            }
        }
        for (int j = 0; j < traindata.size(); j++) {
            double distance = WekaUtils.hammingDistance(testdata.get(i), traindata.get(j));
            if (distance <= minHam) {
                selectedInstances.add(j);
            }
        }
    }
    Instances selectedTraindata = new Instances(testdata);
    selectedTraindata.clear();
    for (Integer index : selectedInstances) {
        selectedTraindata.add(traindata.instance(index));
    }
    return selectedTraindata;
}

From source file:de.ugoe.cs.cpdp.dataselection.SetWiseEMContextSelection.java

License:Apache License

/**
 * Returns test- and training data with only the project context factors which were chosen in
 * the configuration. This is later used for clustering.
 * //from w w w . java2 s . c  o  m
 * @param testdata
 * @param traindataSet
 * @return
 */
protected Instances getContextFactors(Instances testdata, SetUniqueList<Instances> traindataSet) {
    // setup weka Instances for clustering
    final ArrayList<Attribute> atts = new ArrayList<Attribute>();

    // we only want the project context factors
    for (String pcf : this.project_context_factors) {
        atts.add(new Attribute(pcf));
    }

    // set up the data
    final Instances data = new Instances("project_context_factors", atts, 0);
    double[] instanceValues = new double[atts.size()];

    // only project context factors + only one instance per project needed
    int i = 0;
    for (String pcf : this.project_context_factors) {
        instanceValues[i] = testdata.instance(0).value(testdata.attribute(pcf));
        // Console.traceln(Level.INFO, "adding attribute: " + pcf + " value: " +
        // instanceValues[i]);
        i++;
    }
    data.add(new DenseInstance(1.0, instanceValues));

    // now for the projects of the training stet
    for (Instances traindata : traindataSet) {
        instanceValues = new double[atts.size()]; // ohne das hier immer dieselben werte?!
        i = 0;
        for (String pcf : this.project_context_factors) {
            instanceValues[i] = traindata.instance(0).value(traindata.attribute(pcf));
            // Console.traceln(Level.INFO, "adding attribute: " + pcf + " value: " +
            // instanceValues[i]);
            i++;
        }

        data.add(new DenseInstance(1.0, instanceValues));
    }

    return data;
}

From source file:de.ugoe.cs.cpdp.execution.AbstractCrossProjectExperiment.java

License:Apache License

/**
 * Helper method that combines a set of Weka {@link Instances} sets into a single
 * {@link Instances} set.//  w  ww. java2s  .  com
 * 
 * @param traindataSet
 *            set of {@link Instances} to be combines
 * @return single {@link Instances} set
 */
public static Instances makeSingleTrainingSet(SetUniqueList<Instances> traindataSet) {
    Instances traindataFull = null;
    for (Instances traindata : traindataSet) {
        if (traindataFull == null) {
            traindataFull = new Instances(traindata);
        } else {
            for (int i = 0; i < traindata.numInstances(); i++) {
                traindataFull.add(traindata.instance(i));
            }
        }
    }
    return traindataFull;
}

From source file:de.ugoe.cs.cpdp.loader.AUDIChangeLoader.java

License:Apache License

@Override
public Instances load(File file) {
    final String[] lines;
    String[] lineSplit;/*from  ww  w .  j  a  va  2 s  .  c  o  m*/
    String[] lineSplitBug;

    try {
        lines = FileTools.getLinesFromFile(file.getAbsolutePath());
    } catch (IOException e) {
        throw new RuntimeException(e);
    }

    // information about bugs are in another file
    String path = file.getAbsolutePath();
    path = path.substring(0, path.length() - 14) + "repro.csv";
    final String[] linesBug;
    try {
        linesBug = FileTools.getLinesFromFile(path);
    } catch (IOException e) {
        throw new RuntimeException(e);
    }

    int revisionIndex = -1;
    int bugIndex = -1;
    lineSplitBug = linesBug[0].split(";");
    for (int j = 0; j < lineSplitBug.length; j++) {
        if (lineSplitBug[j].equals("svnrev")) {
            revisionIndex = j;
        }
        if (lineSplitBug[j].equals("num_bugs_trace")) {
            bugIndex = j;
        }
    }
    if (revisionIndex < 0) {
        throw new RuntimeException("could not find SVN revisions");
    }
    if (bugIndex < 0) {
        throw new RuntimeException("could not find bug information");
    }

    int metricsStartIndex = -1;
    int metricsEndIndex = -1;
    lineSplit = lines[0].split(";");
    for (int j = 0; j < lineSplit.length; j++) {
        if (lineSplit[j].equals("lm_LOC")) {
            metricsStartIndex = j;
        }
        if (lineSplit[j].equals("h_E")) {
            metricsEndIndex = j;
        }
    }
    if (metricsStartIndex < 0) {
        throw new RuntimeException("could not find first metric, i.e., lm_LOC");
    }
    if (metricsEndIndex < 0) {
        throw new RuntimeException("could not find last metric, i.e., h_E");
    }
    int numMetrics = metricsEndIndex - metricsStartIndex + 1;

    // create sets of all filenames and revisions
    SortedMap<EntityRevisionPair, Integer> entityRevisionPairs = new TreeMap<>();
    for (int i = 1; i < linesBug.length; i++) {
        lineSplitBug = linesBug[i].split(";");
        entityRevisionPairs
                .put(new EntityRevisionPair(lineSplitBug[0], Integer.parseInt(lineSplitBug[revisionIndex])), i);
    }

    // prepare weka instances
    final ArrayList<Attribute> atts = new ArrayList<Attribute>();
    lineSplit = lines[0].split(";");
    for (int j = metricsStartIndex; j <= metricsEndIndex; j++) {
        atts.add(new Attribute(lineSplit[j] + "_delta"));
    }
    for (int j = metricsStartIndex; j <= metricsEndIndex; j++) {
        atts.add(new Attribute(lineSplit[j] + "_abs"));
    }
    final ArrayList<String> classAttVals = new ArrayList<String>();
    classAttVals.add("0");
    classAttVals.add("1");
    final Attribute classAtt = new Attribute("bug", classAttVals);
    atts.add(classAtt);

    final Instances data = new Instances(file.getName(), atts, 0);
    data.setClass(classAtt);

    // create data
    String lastFile = null;
    double[] lastValues = null;
    int lastNumBugs = 0;
    for (Entry<EntityRevisionPair, Integer> entry : entityRevisionPairs.entrySet()) {
        try {
            // first get values
            lineSplit = lines[entry.getValue()].split(";");
            lineSplitBug = linesBug[entry.getValue()].split(";");
            int i = 0;
            double[] values = new double[numMetrics];
            for (int j = metricsStartIndex; j <= metricsEndIndex; j++) {
                values[i] = Double.parseDouble(lineSplit[j]);
                i++;
            }
            int numBugs = Integer.parseInt(lineSplitBug[bugIndex]);

            // then check if an entity must be created
            if (entry.getKey().entity.equals(lastFile)) {
                // create new instance
                double[] instanceValues = new double[2 * numMetrics + 1];
                for (int j = 0; j < numMetrics; j++) {
                    instanceValues[j] = values[j] - lastValues[j];
                    instanceValues[j + numMetrics] = values[j];
                }
                // check if any value>0
                boolean changeOccured = false;
                for (int j = 0; j < numMetrics; j++) {
                    if (instanceValues[j] > 0) {
                        changeOccured = true;
                    }
                }
                if (changeOccured) {
                    instanceValues[instanceValues.length - 1] = numBugs <= lastNumBugs ? 0 : 1;
                    data.add(new DenseInstance(1.0, instanceValues));
                }
            }
            lastFile = entry.getKey().entity;
            lastValues = values;
            lastNumBugs = numBugs;
        } catch (IllegalArgumentException e) {
            System.err.println("error in line " + entry.getValue() + ": " + e.getMessage());
            System.err.println("metrics line: " + lines[entry.getValue()]);
            System.err.println("bugs line: " + linesBug[entry.getValue()]);
            System.err.println("line is ignored");
        }
    }

    return data;
}

From source file:de.ugoe.cs.cpdp.loader.AUDIChangeLoader.java

License:Apache License

public Instances load(File file, String dummy) {
    final String[] lines;
    try {//from ww w  .  j  a v  a2 s.c o  m
        lines = FileTools.getLinesFromFile(file.getAbsolutePath());
    } catch (IOException e) {
        throw new RuntimeException(e);
    }

    // information about bugs are in another file
    String path = file.getAbsolutePath();
    path = path.substring(0, path.length() - 14) + "repro.csv";
    final String[] linesBug;
    try {
        linesBug = FileTools.getLinesFromFile(path);
    } catch (IOException e) {
        throw new RuntimeException(e);
    }

    // configure Instances
    final ArrayList<Attribute> atts = new ArrayList<Attribute>();

    String[] lineSplit = lines[0].split(";");
    // ignore first three/four and last two columns
    int offset;
    if (lineSplit[3].equals("project_rev")) {
        offset = 4;
    } else {
        offset = 3;
    }
    for (int j = 0; j < lineSplit.length - (offset + 2); j++) {
        atts.add(new Attribute(lineSplit[j + offset]));
    }
    final ArrayList<String> classAttVals = new ArrayList<String>();
    classAttVals.add("0");
    classAttVals.add("1");
    final Attribute classAtt = new Attribute("bug", classAttVals);
    atts.add(classAtt);

    final Instances data = new Instances(file.getName(), atts, 0);
    data.setClass(classAtt);

    // fetch data
    for (int i = 1; i < lines.length; i++) {
        boolean validInstance = true;
        lineSplit = lines[i].split(";");
        String[] lineSplitBug = linesBug[i].split(";");
        double[] values = new double[data.numAttributes()];
        for (int j = 0; validInstance && j < values.length - 1; j++) {
            if (lineSplit[j + offset].trim().isEmpty()) {
                validInstance = false;
            } else {
                values[j] = Double.parseDouble(lineSplit[j + offset].trim());
            }
        }
        if (offset == 3) {
            values[values.length - 1] = lineSplitBug[7].equals("0") ? 0 : 1;
        } else {
            values[values.length - 1] = lineSplitBug[8].equals("0") ? 0 : 1;
        }

        if (validInstance) {
            data.add(new DenseInstance(1.0, values));
        } else {
            System.out.println("instance " + i + " is invalid");
        }
    }
    return data;
}

From source file:de.ugoe.cs.cpdp.loader.AUDIDataLoader.java

License:Apache License

@Override
public Instances load(File file) {
    final String[] lines;
    try {/*w w w. j a v a  2s .c o m*/
        lines = FileTools.getLinesFromFile(file.getAbsolutePath());
    } catch (IOException e) {
        throw new RuntimeException(e);
    }

    // information about bugs are in another file
    String path = file.getAbsolutePath();
    path = path.substring(0, path.length() - 14) + "repro.csv";
    final String[] linesBug;
    try {
        linesBug = FileTools.getLinesFromFile(path);
    } catch (IOException e) {
        throw new RuntimeException(e);
    }

    // configure Instances
    final ArrayList<Attribute> atts = new ArrayList<Attribute>();

    String[] lineSplit = lines[0].split(";");
    // ignore first three/four and last two columns
    int offset;
    if (lineSplit[3].equals("project_rev")) {
        offset = 4;
    } else {
        offset = 3;
    }
    for (int j = 0; j < lineSplit.length - (offset + 2); j++) {
        atts.add(new Attribute(lineSplit[j + offset]));
    }
    final ArrayList<String> classAttVals = new ArrayList<String>();
    classAttVals.add("0");
    classAttVals.add("1");
    final Attribute classAtt = new Attribute("bug", classAttVals);
    atts.add(classAtt);

    final Instances data = new Instances(file.getName(), atts, 0);
    data.setClass(classAtt);

    // fetch data
    for (int i = 1; i < lines.length; i++) {
        boolean validInstance = true;
        lineSplit = lines[i].split(";");
        String[] lineSplitBug = linesBug[i].split(";");
        double[] values = new double[data.numAttributes()];
        for (int j = 0; validInstance && j < values.length - 1; j++) {
            if (lineSplit[j + offset].trim().isEmpty()) {
                validInstance = false;
            } else {
                values[j] = Double.parseDouble(lineSplit[j + offset].trim());
            }
        }
        if (offset == 3) {
            values[values.length - 1] = lineSplitBug[7].equals("0") ? 0 : 1;
        } else {
            values[values.length - 1] = lineSplitBug[8].equals("0") ? 0 : 1;
        }

        if (validInstance) {
            data.add(new DenseInstance(1.0, values));
        } else {
            System.out.println("instance " + i + " is invalid");
        }
    }
    return data;
}

From source file:de.ugoe.cs.cpdp.loader.CSVMockusDataLoader.java

License:Apache License

@Override
public Instances load(File file) {
    final String[] lines;
    try {//ww w .java 2 s  . c om

        lines = FileTools.getLinesFromFile(file.getAbsolutePath());
    } catch (IOException e) {
        throw new RuntimeException(e);
    }

    // configure Instances
    final ArrayList<Attribute> atts = new ArrayList<Attribute>();

    String[] lineSplit = lines[0].split(",");
    for (int j = 0; j < lineSplit.length - 3; j++) {
        atts.add(new Attribute(lineSplit[j + 2]));
    }

    final ArrayList<String> classAttVals = new ArrayList<String>();
    classAttVals.add("0");
    classAttVals.add("1");
    final Attribute classAtt = new Attribute("bug", classAttVals);
    atts.add(classAtt);

    final Instances data = new Instances(file.getName(), atts, 0);
    data.setClass(classAtt);

    // fetch data
    for (int i = 1; i < lines.length; i++) {
        lineSplit = lines[i].split(",");
        double[] values = new double[lineSplit.length - 2];
        for (int j = 0; j < values.length - 1; j++) {
            values[j] = Double.parseDouble(lineSplit[j + 2].trim());
        }
        values[values.length - 1] = lineSplit[lineSplit.length - 1].trim().equals("0") ? 0 : 1;
        data.add(new DenseInstance(1.0, values));
    }

    return data;
}

From source file:de.ugoe.cs.cpdp.loader.DecentDataLoader.java

License:Apache License

/**
 * Creates a WekaInstance from an ARFFX Model Instance
 * // w ww . ja  va2s  .co m
 * @param dataSet
 *            WekaInstance dataset, where the arffx model instances should be added to
 * @param i
 *            arffx model instance
 */
private void createWekaInstance(Instances dataSet, Instance i) {
    double[] values = new double[dataSet.numAttributes()];
    int j = 0;

    for (Value value : i.getValues()) {
        String dataValue = value.getContent();
        String attributeName = value.getOfAttribute().getName();

        if (attributeFilter.contains(attributeName)) {
            continue;
        }

        // Is value a LABEL.* attribute?
        if (isLabel(attributeName)) {
            values[j] = dataSet.attribute(j).indexOfValue(dataValue);
        } else if (isConfidenceLabel(attributeName)) {
            // Is value a CONFIDENCE.* attribute?
            values[j] = dataSet.attribute(j).indexOfValue(dataValue);
        } else if (attributeName.equals("Artifact.Name")) {
            // Is it the name of the artifact?
            artifactNames.add(dataValue);
            values[j] = getIndexOfArtifactName(dataValue);
        } else {
            // Is it a numeric value?
            values[j] = Double.parseDouble(dataValue);
        }

        j++;
    }

    DenseInstance inst = new DenseInstance(1.0, values);
    dataSet.add(inst);
}

From source file:de.unidue.langtech.grading.tc.ClusterExemplarTask.java

License:Open Source License

@Override
public void execute(TaskContext aContext) throws Exception {
    if (learningMode.equals(Constants.LM_MULTI_LABEL)) {
        throw new IllegalArgumentException("Cannot use multi-label setup in clustering.");
    }//from w w w  .j  av  a 2s.  c o m
    boolean multiLabel = false;

    File arffFileTrain = new File(
            aContext.getStorageLocation(TEST_TASK_INPUT_KEY_TRAINING_DATA, AccessMode.READONLY).getPath() + "/"
                    + TRAINING_DATA_FILENAME);

    Instances trainData = TaskUtils.getInstances(arffFileTrain, multiLabel);

    Clusterer abstractClusterer = AbstractClusterer.forName(clusteringArguments.get(0),
            clusteringArguments.subList(1, clusteringArguments.size()).toArray(new String[0]));

    // we assume that only this method has been used - breaks modularity, but need results fast ... :/
    SimpleKMeans clusterer = (SimpleKMeans) abstractClusterer;

    trainData = WekaUtils.removeOutcomeId(trainData, multiLabel);
    Instances copyTrainData = new Instances(trainData);

    // generate data for clusterer (w/o class)
    Remove filter = new Remove();
    filter.setAttributeIndices("" + (trainData.classIndex() + 1));
    filter.setInputFormat(trainData);
    Instances clusterTrainData = Filter.useFilter(trainData, filter);

    clusterer.buildClusterer(clusterTrainData);
    Instances centroids = clusterer.getClusterCentroids();

    //        Add addFilter = new Add();
    //        addFilter.setAttributeIndex(new Integer(numTestLabels + i + 1).toString());
    //        addFilter.setNominalLabels("0,1");
    //        addFilter.setAttributeName(trainData.attribute(i).name() + COMPATIBLE_OUTCOME_CLASS);
    //        addFilter.setInputFormat(testData);

    trainData.clear();

    Enumeration<Instance> centroidInstances = centroids.enumerateInstances();
    while (centroidInstances.hasMoreElements()) {
        Instance centroidInstance = centroidInstances.nextElement();

        // centroidInstance is usually not a real instance, but a virtual centroid
        // we need to find the closest point in the training data
        double minDistance = Double.POSITIVE_INFINITY;
        int offset = 0;
        int minOffset = 0;
        Enumeration<Instance> trainInstances = clusterTrainData.enumerateInstances();
        while (trainInstances.hasMoreElements()) {
            Instance trainInstance = trainInstances.nextElement();

            double dist = distance(centroidInstance, trainInstance);
            if (dist < minDistance) {
                minDistance = dist;
                minOffset = offset;
            }
            offset++;
        }

        // add selected instance to instances
        trainData.add(copyTrainData.get(minOffset));
    }

    // write the new training data (that will be used by the test task instead of the original one)                
    DataSink.write(aContext.getStorageLocation(ADAPTED_TRAINING_DATA, AccessMode.READWRITE).getPath() + "/"
            + ARFF_FILENAME, trainData);
}

From source file:de.uniheidelberg.cl.swp.mlprocess.AblationTesting.java

License:Apache License

/**
 * Copies the Instances from the source Instances object to a new one, which only contains the 
 * currently tested features./*from  w ww  .j a v a2s .co m*/
 * 
 * @param source The Instances object containing all the Instance objects from the source file. 
 * @param targetStructure The list of {@link AbstractFeatureExtractor}s which is currently 
 *          being tested.
 * @return An instances object consisting of all Instance objects from the source file.  
 */
private Instances copyInstances(Instances source, ArrayList<Attribute> targetStructure) {
    Instances target = new Instances("ACResolution", targetStructure, 0);

    for (int i = 0; i < source.numInstances(); i++) {
        double[] vals = new double[targetStructure.size()];

        for (int z = 0; z < targetStructure.size(); z++) {
            vals[z] = getAttributeValue(source.instance(i), targetStructure.get(z).name());
        }
        Instance in = new DenseInstance(1.0, vals);
        target.add(in);
    }
    return target;
}