Example usage for weka.core Instances instance

List of usage examples for weka.core Instances instance

Introduction

In this page you can find the example usage for weka.core Instances instance.

Prototype



publicInstance instance(int index) 

Source Link

Document

Returns the instance at the given position.

Usage

From source file:de.ugoe.cs.cpdp.loader.NetgeneLoader.java

License:Apache License

@Override
public Instances load(File fileMetricsFile) {
    // first determine all files
    String path = fileMetricsFile.getParentFile().getAbsolutePath();
    String project = fileMetricsFile.getName().split("_")[0];
    File bugsFile = new File(path + "/" + project + "_bugs_per_file.csv");
    File networkMetrics = new File(path + "/" + project + "_network_metrics.csv");
    Instances metricsData = null;//from  ww  w  .  j av  a  2  s  . com

    try {
        CSVLoader wekaCsvLoader = new CSVLoader();
        wekaCsvLoader.setSource(fileMetricsFile);
        metricsData = wekaCsvLoader.getDataSet();
        wekaCsvLoader.setSource(bugsFile);
        Instances bugsData = wekaCsvLoader.getDataSet();
        wekaCsvLoader.setSource(networkMetrics);
        Instances networkData = wekaCsvLoader.getDataSet();

        metricsData.setRelationName(project);

        // fix nominal attributes (i.e., NA values)
        for (int j = 2; j < networkData.numAttributes(); j++) {
            if (networkData.attribute(j).isNominal()) {
                String attributeName = networkData.attribute(j).name();
                double[] tmpVals = new double[networkData.size()];
                // get temporary values
                for (int i = 0; i < networkData.size(); i++) {
                    Instance inst = networkData.instance(i);
                    if (!inst.isMissing(j)) {
                        String val = networkData.instance(i).stringValue(j);
                        try {
                            tmpVals[i] = Double.parseDouble(val);
                        } catch (NumberFormatException e) {
                            // not a number, using 0.0;
                            tmpVals[i] = 0.0;
                        }
                    } else {
                        tmpVals[i] = 0.0;
                    }
                }
                // replace attribute
                networkData.deleteAttributeAt(j);
                networkData.insertAttributeAt(new Attribute(attributeName), j);
                for (int i = 0; i < networkData.size(); i++) {
                    networkData.instance(i).setValue(j, tmpVals[i]);
                }
            }
        }
        // fix string attributes
        for (int j = 2; j < networkData.numAttributes(); j++) {
            if (networkData.attribute(j).isString()) {
                String attributeName = networkData.attribute(j).name();
                double[] tmpVals = new double[networkData.size()];
                // get temporary values
                for (int i = 0; i < networkData.size(); i++) {
                    Instance inst = networkData.instance(i);
                    if (!inst.isMissing(j)) {
                        String val = networkData.instance(i).stringValue(j);
                        try {
                            tmpVals[i] = Double.parseDouble(val);
                        } catch (NumberFormatException e) {
                            // not a number, using 0.0;
                            tmpVals[i] = 0.0;
                        }
                    } else {
                        tmpVals[i] = 0.0;
                    }
                }
                // replace attribute
                networkData.deleteAttributeAt(j);
                networkData.insertAttributeAt(new Attribute(attributeName), j);
                for (int i = 0; i < networkData.size(); i++) {
                    networkData.instance(i).setValue(j, tmpVals[i]);
                }
            }
        }

        Map<String, Integer> filenames = new HashMap<>();
        for (int j = 0; j < metricsData.size(); j++) {
            filenames.put(metricsData.instance(j).stringValue(0), j);
        }
        // merge with network data
        int attributeIndex;
        for (int j = 2; j < networkData.numAttributes(); j++) {
            attributeIndex = metricsData.numAttributes();
            metricsData.insertAttributeAt(networkData.attribute(j), attributeIndex);
            for (int i = 0; i < networkData.size(); i++) {
                Integer instanceIndex = filenames.get(networkData.instance(i).stringValue(1));
                if (instanceIndex != null) {
                    metricsData.instance(instanceIndex).setValue(attributeIndex,
                            networkData.instance(i).value(j));
                }
            }
        }

        // add bug information
        attributeIndex = metricsData.numAttributes();
        final ArrayList<String> classAttVals = new ArrayList<String>();
        classAttVals.add("0");
        classAttVals.add("1");
        final Attribute classAtt = new Attribute("bug", classAttVals);
        metricsData.insertAttributeAt(classAtt, attributeIndex);
        for (int i = 0; i < bugsData.size(); i++) {
            if (bugsData.instance(i).value(2) > 0.0d) {
                Integer instanceIndex = filenames.get(bugsData.instance(i).stringValue(1));
                if (instanceIndex != null) {
                    metricsData.instance(instanceIndex).setValue(attributeIndex, 1.0);
                }
            }
        }

        // remove filenames
        metricsData.deleteAttributeAt(0);
        Attribute eigenvector = metricsData.attribute("eigenvector");
        if (eigenvector != null) {
            for (int j = 0; j < metricsData.numAttributes(); j++) {
                if (metricsData.attribute(j) == eigenvector) {
                    metricsData.deleteAttributeAt(j);
                }
            }
        }

        metricsData.setClassIndex(metricsData.numAttributes() - 1);

        // set all missing values to 0
        for (int i = 0; i < metricsData.size(); i++) {
            for (int j = 0; j < metricsData.numAttributes(); j++) {
                if (metricsData.instance(i).isMissing(j)) {
                    metricsData.instance(i).setValue(j, 0.0d);
                }
            }
        }
    } catch (IOException e) {
        Console.traceln(Level.SEVERE, "failure reading file: " + e.getMessage());
        metricsData = null;
    }
    return metricsData;
}

From source file:de.ugoe.cs.cpdp.util.WekaUtils.java

License:Apache License

/**
 * <p>//from   w  w w .j  a  v  a2 s  .co m
 * Calculates the distributional characteristics of the distances the instances within a data
 * set have to each other.
 * </p>
 *
 * @param data
 *            data for which the instances are characterized
 * @return characteristics
 */
public static DistChar datasetDistance(Instances data) {
    double distance;
    double sumAll = 0.0;
    double sumAllQ = 0.0;
    double min = Double.MAX_VALUE;
    double max = Double.MIN_VALUE;
    int numCmp = 0;
    int l = 0;
    double[] inst1 = new double[data.numAttributes() - 1];
    double[] inst2 = new double[data.numAttributes() - 1];
    EuclideanDistance euclideanDistance = new EuclideanDistance();
    for (int i = 0; i < data.numInstances(); i++) {
        l = 0;
        for (int k = 0; k < data.numAttributes(); k++) {
            if (k != data.classIndex()) {
                inst1[l] = data.instance(i).value(k);
            }
        }
        for (int j = 0; j < data.numInstances(); j++) {
            if (j != i) {
                l = 0;
                for (int k = 0; k < data.numAttributes(); k++) {
                    if (k != data.classIndex()) {
                        inst2[l] = data.instance(j).value(k);
                    }
                }
                distance = euclideanDistance.compute(inst1, inst2);
                sumAll += distance;
                sumAllQ += distance * distance;
                numCmp++;
                if (distance < min) {
                    min = distance;
                }
                if (distance > max) {
                    max = distance;
                }
            }
        }
    }
    double mean = sumAll / numCmp;
    double std = Math.sqrt((sumAllQ - (sumAll * sumAll) / numCmp) * (1.0d / (numCmp - 1)));
    return new DistChar(mean, std, min, max, data.numInstances());
}

From source file:de.ugoe.cs.cpdp.util.WekaUtils.java

License:Apache License

/**
 * <p>//from   www . j  a va2  s. c om
 * Calculates the distributional characteristics of the distances of a single attribute the
 * instances within a data set have to each other.
 * </p>
 *
 * @param data
 *            data for which the instances are characterized
 * @param index
 *            attribute for which the distances are characterized
 * @return characteristics
 */
public static DistChar attributeDistance(Instances data, int index) {
    double distance;
    double sumAll = 0.0;
    double sumAllQ = 0.0;
    double min = Double.MAX_VALUE;
    double max = Double.MIN_VALUE;
    int numCmp = 0;
    double value1, value2;
    for (int i = 0; i < data.numInstances(); i++) {
        value1 = data.instance(i).value(index);
        for (int j = 0; j < data.numInstances(); j++) {
            if (j != i) {
                value2 = data.instance(j).value(index);
                distance = Math.abs(value1 - value2);
                sumAll += distance;
                sumAllQ += distance * distance;
                numCmp++;
                if (distance < min) {
                    min = distance;
                }
                if (distance > max) {
                    max = distance;
                }
            }
        }
    }
    double mean = sumAll / numCmp;
    double std = Math.sqrt((sumAllQ - (sumAll * sumAll) / numCmp) * (1.0d / (numCmp - 1)));
    return new DistChar(mean, std, min, max, data.numInstances());
}

From source file:de.uniheidelberg.cl.swp.mlprocess.AblationTesting.java

License:Apache License

/**
 * Copies the Instances from the source Instances object to a new one, which only contains the 
 * currently tested features.//from  ww  w .j  a va 2s.  co  m
 * 
 * @param source The Instances object containing all the Instance objects from the source file. 
 * @param targetStructure The list of {@link AbstractFeatureExtractor}s which is currently 
 *          being tested.
 * @return An instances object consisting of all Instance objects from the source file.  
 */
private Instances copyInstances(Instances source, ArrayList<Attribute> targetStructure) {
    Instances target = new Instances("ACResolution", targetStructure, 0);

    for (int i = 0; i < source.numInstances(); i++) {
        double[] vals = new double[targetStructure.size()];

        for (int z = 0; z < targetStructure.size(); z++) {
            vals[z] = getAttributeValue(source.instance(i), targetStructure.get(z).name());
        }
        Instance in = new DenseInstance(1.0, vals);
        target.add(in);
    }
    return target;
}

From source file:de.uniheidelberg.cl.swp.mlprocess.WEKARunner.java

License:Apache License

/**
 * Predicts unknown labels of an Instances.
 * //from  w  ww . j av a 2s .  c  om
 * @param unkIns Instances with unknown attributes.
 * @return Instances with the formerly unknown instances, now labeled.
 * @throws If the Instances couldn't be labeled.
 */
public Instances labelUnknownInstances(Instances unkIns) throws Exception {
    Instances testcpy = new Instances(unkIns);

    for (int i = 0; i < unkIns.numInstances(); i++) {
        double clsLabel = classifier.classifyInstance(unkIns.instance(i));
        testcpy.instance(i).setClassValue(clsLabel);
    }
    return testcpy;
}

From source file:de.unimannheim.dws.algorithms.CustomSimpleKMedian.java

License:Open Source License

/**
 * Generates a clusterer. Has to initialize all fields of the clusterer that
 * are not being set via options.//from  ww w .jav a2  s .  co m
 * 
 * @param data set of instances serving as training data
 * @throws Exception if the clusterer has not been generated successfully
 */
@Override
public void buildClusterer(Instances data) throws Exception {

    // can clusterer handle the data?
    getCapabilities().testWithFail(data);

    m_Iterations = 0;

    m_ReplaceMissingFilter = new ReplaceMissingValues();
    Instances instances = new Instances(data);

    instances.setClassIndex(-1);
    if (!m_dontReplaceMissing) {
        m_ReplaceMissingFilter.setInputFormat(instances);
        instances = Filter.useFilter(instances, m_ReplaceMissingFilter);
    }

    m_FullMissingCounts = new int[instances.numAttributes()];
    if (m_displayStdDevs) {
        m_FullStdDevs = new double[instances.numAttributes()];
    }
    m_FullNominalCounts = new int[instances.numAttributes()][0];

    m_FullMeansOrMediansOrModes = moveCentroid(0, instances, false);
    for (int i = 0; i < instances.numAttributes(); i++) {
        m_FullMissingCounts[i] = instances.attributeStats(i).missingCount;
        if (instances.attribute(i).isNumeric()) {
            if (m_displayStdDevs) {
                m_FullStdDevs[i] = Math.sqrt(instances.variance(i));
            }
            if (m_FullMissingCounts[i] == instances.numInstances()) {
                m_FullMeansOrMediansOrModes[i] = Double.NaN; // mark missing as mean
            }
        } else {
            m_FullNominalCounts[i] = instances.attributeStats(i).nominalCounts;
            if (m_FullMissingCounts[i] > m_FullNominalCounts[i][Utils.maxIndex(m_FullNominalCounts[i])]) {
                m_FullMeansOrMediansOrModes[i] = -1; // mark missing as most common
                                                     // value
            }
        }
    }

    m_ClusterCentroids = new Instances(instances, m_NumClusters);
    int[] clusterAssignments = new int[instances.numInstances()];

    if (m_PreserveOrder) {
        m_Assignments = clusterAssignments;
    }

    m_DistanceFunction.setInstances(instances);

    Random RandomO = new Random(getSeed());
    int instIndex;
    HashMap initC = new HashMap();
    DecisionTableHashKey hk = null;

    Instances initInstances = null;
    if (m_PreserveOrder) {
        initInstances = new Instances(instances);
    } else {
        initInstances = instances;
    }

    for (int j = initInstances.numInstances() - 1; j >= 0; j--) {
        instIndex = RandomO.nextInt(j + 1);
        hk = new DecisionTableHashKey(initInstances.instance(instIndex), initInstances.numAttributes(), true);
        if (!initC.containsKey(hk)) {
            m_ClusterCentroids.add(initInstances.instance(instIndex));
            initC.put(hk, null);
        }
        initInstances.swap(j, instIndex);

        if (m_ClusterCentroids.numInstances() == m_NumClusters) {
            break;
        }
    }

    m_NumClusters = m_ClusterCentroids.numInstances();

    // removing reference
    initInstances = null;

    int i;
    boolean converged = false;
    int emptyClusterCount;
    Instances[] tempI = new Instances[m_NumClusters];
    m_squaredErrors = new double[m_NumClusters];
    m_ClusterNominalCounts = new int[m_NumClusters][instances.numAttributes()][0];
    m_ClusterMissingCounts = new int[m_NumClusters][instances.numAttributes()];
    while (!converged) {
        emptyClusterCount = 0;
        m_Iterations++;
        converged = true;
        for (i = 0; i < instances.numInstances(); i++) {
            Instance toCluster = instances.instance(i);
            int newC = clusterProcessedInstance(toCluster, true);
            if (newC != clusterAssignments[i]) {
                converged = false;
            }
            clusterAssignments[i] = newC;
        }

        // update centroids
        m_ClusterCentroids = new Instances(instances, m_NumClusters);
        for (i = 0; i < m_NumClusters; i++) {
            tempI[i] = new Instances(instances, 0);
        }
        for (i = 0; i < instances.numInstances(); i++) {
            tempI[clusterAssignments[i]].add(instances.instance(i));
        }
        for (i = 0; i < m_NumClusters; i++) {
            if (tempI[i].numInstances() == 0) {
                // empty cluster
                emptyClusterCount++;
            } else {
                moveCentroid(i, tempI[i], true);
            }
        }

        if (m_Iterations == m_MaxIterations) {
            converged = true;
        }

        if (emptyClusterCount > 0) {
            m_NumClusters -= emptyClusterCount;
            if (converged) {
                Instances[] t = new Instances[m_NumClusters];
                int index = 0;
                for (int k = 0; k < tempI.length; k++) {
                    if (tempI[k].numInstances() > 0) {
                        t[index] = tempI[k];

                        for (i = 0; i < tempI[k].numAttributes(); i++) {
                            m_ClusterNominalCounts[index][i] = m_ClusterNominalCounts[k][i];
                        }
                        index++;
                    }
                }
                tempI = t;
            } else {
                tempI = new Instances[m_NumClusters];
            }
        }

        if (!converged) {
            m_squaredErrors = new double[m_NumClusters];
            m_ClusterNominalCounts = new int[m_NumClusters][instances.numAttributes()][0];
        }
    }

    if (m_displayStdDevs) {
        m_ClusterStdDevs = new Instances(instances, m_NumClusters);
    }
    m_ClusterSizes = new int[m_NumClusters];
    for (i = 0; i < m_NumClusters; i++) {
        if (m_displayStdDevs) {
            double[] vals2 = new double[instances.numAttributes()];
            for (int j = 0; j < instances.numAttributes(); j++) {
                if (instances.attribute(j).isNumeric()) {
                    vals2[j] = Math.sqrt(tempI[i].variance(j));
                } else {
                    vals2[j] = Instance.missingValue();
                }
            }
            m_ClusterStdDevs.add(new Instance(1.0, vals2));
        }
        m_ClusterSizes[i] = tempI[i].numInstances();
    }

    // Save memory!!
    m_DistanceFunction.clean();
}

From source file:de.unimannheim.dws.algorithms.CustomSimpleKMedian.java

License:Open Source License

/**
 * Move the centroid to it's new coordinates. Generate the centroid
 * coordinates based on it's members (objects assigned to the cluster of the
 * centroid) and the distance function being used.
 * /*from  w  w  w. j  av a  2 s . c  om*/
 * @param centroidIndex index of the centroid which the coordinates will be
 *          computed
 * @param members the objects that are assigned to the cluster of this
 *          centroid
 * @param updateClusterInfo if the method is supposed to update the m_Cluster
 *          arrays
 * @return the centroid coordinates
 */
protected double[] moveCentroid(int centroidIndex, Instances members, boolean updateClusterInfo) {
    double[] vals = new double[members.numAttributes()];

    // used only for Manhattan Distance
    Instances sortedMembers = null;
    int middle = 0;
    boolean dataIsEven = false;

    if (m_DistanceFunction instanceof ManhattanDistance
            || m_DistanceFunction instanceof CustomPairWiseDistance) {
        middle = (members.numInstances() - 1) / 2;
        dataIsEven = ((members.numInstances() % 2) == 0);
        if (m_PreserveOrder) {
            sortedMembers = members;
        } else {
            sortedMembers = new Instances(members);
        }
    }

    for (int j = 0; j < members.numAttributes(); j++) {

        // in case of Euclidian distance the centroid is the mean point
        // in case of Manhattan distance the centroid is the median point
        // in both cases, if the attribute is nominal, the centroid is the mode
        if (m_DistanceFunction instanceof EuclideanDistance || members.attribute(j).isNominal()) {
            vals[j] = members.meanOrMode(j);
        } else if (m_DistanceFunction instanceof ManhattanDistance
                || m_DistanceFunction instanceof CustomPairWiseDistance) {
            // singleton special case
            if (members.numInstances() == 1) {
                vals[j] = members.instance(0).value(j);
            } else {
                vals[j] = sortedMembers.kthSmallestValue(j, middle + 1);
                if (dataIsEven) {
                    vals[j] = (vals[j] + sortedMembers.kthSmallestValue(j, middle + 2)) / 2;
                }
            }
        }

        if (updateClusterInfo) {
            m_ClusterMissingCounts[centroidIndex][j] = members.attributeStats(j).missingCount;
            m_ClusterNominalCounts[centroidIndex][j] = members.attributeStats(j).nominalCounts;
            if (members.attribute(j).isNominal()) {
                if (m_ClusterMissingCounts[centroidIndex][j] > m_ClusterNominalCounts[centroidIndex][j][Utils
                        .maxIndex(m_ClusterNominalCounts[centroidIndex][j])]) {
                    vals[j] = Instance.missingValue(); // mark mode as missing
                }
            } else {
                if (m_ClusterMissingCounts[centroidIndex][j] == members.numInstances()) {
                    vals[j] = Instance.missingValue(); // mark mean as missing
                }
            }
        }
    }
    if (updateClusterInfo) {
        m_ClusterCentroids.add(new Instance(1.0, vals));
    }
    return vals;
}

From source file:de.unimannheim.dws.algorithms.CustomSimpleKMedoids.java

License:Open Source License

/**
 * Move the centroid to it's new coordinates. Generate the centroid
 * coordinates based on it's members (objects assigned to the cluster of the
 * centroid) and the distance function being used.
 * /*  w  w w  . j  a va2s  . c om*/
 * @param centroidIndex index of the centroid which the coordinates will be
 *          computed
 * @param members the objects that are assigned to the cluster of this
 *          centroid
 * @param updateClusterInfo if the method is supposed to update the m_Cluster
 *          arrays
 * @return the centroid coordinates
 */
protected double[] moveCentroid(int centroidIndex, Instances members, boolean updateClusterInfo) {
    double[] vals = new double[members.numAttributes()];

    if (!updateClusterInfo) {
        vals[0] = 100D;
        return vals;
    }

    double smallestError = Double.MAX_VALUE;
    Instance currentCentroid = null;

    for (int j = 0; j < members.numInstances(); j++) {

        Instance currentInstance = members.instance(j);
        double distanceError = 0D;
        for (int i = 0; i < members.numInstances(); i++) {
            distanceError += m_DistanceFunction.distance(currentInstance, members.instance(i));
        }
        if (distanceError < smallestError) {
            smallestError = distanceError;
            currentCentroid = currentInstance;
        }
    }

    vals[0] = currentCentroid.valueSparse(0);

    for (int j = 0; j < members.numAttributes(); j++) {
        if (updateClusterInfo) {
            m_ClusterMissingCounts[centroidIndex][j] = members.attributeStats(j).missingCount;
            m_ClusterNominalCounts[centroidIndex][j] = members.attributeStats(j).nominalCounts;
            if (members.attribute(j).isNominal()) {
                if (m_ClusterMissingCounts[centroidIndex][j] > m_ClusterNominalCounts[centroidIndex][j][Utils
                        .maxIndex(m_ClusterNominalCounts[centroidIndex][j])]) {
                    vals[j] = Instance.missingValue(); // mark mode as missing
                }
            } else {
                if (m_ClusterMissingCounts[centroidIndex][j] == members.numInstances()) {
                    vals[j] = Instance.missingValue(); // mark mean as missing
                }
            }
        }
    }

    if (updateClusterInfo) {
        m_ClusterCentroids.add(new Instance(1.0, vals));
    }
    return vals;
}

From source file:decisiontree.ID3tree.java

private int calculateSplit(Instances inst) {
    Instance tempInst;//from w  w  w.  j  a v a 2 s. c  o  m
    ArrayList<Instance> subset;
    subset = new ArrayList();
    double[] entropy;
    int numAttr = inst.numAttributes();
    entropy = new double[numAttr];
    double tempEnt;
    int numInst = inst.numInstances();
    int splitVal = 5;

    for (int i = 0; i < numInst; i++) {
        tempInst = inst.instance(i);
        subset.add(tempInst);
    }

    for (int j = 0; j < numAttr - 1; j++) {
        tempEnt = calculateEntropy(subset, numAttr, j);
        entropy[j] = tempEnt;
    }
    double temp = 5.0;
    for (int k = 0; k < numAttr - 1; k++) {
        if (temp > entropy[k]) {
            temp = entropy[k];
            splitVal = k;
        }
    }

    return splitVal;
}

From source file:decisiontree.MyC45.java

private Instances handleMissingValues(Instances data) throws Exception {
    Instances newData = data;
    Enumeration attrEnum = newData.enumerateAttributes();
    while (attrEnum.hasMoreElements()) {
        Attribute attr = (Attribute) attrEnum.nextElement();
        AttributeStats attrStats = newData.attributeStats(attr.index());
        if (attr.isNominal()) {
            int maxIdx = 0;
            for (int i = 0; i < attr.numValues(); i++) {
                if (attrStats.nominalCounts[i] > attrStats.nominalCounts[maxIdx]) {
                    maxIdx = i;//from   www.  java2 s .c  om
                }
            }

            for (int i = 0; i < newData.numInstances(); i++) {
                if (newData.instance(i).isMissing(attr.index())) {
                    newData.instance(i).setValue(attr.index(), maxIdx);
                }
            }
        } else if (attr.isNumeric()) {
            double mean = attrStats.numericStats.mean;
            for (int i = 0; i < newData.numInstances(); i++) {
                if (newData.instance(i).isMissing(attr.index())) {
                    newData.instance(i).setValue(attr.index(), mean);
                }
            }
        }
    }

    return newData;
}