Example usage for weka.core Instances numInstances

List of usage examples for weka.core Instances numInstances

Introduction

In this page you can find the example usage for weka.core Instances numInstances.

Prototype


publicint numInstances() 

Source Link

Document

Returns the number of instances in the dataset.

Usage

From source file:decisiontree.MyC45.java

/**
* Splits a dataset according to the values of a numeric attribute.
*
* @param data the data which is to be split
* @param att the attribute to be used for splitting
* @return the sets of instances produced by the split
*//*from   w w  w .ja v  a2s  . c  om*/
private Instances[] splitData(Instances data, Attribute att, double threshold) {
    Instances[] splitData = new Instances[2];
    for (int i = 0; i < 2; i++) {
        splitData[i] = new Instances(data, data.numInstances());
    }

    Enumeration instEnum = data.enumerateInstances();
    while (instEnum.hasMoreElements()) {
        Instance inst = (Instance) instEnum.nextElement();
        if (inst.value(att) >= threshold) {
            inst.setValue(att, threshold);
            splitData[1].add(inst);
        } else {
            inst.setValue(att, 0);
            splitData[0].add(inst);
        }
    }
    for (int i = 0; i < splitData.length; i++) {
        splitData[i].compactify();
    }
    return splitData;
}

From source file:decisiontree.MyC45.java

/**
* Computes information gain ratio for a nominal attribute.
*
* @param data the data for which info gain is to be computed
* @param att the attribute//from  w  ww  . j a va  2 s .c o m
* @return the information gain ratio for the given attribute and data
* @throws Exception if computation fails
*/
private double computeGainRatio(Instances instances, Attribute attr) throws Exception {
    double infoGain = computeEntropy(instances);
    double splitInfo = 0;
    double gainRatio = 0;
    double fraction = 0.0;
    Instances[] splitData = splitData(instances, attr);
    for (int j = 0; j < attr.numValues(); j++) {
        if (splitData[j].numInstances() > 0) {
            infoGain -= ((double) splitData[j].numInstances() / (double) instances.numInstances())
                    * computeEntropy(splitData[j]);
            fraction = (double) splitData[j].numInstances() / instances.numInstances();
            if (fraction != 0)
                splitInfo += fraction * Utils.log2(fraction);
        }
    }
    if (splitInfo == 0)
        gainRatio = infoGain;
    else
        gainRatio = -1 * infoGain / splitInfo;

    return gainRatio;
}

From source file:decisiontree.MyC45.java

/**
* Computes information gain ratio for a numeric attribute.
*
* @param data the data for which info gain is to be computed
* @param att the attribute//www. ja va  2 s  .  c om
* @return the information gain ratio for the given attribute and data
* @throws Exception if computation fails
*/
private double computeGainRatio(Instances instances, Attribute attr, double threshold) throws Exception {
    double infoGain = computeEntropy(instances);
    double splitInfo = 0;
    double gainRatio = 0;
    double fraction = 0.0;
    Instances[] splitData = splitData(instances, attr, threshold);
    for (int j = 0; j < 2; j++) {
        if (splitData[j].numInstances() > 0) {
            infoGain -= ((double) splitData[j].numInstances() / (double) instances.numInstances())
                    * computeEntropy(splitData[j]);
            fraction = (double) splitData[j].numInstances() / instances.numInstances();
            if (fraction != 0)
                splitInfo += fraction * Utils.log2(fraction);
        }
    }
    if (splitInfo == 0)
        gainRatio = infoGain;
    else
        gainRatio = -1 * infoGain / splitInfo;

    return gainRatio;
}

From source file:decisiontree.MyC45.java

private Instances handleMissingValues(Instances data) throws Exception {
    Instances newData = data;
    Enumeration attrEnum = newData.enumerateAttributes();
    while (attrEnum.hasMoreElements()) {
        Attribute attr = (Attribute) attrEnum.nextElement();
        AttributeStats attrStats = newData.attributeStats(attr.index());
        if (attr.isNominal()) {
            int maxIdx = 0;
            for (int i = 0; i < attr.numValues(); i++) {
                if (attrStats.nominalCounts[i] > attrStats.nominalCounts[maxIdx]) {
                    maxIdx = i;//from   w w w.j  a  v  a 2s  .com
                }
            }

            for (int i = 0; i < newData.numInstances(); i++) {
                if (newData.instance(i).isMissing(attr.index())) {
                    newData.instance(i).setValue(attr.index(), maxIdx);
                }
            }
        } else if (attr.isNumeric()) {
            double mean = attrStats.numericStats.mean;
            for (int i = 0; i < newData.numInstances(); i++) {
                if (newData.instance(i).isMissing(attr.index())) {
                    newData.instance(i).setValue(attr.index(), mean);
                }
            }
        }
    }

    return newData;
}

From source file:decisiontree.MyC45.java

private double computeThreshold(Instances instances, Attribute attr) throws Exception {
    double[] threshold = new double[instances.numInstances()];
    double[] gainRatio = new double[instances.numInstances()];
    for (int i = 0; i < instances.numInstances() - 1; i++) {
        if (instances.instance(i).classValue() != instances.instance(i + 1).classValue()) {
            threshold[i] = (instances.instance(i).value(attr) + instances.instance(i + 1).value(attr)) / 2;
            gainRatio[i] = computeGainRatio(instances, attr, threshold[i]);
        }//from   w  w w.  ja  va  2  s  .c om
    }
    return (double) threshold[Utils.maxIndex(gainRatio)];
}

From source file:decisiontree.MyID3.java

private void makeTree(Instances data) {
    // Check if no instances have reached this node.  
    if (data.numInstances() == 0) {
        splitAttr = null;//from   w w  w .  j a v a  2 s  .  c  o m
        leafValue = Double.NaN;
        leafDist = new double[data.numClasses()];
        return;
    }

    if (data.numDistinctValues(data.classIndex()) == 1) {
        leafValue = data.firstInstance().classValue();
        return;
    }

    // Compute attribute with maximum information gain.  
    double[] infoGains = new double[data.numAttributes()];
    Enumeration attEnum = data.enumerateAttributes();
    while (attEnum.hasMoreElements()) {
        Attribute att = (Attribute) attEnum.nextElement();
        infoGains[att.index()] = computeInfoGain(data, att);
    }
    splitAttr = data.attribute(maxIndex(infoGains));

    // Make leaf if information gain is zero.   
    // Otherwise create successors.  
    if (Utils.eq(infoGains[splitAttr.index()], 0)) {
        splitAttr = null;
        leafDist = new double[data.numClasses()];
        Enumeration instEnum = data.enumerateInstances();
        while (instEnum.hasMoreElements()) {
            Instance inst = (Instance) instEnum.nextElement();
            leafDist[(int) inst.classValue()]++;
        }
        normalize(leafDist);
        leafValue = Utils.maxIndex(leafDist);
        classAttr = data.classAttribute();
    } else {
        Instances[] splitData = splitData(data, splitAttr);
        child = new MyID3[splitAttr.numValues()];
        for (int j = 0; j < splitAttr.numValues(); j++) {
            child[j] = new MyID3();
            child[j].makeTree(splitData[j]);
        }
    }
}

From source file:decisiontree.MyID3.java

private double computeInfoGain(Instances data, Attribute att) {
    double infoGain = computeEntropy(data);
    Instances[] splitData = splitData(data, att);
    for (Instances split : splitData) {
        if (split.numInstances() > 0) {
            infoGain -= ((double) split.numInstances() / (double) data.numInstances()) * computeEntropy(split);
        }//from  ww  w  . j  av a2  s  . c o m
    }
    return infoGain;
}

From source file:decisiontree.MyID3.java

private double computeEntropy(Instances data) {
    int numClasses = data.numClasses();
    int[] classCount = new int[numClasses];
    ArrayList<Double> classValues = new ArrayList<>();
    Enumeration<Instance> instEnum = data.enumerateInstances();
    while (instEnum.hasMoreElements()) {
        Instance instance = instEnum.nextElement();
        double classValue = instance.classValue();
        if (!classValues.contains(classValue)) {
            classValues.add(classValue);
        }//  ww  w.  j av a 2  s.c o m
        int index = classValues.indexOf(classValue);
        classCount[index]++;
    }
    double entropy = 0.0;
    for (Double value : classValues) {
        int index = classValues.indexOf(value);
        if (classCount[index] > 0) {
            double temp = (double) classCount[index] / data.numInstances();
            entropy -= temp * Utils.log2(temp);
        }
    }
    return entropy;

}

From source file:decisiontree.MyID3.java

private Instances[] splitData(Instances data, Attribute att) {
    Instances[] splitData = new Instances[att.numValues()];
    for (int j = 0; j < att.numValues(); j++) {
        splitData[j] = new Instances(data, data.numInstances());
    }//  ww  w  .  ja  va  2  s .  com

    Enumeration instEnum = data.enumerateInstances();
    while (instEnum.hasMoreElements()) {
        Instance inst = (Instance) instEnum.nextElement();
        splitData[(int) inst.value(att)].add(inst);
    }
    for (Instances split : splitData) {
        split.compactify();
    }
    return splitData;
}

From source file:decisiontreeclassifier.ITree2.java

/********************************************************************
 * Changes the missing data to 0.0. For the voting data set, this 
 * should be sufficient seeing as 0.00 is more or less random.
 ********************************************************************/
public Instances fixMissingData(Instances iToFix) {
    for (int i = 0; i < iToFix.numInstances(); i++) {
        for (int j = 0; j < iToFix.numAttributes(); j++) {
            if (iToFix.instance(i).isMissing(j)) {
                iToFix.instance(i).setValue(j, 0.0);
            }//  www .j  a  va2 s  .  com
        }
    }
    return iToFix;
}