Example usage for weka.core Instances numClasses

Introduction

In this page you can find the example usage for weka.core Instances numClasses.

Prototype


publicint numClasses()

Source Link

Document

Returns the number of class labels.

Usage

From source file:j48.BinC45Split.java

License:Open Source License

/**
 * Creates split on numeric attribute./*ww  w  .j  a v  a2  s .com*/
 *
 * @exception Exception if something goes wrong
 */
private void handleNumericAttribute(Instances trainInstances) throws Exception {

    int firstMiss;
    int next = 1;
    int last = 0;
    int index = 0;
    int splitIndex = -1;
    double currentInfoGain;
    double defaultEnt;
    double minSplit;
    Instance instance;
    int i;

    // Current attribute is a numeric attribute.
    m_distribution = new Distribution(2, trainInstances.numClasses());

    // Only Instances with known values are relevant.
    Enumeration enu = trainInstances.enumerateInstances();
    i = 0;
    while (enu.hasMoreElements()) {
        instance = (Instance) enu.nextElement();
        if (instance.isMissing(m_attIndex))
            break;
        m_distribution.add(1, instance);
        i++;
    }
    firstMiss = i;

    // Compute minimum number of Instances required in each
    // subset.
    minSplit = 0.1 * (m_distribution.total()) / ((double) trainInstances.numClasses());
    if (Utils.smOrEq(minSplit, m_minNoObj))
        minSplit = m_minNoObj;
    else if (Utils.gr(minSplit, 25))
        minSplit = 25;

    // Enough Instances with known values?
    if (Utils.sm((double) firstMiss, 2 * minSplit))
        return;

    // Compute values of criteria for all possible split
    // indices.
    defaultEnt = m_infoGainCrit.oldEnt(m_distribution);
    while (next < firstMiss) {

        if (trainInstances.instance(next - 1).value(m_attIndex) + 1e-5 < trainInstances.instance(next)
                .value(m_attIndex)) {

            // Move class values for all Instances up to next 
            // possible split point.
            m_distribution.shiftRange(1, 0, trainInstances, last, next);

            // Check if enough Instances in each subset and compute
            // values for criteria.
            if (Utils.grOrEq(m_distribution.perBag(0), minSplit)
                    && Utils.grOrEq(m_distribution.perBag(1), minSplit)) {
                currentInfoGain = m_infoGainCrit.splitCritValue(m_distribution, m_sumOfWeights, defaultEnt);
                if (Utils.gr(currentInfoGain, m_infoGain)) {
                    m_infoGain = currentInfoGain;
                    splitIndex = next - 1;
                }
                index++;
            }
            last = next;
        }
        next++;
    }

    // Was there any useful split?
    if (index == 0)
        return;

    // Compute modified information gain for best split.
    m_infoGain = m_infoGain - (Utils.log2(index) / m_sumOfWeights);
    if (Utils.smOrEq(m_infoGain, 0))
        return;

    // Set instance variables' values to values for
    // best split.
    m_numSubsets = 2;
    m_splitPoint = (trainInstances.instance(splitIndex + 1).value(m_attIndex)
            + trainInstances.instance(splitIndex).value(m_attIndex)) / 2;

    // In case we have a numerical precision problem we need to choose the
    // smaller value
    if (m_splitPoint == trainInstances.instance(splitIndex + 1).value(m_attIndex)) {
        m_splitPoint = trainInstances.instance(splitIndex).value(m_attIndex);
    }

    // Restore distributioN for best split.
    m_distribution = new Distribution(2, trainInstances.numClasses());
    m_distribution.addRange(0, trainInstances, 0, splitIndex + 1);
    m_distribution.addRange(1, trainInstances, splitIndex + 1, firstMiss);

    // Compute modified gain ratio for best split.
    m_gainRatio = m_gainRatioCrit.splitCritValue(m_distribution, m_sumOfWeights, m_infoGain);
}

From source file:j48.C45PruneableClassifierTreeG.java

License:Open Source License

/**
 * finds new nodes that improve accuracy and grafts them onto the tree
 *
 * @param fulldata the instances in whole trainset
 * @param iindex records num tests each instance has failed up to this node
 * @param limits the upper/lower limits for numeric attributes
 * @param parent the node immediately before the current one
 * @param pLaplace laplace for leaf, calculated by parent (in case leaf empty)
 * @param pLeafClass class of leaf, determined by parent (in case leaf empty)
 *//*from  w  w w  . j  a  va2 s  .  c  o m*/
private void findGraft(Instances fulldata, double[][] iindex, double[][] limits, ClassifierTree parent,
        double pLaplace, int pLeafClass) throws Exception {

    // get the class for this leaf
    int leafClass = (m_isEmpty) ? pLeafClass : localModel().distribution().maxClass();

    // get the laplace value for this leaf
    double leafLaplace = (m_isEmpty) ? pLaplace : laplaceLeaf(leafClass);

    // sort the instances into those at the leaf, those in atbop, and discarded
    Instances l = new Instances(fulldata, fulldata.numInstances());
    Instances n = new Instances(fulldata, fulldata.numInstances());
    int lcount = 0;
    int acount = 0;
    for (int x = 0; x < fulldata.numInstances(); x++) {
        if (iindex[0][x] <= 0 && iindex[1][x] <= 0)
            continue;
        if (iindex[0][x] != 0) {
            l.add(fulldata.instance(x));
            l.instance(lcount).setWeight(iindex[0][x]);
            // move instance's weight in iindex to same index as in l
            iindex[0][lcount++] = iindex[0][x];
        }
        if (iindex[1][x] > 0) {
            n.add(fulldata.instance(x));
            n.instance(acount).setWeight(iindex[1][x]);
            // move instance's weight in iindex to same index as in n
            iindex[1][acount++] = iindex[1][x];
        }
    }

    boolean graftPossible = false;
    double[] classDist = new double[n.numClasses()];
    for (int x = 0; x < n.numInstances(); x++) {
        if (iindex[1][x] > 0 && !n.instance(x).classIsMissing())
            classDist[(int) n.instance(x).classValue()] += iindex[1][x];
    }

    for (int cVal = 0; cVal < n.numClasses(); cVal++) {
        double theLaplace = (classDist[cVal] + 1.0) / (classDist[cVal] + 2.0);
        if (cVal != leafClass && (theLaplace > leafLaplace)
                && (biprob(classDist[cVal], classDist[cVal], leafLaplace) > m_BiProbCrit)) {
            graftPossible = true;
            break;
        }
    }

    if (!graftPossible) {
        return;
    }

    // 1. Initialize to {} a set of tuples t containing potential tests
    ArrayList t = new ArrayList();

    // go through each attribute
    for (int a = 0; a < n.numAttributes(); a++) {
        if (a == n.classIndex())
            continue; // skip the class

        // sort instances in atbop by $a
        int[] sorted = sortByAttribute(n, a);

        // 2. For each continuous attribute $a:
        if (n.attribute(a).isNumeric()) {

            // find min and max values for this attribute at the leaf
            boolean prohibited = false;
            double minLeaf = Double.POSITIVE_INFINITY;
            double maxLeaf = Double.NEGATIVE_INFINITY;
            for (int i = 0; i < l.numInstances(); i++) {
                if (l.instance(i).isMissing(a)) {
                    if (l.instance(i).classValue() == leafClass) {
                        prohibited = true;
                        break;
                    }
                }
                double value = l.instance(i).value(a);
                if (!m_relabel || l.instance(i).classValue() == leafClass) {
                    if (value < minLeaf)
                        minLeaf = value;
                    if (value > maxLeaf)
                        maxLeaf = value;
                }
            }
            if (prohibited) {
                continue;
            }

            // (a) find values of
            //    $n: instances in atbop (already have that, actually)
            //    $v: a value for $a that exists for a case in the atbop, where
            //       $v is < the min value for $a for a case at the leaf which
            //       has the class $c, and $v is > the lowerlimit of $a at
            //       the leaf.
            //       (note: error in original paper stated that $v must be
            //       smaller OR EQUAL TO the min value).
            //    $k: $k is a class
            //  that maximize L' = Laplace({$x: $x contained in cases($n)
            //    & value($a,$x) <= $v & value($a,$x) > lowerlim($l,$a)}, $k).
            double minBestClass = Double.NaN;
            double minBestLaplace = leafLaplace;
            double minBestVal = Double.NaN;
            double minBestPos = Double.NaN;
            double minBestTotal = Double.NaN;
            double[][] minBestCounts = null;
            double[][] counts = new double[2][n.numClasses()];
            for (int x = 0; x < n.numInstances(); x++) {
                if (n.instance(sorted[x]).isMissing(a))
                    break; // missing are sorted to end: no more valid vals

                double theval = n.instance(sorted[x]).value(a);
                if (m_Debug)
                    System.out.println("\t " + theval);

                if (theval <= limits[a][0]) {
                    if (m_Debug)
                        System.out.println("\t  <= lowerlim: continuing...");
                    continue;
                }
                // note: error in paper would have this read "theVal > minLeaf)
                if (theval >= minLeaf) {
                    if (m_Debug)
                        System.out.println("\t  >= minLeaf; breaking...");
                    break;
                }
                counts[0][(int) n.instance(sorted[x]).classValue()] += iindex[1][sorted[x]];

                if (x != n.numInstances() - 1) {
                    int z = x + 1;
                    while (z < n.numInstances() && n.instance(sorted[z]).value(a) == theval) {
                        z++;
                        x++;
                        counts[0][(int) n.instance(sorted[x]).classValue()] += iindex[1][sorted[x]];
                    }
                }

                // work out the best laplace/class (for <= theval)
                double total = Utils.sum(counts[0]);
                for (int c = 0; c < n.numClasses(); c++) {
                    double temp = (counts[0][c] + 1.0) / (total + 2.0);
                    if (temp > minBestLaplace) {
                        minBestPos = counts[0][c];
                        minBestTotal = total;
                        minBestLaplace = temp;
                        minBestClass = c;
                        minBestCounts = copyCounts(counts);

                        minBestVal = (x == n.numInstances() - 1) ? theval
                                : ((theval + n.instance(sorted[x + 1]).value(a)) / 2.0);
                    }
                }
            }

            // (b) add to t tuple <n,a,v,k,L',"<=">
            if (!Double.isNaN(minBestVal) && biprob(minBestPos, minBestTotal, leafLaplace) > m_BiProbCrit) {
                GraftSplit gsplit = null;
                try {
                    gsplit = new GraftSplit(a, minBestVal, 0, leafClass, minBestCounts);
                } catch (Exception e) {
                    System.err.println("graftsplit error: " + e.getMessage());
                    System.exit(1);
                }
                t.add(gsplit);
            }
            // free space
            minBestCounts = null;

            // (c) find values of
            //    n: instances in atbop (already have that, actually)
            //    $v: a value for $a that exists for a case in the atbop, where
            //       $v is > the max value for $a for a case at the leaf which
            //       has the class $c, and $v is <= the upperlimit of $a at
            //       the leaf.
            //    k: k is a class
            //   that maximize L' = Laplace({x: x contained in cases(n)
            //       & value(a,x) > v & value(a,x) <= upperlim(l,a)}, k).
            double maxBestClass = -1;
            double maxBestLaplace = leafLaplace;
            double maxBestVal = Double.NaN;
            double maxBestPos = Double.NaN;
            double maxBestTotal = Double.NaN;
            double[][] maxBestCounts = null;
            for (int c = 0; c < n.numClasses(); c++) { // zero the counts
                counts[0][c] = 0;
                counts[1][c] = 0; // shouldn't need to do this ...
            }

            // check smallest val for a in atbop is < upper limit
            if (n.numInstances() >= 1 && n.instance(sorted[0]).value(a) < limits[a][1]) {
                for (int x = n.numInstances() - 1; x >= 0; x--) {
                    if (n.instance(sorted[x]).isMissing(a))
                        continue;

                    double theval = n.instance(sorted[x]).value(a);
                    if (m_Debug)
                        System.out.println("\t " + theval);

                    if (theval > limits[a][1]) {
                        if (m_Debug)
                            System.out.println("\t  >= upperlim; continuing...");
                        continue;
                    }
                    if (theval <= maxLeaf) {
                        if (m_Debug)
                            System.out.println("\t  < maxLeaf; breaking...");
                        break;
                    }

                    // increment counts
                    counts[1][(int) n.instance(sorted[x]).classValue()] += iindex[1][sorted[x]];

                    if (x != 0 && !n.instance(sorted[x - 1]).isMissing(a)) {
                        int z = x - 1;
                        while (z >= 0 && n.instance(sorted[z]).value(a) == theval) {
                            z--;
                            x--;
                            counts[1][(int) n.instance(sorted[x]).classValue()] += iindex[1][sorted[x]];
                        }
                    }

                    // work out best laplace for > theval
                    double total = Utils.sum(counts[1]);
                    for (int c = 0; c < n.numClasses(); c++) {
                        double temp = (counts[1][c] + 1.0) / (total + 2.0);
                        if (temp > maxBestLaplace) {
                            maxBestPos = counts[1][c];
                            maxBestTotal = total;
                            maxBestLaplace = temp;
                            maxBestClass = c;
                            maxBestCounts = copyCounts(counts);
                            maxBestVal = (x == 0) ? theval
                                    : ((theval + n.instance(sorted[x - 1]).value(a)) / 2.0);
                        }
                    }
                }

                // (d) add to t tuple <n,a,v,k,L',">">
                if (!Double.isNaN(maxBestVal) && biprob(maxBestPos, maxBestTotal, leafLaplace) > m_BiProbCrit) {
                    GraftSplit gsplit = null;
                    try {
                        gsplit = new GraftSplit(a, maxBestVal, 1, leafClass, maxBestCounts);
                    } catch (Exception e) {
                        System.err.println("graftsplit error:" + e.getMessage());
                        System.exit(1);
                    }
                    t.add(gsplit);
                }
            }
        } else { // must be a nominal attribute

            // 3. for each discrete attribute a for which there is no
            //    test at an ancestor of l

            // skip if this attribute has already been used
            if (limits[a][1] == 1) {
                continue;
            }

            boolean[] prohibit = new boolean[l.attribute(a).numValues()];
            for (int aval = 0; aval < n.attribute(a).numValues(); aval++) {
                for (int x = 0; x < l.numInstances(); x++) {
                    if ((l.instance(x).isMissing(a) || l.instance(x).value(a) == aval)
                            && (!m_relabel || (l.instance(x).classValue() == leafClass))) {
                        prohibit[aval] = true;
                        break;
                    }
                }
            }

            // (a) find values of
            //       $n: instances in atbop (already have that, actually)
            //       $v: $v is a value for $a
            //       $k: $k is a class
            //     that maximize L' = Laplace({$x: $x contained in cases($n)
            //           & value($a,$x) = $v}, $k).
            double bestVal = Double.NaN;
            double bestClass = Double.NaN;
            double bestLaplace = leafLaplace;
            double[][] bestCounts = null;
            double[][] counts = new double[2][n.numClasses()];

            for (int x = 0; x < n.numInstances(); x++) {
                if (n.instance(sorted[x]).isMissing(a))
                    continue;

                // zero the counts
                for (int c = 0; c < n.numClasses(); c++)
                    counts[0][c] = 0;

                double theval = n.instance(sorted[x]).value(a);
                counts[0][(int) n.instance(sorted[x]).classValue()] += iindex[1][sorted[x]];

                if (x != n.numInstances() - 1) {
                    int z = x + 1;
                    while (z < n.numInstances() && n.instance(sorted[z]).value(a) == theval) {
                        z++;
                        x++;
                        counts[0][(int) n.instance(sorted[x]).classValue()] += iindex[1][sorted[x]];
                    }
                }

                if (!prohibit[(int) theval]) {
                    // work out best laplace for > theval
                    double total = Utils.sum(counts[0]);
                    bestLaplace = leafLaplace;
                    bestClass = Double.NaN;
                    for (int c = 0; c < n.numClasses(); c++) {
                        double temp = (counts[0][c] + 1.0) / (total + 2.0);
                        if (temp > bestLaplace && biprob(counts[0][c], total, leafLaplace) > m_BiProbCrit) {
                            bestLaplace = temp;
                            bestClass = c;
                            bestVal = theval;
                            bestCounts = copyCounts(counts);
                        }
                    }
                    // add to graft list
                    if (!Double.isNaN(bestClass)) {
                        GraftSplit gsplit = null;
                        try {
                            gsplit = new GraftSplit(a, bestVal, 2, leafClass, bestCounts);
                        } catch (Exception e) {
                            System.err.println("graftsplit error: " + e.getMessage());
                            System.exit(1);
                        }
                        t.add(gsplit);
                    }
                }
            }
            // (b) add to t tuple <n,a,v,k,L',"=">
            // done this already
        }
    }

    // 4. remove from t all tuples <n,a,v,c,L,x> such that L <=
    //    Laplace(cases(l),c) or prob(x,n,Laplace(cases(l),c) <= 0.05
    //      -- checked this constraint prior to adding a tuple --

    // *** step six done before step five for efficiency ***
    // 6. for each <n,a,v,k,L,x> in t ordered on L from highest to lowest
    // order the tuples from highest to lowest laplace
    // (this actually orders lowest to highest)
    Collections.sort(t);

    // 5. remove from t all tuples <n,a,v,c,L,x> such that there is
    //    no tuple <n',a',v',k',L',x'> such that k' != c & L' < L.
    for (int x = 0; x < t.size(); x++) {
        GraftSplit gs = (GraftSplit) t.get(x);
        if (gs.maxClassForSubsetOfInterest() != leafClass) {
            break; // reached a graft with class != leafClass, so stop deleting
        } else {
            t.remove(x);
            x--;
        }
    }

    // if no potential grafts were found, do nothing and return
    if (t.size() < 1) {
        return;
    }

    // create the distributions for each graft
    for (int x = t.size() - 1; x >= 0; x--) {
        GraftSplit gs = (GraftSplit) t.get(x);
        try {
            gs.buildClassifier(l);
            gs.deleteGraftedCases(l); // so they don't go down the other branch
        } catch (Exception e) {
            System.err.println("graftsplit build error: " + e.getMessage());
        }
    }

    // add this stuff to the tree
    ((C45PruneableClassifierTreeG) parent).setDescendents(t, this);
}

From source file:j48.C45Split.java

License:Open Source License

/**
 * Creates split on enumerated attribute.
 * /*from   w  w w. ja v  a  2  s .co m*/
 * @exception Exception
 *                if something goes wrong
 */
private void handleEnumeratedAttribute(Instances trainInstances) throws Exception {

    Instance instance;

    m_distribution = new Distribution(m_complexityIndex, trainInstances.numClasses());

    // Only Instances with known values are relevant.
    Enumeration enu = trainInstances.enumerateInstances();
    while (enu.hasMoreElements()) {
        instance = (Instance) enu.nextElement();
        if (!instance.isMissing(m_attIndex))
            m_distribution.add((int) instance.value(m_attIndex), instance);
    }

    // Check if minimum number of Instances in at least two
    // subsets.
    if (m_distribution.check(m_minNoObj)) {
        m_numSubsets = m_complexityIndex;
        m_infoGain = infoGainCrit.splitCritValue(m_distribution, m_sumOfWeights);
        m_gainRatio = gainRatioCrit.splitCritValue(m_distribution, m_sumOfWeights, m_infoGain);
    }
}

From source file:j48.C45Split.java

License:Open Source License

/**
 * Creates split on numeric attribute./*from w  w  w .  j a v a  2 s  . c om*/
 * 
 * @exception Exception
 *                if something goes wrong
 */
private void handleNumericAttribute(Instances trainInstances) throws Exception {

    int firstMiss;
    int next = 1;
    int last = 0;
    int splitIndex = -1;
    double currentInfoGain;
    double defaultEnt;
    double minSplit;
    Instance instance;
    int i;

    // Current attribute is a numeric attribute.
    m_distribution = new Distribution(2, trainInstances.numClasses());

    // Only Instances with known values are relevant.
    Enumeration enu = trainInstances.enumerateInstances();
    i = 0;
    while (enu.hasMoreElements()) {
        instance = (Instance) enu.nextElement();
        if (instance.isMissing(m_attIndex))
            break;
        m_distribution.add(1, instance);
        i++;
    }
    firstMiss = i;

    // Compute minimum number of Instances required in each
    // subset.
    minSplit = 0.1 * (m_distribution.total()) / ((double) trainInstances.numClasses());
    if (Utils.smOrEq(minSplit, m_minNoObj))
        minSplit = m_minNoObj;
    else if (Utils.gr(minSplit, 25))
        minSplit = 25;

    // Enough Instances with known values?
    if (Utils.sm((double) firstMiss, 2 * minSplit))
        return;

    // Compute values of criteria for all possible split
    // indices.
    defaultEnt = infoGainCrit.oldEnt(m_distribution);
    while (next < firstMiss) {

        if (trainInstances.instance(next - 1).value(m_attIndex) + 1e-5 < trainInstances.instance(next)
                .value(m_attIndex)) {

            // Move class values for all Instances up to next
            // possible split point.
            m_distribution.shiftRange(1, 0, trainInstances, last, next);

            // Check if enough Instances in each subset and compute
            // values for criteria.
            if (Utils.grOrEq(m_distribution.perBag(0), minSplit)
                    && Utils.grOrEq(m_distribution.perBag(1), minSplit)) {
                currentInfoGain = infoGainCrit.splitCritValue1(m_distribution, m_sumOfWeights, defaultEnt,
                        rrrrr);
                if (Utils.gr(currentInfoGain, m_infoGain)) {
                    m_infoGain = currentInfoGain;
                    splitIndex = next - 1;
                }
                m_index++;
            }
            last = next;
        }
        next++;
    }

    // Was there any useful split?
    if (m_index == 0)
        return;

    // Compute modified information gain for best split.
    m_infoGain = m_infoGain - (Utils.log2(m_index) / m_sumOfWeights);
    if (Utils.smOrEq(m_infoGain, 0))
        return;

    // Set instance variables' values to values for
    // best split.
    m_numSubsets = 2;
    m_splitPoint = (trainInstances.instance(splitIndex + 1).value(m_attIndex)
            + trainInstances.instance(splitIndex).value(m_attIndex)) / 2;

    // In case we have a numerical precision problem we need to choose the
    // smaller value
    if (m_splitPoint == trainInstances.instance(splitIndex + 1).value(m_attIndex)) {
        m_splitPoint = trainInstances.instance(splitIndex).value(m_attIndex);
    }

    // Restore distributioN for best split.
    m_distribution = new Distribution(2, trainInstances.numClasses());
    m_distribution.addRange(0, trainInstances, 0, splitIndex + 1);
    m_distribution.addRange(1, trainInstances, splitIndex + 1, firstMiss);

    // Compute modified gain ratio for best split.
    m_gainRatio = gainRatioCrit.splitCritValue1(m_distribution, m_sumOfWeights, m_infoGain, lllll);
}

From source file:j48.Distribution.java

License:Open Source License

/**
 * Creates a distribution with only one bag according
 * to instances in source./*from   w w  w.  j a v a 2 s .co  m*/
 *
 * @exception Exception if something goes wrong
 */
public Distribution(Instances source) throws Exception {

    m_perClassPerBag = new double[1][0];
    m_perBag = new double[1];
    totaL = 0;
    m_perClass = new double[source.numClasses()];
    m_perClassPerBag[0] = new double[source.numClasses()];
    Enumeration enu = source.enumerateInstances();
    while (enu.hasMoreElements())
        add(0, (Instance) enu.nextElement());
}

From source file:j48.Distribution.java

License:Open Source License

/**
 * Creates a distribution according to given instances and
 * split model.//from   w  ww  . j  a v a 2s . co  m
 *
 * @exception Exception if something goes wrong
 */

public Distribution(Instances source, ClassifierSplitModel modelToUse) throws Exception {

    int index;
    Instance instance;
    double[] weights;

    m_perClassPerBag = new double[modelToUse.numSubsets()][0];
    m_perBag = new double[modelToUse.numSubsets()];
    totaL = 0;
    m_perClass = new double[source.numClasses()];
    for (int i = 0; i < modelToUse.numSubsets(); i++)
        m_perClassPerBag[i] = new double[source.numClasses()];
    Enumeration enu = source.enumerateInstances();
    while (enu.hasMoreElements()) {
        instance = (Instance) enu.nextElement();
        index = modelToUse.whichSubset(instance);
        if (index != -1)
            add(index, instance);
        else {
            weights = modelToUse.weights(instance);
            addWeights(instance, weights);
        }
    }
}

From source file:j48.GraftSplit.java

License:Open Source License

/**
 * builds m_graftdistro using the passed data
 *
 * @param data the instances to use when creating the distribution
 */// w w  w. j a v a  2 s .  co  m
public void buildClassifier(Instances data) throws Exception {

    // distribution for the graft, not counting cases in atbop, only orig leaf
    m_graftdistro = new Distribution(2, data.numClasses());

    // which subset are we looking at for the graft?
    int subset = subsetOfInterest(); // this is the subset for m_leaf

    double thisNodeCount = 0;
    double knownCases = 0;
    boolean allKnown = true;
    // populate distribution
    for (int x = 0; x < data.numInstances(); x++) {
        Instance instance = data.instance(x);
        if (instance.isMissing(m_attIndex)) {
            allKnown = false;
            continue;
        }
        knownCases += instance.weight();
        int subst = whichSubset(instance);
        if (subst == -1)
            continue;
        m_graftdistro.add(subst, instance);
        if (subst == subset) { // instance belongs at m_leaf
            thisNodeCount += instance.weight();
        }
    }
    double factor = (knownCases == 0) ? (1.0 / (double) 2.0) : (thisNodeCount / knownCases);
    if (!allKnown) {
        for (int x = 0; x < data.numInstances(); x++) {
            if (data.instance(x).isMissing(m_attIndex)) {
                Instance instance = data.instance(x);
                int subst = whichSubset(instance);
                if (subst == -1)
                    continue;
                instance.setWeight(instance.weight() * factor);
                m_graftdistro.add(subst, instance);
            }
        }
    }

    // if there are no cases at the leaf, make sure the desired
    // class is chosen, by setting counts to 0.01
    if (m_graftdistro.perBag(subset) == 0) {
        double[] counts = new double[data.numClasses()];
        counts[m_maxClass] = 0.01;
        m_graftdistro.add(subset, counts);
    }
    if (m_graftdistro.perBag((subset == 0) ? 1 : 0) == 0) {
        double[] counts = new double[data.numClasses()];
        counts[(int) m_otherLeafMaxClass] = 0.01;
        m_graftdistro.add((subset == 0) ? 1 : 0, counts);
    }
}

From source file:liac.igmn.loader.DataLoader.java

License:Open Source License

/**
 * Carrega dataset a partir de arquivo ARFF e binariza os atributos nominais.
 * Assume que a classe seja o ultimo atributo.
 * /*from w w  w  .  j  a  va  2s  . com*/
 * @param filename path do arquivo
 * @return dataset
 * @throws DataLoaderException lancado quando o arquivo nao e encontrado
 * ou quando ocorre algum erro de IO
 */
public static Dataset loadARFF(String filename) throws DataLoaderException {
    Dataset dataset = new Dataset();
    try {
        ArffLoader loader = new ArffLoader();

        loader.setSource(new File(filename));
        Instances data = loader.getDataSet();
        Instances m_Intances = new Instances(data);

        data.setClassIndex(data.numAttributes() - 1);

        String[] classes = new String[data.numClasses()];
        for (int i = 0; i < data.numClasses(); i++)
            classes[i] = data.classAttribute().value(i);
        dataset.setClassesNames(classes);

        NominalToBinary filter = new NominalToBinary();
        filter.setInputFormat(m_Intances);
        filter.setOptions(new String[] { "-A" });
        m_Intances = Filter.useFilter(m_Intances, filter);

        int inputSize = m_Intances.numAttributes() - data.numClasses();

        dataset.setInputSize(inputSize);
        dataset.setNumClasses(data.numClasses());

        dataset.setWekaDataset(m_Intances);
    } catch (IOException e) {
        throw new DataLoaderException("Arquivo no encontrado", e.getCause());
    } catch (Exception e) {
        throw new DataLoaderException("Falha na converso do arquivo", e.getCause());
    }

    return dataset;
}

From source file:LogReg.Logistic.java

License:Open Source License

/**
 * Builds the classifier/*from www.j  a va  2 s  .c  om*/
 *
 * @param train the training data to be used for generating the
 * boosted classifier.
 * @throws Exception if the classifier could not be built successfully
 */
public void buildClassifier(Instances train) throws Exception {
    // can classifier handle the data?
    getCapabilities().testWithFail(train);

    // remove instances with missing class
    train = new Instances(train);
    train.deleteWithMissingClass();

    // Replace missing values   
    m_ReplaceMissingValues = new ReplaceMissingValues();
    m_ReplaceMissingValues.setInputFormat(train);
    train = Filter.useFilter(train, m_ReplaceMissingValues);

    // Remove useless attributes
    m_AttFilter = new RemoveUseless();
    m_AttFilter.setInputFormat(train);
    train = Filter.useFilter(train, m_AttFilter);

    // Transform attributes
    m_NominalToBinary = new NominalToBinary();
    m_NominalToBinary.setInputFormat(train);
    train = Filter.useFilter(train, m_NominalToBinary);

    // Save the structure for printing the model
    m_structure = new Instances(train, 0);

    // Extract data
    m_ClassIndex = train.classIndex();
    m_NumClasses = train.numClasses();

    int nK = m_NumClasses - 1; // Only K-1 class labels needed 
    int nR = m_NumPredictors = train.numAttributes() - 1;
    int nC = train.numInstances();

    m_Data = new double[nC][nR + 1]; // Data values
    int[] Y = new int[nC]; // Class labels
    double[] xMean = new double[nR + 1]; // Attribute means
    xSD = new double[nR + 1]; // Attribute stddev's
    double[] sY = new double[nK + 1]; // Number of classes
    double[] weights = new double[nC]; // Weights of instances
    double totWeights = 0; // Total weights of the instances
    m_Par = new double[nR + 1][nK]; // Optimized parameter values

    if (m_Debug) {
        System.out.println("Extracting data...");
    }

    for (int i = 0; i < nC; i++) {
        // initialize X[][]
        Instance current = train.instance(i);
        Y[i] = (int) current.classValue(); // Class value starts from 0
        weights[i] = current.weight(); // Dealing with weights
        totWeights += weights[i];

        m_Data[i][0] = 1;
        int j = 1;
        for (int k = 0; k <= nR; k++) {
            if (k != m_ClassIndex) {
                double x = current.value(k);
                m_Data[i][j] = x;
                xMean[j] += weights[i] * x;
                xSD[j] += weights[i] * x * x;
                j++;
            }
        }

        // Class count
        sY[Y[i]]++;
    }

    if ((totWeights <= 1) && (nC > 1))
        throw new Exception("Sum of weights of instances less than 1, please reweight!");

    xMean[0] = 0;
    xSD[0] = 1;
    for (int j = 1; j <= nR; j++) {
        xMean[j] = xMean[j] / totWeights;
        if (totWeights > 1)
            xSD[j] = Math.sqrt(Math.abs(xSD[j] - totWeights * xMean[j] * xMean[j]) / (totWeights - 1));
        else
            xSD[j] = 0;
    }

    if (m_Debug) {
        // Output stats about input data
        System.out.println("Descriptives...");
        for (int m = 0; m <= nK; m++)
            System.out.println(sY[m] + " cases have class " + m);
        System.out.println("\n Variable     Avg       SD    ");
        for (int j = 1; j <= nR; j++)
            System.out.println(Utils.doubleToString(j, 8, 4) + Utils.doubleToString(xMean[j], 10, 4)
                    + Utils.doubleToString(xSD[j], 10, 4));
    }

    // Normalise input data 
    for (int i = 0; i < nC; i++) {
        for (int j = 0; j <= nR; j++) {
            if (xSD[j] != 0) {
                m_Data[i][j] = (m_Data[i][j] - xMean[j]) / xSD[j];
            }
        }
    }

    if (m_Debug) {
        System.out.println("\nIteration History...");
    }

    double x[] = new double[(nR + 1) * nK];
    double[][] b = new double[2][x.length]; // Boundary constraints, N/A here

    // Initialize
    for (int p = 0; p < nK; p++) {
        int offset = p * (nR + 1);
        x[offset] = Math.log(sY[p] + 1.0) - Math.log(sY[nK] + 1.0); // Null model
        b[0][offset] = Double.NaN;
        b[1][offset] = Double.NaN;
        for (int q = 1; q <= nR; q++) {
            x[offset + q] = 0.0;
            b[0][offset + q] = Double.NaN;
            b[1][offset + q] = Double.NaN;
        }
    }

    OptEng opt = new OptEng();
    opt.setDebug(m_Debug);
    opt.setWeights(weights);
    opt.setClassLabels(Y);

    if (m_MaxIts == -1) { // Search until convergence
        x = opt.findArgmin(x, b);
        while (x == null) {
            x = opt.getVarbValues();
            if (m_Debug)
                System.out.println("200 iterations finished, not enough!");
            x = opt.findArgmin(x, b);
        }
        if (m_Debug)
            System.out.println(" -------------<Converged>--------------");
    } else {
        opt.setMaxIteration(m_MaxIts);
        x = opt.findArgmin(x, b);
        if (x == null) // Not enough, but use the current value
            x = opt.getVarbValues();
    }

    m_LL = -opt.getMinFunction(); // Log-likelihood

    // Don't need data matrix anymore
    m_Data = null;

    // Convert coefficients back to non-normalized attribute units
    for (int i = 0; i < nK; i++) {
        m_Par[0][i] = x[i * (nR + 1)];
        for (int j = 1; j <= nR; j++) {
            m_Par[j][i] = x[i * (nR + 1) + j];
            if (xSD[j] != 0) {
                m_Par[j][i] /= xSD[j];
                m_Par[0][i] -= m_Par[j][i] * xMean[j];
            }
        }
    }
}

From source file:machinelearninglabs.OENaiveBayesClassifier.java

public int[][] attributeCounts(Instances data, int att) {
    int numberOfPossibleValuesForAttribute = data.firstInstance().attribute(att).numValues();
    int[][] result = new int[data.numClasses()][numberOfPossibleValuesForAttribute];

    // for each class
    for (Instance eachInstance : data) {
        double classValue = eachInstance.value(eachInstance.classIndex());
        result[(int) classValue][(int) eachInstance.value(att)]++;
    }/*from  ww  w.  j a  v  a2s  . com*/
    //printIntMatrix(result);
    return result;
}