Example usage for weka.core Instances numClasses

List of usage examples for weka.core Instances numClasses

Introduction

In this page you can find the example usage for weka.core Instances numClasses.

Prototype


publicint numClasses() 

Source Link

Document

Returns the number of class labels.

Usage

From source file:ID3Chi.java

License:Open Source License

private void SetNullDistribution(Instances data) {
    m_Attribute = null;//  w w  w . j av  a  2s.c om
    m_ClassValue = Instance.missingValue();

    // TODO: think if it's better to keep all the distributions equal to 0
    m_Distribution = new double[data.numClasses()];
    for (int i = 0; i < m_Distribution.length; i++) {
        m_Distribution[i] = 1.0 / (double) data.numClasses();
    }
}

From source file:ID3Chi.java

License:Open Source License

/**
 * Computes Chi-Square element for given subset.
 *
 * @param subset//from   w ww .j ava 2s .  c  o  m
 *            the data for which info gain is to be computed
 * @param att
 *            the attribute
 * @setClassCounts class counts for initial set of instances
 * @setNumInstances number of instances for set of data
 * @return the chi-square for the given attribute and data
 * @throws Exception
 *             if computation fails
 */
private double computeChiSquareForSubset(Instances subset, Attribute att, double[] setClassCounts,
        double setNumInstances) {

    double[] subsetClassCounts = GetClassCounts(subset);
    double result = 0;
    double d = subset.numInstances() / setNumInstances;
    for (int j = 0; j < subset.numClasses(); j++) {
        double ciNew = setClassCounts[j] * d;
        if (ciNew > 0) {
            result += Math.pow(subsetClassCounts[j] - ciNew, 2) / ciNew;
        }
    }
    return result;
}

From source file:ID3Chi.java

License:Open Source License

/**
 * Computes the entropy of a dataset.//from ww w  .  j av a 2s  . c o m
 * 
 * @param data
 *            the data for which entropy is to be computed
 * @return the entropy of the data's class distribution
 * @throws Exception
 *             if computation fails
 */
private double computeEntropy(Instances data) throws Exception {

    double[] classCounts = GetClassCounts(data);
    double entropy = 0;
    for (int j = 0; j < data.numClasses(); j++) {
        if (classCounts[j] > 0) {
            entropy -= classCounts[j] * Utils.log2(classCounts[j]);
        }
    }
    entropy /= (double) data.numInstances();
    return entropy + Utils.log2(data.numInstances());
}

From source file:ID3Chi.java

License:Open Source License

private double computeEntropyWithUnknowns(Instances data, Instances unknownData,
        double[] classCountsUnknownData, double ratio) throws Exception {

    double[] classCounts = GetClassCounts(data);
    double entropy = 0;
    for (int j = 0; j < data.numClasses(); j++) {
        double p = classCounts[j] + classCountsUnknownData[j] * ratio;
        if (p > 0) {
            entropy -= p * Utils.log2(p);
        }//from   w w w.  j  av a2 s.c  om
    }
    entropy /= (double) data.numInstances();
    return entropy + Utils.log2(data.numInstances());
}

From source file:ID3Chi.java

License:Open Source License

private double[] GetClassCounts(Instances data) {

    double[] classCounts = new double[data.numClasses()];
    Enumeration instEnum = data.enumerateInstances();
    while (instEnum.hasMoreElements()) {
        Instance inst = (Instance) instEnum.nextElement();
        classCounts[(int) inst.classValue()]++;
    }/*from  ww w. ja v  a  2  s . c om*/
    return classCounts;
}

From source file:MPCKMeans.java

License:Open Source License

public static void testCase() {
    try {/*  w  w w  .  j  a va  2 s  . co m*/
        String dataset = new String("lowd");
        //String dataset = new String("highd");
        if (dataset.equals("lowd")) {
            //////// Low-D data

            //   String datafile = "/u/ml/data/bio/arffFromPhylo/ecoli_K12-100.arff";
            //   String datafile = "/u/sugato/weka/data/digits-0.1-389.arff";
            String datafile = "/u/sugato/weka/data/iris.arff";
            int numPairs = 200, num = 0;

            // set up the data
            FileReader reader = new FileReader(datafile);
            Instances data = new Instances(reader);

            // Make the last attribute be the class 
            int classIndex = data.numAttributes() - 1;
            data.setClassIndex(classIndex); // starts with 0
            System.out.println("ClassIndex is: " + classIndex);

            // Remove the class labels before clustering
            Instances clusterData = new Instances(data);
            clusterData.deleteClassAttribute();

            // create the pairs
            ArrayList labeledPair = InstancePair.getPairs(data, numPairs);

            System.out.println("Finished initializing constraint matrix");

            MPCKMeans mpckmeans = new MPCKMeans();
            mpckmeans.setUseMultipleMetrics(false);
            System.out.println("\nClustering the data using MPCKmeans...\n");

            WeightedEuclidean metric = new WeightedEuclidean();
            WEuclideanLearner metricLearner = new WEuclideanLearner();

            //     LearnableMetric metric = new WeightedDotP();
            //     MPCKMeansMetricLearner metricLearner = new DotPGDLearner();

            //     KL metric = new KL();
            //     KLGDLearner metricLearner = new KLGDLearner();
            //   ((KL)metric).setUseIDivergence(true);

            //   BarHillelMetric metric = new BarHillelMetric();
            //   BarHillelMetricMatlab metric = new BarHillelMetricMatlab();
            //     XingMetric metric = new XingMetric();
            //   WeightedMahalanobis metric = new WeightedMahalanobis(); 

            mpckmeans.setMetric(metric);
            mpckmeans.setMetricLearner(metricLearner);
            mpckmeans.setVerbose(false);
            mpckmeans.setRegularize(false);
            mpckmeans.setTrainable(new SelectedTag(TRAINING_INTERNAL, TAGS_TRAINING));
            mpckmeans.setSeedable(true);
            mpckmeans.buildClusterer(labeledPair, clusterData, data, data.numClasses(), data.numInstances());
            mpckmeans.getIndexClusters();
            mpckmeans.printIndexClusters();

            SemiSupClustererEvaluation eval = new SemiSupClustererEvaluation(mpckmeans.m_TotalTrainWithLabels,
                    mpckmeans.m_TotalTrainWithLabels.numClasses(),
                    mpckmeans.m_TotalTrainWithLabels.numClasses());
            eval.evaluateModel(mpckmeans, mpckmeans.m_TotalTrainWithLabels, mpckmeans.m_Instances);
            System.out.println("MI=" + eval.mutualInformation());
            System.out.print("FM=" + eval.pairwiseFMeasure());
            System.out.print("\tP=" + eval.pairwisePrecision());
            System.out.print("\tR=" + eval.pairwiseRecall());
        } else if (dataset.equals("highd")) {
            //////// Newsgroup data
            String datafile = "/u/ml/users/sugato/groupcode/weka335/data/arffFromCCS/sanitized/different-1000_sanitized.arff";
            //String datafile = "/u/ml/users/sugato/groupcode/weka335/data/20newsgroups/small-newsgroup_fromCCS.arff";
            //String datafile = "/u/ml/users/sugato/groupcode/weka335/data/20newsgroups/same-100_fromCCS.arff";

            // set up the data
            FileReader reader = new FileReader(datafile);
            Instances data = new Instances(reader);

            // Make the last attribute be the class 
            int classIndex = data.numAttributes() - 1;
            data.setClassIndex(classIndex); // starts with 0
            System.out.println("ClassIndex is: " + classIndex);

            // Remove the class labels before clustering
            Instances clusterData = new Instances(data);
            clusterData.deleteClassAttribute();

            // create the pairs
            int numPairs = 0, num = 0;
            ArrayList labeledPair = new ArrayList(numPairs);
            Random rand = new Random(42);
            System.out.println("Initializing constraint matrix:");
            while (num < numPairs) {
                int i = (int) (data.numInstances() * rand.nextFloat());
                int j = (int) (data.numInstances() * rand.nextFloat());
                int first = (i < j) ? i : j;
                int second = (i >= j) ? i : j;
                int linkType = (data.instance(first).classValue() == data.instance(second).classValue())
                        ? InstancePair.MUST_LINK
                        : InstancePair.CANNOT_LINK;
                InstancePair pair = new InstancePair(first, second, linkType);
                if (first != second && !labeledPair.contains(pair)) {
                    labeledPair.add(pair);
                    //System.out.println(num + "th entry is: " + pair);
                    num++;
                }
            }
            System.out.println("Finished initializing constraint matrix");

            MPCKMeans mpckmeans = new MPCKMeans();
            mpckmeans.setUseMultipleMetrics(false);
            System.out.println("\nClustering the highd data using MPCKmeans...\n");

            LearnableMetric metric = new WeightedDotP();
            MPCKMeansMetricLearner metricLearner = new DotPGDLearner();

            //     KL metric = new KL();
            //     KLGDLearner metricLearner = new KLGDLearner();

            mpckmeans.setMetric(metric);
            mpckmeans.setMetricLearner(metricLearner);
            mpckmeans.setVerbose(false);
            mpckmeans.setRegularize(true);
            mpckmeans.setTrainable(new SelectedTag(TRAINING_INTERNAL, TAGS_TRAINING));
            mpckmeans.setSeedable(true);
            mpckmeans.buildClusterer(labeledPair, clusterData, data, data.numClasses(), data.numInstances());
            mpckmeans.getIndexClusters();

            SemiSupClustererEvaluation eval = new SemiSupClustererEvaluation(mpckmeans.m_TotalTrainWithLabels,
                    mpckmeans.m_TotalTrainWithLabels.numClasses(),
                    mpckmeans.m_TotalTrainWithLabels.numClasses());

            mpckmeans.getMetric().resetMetric(); // Vital: to reset m_attrWeights to 1 for proper normalization
            eval.evaluateModel(mpckmeans, mpckmeans.m_TotalTrainWithLabels, mpckmeans.m_Instances);
            System.out.println("MI=" + eval.mutualInformation());
            System.out.print("FM=" + eval.pairwiseFMeasure());
            System.out.print("\tP=" + eval.pairwisePrecision());
            System.out.print("\tR=" + eval.pairwiseRecall());
        }
    } catch (Exception e) {
        e.printStackTrace();
    }
}

From source file:MeansClassifier.java

@Override
public void buildClassifier(Instances i) throws Exception {

    double[][] means = new double[i.numClasses()][i.numAttributes() - 1];

    for (int j = 0; j < i.size(); j++) {
        if (i.instance(j).classValue() == 0) {
            zeros.add(i.instance(j).value(0));
        }// www .  j a  v a 2s  . c  om
        if (i.instance(j).classValue() == 1) {
            ones.add(i.instance(j).value(0));
        }
    }
    double zero1 = 0, one1 = 0;
    for (int k = 0; k < zeros.size(); k++) {
        zero1 = zero1 + zeros.get(k);
    }
    for (int l = 0; l < ones.size(); l++) {
        one1 = one1 + ones.get(l);
    }
    System.out.println("For class 0, mean is " + (zero1 / zeros.size()));
    System.out.println("For class 1, mean is " + (one1 / ones.size()));
}

From source file:ai.BalancedRandomForest.java

License:GNU General Public License

/**
 * Build Balanced Random Forest// w  w w.  j a v  a2  s  .  c  o  m
 */
public void buildClassifier(final Instances data) throws Exception {
    // If number of features is 0 then set it to log2 of M (number of attributes)
    if (numFeatures < 1)
        numFeatures = (int) Utils.log2(data.numAttributes()) + 1;
    // Check maximum number of random features
    if (numFeatures >= data.numAttributes())
        numFeatures = data.numAttributes() - 1;

    // Initialize array of trees
    tree = new BalancedRandomTree[numTrees];

    // total number of instances
    final int numInstances = data.numInstances();
    // total number of classes
    final int numClasses = data.numClasses();

    final ArrayList<Integer>[] indexSample = new ArrayList[numClasses];
    for (int i = 0; i < numClasses; i++)
        indexSample[i] = new ArrayList<Integer>();

    //System.out.println("numClasses = " + numClasses);

    // fill indexSample with the indices of each class
    for (int i = 0; i < numInstances; i++) {
        //System.out.println("data.get("+i+").classValue() = " + data.get(i).classValue());
        indexSample[(int) data.get(i).classValue()].add(i);
    }

    final Random random = new Random(seed);

    // Executor service to run concurrent trees
    final ExecutorService exe = Executors.newFixedThreadPool(Runtime.getRuntime().availableProcessors());

    List<Future<BalancedRandomTree>> futures = new ArrayList<Future<BalancedRandomTree>>(numTrees);

    final boolean[][] inBag = new boolean[numTrees][numInstances];

    try {
        for (int i = 0; i < numTrees; i++) {
            final ArrayList<Integer> bagIndices = new ArrayList<Integer>();

            // Randomly select the indices in a balanced way
            for (int j = 0; j < numInstances; j++) {
                // Select first the class
                final int randomClass = random.nextInt(numClasses);
                // Select then a random sample of that class
                final int randomSample = random.nextInt(indexSample[randomClass].size());
                bagIndices.add(indexSample[randomClass].get(randomSample));
                inBag[i][indexSample[randomClass].get(randomSample)] = true;
            }

            // Create random tree
            final Splitter splitter = new Splitter(
                    new GiniFunction(numFeatures, data.getRandomNumberGenerator(random.nextInt())));

            futures.add(exe.submit(new Callable<BalancedRandomTree>() {
                public BalancedRandomTree call() {
                    return new BalancedRandomTree(data, bagIndices, splitter);
                }
            }));
        }

        // Grab all trained trees before proceeding
        for (int treeIdx = 0; treeIdx < numTrees; treeIdx++)
            tree[treeIdx] = futures.get(treeIdx).get();

        // Calculate out of bag error
        final boolean numeric = data.classAttribute().isNumeric();

        List<Future<Double>> votes = new ArrayList<Future<Double>>(data.numInstances());

        for (int i = 0; i < data.numInstances(); i++) {
            VotesCollector aCollector = new VotesCollector(tree, i, data, inBag);
            votes.add(exe.submit(aCollector));
        }

        double outOfBagCount = 0.0;
        double errorSum = 0.0;

        for (int i = 0; i < data.numInstances(); i++) {

            double vote = votes.get(i).get();

            // error for instance
            outOfBagCount += data.instance(i).weight();
            if (numeric) {
                errorSum += StrictMath.abs(vote - data.instance(i).classValue()) * data.instance(i).weight();
            } else {
                if (vote != data.instance(i).classValue())
                    errorSum += data.instance(i).weight();
            }

        }

        outOfBagError = errorSum / outOfBagCount;

    } catch (Exception ex) {
        ex.printStackTrace();
    } finally {
        exe.shutdownNow();
    }

}

From source file:ai.GiniFunction.java

License:GNU General Public License

/**
 * Create split function based on Gini coefficient
 * /*  w ww  .  ja  v  a  2 s . co  m*/
 * @param data original data
 * @param indices indices of the samples to use
 */
public void init(Instances data, ArrayList<Integer> indices) {
    if (indices.size() == 0) {
        this.index = 0;
        this.threshold = 0;
        this.allSame = true;
        return;
    }

    final int len = data.numAttributes();
    final int numElements = indices.size();
    final int numClasses = data.numClasses();
    final int classIndex = data.classIndex();

    /** Attribute-class pair comparator (by attribute value) */
    final Comparator<AttributeClassPair> comp = new Comparator<AttributeClassPair>() {
        public int compare(AttributeClassPair o1, AttributeClassPair o2) {
            final double diff = o2.attributeValue - o1.attributeValue;
            if (diff < 0)
                return 1;
            else if (diff == 0)
                return 0;
            else
                return -1;
        }

        public boolean equals(Object o) {
            return false;
        }
    };

    // Create and shuffle indices of features to use
    ArrayList<Integer> allIndices = new ArrayList<Integer>();
    for (int i = 0; i < len; i++)
        if (i != classIndex)
            allIndices.add(i);

    double minimumGini = Double.MAX_VALUE;

    for (int i = 0; i < numOfFeatures; i++) {
        // Select the random feature
        final int index = random.nextInt(allIndices.size());
        final int featureToUse = allIndices.get(index);
        allIndices.remove(index); // remove that element to prevent from repetitions

        // Get the smallest Gini coefficient

        // Create list with pairs attribute-class
        final ArrayList<AttributeClassPair> list = new ArrayList<AttributeClassPair>();
        for (int j = 0; j < numElements; j++) {
            final Instance ins = data.get(indices.get(j));
            list.add(new AttributeClassPair(ins.value(featureToUse), (int) ins.value(classIndex)));
        }

        // Sort pairs in increasing order
        Collections.sort(list, comp);

        final double[] probLeft = new double[numClasses];
        final double[] probRight = new double[numClasses];
        // initial probabilities (all samples on the right)
        for (int n = 0; n < list.size(); n++)
            probRight[list.get(n).classValue]++;

        // Try all splitting points, from position 0 to the end
        for (int splitPoint = 0; splitPoint < numElements; splitPoint++) {
            // Calculate Gini coefficient
            double giniLeft = 0;
            double giniRight = 0;
            final int rightNumElements = numElements - splitPoint;

            for (int nClass = 0; nClass < numClasses; nClass++) {
                // left set
                double prob = probLeft[nClass];
                // Divide by the number of elements to get probabilities
                if (splitPoint != 0)
                    prob /= (double) splitPoint;
                giniLeft += prob * prob;

                // right set
                prob = probRight[nClass];
                // Divide by the number of elements to get probabilities
                if (rightNumElements != 0)
                    prob /= (double) rightNumElements;
                giniRight += prob * prob;
            }

            // Total Gini value
            final double gini = ((1.0 - giniLeft) * splitPoint + (1.0 - giniRight) * rightNumElements)
                    / (double) numElements;

            // Save values of minimum Gini coefficient
            if (gini < minimumGini) {
                minimumGini = gini;
                this.index = featureToUse;
                this.threshold = list.get(splitPoint).attributeValue;
            }

            // update probabilities for next iteration
            probLeft[list.get(splitPoint).classValue]++;
            probRight[list.get(splitPoint).classValue]--;
        }
    }

    // free list of possible indices to help garbage collector
    //allIndices.clear();
    //allIndices = null;
}

From source file:algoritmogeneticocluster.Cromossomo.java

private double getMacroAverage(Evaluation eval, Instances data) {
    double macroMeasure;
    double macroPrecision = 0;
    double macrorecall = 0;

    for (int i = 0; i < data.numClasses(); i++) {
        macroPrecision += eval.precision(i);
        macrorecall += eval.recall(i);/*w w  w. ja v a 2  s.com*/
    }
    macroPrecision = macroPrecision / data.numClasses();
    macrorecall = macrorecall / data.numClasses();
    macroMeasure = (macroPrecision * macrorecall * 2) / (macroPrecision + macrorecall);
    //System.out.println("macroMeasure: " + macroMeasure);

    return macroMeasure;
}