List of usage examples for weka.core Instances numClasses
publicint numClasses()
From source file:ID3Chi.java
License:Open Source License
private void SetNullDistribution(Instances data) { m_Attribute = null;// w w w . j av a 2s.c om m_ClassValue = Instance.missingValue(); // TODO: think if it's better to keep all the distributions equal to 0 m_Distribution = new double[data.numClasses()]; for (int i = 0; i < m_Distribution.length; i++) { m_Distribution[i] = 1.0 / (double) data.numClasses(); } }
From source file:ID3Chi.java
License:Open Source License
/** * Computes Chi-Square element for given subset. * * @param subset//from w ww .j ava 2s . c o m * the data for which info gain is to be computed * @param att * the attribute * @setClassCounts class counts for initial set of instances * @setNumInstances number of instances for set of data * @return the chi-square for the given attribute and data * @throws Exception * if computation fails */ private double computeChiSquareForSubset(Instances subset, Attribute att, double[] setClassCounts, double setNumInstances) { double[] subsetClassCounts = GetClassCounts(subset); double result = 0; double d = subset.numInstances() / setNumInstances; for (int j = 0; j < subset.numClasses(); j++) { double ciNew = setClassCounts[j] * d; if (ciNew > 0) { result += Math.pow(subsetClassCounts[j] - ciNew, 2) / ciNew; } } return result; }
From source file:ID3Chi.java
License:Open Source License
/** * Computes the entropy of a dataset.//from ww w . j av a 2s . c o m * * @param data * the data for which entropy is to be computed * @return the entropy of the data's class distribution * @throws Exception * if computation fails */ private double computeEntropy(Instances data) throws Exception { double[] classCounts = GetClassCounts(data); double entropy = 0; for (int j = 0; j < data.numClasses(); j++) { if (classCounts[j] > 0) { entropy -= classCounts[j] * Utils.log2(classCounts[j]); } } entropy /= (double) data.numInstances(); return entropy + Utils.log2(data.numInstances()); }
From source file:ID3Chi.java
License:Open Source License
private double computeEntropyWithUnknowns(Instances data, Instances unknownData, double[] classCountsUnknownData, double ratio) throws Exception { double[] classCounts = GetClassCounts(data); double entropy = 0; for (int j = 0; j < data.numClasses(); j++) { double p = classCounts[j] + classCountsUnknownData[j] * ratio; if (p > 0) { entropy -= p * Utils.log2(p); }//from w w w. j av a2 s.c om } entropy /= (double) data.numInstances(); return entropy + Utils.log2(data.numInstances()); }
From source file:ID3Chi.java
License:Open Source License
private double[] GetClassCounts(Instances data) { double[] classCounts = new double[data.numClasses()]; Enumeration instEnum = data.enumerateInstances(); while (instEnum.hasMoreElements()) { Instance inst = (Instance) instEnum.nextElement(); classCounts[(int) inst.classValue()]++; }/*from ww w. ja v a 2 s . c om*/ return classCounts; }
From source file:MPCKMeans.java
License:Open Source License
public static void testCase() { try {/* w w w . j a va 2 s . co m*/ String dataset = new String("lowd"); //String dataset = new String("highd"); if (dataset.equals("lowd")) { //////// Low-D data // String datafile = "/u/ml/data/bio/arffFromPhylo/ecoli_K12-100.arff"; // String datafile = "/u/sugato/weka/data/digits-0.1-389.arff"; String datafile = "/u/sugato/weka/data/iris.arff"; int numPairs = 200, num = 0; // set up the data FileReader reader = new FileReader(datafile); Instances data = new Instances(reader); // Make the last attribute be the class int classIndex = data.numAttributes() - 1; data.setClassIndex(classIndex); // starts with 0 System.out.println("ClassIndex is: " + classIndex); // Remove the class labels before clustering Instances clusterData = new Instances(data); clusterData.deleteClassAttribute(); // create the pairs ArrayList labeledPair = InstancePair.getPairs(data, numPairs); System.out.println("Finished initializing constraint matrix"); MPCKMeans mpckmeans = new MPCKMeans(); mpckmeans.setUseMultipleMetrics(false); System.out.println("\nClustering the data using MPCKmeans...\n"); WeightedEuclidean metric = new WeightedEuclidean(); WEuclideanLearner metricLearner = new WEuclideanLearner(); // LearnableMetric metric = new WeightedDotP(); // MPCKMeansMetricLearner metricLearner = new DotPGDLearner(); // KL metric = new KL(); // KLGDLearner metricLearner = new KLGDLearner(); // ((KL)metric).setUseIDivergence(true); // BarHillelMetric metric = new BarHillelMetric(); // BarHillelMetricMatlab metric = new BarHillelMetricMatlab(); // XingMetric metric = new XingMetric(); // WeightedMahalanobis metric = new WeightedMahalanobis(); mpckmeans.setMetric(metric); mpckmeans.setMetricLearner(metricLearner); mpckmeans.setVerbose(false); mpckmeans.setRegularize(false); mpckmeans.setTrainable(new SelectedTag(TRAINING_INTERNAL, TAGS_TRAINING)); mpckmeans.setSeedable(true); mpckmeans.buildClusterer(labeledPair, clusterData, data, data.numClasses(), data.numInstances()); mpckmeans.getIndexClusters(); mpckmeans.printIndexClusters(); SemiSupClustererEvaluation eval = new SemiSupClustererEvaluation(mpckmeans.m_TotalTrainWithLabels, mpckmeans.m_TotalTrainWithLabels.numClasses(), mpckmeans.m_TotalTrainWithLabels.numClasses()); eval.evaluateModel(mpckmeans, mpckmeans.m_TotalTrainWithLabels, mpckmeans.m_Instances); System.out.println("MI=" + eval.mutualInformation()); System.out.print("FM=" + eval.pairwiseFMeasure()); System.out.print("\tP=" + eval.pairwisePrecision()); System.out.print("\tR=" + eval.pairwiseRecall()); } else if (dataset.equals("highd")) { //////// Newsgroup data String datafile = "/u/ml/users/sugato/groupcode/weka335/data/arffFromCCS/sanitized/different-1000_sanitized.arff"; //String datafile = "/u/ml/users/sugato/groupcode/weka335/data/20newsgroups/small-newsgroup_fromCCS.arff"; //String datafile = "/u/ml/users/sugato/groupcode/weka335/data/20newsgroups/same-100_fromCCS.arff"; // set up the data FileReader reader = new FileReader(datafile); Instances data = new Instances(reader); // Make the last attribute be the class int classIndex = data.numAttributes() - 1; data.setClassIndex(classIndex); // starts with 0 System.out.println("ClassIndex is: " + classIndex); // Remove the class labels before clustering Instances clusterData = new Instances(data); clusterData.deleteClassAttribute(); // create the pairs int numPairs = 0, num = 0; ArrayList labeledPair = new ArrayList(numPairs); Random rand = new Random(42); System.out.println("Initializing constraint matrix:"); while (num < numPairs) { int i = (int) (data.numInstances() * rand.nextFloat()); int j = (int) (data.numInstances() * rand.nextFloat()); int first = (i < j) ? i : j; int second = (i >= j) ? i : j; int linkType = (data.instance(first).classValue() == data.instance(second).classValue()) ? InstancePair.MUST_LINK : InstancePair.CANNOT_LINK; InstancePair pair = new InstancePair(first, second, linkType); if (first != second && !labeledPair.contains(pair)) { labeledPair.add(pair); //System.out.println(num + "th entry is: " + pair); num++; } } System.out.println("Finished initializing constraint matrix"); MPCKMeans mpckmeans = new MPCKMeans(); mpckmeans.setUseMultipleMetrics(false); System.out.println("\nClustering the highd data using MPCKmeans...\n"); LearnableMetric metric = new WeightedDotP(); MPCKMeansMetricLearner metricLearner = new DotPGDLearner(); // KL metric = new KL(); // KLGDLearner metricLearner = new KLGDLearner(); mpckmeans.setMetric(metric); mpckmeans.setMetricLearner(metricLearner); mpckmeans.setVerbose(false); mpckmeans.setRegularize(true); mpckmeans.setTrainable(new SelectedTag(TRAINING_INTERNAL, TAGS_TRAINING)); mpckmeans.setSeedable(true); mpckmeans.buildClusterer(labeledPair, clusterData, data, data.numClasses(), data.numInstances()); mpckmeans.getIndexClusters(); SemiSupClustererEvaluation eval = new SemiSupClustererEvaluation(mpckmeans.m_TotalTrainWithLabels, mpckmeans.m_TotalTrainWithLabels.numClasses(), mpckmeans.m_TotalTrainWithLabels.numClasses()); mpckmeans.getMetric().resetMetric(); // Vital: to reset m_attrWeights to 1 for proper normalization eval.evaluateModel(mpckmeans, mpckmeans.m_TotalTrainWithLabels, mpckmeans.m_Instances); System.out.println("MI=" + eval.mutualInformation()); System.out.print("FM=" + eval.pairwiseFMeasure()); System.out.print("\tP=" + eval.pairwisePrecision()); System.out.print("\tR=" + eval.pairwiseRecall()); } } catch (Exception e) { e.printStackTrace(); } }
From source file:MeansClassifier.java
@Override public void buildClassifier(Instances i) throws Exception { double[][] means = new double[i.numClasses()][i.numAttributes() - 1]; for (int j = 0; j < i.size(); j++) { if (i.instance(j).classValue() == 0) { zeros.add(i.instance(j).value(0)); }// www . j a v a 2s . c om if (i.instance(j).classValue() == 1) { ones.add(i.instance(j).value(0)); } } double zero1 = 0, one1 = 0; for (int k = 0; k < zeros.size(); k++) { zero1 = zero1 + zeros.get(k); } for (int l = 0; l < ones.size(); l++) { one1 = one1 + ones.get(l); } System.out.println("For class 0, mean is " + (zero1 / zeros.size())); System.out.println("For class 1, mean is " + (one1 / ones.size())); }
From source file:ai.BalancedRandomForest.java
License:GNU General Public License
/** * Build Balanced Random Forest// w w w. j a v a2 s . c o m */ public void buildClassifier(final Instances data) throws Exception { // If number of features is 0 then set it to log2 of M (number of attributes) if (numFeatures < 1) numFeatures = (int) Utils.log2(data.numAttributes()) + 1; // Check maximum number of random features if (numFeatures >= data.numAttributes()) numFeatures = data.numAttributes() - 1; // Initialize array of trees tree = new BalancedRandomTree[numTrees]; // total number of instances final int numInstances = data.numInstances(); // total number of classes final int numClasses = data.numClasses(); final ArrayList<Integer>[] indexSample = new ArrayList[numClasses]; for (int i = 0; i < numClasses; i++) indexSample[i] = new ArrayList<Integer>(); //System.out.println("numClasses = " + numClasses); // fill indexSample with the indices of each class for (int i = 0; i < numInstances; i++) { //System.out.println("data.get("+i+").classValue() = " + data.get(i).classValue()); indexSample[(int) data.get(i).classValue()].add(i); } final Random random = new Random(seed); // Executor service to run concurrent trees final ExecutorService exe = Executors.newFixedThreadPool(Runtime.getRuntime().availableProcessors()); List<Future<BalancedRandomTree>> futures = new ArrayList<Future<BalancedRandomTree>>(numTrees); final boolean[][] inBag = new boolean[numTrees][numInstances]; try { for (int i = 0; i < numTrees; i++) { final ArrayList<Integer> bagIndices = new ArrayList<Integer>(); // Randomly select the indices in a balanced way for (int j = 0; j < numInstances; j++) { // Select first the class final int randomClass = random.nextInt(numClasses); // Select then a random sample of that class final int randomSample = random.nextInt(indexSample[randomClass].size()); bagIndices.add(indexSample[randomClass].get(randomSample)); inBag[i][indexSample[randomClass].get(randomSample)] = true; } // Create random tree final Splitter splitter = new Splitter( new GiniFunction(numFeatures, data.getRandomNumberGenerator(random.nextInt()))); futures.add(exe.submit(new Callable<BalancedRandomTree>() { public BalancedRandomTree call() { return new BalancedRandomTree(data, bagIndices, splitter); } })); } // Grab all trained trees before proceeding for (int treeIdx = 0; treeIdx < numTrees; treeIdx++) tree[treeIdx] = futures.get(treeIdx).get(); // Calculate out of bag error final boolean numeric = data.classAttribute().isNumeric(); List<Future<Double>> votes = new ArrayList<Future<Double>>(data.numInstances()); for (int i = 0; i < data.numInstances(); i++) { VotesCollector aCollector = new VotesCollector(tree, i, data, inBag); votes.add(exe.submit(aCollector)); } double outOfBagCount = 0.0; double errorSum = 0.0; for (int i = 0; i < data.numInstances(); i++) { double vote = votes.get(i).get(); // error for instance outOfBagCount += data.instance(i).weight(); if (numeric) { errorSum += StrictMath.abs(vote - data.instance(i).classValue()) * data.instance(i).weight(); } else { if (vote != data.instance(i).classValue()) errorSum += data.instance(i).weight(); } } outOfBagError = errorSum / outOfBagCount; } catch (Exception ex) { ex.printStackTrace(); } finally { exe.shutdownNow(); } }
From source file:ai.GiniFunction.java
License:GNU General Public License
/** * Create split function based on Gini coefficient * /* w ww . ja v a 2 s . co m*/ * @param data original data * @param indices indices of the samples to use */ public void init(Instances data, ArrayList<Integer> indices) { if (indices.size() == 0) { this.index = 0; this.threshold = 0; this.allSame = true; return; } final int len = data.numAttributes(); final int numElements = indices.size(); final int numClasses = data.numClasses(); final int classIndex = data.classIndex(); /** Attribute-class pair comparator (by attribute value) */ final Comparator<AttributeClassPair> comp = new Comparator<AttributeClassPair>() { public int compare(AttributeClassPair o1, AttributeClassPair o2) { final double diff = o2.attributeValue - o1.attributeValue; if (diff < 0) return 1; else if (diff == 0) return 0; else return -1; } public boolean equals(Object o) { return false; } }; // Create and shuffle indices of features to use ArrayList<Integer> allIndices = new ArrayList<Integer>(); for (int i = 0; i < len; i++) if (i != classIndex) allIndices.add(i); double minimumGini = Double.MAX_VALUE; for (int i = 0; i < numOfFeatures; i++) { // Select the random feature final int index = random.nextInt(allIndices.size()); final int featureToUse = allIndices.get(index); allIndices.remove(index); // remove that element to prevent from repetitions // Get the smallest Gini coefficient // Create list with pairs attribute-class final ArrayList<AttributeClassPair> list = new ArrayList<AttributeClassPair>(); for (int j = 0; j < numElements; j++) { final Instance ins = data.get(indices.get(j)); list.add(new AttributeClassPair(ins.value(featureToUse), (int) ins.value(classIndex))); } // Sort pairs in increasing order Collections.sort(list, comp); final double[] probLeft = new double[numClasses]; final double[] probRight = new double[numClasses]; // initial probabilities (all samples on the right) for (int n = 0; n < list.size(); n++) probRight[list.get(n).classValue]++; // Try all splitting points, from position 0 to the end for (int splitPoint = 0; splitPoint < numElements; splitPoint++) { // Calculate Gini coefficient double giniLeft = 0; double giniRight = 0; final int rightNumElements = numElements - splitPoint; for (int nClass = 0; nClass < numClasses; nClass++) { // left set double prob = probLeft[nClass]; // Divide by the number of elements to get probabilities if (splitPoint != 0) prob /= (double) splitPoint; giniLeft += prob * prob; // right set prob = probRight[nClass]; // Divide by the number of elements to get probabilities if (rightNumElements != 0) prob /= (double) rightNumElements; giniRight += prob * prob; } // Total Gini value final double gini = ((1.0 - giniLeft) * splitPoint + (1.0 - giniRight) * rightNumElements) / (double) numElements; // Save values of minimum Gini coefficient if (gini < minimumGini) { minimumGini = gini; this.index = featureToUse; this.threshold = list.get(splitPoint).attributeValue; } // update probabilities for next iteration probLeft[list.get(splitPoint).classValue]++; probRight[list.get(splitPoint).classValue]--; } } // free list of possible indices to help garbage collector //allIndices.clear(); //allIndices = null; }
From source file:algoritmogeneticocluster.Cromossomo.java
private double getMacroAverage(Evaluation eval, Instances data) { double macroMeasure; double macroPrecision = 0; double macrorecall = 0; for (int i = 0; i < data.numClasses(); i++) { macroPrecision += eval.precision(i); macrorecall += eval.recall(i);/*w w w. ja v a 2 s.com*/ } macroPrecision = macroPrecision / data.numClasses(); macrorecall = macrorecall / data.numClasses(); macroMeasure = (macroPrecision * macrorecall * 2) / (macroPrecision + macrorecall); //System.out.println("macroMeasure: " + macroMeasure); return macroMeasure; }