List of usage examples for weka.core Instances numClasses
publicint numClasses()
From source file:j48.BinC45Split.java
License:Open Source License
/** * Creates split on numeric attribute./*ww w .j a v a2 s .com*/ * * @exception Exception if something goes wrong */ private void handleNumericAttribute(Instances trainInstances) throws Exception { int firstMiss; int next = 1; int last = 0; int index = 0; int splitIndex = -1; double currentInfoGain; double defaultEnt; double minSplit; Instance instance; int i; // Current attribute is a numeric attribute. m_distribution = new Distribution(2, trainInstances.numClasses()); // Only Instances with known values are relevant. Enumeration enu = trainInstances.enumerateInstances(); i = 0; while (enu.hasMoreElements()) { instance = (Instance) enu.nextElement(); if (instance.isMissing(m_attIndex)) break; m_distribution.add(1, instance); i++; } firstMiss = i; // Compute minimum number of Instances required in each // subset. minSplit = 0.1 * (m_distribution.total()) / ((double) trainInstances.numClasses()); if (Utils.smOrEq(minSplit, m_minNoObj)) minSplit = m_minNoObj; else if (Utils.gr(minSplit, 25)) minSplit = 25; // Enough Instances with known values? if (Utils.sm((double) firstMiss, 2 * minSplit)) return; // Compute values of criteria for all possible split // indices. defaultEnt = m_infoGainCrit.oldEnt(m_distribution); while (next < firstMiss) { if (trainInstances.instance(next - 1).value(m_attIndex) + 1e-5 < trainInstances.instance(next) .value(m_attIndex)) { // Move class values for all Instances up to next // possible split point. m_distribution.shiftRange(1, 0, trainInstances, last, next); // Check if enough Instances in each subset and compute // values for criteria. if (Utils.grOrEq(m_distribution.perBag(0), minSplit) && Utils.grOrEq(m_distribution.perBag(1), minSplit)) { currentInfoGain = m_infoGainCrit.splitCritValue(m_distribution, m_sumOfWeights, defaultEnt); if (Utils.gr(currentInfoGain, m_infoGain)) { m_infoGain = currentInfoGain; splitIndex = next - 1; } index++; } last = next; } next++; } // Was there any useful split? if (index == 0) return; // Compute modified information gain for best split. m_infoGain = m_infoGain - (Utils.log2(index) / m_sumOfWeights); if (Utils.smOrEq(m_infoGain, 0)) return; // Set instance variables' values to values for // best split. m_numSubsets = 2; m_splitPoint = (trainInstances.instance(splitIndex + 1).value(m_attIndex) + trainInstances.instance(splitIndex).value(m_attIndex)) / 2; // In case we have a numerical precision problem we need to choose the // smaller value if (m_splitPoint == trainInstances.instance(splitIndex + 1).value(m_attIndex)) { m_splitPoint = trainInstances.instance(splitIndex).value(m_attIndex); } // Restore distributioN for best split. m_distribution = new Distribution(2, trainInstances.numClasses()); m_distribution.addRange(0, trainInstances, 0, splitIndex + 1); m_distribution.addRange(1, trainInstances, splitIndex + 1, firstMiss); // Compute modified gain ratio for best split. m_gainRatio = m_gainRatioCrit.splitCritValue(m_distribution, m_sumOfWeights, m_infoGain); }
From source file:j48.C45PruneableClassifierTreeG.java
License:Open Source License
/** * finds new nodes that improve accuracy and grafts them onto the tree * * @param fulldata the instances in whole trainset * @param iindex records num tests each instance has failed up to this node * @param limits the upper/lower limits for numeric attributes * @param parent the node immediately before the current one * @param pLaplace laplace for leaf, calculated by parent (in case leaf empty) * @param pLeafClass class of leaf, determined by parent (in case leaf empty) *//*from w w w . j a va2 s . c o m*/ private void findGraft(Instances fulldata, double[][] iindex, double[][] limits, ClassifierTree parent, double pLaplace, int pLeafClass) throws Exception { // get the class for this leaf int leafClass = (m_isEmpty) ? pLeafClass : localModel().distribution().maxClass(); // get the laplace value for this leaf double leafLaplace = (m_isEmpty) ? pLaplace : laplaceLeaf(leafClass); // sort the instances into those at the leaf, those in atbop, and discarded Instances l = new Instances(fulldata, fulldata.numInstances()); Instances n = new Instances(fulldata, fulldata.numInstances()); int lcount = 0; int acount = 0; for (int x = 0; x < fulldata.numInstances(); x++) { if (iindex[0][x] <= 0 && iindex[1][x] <= 0) continue; if (iindex[0][x] != 0) { l.add(fulldata.instance(x)); l.instance(lcount).setWeight(iindex[0][x]); // move instance's weight in iindex to same index as in l iindex[0][lcount++] = iindex[0][x]; } if (iindex[1][x] > 0) { n.add(fulldata.instance(x)); n.instance(acount).setWeight(iindex[1][x]); // move instance's weight in iindex to same index as in n iindex[1][acount++] = iindex[1][x]; } } boolean graftPossible = false; double[] classDist = new double[n.numClasses()]; for (int x = 0; x < n.numInstances(); x++) { if (iindex[1][x] > 0 && !n.instance(x).classIsMissing()) classDist[(int) n.instance(x).classValue()] += iindex[1][x]; } for (int cVal = 0; cVal < n.numClasses(); cVal++) { double theLaplace = (classDist[cVal] + 1.0) / (classDist[cVal] + 2.0); if (cVal != leafClass && (theLaplace > leafLaplace) && (biprob(classDist[cVal], classDist[cVal], leafLaplace) > m_BiProbCrit)) { graftPossible = true; break; } } if (!graftPossible) { return; } // 1. Initialize to {} a set of tuples t containing potential tests ArrayList t = new ArrayList(); // go through each attribute for (int a = 0; a < n.numAttributes(); a++) { if (a == n.classIndex()) continue; // skip the class // sort instances in atbop by $a int[] sorted = sortByAttribute(n, a); // 2. For each continuous attribute $a: if (n.attribute(a).isNumeric()) { // find min and max values for this attribute at the leaf boolean prohibited = false; double minLeaf = Double.POSITIVE_INFINITY; double maxLeaf = Double.NEGATIVE_INFINITY; for (int i = 0; i < l.numInstances(); i++) { if (l.instance(i).isMissing(a)) { if (l.instance(i).classValue() == leafClass) { prohibited = true; break; } } double value = l.instance(i).value(a); if (!m_relabel || l.instance(i).classValue() == leafClass) { if (value < minLeaf) minLeaf = value; if (value > maxLeaf) maxLeaf = value; } } if (prohibited) { continue; } // (a) find values of // $n: instances in atbop (already have that, actually) // $v: a value for $a that exists for a case in the atbop, where // $v is < the min value for $a for a case at the leaf which // has the class $c, and $v is > the lowerlimit of $a at // the leaf. // (note: error in original paper stated that $v must be // smaller OR EQUAL TO the min value). // $k: $k is a class // that maximize L' = Laplace({$x: $x contained in cases($n) // & value($a,$x) <= $v & value($a,$x) > lowerlim($l,$a)}, $k). double minBestClass = Double.NaN; double minBestLaplace = leafLaplace; double minBestVal = Double.NaN; double minBestPos = Double.NaN; double minBestTotal = Double.NaN; double[][] minBestCounts = null; double[][] counts = new double[2][n.numClasses()]; for (int x = 0; x < n.numInstances(); x++) { if (n.instance(sorted[x]).isMissing(a)) break; // missing are sorted to end: no more valid vals double theval = n.instance(sorted[x]).value(a); if (m_Debug) System.out.println("\t " + theval); if (theval <= limits[a][0]) { if (m_Debug) System.out.println("\t <= lowerlim: continuing..."); continue; } // note: error in paper would have this read "theVal > minLeaf) if (theval >= minLeaf) { if (m_Debug) System.out.println("\t >= minLeaf; breaking..."); break; } counts[0][(int) n.instance(sorted[x]).classValue()] += iindex[1][sorted[x]]; if (x != n.numInstances() - 1) { int z = x + 1; while (z < n.numInstances() && n.instance(sorted[z]).value(a) == theval) { z++; x++; counts[0][(int) n.instance(sorted[x]).classValue()] += iindex[1][sorted[x]]; } } // work out the best laplace/class (for <= theval) double total = Utils.sum(counts[0]); for (int c = 0; c < n.numClasses(); c++) { double temp = (counts[0][c] + 1.0) / (total + 2.0); if (temp > minBestLaplace) { minBestPos = counts[0][c]; minBestTotal = total; minBestLaplace = temp; minBestClass = c; minBestCounts = copyCounts(counts); minBestVal = (x == n.numInstances() - 1) ? theval : ((theval + n.instance(sorted[x + 1]).value(a)) / 2.0); } } } // (b) add to t tuple <n,a,v,k,L',"<="> if (!Double.isNaN(minBestVal) && biprob(minBestPos, minBestTotal, leafLaplace) > m_BiProbCrit) { GraftSplit gsplit = null; try { gsplit = new GraftSplit(a, minBestVal, 0, leafClass, minBestCounts); } catch (Exception e) { System.err.println("graftsplit error: " + e.getMessage()); System.exit(1); } t.add(gsplit); } // free space minBestCounts = null; // (c) find values of // n: instances in atbop (already have that, actually) // $v: a value for $a that exists for a case in the atbop, where // $v is > the max value for $a for a case at the leaf which // has the class $c, and $v is <= the upperlimit of $a at // the leaf. // k: k is a class // that maximize L' = Laplace({x: x contained in cases(n) // & value(a,x) > v & value(a,x) <= upperlim(l,a)}, k). double maxBestClass = -1; double maxBestLaplace = leafLaplace; double maxBestVal = Double.NaN; double maxBestPos = Double.NaN; double maxBestTotal = Double.NaN; double[][] maxBestCounts = null; for (int c = 0; c < n.numClasses(); c++) { // zero the counts counts[0][c] = 0; counts[1][c] = 0; // shouldn't need to do this ... } // check smallest val for a in atbop is < upper limit if (n.numInstances() >= 1 && n.instance(sorted[0]).value(a) < limits[a][1]) { for (int x = n.numInstances() - 1; x >= 0; x--) { if (n.instance(sorted[x]).isMissing(a)) continue; double theval = n.instance(sorted[x]).value(a); if (m_Debug) System.out.println("\t " + theval); if (theval > limits[a][1]) { if (m_Debug) System.out.println("\t >= upperlim; continuing..."); continue; } if (theval <= maxLeaf) { if (m_Debug) System.out.println("\t < maxLeaf; breaking..."); break; } // increment counts counts[1][(int) n.instance(sorted[x]).classValue()] += iindex[1][sorted[x]]; if (x != 0 && !n.instance(sorted[x - 1]).isMissing(a)) { int z = x - 1; while (z >= 0 && n.instance(sorted[z]).value(a) == theval) { z--; x--; counts[1][(int) n.instance(sorted[x]).classValue()] += iindex[1][sorted[x]]; } } // work out best laplace for > theval double total = Utils.sum(counts[1]); for (int c = 0; c < n.numClasses(); c++) { double temp = (counts[1][c] + 1.0) / (total + 2.0); if (temp > maxBestLaplace) { maxBestPos = counts[1][c]; maxBestTotal = total; maxBestLaplace = temp; maxBestClass = c; maxBestCounts = copyCounts(counts); maxBestVal = (x == 0) ? theval : ((theval + n.instance(sorted[x - 1]).value(a)) / 2.0); } } } // (d) add to t tuple <n,a,v,k,L',">"> if (!Double.isNaN(maxBestVal) && biprob(maxBestPos, maxBestTotal, leafLaplace) > m_BiProbCrit) { GraftSplit gsplit = null; try { gsplit = new GraftSplit(a, maxBestVal, 1, leafClass, maxBestCounts); } catch (Exception e) { System.err.println("graftsplit error:" + e.getMessage()); System.exit(1); } t.add(gsplit); } } } else { // must be a nominal attribute // 3. for each discrete attribute a for which there is no // test at an ancestor of l // skip if this attribute has already been used if (limits[a][1] == 1) { continue; } boolean[] prohibit = new boolean[l.attribute(a).numValues()]; for (int aval = 0; aval < n.attribute(a).numValues(); aval++) { for (int x = 0; x < l.numInstances(); x++) { if ((l.instance(x).isMissing(a) || l.instance(x).value(a) == aval) && (!m_relabel || (l.instance(x).classValue() == leafClass))) { prohibit[aval] = true; break; } } } // (a) find values of // $n: instances in atbop (already have that, actually) // $v: $v is a value for $a // $k: $k is a class // that maximize L' = Laplace({$x: $x contained in cases($n) // & value($a,$x) = $v}, $k). double bestVal = Double.NaN; double bestClass = Double.NaN; double bestLaplace = leafLaplace; double[][] bestCounts = null; double[][] counts = new double[2][n.numClasses()]; for (int x = 0; x < n.numInstances(); x++) { if (n.instance(sorted[x]).isMissing(a)) continue; // zero the counts for (int c = 0; c < n.numClasses(); c++) counts[0][c] = 0; double theval = n.instance(sorted[x]).value(a); counts[0][(int) n.instance(sorted[x]).classValue()] += iindex[1][sorted[x]]; if (x != n.numInstances() - 1) { int z = x + 1; while (z < n.numInstances() && n.instance(sorted[z]).value(a) == theval) { z++; x++; counts[0][(int) n.instance(sorted[x]).classValue()] += iindex[1][sorted[x]]; } } if (!prohibit[(int) theval]) { // work out best laplace for > theval double total = Utils.sum(counts[0]); bestLaplace = leafLaplace; bestClass = Double.NaN; for (int c = 0; c < n.numClasses(); c++) { double temp = (counts[0][c] + 1.0) / (total + 2.0); if (temp > bestLaplace && biprob(counts[0][c], total, leafLaplace) > m_BiProbCrit) { bestLaplace = temp; bestClass = c; bestVal = theval; bestCounts = copyCounts(counts); } } // add to graft list if (!Double.isNaN(bestClass)) { GraftSplit gsplit = null; try { gsplit = new GraftSplit(a, bestVal, 2, leafClass, bestCounts); } catch (Exception e) { System.err.println("graftsplit error: " + e.getMessage()); System.exit(1); } t.add(gsplit); } } } // (b) add to t tuple <n,a,v,k,L',"="> // done this already } } // 4. remove from t all tuples <n,a,v,c,L,x> such that L <= // Laplace(cases(l),c) or prob(x,n,Laplace(cases(l),c) <= 0.05 // -- checked this constraint prior to adding a tuple -- // *** step six done before step five for efficiency *** // 6. for each <n,a,v,k,L,x> in t ordered on L from highest to lowest // order the tuples from highest to lowest laplace // (this actually orders lowest to highest) Collections.sort(t); // 5. remove from t all tuples <n,a,v,c,L,x> such that there is // no tuple <n',a',v',k',L',x'> such that k' != c & L' < L. for (int x = 0; x < t.size(); x++) { GraftSplit gs = (GraftSplit) t.get(x); if (gs.maxClassForSubsetOfInterest() != leafClass) { break; // reached a graft with class != leafClass, so stop deleting } else { t.remove(x); x--; } } // if no potential grafts were found, do nothing and return if (t.size() < 1) { return; } // create the distributions for each graft for (int x = t.size() - 1; x >= 0; x--) { GraftSplit gs = (GraftSplit) t.get(x); try { gs.buildClassifier(l); gs.deleteGraftedCases(l); // so they don't go down the other branch } catch (Exception e) { System.err.println("graftsplit build error: " + e.getMessage()); } } // add this stuff to the tree ((C45PruneableClassifierTreeG) parent).setDescendents(t, this); }
From source file:j48.C45Split.java
License:Open Source License
/** * Creates split on enumerated attribute. * /*from w w w. ja v a 2 s .co m*/ * @exception Exception * if something goes wrong */ private void handleEnumeratedAttribute(Instances trainInstances) throws Exception { Instance instance; m_distribution = new Distribution(m_complexityIndex, trainInstances.numClasses()); // Only Instances with known values are relevant. Enumeration enu = trainInstances.enumerateInstances(); while (enu.hasMoreElements()) { instance = (Instance) enu.nextElement(); if (!instance.isMissing(m_attIndex)) m_distribution.add((int) instance.value(m_attIndex), instance); } // Check if minimum number of Instances in at least two // subsets. if (m_distribution.check(m_minNoObj)) { m_numSubsets = m_complexityIndex; m_infoGain = infoGainCrit.splitCritValue(m_distribution, m_sumOfWeights); m_gainRatio = gainRatioCrit.splitCritValue(m_distribution, m_sumOfWeights, m_infoGain); } }
From source file:j48.C45Split.java
License:Open Source License
/** * Creates split on numeric attribute./*from w w w . j a v a 2 s . c om*/ * * @exception Exception * if something goes wrong */ private void handleNumericAttribute(Instances trainInstances) throws Exception { int firstMiss; int next = 1; int last = 0; int splitIndex = -1; double currentInfoGain; double defaultEnt; double minSplit; Instance instance; int i; // Current attribute is a numeric attribute. m_distribution = new Distribution(2, trainInstances.numClasses()); // Only Instances with known values are relevant. Enumeration enu = trainInstances.enumerateInstances(); i = 0; while (enu.hasMoreElements()) { instance = (Instance) enu.nextElement(); if (instance.isMissing(m_attIndex)) break; m_distribution.add(1, instance); i++; } firstMiss = i; // Compute minimum number of Instances required in each // subset. minSplit = 0.1 * (m_distribution.total()) / ((double) trainInstances.numClasses()); if (Utils.smOrEq(minSplit, m_minNoObj)) minSplit = m_minNoObj; else if (Utils.gr(minSplit, 25)) minSplit = 25; // Enough Instances with known values? if (Utils.sm((double) firstMiss, 2 * minSplit)) return; // Compute values of criteria for all possible split // indices. defaultEnt = infoGainCrit.oldEnt(m_distribution); while (next < firstMiss) { if (trainInstances.instance(next - 1).value(m_attIndex) + 1e-5 < trainInstances.instance(next) .value(m_attIndex)) { // Move class values for all Instances up to next // possible split point. m_distribution.shiftRange(1, 0, trainInstances, last, next); // Check if enough Instances in each subset and compute // values for criteria. if (Utils.grOrEq(m_distribution.perBag(0), minSplit) && Utils.grOrEq(m_distribution.perBag(1), minSplit)) { currentInfoGain = infoGainCrit.splitCritValue1(m_distribution, m_sumOfWeights, defaultEnt, rrrrr); if (Utils.gr(currentInfoGain, m_infoGain)) { m_infoGain = currentInfoGain; splitIndex = next - 1; } m_index++; } last = next; } next++; } // Was there any useful split? if (m_index == 0) return; // Compute modified information gain for best split. m_infoGain = m_infoGain - (Utils.log2(m_index) / m_sumOfWeights); if (Utils.smOrEq(m_infoGain, 0)) return; // Set instance variables' values to values for // best split. m_numSubsets = 2; m_splitPoint = (trainInstances.instance(splitIndex + 1).value(m_attIndex) + trainInstances.instance(splitIndex).value(m_attIndex)) / 2; // In case we have a numerical precision problem we need to choose the // smaller value if (m_splitPoint == trainInstances.instance(splitIndex + 1).value(m_attIndex)) { m_splitPoint = trainInstances.instance(splitIndex).value(m_attIndex); } // Restore distributioN for best split. m_distribution = new Distribution(2, trainInstances.numClasses()); m_distribution.addRange(0, trainInstances, 0, splitIndex + 1); m_distribution.addRange(1, trainInstances, splitIndex + 1, firstMiss); // Compute modified gain ratio for best split. m_gainRatio = gainRatioCrit.splitCritValue1(m_distribution, m_sumOfWeights, m_infoGain, lllll); }
From source file:j48.Distribution.java
License:Open Source License
/** * Creates a distribution with only one bag according * to instances in source./*from w w w. j a v a 2 s .co m*/ * * @exception Exception if something goes wrong */ public Distribution(Instances source) throws Exception { m_perClassPerBag = new double[1][0]; m_perBag = new double[1]; totaL = 0; m_perClass = new double[source.numClasses()]; m_perClassPerBag[0] = new double[source.numClasses()]; Enumeration enu = source.enumerateInstances(); while (enu.hasMoreElements()) add(0, (Instance) enu.nextElement()); }
From source file:j48.Distribution.java
License:Open Source License
/** * Creates a distribution according to given instances and * split model.//from w ww . j a v a 2s . co m * * @exception Exception if something goes wrong */ public Distribution(Instances source, ClassifierSplitModel modelToUse) throws Exception { int index; Instance instance; double[] weights; m_perClassPerBag = new double[modelToUse.numSubsets()][0]; m_perBag = new double[modelToUse.numSubsets()]; totaL = 0; m_perClass = new double[source.numClasses()]; for (int i = 0; i < modelToUse.numSubsets(); i++) m_perClassPerBag[i] = new double[source.numClasses()]; Enumeration enu = source.enumerateInstances(); while (enu.hasMoreElements()) { instance = (Instance) enu.nextElement(); index = modelToUse.whichSubset(instance); if (index != -1) add(index, instance); else { weights = modelToUse.weights(instance); addWeights(instance, weights); } } }
From source file:j48.GraftSplit.java
License:Open Source License
/** * builds m_graftdistro using the passed data * * @param data the instances to use when creating the distribution */// w w w. j a v a 2 s . co m public void buildClassifier(Instances data) throws Exception { // distribution for the graft, not counting cases in atbop, only orig leaf m_graftdistro = new Distribution(2, data.numClasses()); // which subset are we looking at for the graft? int subset = subsetOfInterest(); // this is the subset for m_leaf double thisNodeCount = 0; double knownCases = 0; boolean allKnown = true; // populate distribution for (int x = 0; x < data.numInstances(); x++) { Instance instance = data.instance(x); if (instance.isMissing(m_attIndex)) { allKnown = false; continue; } knownCases += instance.weight(); int subst = whichSubset(instance); if (subst == -1) continue; m_graftdistro.add(subst, instance); if (subst == subset) { // instance belongs at m_leaf thisNodeCount += instance.weight(); } } double factor = (knownCases == 0) ? (1.0 / (double) 2.0) : (thisNodeCount / knownCases); if (!allKnown) { for (int x = 0; x < data.numInstances(); x++) { if (data.instance(x).isMissing(m_attIndex)) { Instance instance = data.instance(x); int subst = whichSubset(instance); if (subst == -1) continue; instance.setWeight(instance.weight() * factor); m_graftdistro.add(subst, instance); } } } // if there are no cases at the leaf, make sure the desired // class is chosen, by setting counts to 0.01 if (m_graftdistro.perBag(subset) == 0) { double[] counts = new double[data.numClasses()]; counts[m_maxClass] = 0.01; m_graftdistro.add(subset, counts); } if (m_graftdistro.perBag((subset == 0) ? 1 : 0) == 0) { double[] counts = new double[data.numClasses()]; counts[(int) m_otherLeafMaxClass] = 0.01; m_graftdistro.add((subset == 0) ? 1 : 0, counts); } }
From source file:liac.igmn.loader.DataLoader.java
License:Open Source License
/** * Carrega dataset a partir de arquivo ARFF e binariza os atributos nominais. * Assume que a classe seja o ultimo atributo. * /*from w w w . j a va 2s . com*/ * @param filename path do arquivo * @return dataset * @throws DataLoaderException lancado quando o arquivo nao e encontrado * ou quando ocorre algum erro de IO */ public static Dataset loadARFF(String filename) throws DataLoaderException { Dataset dataset = new Dataset(); try { ArffLoader loader = new ArffLoader(); loader.setSource(new File(filename)); Instances data = loader.getDataSet(); Instances m_Intances = new Instances(data); data.setClassIndex(data.numAttributes() - 1); String[] classes = new String[data.numClasses()]; for (int i = 0; i < data.numClasses(); i++) classes[i] = data.classAttribute().value(i); dataset.setClassesNames(classes); NominalToBinary filter = new NominalToBinary(); filter.setInputFormat(m_Intances); filter.setOptions(new String[] { "-A" }); m_Intances = Filter.useFilter(m_Intances, filter); int inputSize = m_Intances.numAttributes() - data.numClasses(); dataset.setInputSize(inputSize); dataset.setNumClasses(data.numClasses()); dataset.setWekaDataset(m_Intances); } catch (IOException e) { throw new DataLoaderException("Arquivo no encontrado", e.getCause()); } catch (Exception e) { throw new DataLoaderException("Falha na converso do arquivo", e.getCause()); } return dataset; }
From source file:LogReg.Logistic.java
License:Open Source License
/** * Builds the classifier/*from www.j a va 2 s .c om*/ * * @param train the training data to be used for generating the * boosted classifier. * @throws Exception if the classifier could not be built successfully */ public void buildClassifier(Instances train) throws Exception { // can classifier handle the data? getCapabilities().testWithFail(train); // remove instances with missing class train = new Instances(train); train.deleteWithMissingClass(); // Replace missing values m_ReplaceMissingValues = new ReplaceMissingValues(); m_ReplaceMissingValues.setInputFormat(train); train = Filter.useFilter(train, m_ReplaceMissingValues); // Remove useless attributes m_AttFilter = new RemoveUseless(); m_AttFilter.setInputFormat(train); train = Filter.useFilter(train, m_AttFilter); // Transform attributes m_NominalToBinary = new NominalToBinary(); m_NominalToBinary.setInputFormat(train); train = Filter.useFilter(train, m_NominalToBinary); // Save the structure for printing the model m_structure = new Instances(train, 0); // Extract data m_ClassIndex = train.classIndex(); m_NumClasses = train.numClasses(); int nK = m_NumClasses - 1; // Only K-1 class labels needed int nR = m_NumPredictors = train.numAttributes() - 1; int nC = train.numInstances(); m_Data = new double[nC][nR + 1]; // Data values int[] Y = new int[nC]; // Class labels double[] xMean = new double[nR + 1]; // Attribute means xSD = new double[nR + 1]; // Attribute stddev's double[] sY = new double[nK + 1]; // Number of classes double[] weights = new double[nC]; // Weights of instances double totWeights = 0; // Total weights of the instances m_Par = new double[nR + 1][nK]; // Optimized parameter values if (m_Debug) { System.out.println("Extracting data..."); } for (int i = 0; i < nC; i++) { // initialize X[][] Instance current = train.instance(i); Y[i] = (int) current.classValue(); // Class value starts from 0 weights[i] = current.weight(); // Dealing with weights totWeights += weights[i]; m_Data[i][0] = 1; int j = 1; for (int k = 0; k <= nR; k++) { if (k != m_ClassIndex) { double x = current.value(k); m_Data[i][j] = x; xMean[j] += weights[i] * x; xSD[j] += weights[i] * x * x; j++; } } // Class count sY[Y[i]]++; } if ((totWeights <= 1) && (nC > 1)) throw new Exception("Sum of weights of instances less than 1, please reweight!"); xMean[0] = 0; xSD[0] = 1; for (int j = 1; j <= nR; j++) { xMean[j] = xMean[j] / totWeights; if (totWeights > 1) xSD[j] = Math.sqrt(Math.abs(xSD[j] - totWeights * xMean[j] * xMean[j]) / (totWeights - 1)); else xSD[j] = 0; } if (m_Debug) { // Output stats about input data System.out.println("Descriptives..."); for (int m = 0; m <= nK; m++) System.out.println(sY[m] + " cases have class " + m); System.out.println("\n Variable Avg SD "); for (int j = 1; j <= nR; j++) System.out.println(Utils.doubleToString(j, 8, 4) + Utils.doubleToString(xMean[j], 10, 4) + Utils.doubleToString(xSD[j], 10, 4)); } // Normalise input data for (int i = 0; i < nC; i++) { for (int j = 0; j <= nR; j++) { if (xSD[j] != 0) { m_Data[i][j] = (m_Data[i][j] - xMean[j]) / xSD[j]; } } } if (m_Debug) { System.out.println("\nIteration History..."); } double x[] = new double[(nR + 1) * nK]; double[][] b = new double[2][x.length]; // Boundary constraints, N/A here // Initialize for (int p = 0; p < nK; p++) { int offset = p * (nR + 1); x[offset] = Math.log(sY[p] + 1.0) - Math.log(sY[nK] + 1.0); // Null model b[0][offset] = Double.NaN; b[1][offset] = Double.NaN; for (int q = 1; q <= nR; q++) { x[offset + q] = 0.0; b[0][offset + q] = Double.NaN; b[1][offset + q] = Double.NaN; } } OptEng opt = new OptEng(); opt.setDebug(m_Debug); opt.setWeights(weights); opt.setClassLabels(Y); if (m_MaxIts == -1) { // Search until convergence x = opt.findArgmin(x, b); while (x == null) { x = opt.getVarbValues(); if (m_Debug) System.out.println("200 iterations finished, not enough!"); x = opt.findArgmin(x, b); } if (m_Debug) System.out.println(" -------------<Converged>--------------"); } else { opt.setMaxIteration(m_MaxIts); x = opt.findArgmin(x, b); if (x == null) // Not enough, but use the current value x = opt.getVarbValues(); } m_LL = -opt.getMinFunction(); // Log-likelihood // Don't need data matrix anymore m_Data = null; // Convert coefficients back to non-normalized attribute units for (int i = 0; i < nK; i++) { m_Par[0][i] = x[i * (nR + 1)]; for (int j = 1; j <= nR; j++) { m_Par[j][i] = x[i * (nR + 1) + j]; if (xSD[j] != 0) { m_Par[j][i] /= xSD[j]; m_Par[0][i] -= m_Par[j][i] * xMean[j]; } } } }
From source file:machinelearninglabs.OENaiveBayesClassifier.java
public int[][] attributeCounts(Instances data, int att) { int numberOfPossibleValuesForAttribute = data.firstInstance().attribute(att).numValues(); int[][] result = new int[data.numClasses()][numberOfPossibleValuesForAttribute]; // for each class for (Instance eachInstance : data) { double classValue = eachInstance.value(eachInstance.classIndex()); result[(int) classValue][(int) eachInstance.value(att)]++; }/*from ww w. j a v a2s . com*/ //printIntMatrix(result); return result; }