List of usage examples for weka.core Instances attribute
publicAttribute attribute(String name)
From source file:j48.BinC45Split.java
License:Open Source License
/** * Prints the condition satisfied by instances in a subset. * * @param index of subset and training set. *///ww w. j a v a 2 s .c om public final String rightSide(int index, Instances data) { StringBuffer text; text = new StringBuffer(); if (data.attribute(m_attIndex).isNominal()) { if (index == 0) text.append(" = " + data.attribute(m_attIndex).value((int) m_splitPoint)); else text.append(" != " + data.attribute(m_attIndex).value((int) m_splitPoint)); } else if (index == 0) text.append(" <= " + m_splitPoint); else text.append(" > " + m_splitPoint); return text.toString(); }
From source file:j48.BinC45Split.java
License:Open Source License
/** * Returns a string containing java source code equivalent to the test * made at this node. The instance being tested is called "i". * * @param index index of the nominal value tested * @param data the data containing instance structure info * @return a value of type 'String'//w w w . jav a2 s. c om */ public final String sourceExpression(int index, Instances data) { StringBuffer expr = null; if (index < 0) { return "i[" + m_attIndex + "] == null"; } if (data.attribute(m_attIndex).isNominal()) { if (index == 0) { expr = new StringBuffer("i["); } else { expr = new StringBuffer("!i["); } expr.append(m_attIndex).append("]"); expr.append(".equals(\"").append(data.attribute(m_attIndex).value((int) m_splitPoint)).append("\")"); } else { expr = new StringBuffer("((Double) i["); expr.append(m_attIndex).append("])"); if (index == 0) { expr.append(".doubleValue() <= ").append(m_splitPoint); } else { expr.append(".doubleValue() > ").append(m_splitPoint); } } return expr.toString(); }
From source file:j48.BinC45Split.java
License:Open Source License
/** * Sets split point to greatest value in given data smaller or equal to * old split point.// w w w .ja v a 2 s . c om * (C4.5 does this for some strange reason). */ public final void setSplitPoint(Instances allInstances) { double newSplitPoint = -Double.MAX_VALUE; double tempValue; Instance instance; if ((!allInstances.attribute(m_attIndex).isNominal()) && (m_numSubsets > 1)) { Enumeration enu = allInstances.enumerateInstances(); while (enu.hasMoreElements()) { instance = (Instance) enu.nextElement(); if (!instance.isMissing(m_attIndex)) { tempValue = instance.value(m_attIndex); if (Utils.gr(tempValue, newSplitPoint) && Utils.smOrEq(tempValue, m_splitPoint)) newSplitPoint = tempValue; } } m_splitPoint = newSplitPoint; } }
From source file:j48.C45ModelSelection.java
License:Open Source License
/** * Selects C4.5-type split for the given dataset. *//*ww w .j av a 2 s . c o m*/ public final ClassifierSplitModel selectModel(Instances data) { double minResult; double currentResult; C45Split[] currentModel; C45Split bestModel = null; NoSplit noSplitModel = null; double averageInfoGain = 0; int validModels = 0; boolean multiVal = true; Distribution checkDistribution; Attribute attribute; double sumOfWeights; int i; try { // Check if all Instances belong to one class or if not // enough Instances to split. checkDistribution = new Distribution(data); noSplitModel = new NoSplit(checkDistribution); if (Utils.sm(checkDistribution.total(), 2 * m_minNoObj) || Utils.eq(checkDistribution.total(), checkDistribution.perClass(checkDistribution.maxClass()))) return noSplitModel; // Check if all attributes are nominal and have a // lot of values. if (m_allData != null) { Enumeration enu = data.enumerateAttributes(); while (enu.hasMoreElements()) { attribute = (Attribute) enu.nextElement(); if ((attribute.isNumeric()) || (Utils.sm((double) attribute.numValues(), (0.3 * (double) m_allData.numInstances())))) { multiVal = false; break; } } } currentModel = new j48.C45Split[data.numAttributes()]; sumOfWeights = data.sumOfWeights(); // For each attribute. for (i = 0; i < data.numAttributes(); i++) { // Apart from class attribute. if (i != (data).classIndex()) { // Get models for current attribute. currentModel[i] = new j48.C45Split(i, m_minNoObj, sumOfWeights); currentModel[i].buildClassifier(data); // Check if useful split for current attribute // exists and check for enumerated attributes with // a lot of values. if (currentModel[i].checkModel()) if (m_allData != null) { if ((data.attribute(i).isNumeric()) || (multiVal || Utils.sm((double) data.attribute(i).numValues(), (0.3 * (double) m_allData.numInstances())))) { averageInfoGain = averageInfoGain + currentModel[i].infoGain(); validModels++; } } else { averageInfoGain = averageInfoGain + currentModel[i].infoGain(); validModels++; } } else currentModel[i] = null; } // Check if any useful split was found. if (validModels == 0) return noSplitModel; averageInfoGain = averageInfoGain / (double) validModels; // Find "best" attribute to split on. minResult = 0; for (i = 0; i < data.numAttributes(); i++) { if ((i != (data).classIndex()) && (currentModel[i].checkModel())) // Use 1E-3 here to get a closer approximation to the // original // implementation. if ((currentModel[i].infoGain() >= (averageInfoGain - 1E-3)) && Utils.gr(currentModel[i].gainRatio(), minResult)) { bestModel = currentModel[i]; minResult = currentModel[i].gainRatio(); } } // Check if useful split was found. if (Utils.eq(minResult, 0)) return noSplitModel; // Add all Instances with unknown values for the corresponding // attribute to the distribution for the model, so that // the complete distribution is stored with the model. bestModel.distribution().addInstWithUnknown(data, bestModel.attIndex()); // Set the split point analogue to C45 if attribute numeric. if (m_allData != null) bestModel.setSplitPoint(m_allData); return bestModel; } catch (Exception e) { e.printStackTrace(); } return null; }
From source file:j48.C45PruneableClassifierTreeG.java
License:Open Source License
/** * finds new nodes that improve accuracy and grafts them onto the tree * * @param fulldata the instances in whole trainset * @param iindex records num tests each instance has failed up to this node * @param limits the upper/lower limits for numeric attributes * @param parent the node immediately before the current one * @param pLaplace laplace for leaf, calculated by parent (in case leaf empty) * @param pLeafClass class of leaf, determined by parent (in case leaf empty) *//*from w w w .ja v a 2 s . c o m*/ private void findGraft(Instances fulldata, double[][] iindex, double[][] limits, ClassifierTree parent, double pLaplace, int pLeafClass) throws Exception { // get the class for this leaf int leafClass = (m_isEmpty) ? pLeafClass : localModel().distribution().maxClass(); // get the laplace value for this leaf double leafLaplace = (m_isEmpty) ? pLaplace : laplaceLeaf(leafClass); // sort the instances into those at the leaf, those in atbop, and discarded Instances l = new Instances(fulldata, fulldata.numInstances()); Instances n = new Instances(fulldata, fulldata.numInstances()); int lcount = 0; int acount = 0; for (int x = 0; x < fulldata.numInstances(); x++) { if (iindex[0][x] <= 0 && iindex[1][x] <= 0) continue; if (iindex[0][x] != 0) { l.add(fulldata.instance(x)); l.instance(lcount).setWeight(iindex[0][x]); // move instance's weight in iindex to same index as in l iindex[0][lcount++] = iindex[0][x]; } if (iindex[1][x] > 0) { n.add(fulldata.instance(x)); n.instance(acount).setWeight(iindex[1][x]); // move instance's weight in iindex to same index as in n iindex[1][acount++] = iindex[1][x]; } } boolean graftPossible = false; double[] classDist = new double[n.numClasses()]; for (int x = 0; x < n.numInstances(); x++) { if (iindex[1][x] > 0 && !n.instance(x).classIsMissing()) classDist[(int) n.instance(x).classValue()] += iindex[1][x]; } for (int cVal = 0; cVal < n.numClasses(); cVal++) { double theLaplace = (classDist[cVal] + 1.0) / (classDist[cVal] + 2.0); if (cVal != leafClass && (theLaplace > leafLaplace) && (biprob(classDist[cVal], classDist[cVal], leafLaplace) > m_BiProbCrit)) { graftPossible = true; break; } } if (!graftPossible) { return; } // 1. Initialize to {} a set of tuples t containing potential tests ArrayList t = new ArrayList(); // go through each attribute for (int a = 0; a < n.numAttributes(); a++) { if (a == n.classIndex()) continue; // skip the class // sort instances in atbop by $a int[] sorted = sortByAttribute(n, a); // 2. For each continuous attribute $a: if (n.attribute(a).isNumeric()) { // find min and max values for this attribute at the leaf boolean prohibited = false; double minLeaf = Double.POSITIVE_INFINITY; double maxLeaf = Double.NEGATIVE_INFINITY; for (int i = 0; i < l.numInstances(); i++) { if (l.instance(i).isMissing(a)) { if (l.instance(i).classValue() == leafClass) { prohibited = true; break; } } double value = l.instance(i).value(a); if (!m_relabel || l.instance(i).classValue() == leafClass) { if (value < minLeaf) minLeaf = value; if (value > maxLeaf) maxLeaf = value; } } if (prohibited) { continue; } // (a) find values of // $n: instances in atbop (already have that, actually) // $v: a value for $a that exists for a case in the atbop, where // $v is < the min value for $a for a case at the leaf which // has the class $c, and $v is > the lowerlimit of $a at // the leaf. // (note: error in original paper stated that $v must be // smaller OR EQUAL TO the min value). // $k: $k is a class // that maximize L' = Laplace({$x: $x contained in cases($n) // & value($a,$x) <= $v & value($a,$x) > lowerlim($l,$a)}, $k). double minBestClass = Double.NaN; double minBestLaplace = leafLaplace; double minBestVal = Double.NaN; double minBestPos = Double.NaN; double minBestTotal = Double.NaN; double[][] minBestCounts = null; double[][] counts = new double[2][n.numClasses()]; for (int x = 0; x < n.numInstances(); x++) { if (n.instance(sorted[x]).isMissing(a)) break; // missing are sorted to end: no more valid vals double theval = n.instance(sorted[x]).value(a); if (m_Debug) System.out.println("\t " + theval); if (theval <= limits[a][0]) { if (m_Debug) System.out.println("\t <= lowerlim: continuing..."); continue; } // note: error in paper would have this read "theVal > minLeaf) if (theval >= minLeaf) { if (m_Debug) System.out.println("\t >= minLeaf; breaking..."); break; } counts[0][(int) n.instance(sorted[x]).classValue()] += iindex[1][sorted[x]]; if (x != n.numInstances() - 1) { int z = x + 1; while (z < n.numInstances() && n.instance(sorted[z]).value(a) == theval) { z++; x++; counts[0][(int) n.instance(sorted[x]).classValue()] += iindex[1][sorted[x]]; } } // work out the best laplace/class (for <= theval) double total = Utils.sum(counts[0]); for (int c = 0; c < n.numClasses(); c++) { double temp = (counts[0][c] + 1.0) / (total + 2.0); if (temp > minBestLaplace) { minBestPos = counts[0][c]; minBestTotal = total; minBestLaplace = temp; minBestClass = c; minBestCounts = copyCounts(counts); minBestVal = (x == n.numInstances() - 1) ? theval : ((theval + n.instance(sorted[x + 1]).value(a)) / 2.0); } } } // (b) add to t tuple <n,a,v,k,L',"<="> if (!Double.isNaN(minBestVal) && biprob(minBestPos, minBestTotal, leafLaplace) > m_BiProbCrit) { GraftSplit gsplit = null; try { gsplit = new GraftSplit(a, minBestVal, 0, leafClass, minBestCounts); } catch (Exception e) { System.err.println("graftsplit error: " + e.getMessage()); System.exit(1); } t.add(gsplit); } // free space minBestCounts = null; // (c) find values of // n: instances in atbop (already have that, actually) // $v: a value for $a that exists for a case in the atbop, where // $v is > the max value for $a for a case at the leaf which // has the class $c, and $v is <= the upperlimit of $a at // the leaf. // k: k is a class // that maximize L' = Laplace({x: x contained in cases(n) // & value(a,x) > v & value(a,x) <= upperlim(l,a)}, k). double maxBestClass = -1; double maxBestLaplace = leafLaplace; double maxBestVal = Double.NaN; double maxBestPos = Double.NaN; double maxBestTotal = Double.NaN; double[][] maxBestCounts = null; for (int c = 0; c < n.numClasses(); c++) { // zero the counts counts[0][c] = 0; counts[1][c] = 0; // shouldn't need to do this ... } // check smallest val for a in atbop is < upper limit if (n.numInstances() >= 1 && n.instance(sorted[0]).value(a) < limits[a][1]) { for (int x = n.numInstances() - 1; x >= 0; x--) { if (n.instance(sorted[x]).isMissing(a)) continue; double theval = n.instance(sorted[x]).value(a); if (m_Debug) System.out.println("\t " + theval); if (theval > limits[a][1]) { if (m_Debug) System.out.println("\t >= upperlim; continuing..."); continue; } if (theval <= maxLeaf) { if (m_Debug) System.out.println("\t < maxLeaf; breaking..."); break; } // increment counts counts[1][(int) n.instance(sorted[x]).classValue()] += iindex[1][sorted[x]]; if (x != 0 && !n.instance(sorted[x - 1]).isMissing(a)) { int z = x - 1; while (z >= 0 && n.instance(sorted[z]).value(a) == theval) { z--; x--; counts[1][(int) n.instance(sorted[x]).classValue()] += iindex[1][sorted[x]]; } } // work out best laplace for > theval double total = Utils.sum(counts[1]); for (int c = 0; c < n.numClasses(); c++) { double temp = (counts[1][c] + 1.0) / (total + 2.0); if (temp > maxBestLaplace) { maxBestPos = counts[1][c]; maxBestTotal = total; maxBestLaplace = temp; maxBestClass = c; maxBestCounts = copyCounts(counts); maxBestVal = (x == 0) ? theval : ((theval + n.instance(sorted[x - 1]).value(a)) / 2.0); } } } // (d) add to t tuple <n,a,v,k,L',">"> if (!Double.isNaN(maxBestVal) && biprob(maxBestPos, maxBestTotal, leafLaplace) > m_BiProbCrit) { GraftSplit gsplit = null; try { gsplit = new GraftSplit(a, maxBestVal, 1, leafClass, maxBestCounts); } catch (Exception e) { System.err.println("graftsplit error:" + e.getMessage()); System.exit(1); } t.add(gsplit); } } } else { // must be a nominal attribute // 3. for each discrete attribute a for which there is no // test at an ancestor of l // skip if this attribute has already been used if (limits[a][1] == 1) { continue; } boolean[] prohibit = new boolean[l.attribute(a).numValues()]; for (int aval = 0; aval < n.attribute(a).numValues(); aval++) { for (int x = 0; x < l.numInstances(); x++) { if ((l.instance(x).isMissing(a) || l.instance(x).value(a) == aval) && (!m_relabel || (l.instance(x).classValue() == leafClass))) { prohibit[aval] = true; break; } } } // (a) find values of // $n: instances in atbop (already have that, actually) // $v: $v is a value for $a // $k: $k is a class // that maximize L' = Laplace({$x: $x contained in cases($n) // & value($a,$x) = $v}, $k). double bestVal = Double.NaN; double bestClass = Double.NaN; double bestLaplace = leafLaplace; double[][] bestCounts = null; double[][] counts = new double[2][n.numClasses()]; for (int x = 0; x < n.numInstances(); x++) { if (n.instance(sorted[x]).isMissing(a)) continue; // zero the counts for (int c = 0; c < n.numClasses(); c++) counts[0][c] = 0; double theval = n.instance(sorted[x]).value(a); counts[0][(int) n.instance(sorted[x]).classValue()] += iindex[1][sorted[x]]; if (x != n.numInstances() - 1) { int z = x + 1; while (z < n.numInstances() && n.instance(sorted[z]).value(a) == theval) { z++; x++; counts[0][(int) n.instance(sorted[x]).classValue()] += iindex[1][sorted[x]]; } } if (!prohibit[(int) theval]) { // work out best laplace for > theval double total = Utils.sum(counts[0]); bestLaplace = leafLaplace; bestClass = Double.NaN; for (int c = 0; c < n.numClasses(); c++) { double temp = (counts[0][c] + 1.0) / (total + 2.0); if (temp > bestLaplace && biprob(counts[0][c], total, leafLaplace) > m_BiProbCrit) { bestLaplace = temp; bestClass = c; bestVal = theval; bestCounts = copyCounts(counts); } } // add to graft list if (!Double.isNaN(bestClass)) { GraftSplit gsplit = null; try { gsplit = new GraftSplit(a, bestVal, 2, leafClass, bestCounts); } catch (Exception e) { System.err.println("graftsplit error: " + e.getMessage()); System.exit(1); } t.add(gsplit); } } } // (b) add to t tuple <n,a,v,k,L',"="> // done this already } } // 4. remove from t all tuples <n,a,v,c,L,x> such that L <= // Laplace(cases(l),c) or prob(x,n,Laplace(cases(l),c) <= 0.05 // -- checked this constraint prior to adding a tuple -- // *** step six done before step five for efficiency *** // 6. for each <n,a,v,k,L,x> in t ordered on L from highest to lowest // order the tuples from highest to lowest laplace // (this actually orders lowest to highest) Collections.sort(t); // 5. remove from t all tuples <n,a,v,c,L,x> such that there is // no tuple <n',a',v',k',L',x'> such that k' != c & L' < L. for (int x = 0; x < t.size(); x++) { GraftSplit gs = (GraftSplit) t.get(x); if (gs.maxClassForSubsetOfInterest() != leafClass) { break; // reached a graft with class != leafClass, so stop deleting } else { t.remove(x); x--; } } // if no potential grafts were found, do nothing and return if (t.size() < 1) { return; } // create the distributions for each graft for (int x = t.size() - 1; x >= 0; x--) { GraftSplit gs = (GraftSplit) t.get(x); try { gs.buildClassifier(l); gs.deleteGraftedCases(l); // so they don't go down the other branch } catch (Exception e) { System.err.println("graftsplit build error: " + e.getMessage()); } } // add this stuff to the tree ((C45PruneableClassifierTreeG) parent).setDescendents(t, this); }
From source file:j48.C45Split.java
License:Open Source License
public void buildClassifier(Instances trainInstances) throws Exception { // Initialize the remaining instance variables. m_numSubsets = 0;/*from w w w. ja v a 2 s. c o m*/ m_splitPoint = Double.MAX_VALUE; m_infoGain = 0; m_gainRatio = 0; // Different treatment for enumerated and numeric // attributes. if (trainInstances.attribute(m_attIndex).isNominal()) { m_complexityIndex = trainInstances.attribute(m_attIndex).numValues(); m_index = m_complexityIndex; handleEnumeratedAttribute(trainInstances); } else { m_complexityIndex = 2; m_index = 0; trainInstances.sort(trainInstances.attribute(m_attIndex)); // /////////////////////////////////////////////////////////////////////////////////////// double stdDev = trainInstances.attributeStats(m_attIndex).numericStats.stdDev; if (stdDev > 200) { // rrrrr = stdDev/200; // System.out.println(stdDev+" "); rrrrr = Math.log10(stdDev) / 1.2; // rrrrr = 1.1; // lllll = stdDev/2000; // lllll = 0.3; lllll = Math.log10(stdDev) / 8; } else { lllll = Math.log10(stdDev) / 1.2; // lllll = stdDev/200; // lllll = 1.1; // rrrrr = stdDev/2000; // rrrrr = 0.3; rrrrr = Math.log10(stdDev) / 8; } handleNumericAttribute(trainInstances); } }
From source file:j48.C45Split.java
License:Open Source License
/** * Prints the condition satisfied by instances in a subset. * //from w w w.j a va 2s . com * @param index * of subset * @param data * training set. */ public final String rightSide(int index, Instances data) { StringBuffer text; text = new StringBuffer(); if (data.attribute(m_attIndex).isNominal()) text.append(" = " + data.attribute(m_attIndex).value(index)); else if (index == 0) text.append(" <= " + Utils.doubleToString(m_splitPoint, 6)); else text.append(" > " + Utils.doubleToString(m_splitPoint, 6)); return text.toString(); }
From source file:j48.C45Split.java
License:Open Source License
/** * Returns a string containing java source code equivalent to the test made * at this node. The instance being tested is called "i". * //from w ww.j a v a 2s . com * @param index * index of the nominal value tested * @param data * the data containing instance structure info * @return a value of type 'String' */ public final String sourceExpression(int index, Instances data) { StringBuffer expr = null; if (index < 0) { return "i[" + m_attIndex + "] == null"; } if (data.attribute(m_attIndex).isNominal()) { expr = new StringBuffer("i["); expr.append(m_attIndex).append("]"); expr.append(".equals(\"").append(data.attribute(m_attIndex).value(index)).append("\")"); } else { expr = new StringBuffer("((Double) i["); expr.append(m_attIndex).append("])"); if (index == 0) { expr.append(".doubleValue() <= ").append(m_splitPoint); } else { expr.append(".doubleValue() > ").append(m_splitPoint); } } return expr.toString(); }
From source file:j48.C45Split.java
License:Open Source License
/** * Sets split point to greatest value in given data smaller or equal to old * split point. (C4.5 does this for some strange reason). *///from w w w . ja v a 2 s .c o m public final void setSplitPoint(Instances allInstances) { double newSplitPoint = -Double.MAX_VALUE; double tempValue; Instance instance; if ((allInstances.attribute(m_attIndex).isNumeric()) && (m_numSubsets > 1)) { Enumeration enu = allInstances.enumerateInstances(); while (enu.hasMoreElements()) { instance = (Instance) enu.nextElement(); if (!instance.isMissing(m_attIndex)) { tempValue = instance.value(m_attIndex); if (Utils.gr(tempValue, newSplitPoint) && Utils.smOrEq(tempValue, m_splitPoint)) newSplitPoint = tempValue; } } m_splitPoint = newSplitPoint; } }
From source file:j48.C45Split.java
License:Open Source License
/** * Returns the minsAndMaxs of the index.th subset. *//*from www .j av a 2s.c om*/ public final double[][] minsAndMaxs(Instances data, double[][] minsAndMaxs, int index) { double[][] newMinsAndMaxs = new double[data.numAttributes()][2]; for (int i = 0; i < data.numAttributes(); i++) { newMinsAndMaxs[i][0] = minsAndMaxs[i][0]; newMinsAndMaxs[i][1] = minsAndMaxs[i][1]; if (i == m_attIndex) if (data.attribute(m_attIndex).isNominal()) newMinsAndMaxs[m_attIndex][1] = 1; else newMinsAndMaxs[m_attIndex][1 - index] = m_splitPoint; } return newMinsAndMaxs; }