List of usage examples for weka.core Instances sumOfWeights
publicdouble sumOfWeights()
From source file:Pair.java
License:Open Source License
private void doCV(Instances targetData) throws Exception { System.out.println();/*from w ww. j av a 2 s.com*/ System.out.flush(); int numSourceInstances = m_SourceInstances.numInstances(); int numInstances = targetData.numInstances() + numSourceInstances; numTargetInstances = numInstances - numSourceInstances; double weightSource, weightTarget; double initialSourceFraction; double[] weights = new double[numInstances]; Random randomInstance = new Random(1); Instances data = new Instances(m_SourceInstances, 0, numSourceInstances); // Now add the target data, shallow copying the instances as they are added // so it doesn't mess up the weights for anyone else Enumeration enumer = targetData.enumerateInstances(); while (enumer.hasMoreElements()) { Instance instance = (Instance) enumer.nextElement(); data.add(instance); } if (sourceRatio < 0) { //weight all equally weightSource = weightTarget = 1.0/*/numInstances*/; initialSourceFraction = numSourceInstances / (double) numInstances; } else { double totalWeight = 1 + sourceRatio; weightSource = sourceRatio / totalWeight/*/numSourceInstances*/; weightTarget = 1.0 / totalWeight/*/numTargetInstances*/; initialSourceFraction = weightSource; } for (int j = 0; j < numInstances; j++) { Instance instance = data.instance(j); if (j < numSourceInstances) instance.setWeight(weightSource); else instance.setWeight(weightTarget); } if (doFraction) { for (int it = 0; it < sourceIterations/*m_NumIterations*/; it++) { sourceFraction = (1 - (it / (double) m_NumIterations)) * initialSourceFraction; //[same weights as regular] if (sourceFraction > .995) sourceFraction = .995; //double sourceWeight = (sourceFraction * numInstances) / numSourceInstances; double sourceWeight = (sourceFraction * numTargetInstances) / (numSourceInstances * (1 - sourceFraction)); for (int j = 0; j < numInstances; j++) { Instance instance = data.instance(j); if (j < numSourceInstances) instance.setWeight(sourceWeight); else instance.setWeight(1); } buildClassifierWithWeights(data); System.out.println("Iteration " + it + ":" + getTestError()); } } else { for (int i = 0; i < numInstances; i++) weights[i] = data.instance(i).weight(); buildClassifierWithWeights(data); System.out.println("Iteration -1:" + getTestError()); for (int i = 0; i < numInstances; i++) data.instance(i).setWeight(weights[i]); for (int it = 0; it < sourceIterations; it++) { Instances sample = null; if (!resample || m_NumIterationsPerformed == 0) { sample = data; } else { double sum = data.sumOfWeights(); double[] sweights = new double[data.numInstances()]; for (int i = 0; i < sweights.length; i++) { sweights[i] = data.instance(i).weight() / sum; } sample = data.resampleWithWeights(randomInstance, sweights); } try { m_Classifiers[it].buildClassifier(sample); } catch (Exception e) { e.printStackTrace(); System.out.println("E: " + e); } sourceFraction = initialSourceFraction * (1 - (it + 1) / (double) m_NumIterations); setWeights(data, m_Classifiers[it], sourceFraction, numSourceInstances, false); for (int i = 0; i < numInstances; i++) weights[i] = data.instance(i).weight(); buildClassifierWithWeights(data); System.out.println("Iteration " + it + ":" + getTestError()); for (int i = 0; i < numInstances; i++) data.instance(i).setWeight(weights[i]); } } }
From source file:Pair.java
License:Open Source License
/** * Boosting method. Boosts any classifier that can handle weighted * instances./* w ww. j av a2 s . c o m*/ * * @param data the training data to be used for generating the * boosted classifier. * @exception Exception if the classifier could not be built successfully */ protected void buildClassifierWithWeights(Instances data) throws Exception { Random randomInstance = new Random(0); double epsilon, reweight, beta = 0; Evaluation evaluation; Instances sample; // Initialize data m_Betas = new double[m_Classifiers.length]; m_NumIterationsPerformed = 0; int numSourceInstances = m_SourceInstances.numInstances(); // Do boostrap iterations for (m_NumIterationsPerformed = 0; m_NumIterationsPerformed < m_Classifiers.length; m_NumIterationsPerformed++) { // Build the classifier sample = null; if (!resample || m_NumIterationsPerformed == 0) { sample = data; } else { double sum = data.sumOfWeights(); double[] weights = new double[data.numInstances()]; for (int i = 0; i < weights.length; i++) { weights[i] = data.instance(i).weight() / sum; } sample = data.resampleWithWeights(randomInstance, weights); if (doSampleSize) { int effectiveInstances = (int) (sourceFraction * weights.length + numTargetInstances); if (effectiveInstances > numSourceInstances + numTargetInstances) effectiveInstances = numSourceInstances + numTargetInstances; //System.out.println(effectiveInstances); sample.randomize(randomInstance); Instances q = new Instances(sample, 0, effectiveInstances); sample = q; } } try { m_Classifiers[m_NumIterationsPerformed].buildClassifier(sample); } catch (Exception e) { e.printStackTrace(); System.out.println("E: " + e); } if (doBagging) beta = 0.4 / .6; //always same beta else beta = setWeights(data, m_Classifiers[m_NumIterationsPerformed], -1, numSourceInstances, true); // Stop if error too small or error too big and ignore this model if (beta < 0) { //setWeights indicates a problem with negative beta if (m_NumIterationsPerformed == 0) { m_NumIterationsPerformed = 1; // If we're the first we have to to use it } break; } // Determine the weight to assign to this model m_Betas[m_NumIterationsPerformed] = Math.log(1 / beta); } betaSum = 0; for (int i = 0; i < m_NumIterationsPerformed; i++) betaSum += m_Betas[i]; }
From source file:MultiClassClassifier.java
License:Open Source License
/** * Builds the classifiers./*from www .j a v a 2 s . c o m*/ * * @param insts the training data. * @throws Exception if a classifier can't be built */ public void buildClassifier(Instances insts) throws Exception { Instances newInsts; // can classifier handle the data? getCapabilities().testWithFail(insts); // remove instances with missing class insts = new Instances(insts); insts.deleteWithMissingClass(); if (m_Classifier == null) { throw new Exception("No base classifier has been set!"); } m_ZeroR = new ZeroR(); m_ZeroR.buildClassifier(insts); m_TwoClassDataset = null; int numClassifiers = insts.numClasses(); if (numClassifiers <= 2) { m_Classifiers = Classifier.makeCopies(m_Classifier, 1); m_Classifiers[0].buildClassifier(insts); m_ClassFilters = null; } else if (m_Method == METHOD_1_AGAINST_1) { // generate fastvector of pairs FastVector pairs = new FastVector(); for (int i = 0; i < insts.numClasses(); i++) { for (int j = 0; j < insts.numClasses(); j++) { if (j <= i) continue; int[] pair = new int[2]; pair[0] = i; pair[1] = j; pairs.addElement(pair); } } numClassifiers = pairs.size(); m_Classifiers = Classifier.makeCopies(m_Classifier, numClassifiers); m_ClassFilters = new Filter[numClassifiers]; m_SumOfWeights = new double[numClassifiers]; // generate the classifiers for (int i = 0; i < numClassifiers; i++) { RemoveWithValues classFilter = new RemoveWithValues(); classFilter.setAttributeIndex("" + (insts.classIndex() + 1)); classFilter.setModifyHeader(true); classFilter.setInvertSelection(true); classFilter.setNominalIndicesArr((int[]) pairs.elementAt(i)); Instances tempInstances = new Instances(insts, 0); tempInstances.setClassIndex(-1); classFilter.setInputFormat(tempInstances); newInsts = Filter.useFilter(insts, classFilter); if (newInsts.numInstances() > 0) { newInsts.setClassIndex(insts.classIndex()); m_Classifiers[i].buildClassifier(newInsts); m_ClassFilters[i] = classFilter; m_SumOfWeights[i] = newInsts.sumOfWeights(); } else { m_Classifiers[i] = null; m_ClassFilters[i] = null; } } // construct a two-class header version of the dataset m_TwoClassDataset = new Instances(insts, 0); int classIndex = m_TwoClassDataset.classIndex(); m_TwoClassDataset.setClassIndex(-1); m_TwoClassDataset.deleteAttributeAt(classIndex); FastVector classLabels = new FastVector(); classLabels.addElement("class0"); classLabels.addElement("class1"); m_TwoClassDataset.insertAttributeAt(new Attribute("class", classLabels), classIndex); m_TwoClassDataset.setClassIndex(classIndex); } else { // use error correcting code style methods Code code = null; switch (m_Method) { case METHOD_ERROR_EXHAUSTIVE: code = new ExhaustiveCode(numClassifiers); break; case METHOD_ERROR_RANDOM: code = new RandomCode(numClassifiers, (int) (numClassifiers * m_RandomWidthFactor), insts); break; case METHOD_1_AGAINST_ALL: code = new StandardCode(numClassifiers); break; default: throw new Exception("Unrecognized correction code type"); } numClassifiers = code.size(); m_Classifiers = Classifier.makeCopies(m_Classifier, numClassifiers); m_ClassFilters = new MakeIndicator[numClassifiers]; for (int i = 0; i < m_Classifiers.length; i++) { m_ClassFilters[i] = new MakeIndicator(); MakeIndicator classFilter = (MakeIndicator) m_ClassFilters[i]; classFilter.setAttributeIndex("" + (insts.classIndex() + 1)); classFilter.setValueIndices(code.getIndices(i)); classFilter.setNumeric(false); classFilter.setInputFormat(insts); newInsts = Filter.useFilter(insts, m_ClassFilters[i]); m_Classifiers[i].buildClassifier(newInsts); } } m_ClassAttribute = insts.classAttribute(); }
From source file:clusterer.SimpleKMeansWithSilhouette.java
License:Open Source License
/** * Generates a clusterer. Has to initialize all fields of the clusterer that * are not being set via options./*from w w w . j a v a 2 s .com*/ * * @param data set of instances serving as training data * @throws Exception if the clusterer has not been generated successfully */ @Override public void buildClusterer(Instances data) throws Exception { m_canopyClusters = null; // can clusterer handle the data? getCapabilities().testWithFail(data); m_Iterations = 0; m_ReplaceMissingFilter = new ReplaceMissingValues(); Instances instances = new Instances(data); instances.setClassIndex(-1); if (!m_dontReplaceMissing) { m_ReplaceMissingFilter.setInputFormat(instances); instances = Filter.useFilter(instances, m_ReplaceMissingFilter); } m_ClusterNominalCounts = new double[m_NumClusters][instances.numAttributes()][]; m_ClusterMissingCounts = new double[m_NumClusters][instances.numAttributes()]; if (m_displayStdDevs) { m_FullStdDevs = instances.variances(); } m_FullMeansOrMediansOrModes = moveCentroid(0, instances, true, false); m_FullMissingCounts = m_ClusterMissingCounts[0]; m_FullNominalCounts = m_ClusterNominalCounts[0]; double sumOfWeights = instances.sumOfWeights(); for (int i = 0; i < instances.numAttributes(); i++) { if (instances.attribute(i).isNumeric()) { if (m_displayStdDevs) { m_FullStdDevs[i] = Math.sqrt(m_FullStdDevs[i]); } if (m_FullMissingCounts[i] == sumOfWeights) { m_FullMeansOrMediansOrModes[i] = Double.NaN; // mark missing as mean } } else { if (m_FullMissingCounts[i] > m_FullNominalCounts[i][Utils.maxIndex(m_FullNominalCounts[i])]) { m_FullMeansOrMediansOrModes[i] = -1; // mark missing as most common // value } } } m_ClusterCentroids = new Instances(instances, m_NumClusters); int[] clusterAssignments = new int[instances.numInstances()]; if (m_PreserveOrder) { m_Assignments = clusterAssignments; } m_DistanceFunction.setInstances(instances); Random RandomO = new Random(getSeed()); int instIndex; HashMap<DecisionTableHashKey, Integer> initC = new HashMap<DecisionTableHashKey, Integer>(); DecisionTableHashKey hk = null; Instances initInstances = null; if (m_PreserveOrder) { initInstances = new Instances(instances); } else { initInstances = instances; } if (m_speedUpDistanceCompWithCanopies) { m_canopyClusters = new Canopy(); m_canopyClusters.setNumClusters(m_NumClusters); m_canopyClusters.setSeed(getSeed()); m_canopyClusters.setT2(getCanopyT2()); m_canopyClusters.setT1(getCanopyT1()); m_canopyClusters.setMaxNumCandidateCanopiesToHoldInMemory(getCanopyMaxNumCanopiesToHoldInMemory()); m_canopyClusters.setPeriodicPruningRate(getCanopyPeriodicPruningRate()); m_canopyClusters.setMinimumCanopyDensity(getCanopyMinimumCanopyDensity()); m_canopyClusters.setDebug(getDebug()); m_canopyClusters.buildClusterer(initInstances); // System.err.println(m_canopyClusters); m_centroidCanopyAssignments = new ArrayList<long[]>(); m_dataPointCanopyAssignments = new ArrayList<long[]>(); } if (m_initializationMethod == KMEANS_PLUS_PLUS) { kMeansPlusPlusInit(initInstances); m_initialStartPoints = new Instances(m_ClusterCentroids); } else if (m_initializationMethod == CANOPY) { canopyInit(initInstances); m_initialStartPoints = new Instances(m_canopyClusters.getCanopies()); } else if (m_initializationMethod == FARTHEST_FIRST) { farthestFirstInit(initInstances); m_initialStartPoints = new Instances(m_ClusterCentroids); } else { // random for (int j = initInstances.numInstances() - 1; j >= 0; j--) { instIndex = RandomO.nextInt(j + 1); hk = new DecisionTableHashKey(initInstances.instance(instIndex), initInstances.numAttributes(), true); if (!initC.containsKey(hk)) { m_ClusterCentroids.add(initInstances.instance(instIndex)); initC.put(hk, null); } initInstances.swap(j, instIndex); if (m_ClusterCentroids.numInstances() == m_NumClusters) { break; } } m_initialStartPoints = new Instances(m_ClusterCentroids); } if (m_speedUpDistanceCompWithCanopies) { // assign canopies to training data for (int i = 0; i < instances.numInstances(); i++) { m_dataPointCanopyAssignments.add(m_canopyClusters.assignCanopies(instances.instance(i))); } } m_NumClusters = m_ClusterCentroids.numInstances(); // removing reference initInstances = null; int i; boolean converged = false; int emptyClusterCount; Instances[] tempI = new Instances[m_NumClusters]; m_squaredErrors = new double[m_NumClusters]; m_ClusterNominalCounts = new double[m_NumClusters][instances.numAttributes()][0]; m_ClusterMissingCounts = new double[m_NumClusters][instances.numAttributes()]; startExecutorPool(); while (!converged) { if (m_speedUpDistanceCompWithCanopies) { // re-assign canopies to the current cluster centers m_centroidCanopyAssignments.clear(); for (int kk = 0; kk < m_ClusterCentroids.numInstances(); kk++) { m_centroidCanopyAssignments .add(m_canopyClusters.assignCanopies(m_ClusterCentroids.instance(kk))); } } emptyClusterCount = 0; m_Iterations++; converged = true; if (m_executionSlots <= 1 || instances.numInstances() < 2 * m_executionSlots) { for (i = 0; i < instances.numInstances(); i++) { Instance toCluster = instances.instance(i); int newC = clusterProcessedInstance(toCluster, false, true, m_speedUpDistanceCompWithCanopies ? m_dataPointCanopyAssignments.get(i) : null); if (newC != clusterAssignments[i]) { converged = false; } clusterAssignments[i] = newC; } } else { converged = launchAssignToClusters(instances, clusterAssignments); } // update centroids m_ClusterCentroids = new Instances(instances, m_NumClusters); for (i = 0; i < m_NumClusters; i++) { tempI[i] = new Instances(instances, 0); } for (i = 0; i < instances.numInstances(); i++) { tempI[clusterAssignments[i]].add(instances.instance(i)); } if (m_executionSlots <= 1 || instances.numInstances() < 2 * m_executionSlots) { for (i = 0; i < m_NumClusters; i++) { if (tempI[i].numInstances() == 0) { // empty cluster emptyClusterCount++; } else { moveCentroid(i, tempI[i], true, true); } } } else { emptyClusterCount = launchMoveCentroids(tempI); } if (m_Iterations == m_MaxIterations) { converged = true; } if (emptyClusterCount > 0) { m_NumClusters -= emptyClusterCount; if (converged) { Instances[] t = new Instances[m_NumClusters]; int index = 0; for (int k = 0; k < tempI.length; k++) { if (tempI[k].numInstances() > 0) { t[index] = tempI[k]; for (i = 0; i < tempI[k].numAttributes(); i++) { m_ClusterNominalCounts[index][i] = m_ClusterNominalCounts[k][i]; } index++; } } tempI = t; } else { tempI = new Instances[m_NumClusters]; } } if (!converged) { m_ClusterNominalCounts = new double[m_NumClusters][instances.numAttributes()][0]; } } // calculate errors if (!m_FastDistanceCalc) { for (i = 0; i < instances.numInstances(); i++) { clusterProcessedInstance(instances.instance(i), true, false, null); } } if (m_displayStdDevs) { m_ClusterStdDevs = new Instances(instances, m_NumClusters); } m_ClusterSizes = new double[m_NumClusters]; for (i = 0; i < m_NumClusters; i++) { if (m_displayStdDevs) { double[] vals2 = tempI[i].variances(); for (int j = 0; j < instances.numAttributes(); j++) { if (instances.attribute(j).isNumeric()) { vals2[j] = Math.sqrt(vals2[j]); } else { vals2[j] = Utils.missingValue(); } } m_ClusterStdDevs.add(new DenseInstance(1.0, vals2)); } m_ClusterSizes[i] = tempI[i].sumOfWeights(); } m_executorPool.shutdown(); // save memory! m_DistanceFunction.clean(); // Calculate Silhouette Coefficient SilCoeff = new double[instances.numInstances()]; AvgSilCoeff = 0; for (int z = 0; z < instances.numInstances(); z++) { double[] distance = new double[m_NumClusters]; Arrays.fill(distance, 0.0); //Sum for (int y = 0; y < instances.numInstances(); y++) { distance[clusterAssignments[y]] += m_DistanceFunction.distance(instances.get(z), instances.get(y)); } //Average for (int x = 0; x < m_NumClusters; x++) { distance[x] = distance[x] / m_ClusterSizes[x]; } double a = distance[clusterAssignments[z]]; distance[clusterAssignments[z]] = Double.MAX_VALUE; Arrays.sort(distance); double b = distance[0]; SilCoeff[z] = (b - a) / Math.max(a, b); AvgSilCoeff += SilCoeff[z]; } AvgSilCoeff = AvgSilCoeff / instances.numInstances(); //System.out.println("AvgSilCoeff: " + AvgSilCoeff); }
From source file:gyc.OverBoostM1.java
License:Open Source License
/** * Sets the weights for the next iteration. * //from w w w. ja v a2s . com * @param training the training instances * @param reweight the reweighting factor * @throws Exception if something goes wrong */ protected void setWeights(Instances training, double reweight) throws Exception { double oldSumOfWeights, newSumOfWeights; oldSumOfWeights = training.sumOfWeights(); Enumeration enu = training.enumerateInstances(); while (enu.hasMoreElements()) { Instance instance = (Instance) enu.nextElement(); if (!Utils.eq(m_Classifiers[m_NumIterationsPerformed].classifyInstance(instance), instance.classValue())) instance.setWeight(instance.weight() * reweight); } // Renormalize weights newSumOfWeights = training.sumOfWeights(); enu = training.enumerateInstances(); while (enu.hasMoreElements()) { Instance instance = (Instance) enu.nextElement(); instance.setWeight(instance.weight() * oldSumOfWeights / newSumOfWeights); } }
From source file:j48.BinC45ModelSelection.java
License:Open Source License
/** * Selects C4.5-type split for the given dataset. *//*from w w w . j av a2 s .com*/ public final ClassifierSplitModel selectModel(Instances data) { double minResult; double currentResult; BinC45Split[] currentModel; BinC45Split bestModel = null; NoSplit noSplitModel = null; double averageInfoGain = 0; int validModels = 0; boolean multiVal = true; Distribution checkDistribution; double sumOfWeights; int i; try { // Check if all Instances belong to one class or if not // enough Instances to split. checkDistribution = new Distribution(data); noSplitModel = new NoSplit(checkDistribution); if (Utils.sm(checkDistribution.total(), 2 * m_minNoObj) || Utils.eq(checkDistribution.total(), checkDistribution.perClass(checkDistribution.maxClass()))) return noSplitModel; // Check if all attributes are nominal and have a // lot of values. Enumeration enu = data.enumerateAttributes(); while (enu.hasMoreElements()) { Attribute attribute = (Attribute) enu.nextElement(); if ((attribute.isNumeric()) || (Utils.sm((double) attribute.numValues(), (0.3 * (double) m_allData.numInstances())))) { multiVal = false; break; } } currentModel = new BinC45Split[data.numAttributes()]; sumOfWeights = data.sumOfWeights(); // For each attribute. for (i = 0; i < data.numAttributes(); i++) { // Apart from class attribute. if (i != (data).classIndex()) { // Get models for current attribute. currentModel[i] = new BinC45Split(i, m_minNoObj, sumOfWeights); currentModel[i].buildClassifier(data); // Check if useful split for current attribute // exists and check for enumerated attributes with // a lot of values. if (currentModel[i].checkModel()) if ((data.attribute(i).isNumeric()) || (multiVal || Utils.sm((double) data.attribute(i).numValues(), (0.3 * (double) m_allData.numInstances())))) { averageInfoGain = averageInfoGain + currentModel[i].infoGain(); validModels++; } } else currentModel[i] = null; } // Check if any useful split was found. if (validModels == 0) return noSplitModel; averageInfoGain = averageInfoGain / (double) validModels; // Find "best" attribute to split on. minResult = 0; for (i = 0; i < data.numAttributes(); i++) { if ((i != (data).classIndex()) && (currentModel[i].checkModel())) // Use 1E-3 here to get a closer approximation to the // original // implementation. if ((currentModel[i].infoGain() >= (averageInfoGain - 1E-3)) && Utils.gr(currentModel[i].gainRatio(), minResult)) { bestModel = currentModel[i]; minResult = currentModel[i].gainRatio(); } } // Check if useful split was found. if (Utils.eq(minResult, 0)) return noSplitModel; // Add all Instances with unknown values for the corresponding // attribute to the distribution for the model, so that // the complete distribution is stored with the model. bestModel.distribution().addInstWithUnknown(data, bestModel.attIndex()); // Set the split point analogue to C45 if attribute numeric. bestModel.setSplitPoint(m_allData); return bestModel; } catch (Exception e) { e.printStackTrace(); } return null; }
From source file:j48.C45ModelSelection.java
License:Open Source License
/** * Selects C4.5-type split for the given dataset. *//* w w w . j av a2 s . c o m*/ public final ClassifierSplitModel selectModel(Instances data) { double minResult; double currentResult; C45Split[] currentModel; C45Split bestModel = null; NoSplit noSplitModel = null; double averageInfoGain = 0; int validModels = 0; boolean multiVal = true; Distribution checkDistribution; Attribute attribute; double sumOfWeights; int i; try { // Check if all Instances belong to one class or if not // enough Instances to split. checkDistribution = new Distribution(data); noSplitModel = new NoSplit(checkDistribution); if (Utils.sm(checkDistribution.total(), 2 * m_minNoObj) || Utils.eq(checkDistribution.total(), checkDistribution.perClass(checkDistribution.maxClass()))) return noSplitModel; // Check if all attributes are nominal and have a // lot of values. if (m_allData != null) { Enumeration enu = data.enumerateAttributes(); while (enu.hasMoreElements()) { attribute = (Attribute) enu.nextElement(); if ((attribute.isNumeric()) || (Utils.sm((double) attribute.numValues(), (0.3 * (double) m_allData.numInstances())))) { multiVal = false; break; } } } currentModel = new j48.C45Split[data.numAttributes()]; sumOfWeights = data.sumOfWeights(); // For each attribute. for (i = 0; i < data.numAttributes(); i++) { // Apart from class attribute. if (i != (data).classIndex()) { // Get models for current attribute. currentModel[i] = new j48.C45Split(i, m_minNoObj, sumOfWeights); currentModel[i].buildClassifier(data); // Check if useful split for current attribute // exists and check for enumerated attributes with // a lot of values. if (currentModel[i].checkModel()) if (m_allData != null) { if ((data.attribute(i).isNumeric()) || (multiVal || Utils.sm((double) data.attribute(i).numValues(), (0.3 * (double) m_allData.numInstances())))) { averageInfoGain = averageInfoGain + currentModel[i].infoGain(); validModels++; } } else { averageInfoGain = averageInfoGain + currentModel[i].infoGain(); validModels++; } } else currentModel[i] = null; } // Check if any useful split was found. if (validModels == 0) return noSplitModel; averageInfoGain = averageInfoGain / (double) validModels; // Find "best" attribute to split on. minResult = 0; for (i = 0; i < data.numAttributes(); i++) { if ((i != (data).classIndex()) && (currentModel[i].checkModel())) // Use 1E-3 here to get a closer approximation to the // original // implementation. if ((currentModel[i].infoGain() >= (averageInfoGain - 1E-3)) && Utils.gr(currentModel[i].gainRatio(), minResult)) { bestModel = currentModel[i]; minResult = currentModel[i].gainRatio(); } } // Check if useful split was found. if (Utils.eq(minResult, 0)) return noSplitModel; // Add all Instances with unknown values for the corresponding // attribute to the distribution for the model, so that // the complete distribution is stored with the model. bestModel.distribution().addInstWithUnknown(data, bestModel.attIndex()); // Set the split point analogue to C45 if attribute numeric. if (m_allData != null) bestModel.setSplitPoint(m_allData); return bestModel; } catch (Exception e) { e.printStackTrace(); } return null; }
From source file:j48.C45PruneableClassifierTree.java
License:Open Source License
/** * Computes new distributions of instances for nodes * in tree.// www . j a v a2 s . c om * * @param data the data to compute the distributions for * @throws Exception if something goes wrong */ private void newDistribution(Instances data) throws Exception { Instances[] localInstances; localModel().resetDistribution(data); m_train = data; if (!m_isLeaf) { localInstances = (Instances[]) localModel().split(data); for (int i = 0; i < m_sons.length; i++) son(i).newDistribution(localInstances[i]); } else { // Check whether there are some instances at the leaf now! if (!Utils.eq(data.sumOfWeights(), 0)) { m_isEmpty = false; } } }
From source file:j48.ClassifierTree.java
License:Open Source License
/** * Builds the tree structure./*from w ww. j ava2s. com*/ * * @param data the data for which the tree structure is to be * generated. * @param keepData is training data to be kept? * @throws Exception if something goes wrong */ public void buildTree(Instances data, boolean keepData) throws Exception { Instances[] localInstances; if (keepData) { m_train = data; } m_test = null; m_isLeaf = false; m_isEmpty = false; m_sons = null; m_localModel = m_toSelectModel.selectModel(data); if (m_localModel.numSubsets() > 1) { localInstances = m_localModel.split(data); data = null; m_sons = new ClassifierTree[m_localModel.numSubsets()]; for (int i = 0; i < m_sons.length; i++) { m_sons[i] = getNewTree(localInstances[i]); localInstances[i] = null; } } else { m_isLeaf = true; if (Utils.eq(data.sumOfWeights(), 0)) m_isEmpty = true; data = null; } }
From source file:j48.ClassifierTree.java
License:Open Source License
/** * Builds the tree structure with hold out set * * @param train the data for which the tree structure is to be * generated./*from ww w .j a v a2 s .c om*/ * @param test the test data for potential pruning * @param keepData is training Data to be kept? * @throws Exception if something goes wrong */ public void buildTree(Instances train, Instances test, boolean keepData) throws Exception { Instances[] localTrain, localTest; int i; if (keepData) { m_train = train; } m_isLeaf = false; m_isEmpty = false; m_sons = null; m_localModel = m_toSelectModel.selectModel(train, test); m_test = new Distribution(test, m_localModel); if (m_localModel.numSubsets() > 1) { localTrain = m_localModel.split(train); localTest = m_localModel.split(test); train = test = null; m_sons = new ClassifierTree[m_localModel.numSubsets()]; for (i = 0; i < m_sons.length; i++) { m_sons[i] = getNewTree(localTrain[i], localTest[i]); localTrain[i] = null; localTest[i] = null; } } else { m_isLeaf = true; if (Utils.eq(train.sumOfWeights(), 0)) m_isEmpty = true; train = test = null; } }