List of usage examples for weka.core Instances numInstances
publicint numInstances()
From source file:de.ugoe.cs.cpdp.util.WekaUtils.java
License:Apache License
/** * <p>/*from ww w . j a v a 2 s . c o m*/ * Calculates the distributional characteristics of the distances of a single attribute the * instances within a data set have to each other. * </p> * * @param data * data for which the instances are characterized * @param index * attribute for which the distances are characterized * @return characteristics */ public static DistChar attributeDistance(Instances data, int index) { double distance; double sumAll = 0.0; double sumAllQ = 0.0; double min = Double.MAX_VALUE; double max = Double.MIN_VALUE; int numCmp = 0; double value1, value2; for (int i = 0; i < data.numInstances(); i++) { value1 = data.instance(i).value(index); for (int j = 0; j < data.numInstances(); j++) { if (j != i) { value2 = data.instance(j).value(index); distance = Math.abs(value1 - value2); sumAll += distance; sumAllQ += distance * distance; numCmp++; if (distance < min) { min = distance; } if (distance > max) { max = distance; } } } } double mean = sumAll / numCmp; double std = Math.sqrt((sumAllQ - (sumAll * sumAll) / numCmp) * (1.0d / (numCmp - 1))); return new DistChar(mean, std, min, max, data.numInstances()); }
From source file:de.uniheidelberg.cl.swp.mlprocess.AblationTesting.java
License:Apache License
/** * Copies the Instances from the source Instances object to a new one, which only contains the * currently tested features.//from ww w.ja v a2s . c om * * @param source The Instances object containing all the Instance objects from the source file. * @param targetStructure The list of {@link AbstractFeatureExtractor}s which is currently * being tested. * @return An instances object consisting of all Instance objects from the source file. */ private Instances copyInstances(Instances source, ArrayList<Attribute> targetStructure) { Instances target = new Instances("ACResolution", targetStructure, 0); for (int i = 0; i < source.numInstances(); i++) { double[] vals = new double[targetStructure.size()]; for (int z = 0; z < targetStructure.size(); z++) { vals[z] = getAttributeValue(source.instance(i), targetStructure.get(z).name()); } Instance in = new DenseInstance(1.0, vals); target.add(in); } return target; }
From source file:de.uniheidelberg.cl.swp.mlprocess.WEKARunner.java
License:Apache License
/** * Predicts unknown labels of an Instances. * /*from ww w. j a v a2s . com*/ * @param unkIns Instances with unknown attributes. * @return Instances with the formerly unknown instances, now labeled. * @throws If the Instances couldn't be labeled. */ public Instances labelUnknownInstances(Instances unkIns) throws Exception { Instances testcpy = new Instances(unkIns); for (int i = 0; i < unkIns.numInstances(); i++) { double clsLabel = classifier.classifyInstance(unkIns.instance(i)); testcpy.instance(i).setClassValue(clsLabel); } return testcpy; }
From source file:de.unimannheim.dws.algorithms.CustomSimpleKMedian.java
License:Open Source License
/** * Generates a clusterer. Has to initialize all fields of the clusterer that * are not being set via options./* w ww . jav a2 s. com*/ * * @param data set of instances serving as training data * @throws Exception if the clusterer has not been generated successfully */ @Override public void buildClusterer(Instances data) throws Exception { // can clusterer handle the data? getCapabilities().testWithFail(data); m_Iterations = 0; m_ReplaceMissingFilter = new ReplaceMissingValues(); Instances instances = new Instances(data); instances.setClassIndex(-1); if (!m_dontReplaceMissing) { m_ReplaceMissingFilter.setInputFormat(instances); instances = Filter.useFilter(instances, m_ReplaceMissingFilter); } m_FullMissingCounts = new int[instances.numAttributes()]; if (m_displayStdDevs) { m_FullStdDevs = new double[instances.numAttributes()]; } m_FullNominalCounts = new int[instances.numAttributes()][0]; m_FullMeansOrMediansOrModes = moveCentroid(0, instances, false); for (int i = 0; i < instances.numAttributes(); i++) { m_FullMissingCounts[i] = instances.attributeStats(i).missingCount; if (instances.attribute(i).isNumeric()) { if (m_displayStdDevs) { m_FullStdDevs[i] = Math.sqrt(instances.variance(i)); } if (m_FullMissingCounts[i] == instances.numInstances()) { m_FullMeansOrMediansOrModes[i] = Double.NaN; // mark missing as mean } } else { m_FullNominalCounts[i] = instances.attributeStats(i).nominalCounts; if (m_FullMissingCounts[i] > m_FullNominalCounts[i][Utils.maxIndex(m_FullNominalCounts[i])]) { m_FullMeansOrMediansOrModes[i] = -1; // mark missing as most common // value } } } m_ClusterCentroids = new Instances(instances, m_NumClusters); int[] clusterAssignments = new int[instances.numInstances()]; if (m_PreserveOrder) { m_Assignments = clusterAssignments; } m_DistanceFunction.setInstances(instances); Random RandomO = new Random(getSeed()); int instIndex; HashMap initC = new HashMap(); DecisionTableHashKey hk = null; Instances initInstances = null; if (m_PreserveOrder) { initInstances = new Instances(instances); } else { initInstances = instances; } for (int j = initInstances.numInstances() - 1; j >= 0; j--) { instIndex = RandomO.nextInt(j + 1); hk = new DecisionTableHashKey(initInstances.instance(instIndex), initInstances.numAttributes(), true); if (!initC.containsKey(hk)) { m_ClusterCentroids.add(initInstances.instance(instIndex)); initC.put(hk, null); } initInstances.swap(j, instIndex); if (m_ClusterCentroids.numInstances() == m_NumClusters) { break; } } m_NumClusters = m_ClusterCentroids.numInstances(); // removing reference initInstances = null; int i; boolean converged = false; int emptyClusterCount; Instances[] tempI = new Instances[m_NumClusters]; m_squaredErrors = new double[m_NumClusters]; m_ClusterNominalCounts = new int[m_NumClusters][instances.numAttributes()][0]; m_ClusterMissingCounts = new int[m_NumClusters][instances.numAttributes()]; while (!converged) { emptyClusterCount = 0; m_Iterations++; converged = true; for (i = 0; i < instances.numInstances(); i++) { Instance toCluster = instances.instance(i); int newC = clusterProcessedInstance(toCluster, true); if (newC != clusterAssignments[i]) { converged = false; } clusterAssignments[i] = newC; } // update centroids m_ClusterCentroids = new Instances(instances, m_NumClusters); for (i = 0; i < m_NumClusters; i++) { tempI[i] = new Instances(instances, 0); } for (i = 0; i < instances.numInstances(); i++) { tempI[clusterAssignments[i]].add(instances.instance(i)); } for (i = 0; i < m_NumClusters; i++) { if (tempI[i].numInstances() == 0) { // empty cluster emptyClusterCount++; } else { moveCentroid(i, tempI[i], true); } } if (m_Iterations == m_MaxIterations) { converged = true; } if (emptyClusterCount > 0) { m_NumClusters -= emptyClusterCount; if (converged) { Instances[] t = new Instances[m_NumClusters]; int index = 0; for (int k = 0; k < tempI.length; k++) { if (tempI[k].numInstances() > 0) { t[index] = tempI[k]; for (i = 0; i < tempI[k].numAttributes(); i++) { m_ClusterNominalCounts[index][i] = m_ClusterNominalCounts[k][i]; } index++; } } tempI = t; } else { tempI = new Instances[m_NumClusters]; } } if (!converged) { m_squaredErrors = new double[m_NumClusters]; m_ClusterNominalCounts = new int[m_NumClusters][instances.numAttributes()][0]; } } if (m_displayStdDevs) { m_ClusterStdDevs = new Instances(instances, m_NumClusters); } m_ClusterSizes = new int[m_NumClusters]; for (i = 0; i < m_NumClusters; i++) { if (m_displayStdDevs) { double[] vals2 = new double[instances.numAttributes()]; for (int j = 0; j < instances.numAttributes(); j++) { if (instances.attribute(j).isNumeric()) { vals2[j] = Math.sqrt(tempI[i].variance(j)); } else { vals2[j] = Instance.missingValue(); } } m_ClusterStdDevs.add(new Instance(1.0, vals2)); } m_ClusterSizes[i] = tempI[i].numInstances(); } // Save memory!! m_DistanceFunction.clean(); }
From source file:de.unimannheim.dws.algorithms.CustomSimpleKMedian.java
License:Open Source License
/** * Move the centroid to it's new coordinates. Generate the centroid * coordinates based on it's members (objects assigned to the cluster of the * centroid) and the distance function being used. * //from w ww .j av a 2s. c om * @param centroidIndex index of the centroid which the coordinates will be * computed * @param members the objects that are assigned to the cluster of this * centroid * @param updateClusterInfo if the method is supposed to update the m_Cluster * arrays * @return the centroid coordinates */ protected double[] moveCentroid(int centroidIndex, Instances members, boolean updateClusterInfo) { double[] vals = new double[members.numAttributes()]; // used only for Manhattan Distance Instances sortedMembers = null; int middle = 0; boolean dataIsEven = false; if (m_DistanceFunction instanceof ManhattanDistance || m_DistanceFunction instanceof CustomPairWiseDistance) { middle = (members.numInstances() - 1) / 2; dataIsEven = ((members.numInstances() % 2) == 0); if (m_PreserveOrder) { sortedMembers = members; } else { sortedMembers = new Instances(members); } } for (int j = 0; j < members.numAttributes(); j++) { // in case of Euclidian distance the centroid is the mean point // in case of Manhattan distance the centroid is the median point // in both cases, if the attribute is nominal, the centroid is the mode if (m_DistanceFunction instanceof EuclideanDistance || members.attribute(j).isNominal()) { vals[j] = members.meanOrMode(j); } else if (m_DistanceFunction instanceof ManhattanDistance || m_DistanceFunction instanceof CustomPairWiseDistance) { // singleton special case if (members.numInstances() == 1) { vals[j] = members.instance(0).value(j); } else { vals[j] = sortedMembers.kthSmallestValue(j, middle + 1); if (dataIsEven) { vals[j] = (vals[j] + sortedMembers.kthSmallestValue(j, middle + 2)) / 2; } } } if (updateClusterInfo) { m_ClusterMissingCounts[centroidIndex][j] = members.attributeStats(j).missingCount; m_ClusterNominalCounts[centroidIndex][j] = members.attributeStats(j).nominalCounts; if (members.attribute(j).isNominal()) { if (m_ClusterMissingCounts[centroidIndex][j] > m_ClusterNominalCounts[centroidIndex][j][Utils .maxIndex(m_ClusterNominalCounts[centroidIndex][j])]) { vals[j] = Instance.missingValue(); // mark mode as missing } } else { if (m_ClusterMissingCounts[centroidIndex][j] == members.numInstances()) { vals[j] = Instance.missingValue(); // mark mean as missing } } } } if (updateClusterInfo) { m_ClusterCentroids.add(new Instance(1.0, vals)); } return vals; }
From source file:de.unimannheim.dws.algorithms.CustomSimpleKMedoids.java
License:Open Source License
/** * Move the centroid to it's new coordinates. Generate the centroid * coordinates based on it's members (objects assigned to the cluster of the * centroid) and the distance function being used. * /*from w w w . ja v a2 s . co m*/ * @param centroidIndex index of the centroid which the coordinates will be * computed * @param members the objects that are assigned to the cluster of this * centroid * @param updateClusterInfo if the method is supposed to update the m_Cluster * arrays * @return the centroid coordinates */ protected double[] moveCentroid(int centroidIndex, Instances members, boolean updateClusterInfo) { double[] vals = new double[members.numAttributes()]; if (!updateClusterInfo) { vals[0] = 100D; return vals; } double smallestError = Double.MAX_VALUE; Instance currentCentroid = null; for (int j = 0; j < members.numInstances(); j++) { Instance currentInstance = members.instance(j); double distanceError = 0D; for (int i = 0; i < members.numInstances(); i++) { distanceError += m_DistanceFunction.distance(currentInstance, members.instance(i)); } if (distanceError < smallestError) { smallestError = distanceError; currentCentroid = currentInstance; } } vals[0] = currentCentroid.valueSparse(0); for (int j = 0; j < members.numAttributes(); j++) { if (updateClusterInfo) { m_ClusterMissingCounts[centroidIndex][j] = members.attributeStats(j).missingCount; m_ClusterNominalCounts[centroidIndex][j] = members.attributeStats(j).nominalCounts; if (members.attribute(j).isNominal()) { if (m_ClusterMissingCounts[centroidIndex][j] > m_ClusterNominalCounts[centroidIndex][j][Utils .maxIndex(m_ClusterNominalCounts[centroidIndex][j])]) { vals[j] = Instance.missingValue(); // mark mode as missing } } else { if (m_ClusterMissingCounts[centroidIndex][j] == members.numInstances()) { vals[j] = Instance.missingValue(); // mark mean as missing } } } } if (updateClusterInfo) { m_ClusterCentroids.add(new Instance(1.0, vals)); } return vals; }
From source file:decisiontree.ID3tree.java
/** * * @param inst//from w ww .j a v a 2 s .c om * @throws Exception */ @Override public void buildClassifier(Instances inst) throws Exception { mInstances = inst; mNumInst = inst.numInstances(); int splitNum; splitNum = calculateSplit(mInstances); }
From source file:decisiontree.ID3tree.java
private int calculateSplit(Instances inst) { Instance tempInst;/*from w w w .j a va2 s . com*/ ArrayList<Instance> subset; subset = new ArrayList(); double[] entropy; int numAttr = inst.numAttributes(); entropy = new double[numAttr]; double tempEnt; int numInst = inst.numInstances(); int splitVal = 5; for (int i = 0; i < numInst; i++) { tempInst = inst.instance(i); subset.add(tempInst); } for (int j = 0; j < numAttr - 1; j++) { tempEnt = calculateEntropy(subset, numAttr, j); entropy[j] = tempEnt; } double temp = 5.0; for (int k = 0; k < numAttr - 1; k++) { if (temp > entropy[k]) { temp = entropy[k]; splitVal = k; } } return splitVal; }
From source file:decisiontree.MyC45.java
/** * Method for building an C45 tree./*from www. ja v a2 s . com*/ * * @param instances the training data * @exception Exception if decision tree can't be built successfully */ private void makeTree(Instances instances) throws Exception { // Check if no instances have reached this node. if (instances.numInstances() == 0) { m_Attribute = null; m_ClassValue = Instance.missingValue(); m_Distribution = new double[instances.numClasses()]; return; } // Compute attribute with maximum gain ratio. double[] gainRatios = new double[instances.numAttributes()]; Enumeration attrEnum = instances.enumerateAttributes(); while (attrEnum.hasMoreElements()) { Attribute attr = (Attribute) attrEnum.nextElement(); if (attr.isNominal()) { gainRatios[attr.index()] = computeGainRatio(instances, attr); } else if (attr.isNumeric()) { gainRatios[attr.index()] = computeGainRatio(instances, attr, computeThreshold(instances, attr)); } } m_Attribute = instances.attribute(Utils.maxIndex(gainRatios)); // Make leaf if gain ratio is zero. // Otherwise create successors. if (Utils.eq(gainRatios[m_Attribute.index()], 0)) { m_Attribute = null; m_Distribution = new double[instances.numClasses()]; Enumeration instEnum = instances.enumerateInstances(); while (instEnum.hasMoreElements()) { Instance inst = (Instance) instEnum.nextElement(); m_Distribution[(int) inst.classValue()]++; } Utils.normalize(m_Distribution); m_ClassValue = Utils.maxIndex(m_Distribution); m_ClassAttribute = instances.classAttribute(); } else { Instances[] splitData = null; int child = 0; if (m_Attribute.isNominal()) { child = m_Attribute.numValues(); splitData = splitData(instances, m_Attribute); } else if (m_Attribute.isNumeric()) { child = 2; splitData = splitData(instances, m_Attribute, computeThreshold(instances, m_Attribute)); } m_Successors = new MyC45[child]; for (int j = 0; j < child; j++) { m_Successors[j] = new MyC45(); m_Successors[j].makeTree(splitData[j]); } } }
From source file:decisiontree.MyC45.java
/** * Splits a dataset according to the values of a nominal attribute. * * @param data the data which is to be split * @param att the attribute to be used for splitting * @return the sets of instances produced by the split *//*from www . j a va 2 s.c o m*/ private Instances[] splitData(Instances data, Attribute att) { Instances[] splitData = new Instances[att.numValues()]; for (int j = 0; j < att.numValues(); j++) { splitData[j] = new Instances(data, data.numInstances()); } Enumeration instEnum = data.enumerateInstances(); while (instEnum.hasMoreElements()) { Instance inst = (Instance) instEnum.nextElement(); splitData[(int) inst.value(att)].add(inst); } for (int i = 0; i < splitData.length; i++) { splitData[i].compactify(); } return splitData; }