List of usage examples for weka.core Instances numAttributes
publicint numAttributes()
From source file:de.unimannheim.dws.algorithms.CustomSimpleKMedian.java
License:Open Source License
/** * Generates a clusterer. Has to initialize all fields of the clusterer that * are not being set via options.// w ww. j a va2 s. c om * * @param data set of instances serving as training data * @throws Exception if the clusterer has not been generated successfully */ @Override public void buildClusterer(Instances data) throws Exception { // can clusterer handle the data? getCapabilities().testWithFail(data); m_Iterations = 0; m_ReplaceMissingFilter = new ReplaceMissingValues(); Instances instances = new Instances(data); instances.setClassIndex(-1); if (!m_dontReplaceMissing) { m_ReplaceMissingFilter.setInputFormat(instances); instances = Filter.useFilter(instances, m_ReplaceMissingFilter); } m_FullMissingCounts = new int[instances.numAttributes()]; if (m_displayStdDevs) { m_FullStdDevs = new double[instances.numAttributes()]; } m_FullNominalCounts = new int[instances.numAttributes()][0]; m_FullMeansOrMediansOrModes = moveCentroid(0, instances, false); for (int i = 0; i < instances.numAttributes(); i++) { m_FullMissingCounts[i] = instances.attributeStats(i).missingCount; if (instances.attribute(i).isNumeric()) { if (m_displayStdDevs) { m_FullStdDevs[i] = Math.sqrt(instances.variance(i)); } if (m_FullMissingCounts[i] == instances.numInstances()) { m_FullMeansOrMediansOrModes[i] = Double.NaN; // mark missing as mean } } else { m_FullNominalCounts[i] = instances.attributeStats(i).nominalCounts; if (m_FullMissingCounts[i] > m_FullNominalCounts[i][Utils.maxIndex(m_FullNominalCounts[i])]) { m_FullMeansOrMediansOrModes[i] = -1; // mark missing as most common // value } } } m_ClusterCentroids = new Instances(instances, m_NumClusters); int[] clusterAssignments = new int[instances.numInstances()]; if (m_PreserveOrder) { m_Assignments = clusterAssignments; } m_DistanceFunction.setInstances(instances); Random RandomO = new Random(getSeed()); int instIndex; HashMap initC = new HashMap(); DecisionTableHashKey hk = null; Instances initInstances = null; if (m_PreserveOrder) { initInstances = new Instances(instances); } else { initInstances = instances; } for (int j = initInstances.numInstances() - 1; j >= 0; j--) { instIndex = RandomO.nextInt(j + 1); hk = new DecisionTableHashKey(initInstances.instance(instIndex), initInstances.numAttributes(), true); if (!initC.containsKey(hk)) { m_ClusterCentroids.add(initInstances.instance(instIndex)); initC.put(hk, null); } initInstances.swap(j, instIndex); if (m_ClusterCentroids.numInstances() == m_NumClusters) { break; } } m_NumClusters = m_ClusterCentroids.numInstances(); // removing reference initInstances = null; int i; boolean converged = false; int emptyClusterCount; Instances[] tempI = new Instances[m_NumClusters]; m_squaredErrors = new double[m_NumClusters]; m_ClusterNominalCounts = new int[m_NumClusters][instances.numAttributes()][0]; m_ClusterMissingCounts = new int[m_NumClusters][instances.numAttributes()]; while (!converged) { emptyClusterCount = 0; m_Iterations++; converged = true; for (i = 0; i < instances.numInstances(); i++) { Instance toCluster = instances.instance(i); int newC = clusterProcessedInstance(toCluster, true); if (newC != clusterAssignments[i]) { converged = false; } clusterAssignments[i] = newC; } // update centroids m_ClusterCentroids = new Instances(instances, m_NumClusters); for (i = 0; i < m_NumClusters; i++) { tempI[i] = new Instances(instances, 0); } for (i = 0; i < instances.numInstances(); i++) { tempI[clusterAssignments[i]].add(instances.instance(i)); } for (i = 0; i < m_NumClusters; i++) { if (tempI[i].numInstances() == 0) { // empty cluster emptyClusterCount++; } else { moveCentroid(i, tempI[i], true); } } if (m_Iterations == m_MaxIterations) { converged = true; } if (emptyClusterCount > 0) { m_NumClusters -= emptyClusterCount; if (converged) { Instances[] t = new Instances[m_NumClusters]; int index = 0; for (int k = 0; k < tempI.length; k++) { if (tempI[k].numInstances() > 0) { t[index] = tempI[k]; for (i = 0; i < tempI[k].numAttributes(); i++) { m_ClusterNominalCounts[index][i] = m_ClusterNominalCounts[k][i]; } index++; } } tempI = t; } else { tempI = new Instances[m_NumClusters]; } } if (!converged) { m_squaredErrors = new double[m_NumClusters]; m_ClusterNominalCounts = new int[m_NumClusters][instances.numAttributes()][0]; } } if (m_displayStdDevs) { m_ClusterStdDevs = new Instances(instances, m_NumClusters); } m_ClusterSizes = new int[m_NumClusters]; for (i = 0; i < m_NumClusters; i++) { if (m_displayStdDevs) { double[] vals2 = new double[instances.numAttributes()]; for (int j = 0; j < instances.numAttributes(); j++) { if (instances.attribute(j).isNumeric()) { vals2[j] = Math.sqrt(tempI[i].variance(j)); } else { vals2[j] = Instance.missingValue(); } } m_ClusterStdDevs.add(new Instance(1.0, vals2)); } m_ClusterSizes[i] = tempI[i].numInstances(); } // Save memory!! m_DistanceFunction.clean(); }
From source file:de.unimannheim.dws.algorithms.CustomSimpleKMedian.java
License:Open Source License
/** * Move the centroid to it's new coordinates. Generate the centroid * coordinates based on it's members (objects assigned to the cluster of the * centroid) and the distance function being used. * /*from ww w . ja v a 2s . c om*/ * @param centroidIndex index of the centroid which the coordinates will be * computed * @param members the objects that are assigned to the cluster of this * centroid * @param updateClusterInfo if the method is supposed to update the m_Cluster * arrays * @return the centroid coordinates */ protected double[] moveCentroid(int centroidIndex, Instances members, boolean updateClusterInfo) { double[] vals = new double[members.numAttributes()]; // used only for Manhattan Distance Instances sortedMembers = null; int middle = 0; boolean dataIsEven = false; if (m_DistanceFunction instanceof ManhattanDistance || m_DistanceFunction instanceof CustomPairWiseDistance) { middle = (members.numInstances() - 1) / 2; dataIsEven = ((members.numInstances() % 2) == 0); if (m_PreserveOrder) { sortedMembers = members; } else { sortedMembers = new Instances(members); } } for (int j = 0; j < members.numAttributes(); j++) { // in case of Euclidian distance the centroid is the mean point // in case of Manhattan distance the centroid is the median point // in both cases, if the attribute is nominal, the centroid is the mode if (m_DistanceFunction instanceof EuclideanDistance || members.attribute(j).isNominal()) { vals[j] = members.meanOrMode(j); } else if (m_DistanceFunction instanceof ManhattanDistance || m_DistanceFunction instanceof CustomPairWiseDistance) { // singleton special case if (members.numInstances() == 1) { vals[j] = members.instance(0).value(j); } else { vals[j] = sortedMembers.kthSmallestValue(j, middle + 1); if (dataIsEven) { vals[j] = (vals[j] + sortedMembers.kthSmallestValue(j, middle + 2)) / 2; } } } if (updateClusterInfo) { m_ClusterMissingCounts[centroidIndex][j] = members.attributeStats(j).missingCount; m_ClusterNominalCounts[centroidIndex][j] = members.attributeStats(j).nominalCounts; if (members.attribute(j).isNominal()) { if (m_ClusterMissingCounts[centroidIndex][j] > m_ClusterNominalCounts[centroidIndex][j][Utils .maxIndex(m_ClusterNominalCounts[centroidIndex][j])]) { vals[j] = Instance.missingValue(); // mark mode as missing } } else { if (m_ClusterMissingCounts[centroidIndex][j] == members.numInstances()) { vals[j] = Instance.missingValue(); // mark mean as missing } } } } if (updateClusterInfo) { m_ClusterCentroids.add(new Instance(1.0, vals)); } return vals; }
From source file:de.unimannheim.dws.algorithms.CustomSimpleKMedoids.java
License:Open Source License
/** * Move the centroid to it's new coordinates. Generate the centroid * coordinates based on it's members (objects assigned to the cluster of the * centroid) and the distance function being used. * /*from ww w. jav a 2 s .c o m*/ * @param centroidIndex index of the centroid which the coordinates will be * computed * @param members the objects that are assigned to the cluster of this * centroid * @param updateClusterInfo if the method is supposed to update the m_Cluster * arrays * @return the centroid coordinates */ protected double[] moveCentroid(int centroidIndex, Instances members, boolean updateClusterInfo) { double[] vals = new double[members.numAttributes()]; if (!updateClusterInfo) { vals[0] = 100D; return vals; } double smallestError = Double.MAX_VALUE; Instance currentCentroid = null; for (int j = 0; j < members.numInstances(); j++) { Instance currentInstance = members.instance(j); double distanceError = 0D; for (int i = 0; i < members.numInstances(); i++) { distanceError += m_DistanceFunction.distance(currentInstance, members.instance(i)); } if (distanceError < smallestError) { smallestError = distanceError; currentCentroid = currentInstance; } } vals[0] = currentCentroid.valueSparse(0); for (int j = 0; j < members.numAttributes(); j++) { if (updateClusterInfo) { m_ClusterMissingCounts[centroidIndex][j] = members.attributeStats(j).missingCount; m_ClusterNominalCounts[centroidIndex][j] = members.attributeStats(j).nominalCounts; if (members.attribute(j).isNominal()) { if (m_ClusterMissingCounts[centroidIndex][j] > m_ClusterNominalCounts[centroidIndex][j][Utils .maxIndex(m_ClusterNominalCounts[centroidIndex][j])]) { vals[j] = Instance.missingValue(); // mark mode as missing } } else { if (m_ClusterMissingCounts[centroidIndex][j] == members.numInstances()) { vals[j] = Instance.missingValue(); // mark mean as missing } } } } if (updateClusterInfo) { m_ClusterCentroids.add(new Instance(1.0, vals)); } return vals; }
From source file:de.upb.timok.oneclassclassifier.WekaSvmClassifier.java
License:Open Source License
@Override public void train(List<double[]> trainingSamples) { Instances data = DatasetTransformationUtils.trainingSetToInstances(trainingSamples); // setting class attribute if the data format does not provide this information // For example, the XRFF format saves the class attribute information as well try {/* w w w . j a v a 2 s . com*/ if (filter != null) { filter.setInputFormat(data); data = Filter.useFilter(data, filter); } if (data.classIndex() == -1) { data.setClassIndex(data.numAttributes() - 1); } wekaSvm.buildClassifier(data); } catch (final Exception e) { logger.error("Unexpected exception", e); } }
From source file:decisiontree.ID3tree.java
private int calculateSplit(Instances inst) { Instance tempInst;/*from ww w. j a va 2 s . c o m*/ ArrayList<Instance> subset; subset = new ArrayList(); double[] entropy; int numAttr = inst.numAttributes(); entropy = new double[numAttr]; double tempEnt; int numInst = inst.numInstances(); int splitVal = 5; for (int i = 0; i < numInst; i++) { tempInst = inst.instance(i); subset.add(tempInst); } for (int j = 0; j < numAttr - 1; j++) { tempEnt = calculateEntropy(subset, numAttr, j); entropy[j] = tempEnt; } double temp = 5.0; for (int k = 0; k < numAttr - 1; k++) { if (temp > entropy[k]) { temp = entropy[k]; splitVal = k; } } return splitVal; }
From source file:decisiontree.MyC45.java
/** * Method for building an C45 tree./*from w w w . j av a 2s. co m*/ * * @param instances the training data * @exception Exception if decision tree can't be built successfully */ private void makeTree(Instances instances) throws Exception { // Check if no instances have reached this node. if (instances.numInstances() == 0) { m_Attribute = null; m_ClassValue = Instance.missingValue(); m_Distribution = new double[instances.numClasses()]; return; } // Compute attribute with maximum gain ratio. double[] gainRatios = new double[instances.numAttributes()]; Enumeration attrEnum = instances.enumerateAttributes(); while (attrEnum.hasMoreElements()) { Attribute attr = (Attribute) attrEnum.nextElement(); if (attr.isNominal()) { gainRatios[attr.index()] = computeGainRatio(instances, attr); } else if (attr.isNumeric()) { gainRatios[attr.index()] = computeGainRatio(instances, attr, computeThreshold(instances, attr)); } } m_Attribute = instances.attribute(Utils.maxIndex(gainRatios)); // Make leaf if gain ratio is zero. // Otherwise create successors. if (Utils.eq(gainRatios[m_Attribute.index()], 0)) { m_Attribute = null; m_Distribution = new double[instances.numClasses()]; Enumeration instEnum = instances.enumerateInstances(); while (instEnum.hasMoreElements()) { Instance inst = (Instance) instEnum.nextElement(); m_Distribution[(int) inst.classValue()]++; } Utils.normalize(m_Distribution); m_ClassValue = Utils.maxIndex(m_Distribution); m_ClassAttribute = instances.classAttribute(); } else { Instances[] splitData = null; int child = 0; if (m_Attribute.isNominal()) { child = m_Attribute.numValues(); splitData = splitData(instances, m_Attribute); } else if (m_Attribute.isNumeric()) { child = 2; splitData = splitData(instances, m_Attribute, computeThreshold(instances, m_Attribute)); } m_Successors = new MyC45[child]; for (int j = 0; j < child; j++) { m_Successors[j] = new MyC45(); m_Successors[j].makeTree(splitData[j]); } } }
From source file:decisiontree.MyID3.java
private void makeTree(Instances data) { // Check if no instances have reached this node. if (data.numInstances() == 0) { splitAttr = null;/*from w w w . j a va 2s .com*/ leafValue = Double.NaN; leafDist = new double[data.numClasses()]; return; } if (data.numDistinctValues(data.classIndex()) == 1) { leafValue = data.firstInstance().classValue(); return; } // Compute attribute with maximum information gain. double[] infoGains = new double[data.numAttributes()]; Enumeration attEnum = data.enumerateAttributes(); while (attEnum.hasMoreElements()) { Attribute att = (Attribute) attEnum.nextElement(); infoGains[att.index()] = computeInfoGain(data, att); } splitAttr = data.attribute(maxIndex(infoGains)); // Make leaf if information gain is zero. // Otherwise create successors. if (Utils.eq(infoGains[splitAttr.index()], 0)) { splitAttr = null; leafDist = new double[data.numClasses()]; Enumeration instEnum = data.enumerateInstances(); while (instEnum.hasMoreElements()) { Instance inst = (Instance) instEnum.nextElement(); leafDist[(int) inst.classValue()]++; } normalize(leafDist); leafValue = Utils.maxIndex(leafDist); classAttr = data.classAttribute(); } else { Instances[] splitData = splitData(data, splitAttr); child = new MyID3[splitAttr.numValues()]; for (int j = 0; j < splitAttr.numValues(); j++) { child[j] = new MyID3(); child[j].makeTree(splitData[j]); } } }
From source file:decisiontreeclassifier.ITree2.java
/******************************************************************** * Changes the missing data to 0.0. For the voting data set, this * should be sufficient seeing as 0.00 is more or less random. ********************************************************************/ public Instances fixMissingData(Instances iToFix) { for (int i = 0; i < iToFix.numInstances(); i++) { for (int j = 0; j < iToFix.numAttributes(); j++) { if (iToFix.instance(i).isMissing(j)) { iToFix.instance(i).setValue(j, 0.0); }/* ww w . j av a 2 s . c o m*/ } } return iToFix; }
From source file:demo.Demo.java
License:Open Source License
/** * @param args// www . ja va 2 s . c o m * @throws IOException */ public static void main(String[] args) throws IOException { CSVLoader loader = new CSVLoader(); System.out.println("Downloading dataset..."); URL oracle = new URL("http://repository.seasr.org/Datasets/UCI/csv/mushroom.csv"); File csvFile = File.createTempFile("data-", ".csv"); BufferedReader in = new BufferedReader(new InputStreamReader(oracle.openStream())); PrintWriter out = new PrintWriter(new BufferedOutputStream(new FileOutputStream(csvFile))); String inputLine; while ((inputLine = in.readLine()) != null) { out.println(inputLine); } in.close(); out.close(); System.out.println("Dataset written to: " + csvFile.getAbsolutePath()); loader.setFile(csvFile); loader.setNominalAttributes("first-last"); Instances instances = loader.getDataSet(); String[] variablesNames = new String[instances.numAttributes()]; for (int i = 0; i < variablesNames.length; i++) { variablesNames[i] = instances.attribute(i).name(); } ChordalysisModelling modeller = new ChordalysisModelling(0.05); System.out.println("Learning..."); modeller.buildModel(instances); DecomposableModel bestModel = modeller.getModel(); bestModel.display(variablesNames); System.out.println("The model selected is:"); System.out.println(bestModel.toString(variablesNames)); bestModel.display(variablesNames); }
From source file:demo.DemoInference.java
License:Open Source License
/** * @param args/*from w w w . ja va2 s . c o m*/ * @throws IOException */ public static void main(String[] args) throws IOException { CSVLoader loader = new CSVLoader(); System.out.println("Downloading dataset..."); URL oracle = new URL("http://repository.seasr.org/Datasets/UCI/csv/mushroom.csv"); File csvFile = File.createTempFile("data-", ".csv"); BufferedReader in = new BufferedReader(new InputStreamReader(oracle.openStream())); PrintWriter out = new PrintWriter(new BufferedOutputStream(new FileOutputStream(csvFile))); String inputLine; while ((inputLine = in.readLine()) != null) { out.println(inputLine); } in.close(); out.close(); System.out.println("Dataset written to: " + csvFile.getAbsolutePath()); loader.setFile(csvFile); loader.setNominalAttributes("first-last"); Instances instances = loader.getDataSet(); String[] variablesNames = new String[instances.numAttributes()]; String[][] outcomes = new String[instances.numAttributes()][]; for (int i = 0; i < variablesNames.length; i++) { variablesNames[i] = instances.attribute(i).name(); outcomes[i] = new String[instances.attribute(i).numValues() + 1];//+1 for missing for (int j = 0; j < outcomes[i].length - 1; j++) { outcomes[i][j] = instances.attribute(i).value(j); } outcomes[i][outcomes[i].length - 1] = "missing"; System.out.println("Dom(" + variablesNames[i] + ") = " + Arrays.toString(outcomes[i])); } ChordalysisModelling modeller = new ChordalysisModelling(0.05); System.out.println("Learning..."); modeller.buildModel(instances); DecomposableModel bestModel = modeller.getModel(); // bestModel.display(variablesNames); System.out.println("The model selected is:"); System.out.println(bestModel.toString(variablesNames)); Inference inference = new Inference(bestModel, variablesNames, outcomes); inference.setProbabilities(modeller.getLattice()); String targetVariable = "population"; System.out.println("initial beliefs on " + targetVariable + " " + Arrays.toString(inference.getBelief(targetVariable))); System.out.println("adding evidence poisonous and convex shape"); inference.addEvidence("class", "e"); inference.addEvidence("cap-shape", "x"); inference.recordEvidence(); System.out.println( "beliefs on " + targetVariable + " " + Arrays.toString(inference.getBelief(targetVariable))); inference.clearEvidences(); System.out.println("reset beliefs"); System.out.println( "reset beliefs on " + targetVariable + " " + Arrays.toString(inference.getBelief(targetVariable))); }