List of usage examples for weka.core Instances attributeStats
public AttributeStats attributeStats(int index)
From source file:mlda.labelsDistribution.MeanEntropy.java
License:Open Source License
/** * Calculate metric value/*from w w w. j a v a2s . com*/ * * @param mlData Multi-label dataset to which calculate the metric * @return Value of the metric */ public double calculate(MultiLabelInstances mlData) { Instances instances = mlData.getDataSet(); int nLabels = mlData.getNumLabels(); int[] labels = mlData.getLabelIndices(); double[] entropies = new double[nLabels]; for (int i = 0; i < nLabels; i++) { AttributeStats attStats = instances.attributeStats(labels[i]); if (attStats.nominalCounts != null) { entropies[i] = Utils.entropy(attStats.nominalCounts); } } double meanEntropy = 0; for (double e : entropies) { meanEntropy += e; } meanEntropy /= entropies.length; this.value = meanEntropy; return value; }
From source file:mlda.labelsDistribution.MinEntropy.java
License:Open Source License
/** * Calculate metric value/* w w w . ja v a2 s . c o m*/ * * @param mlData Multi-label dataset to which calculate the metric * @return Value of the metric */ public double calculate(MultiLabelInstances mlData) { Instances instances = mlData.getDataSet(); int nLabels = mlData.getNumLabels(); int[] labels = mlData.getLabelIndices(); double[] entropies = new double[nLabels]; for (int i = 0; i < nLabels; i++) { AttributeStats attStats = instances.attributeStats(labels[i]); if (attStats.nominalCounts != null) { entropies[i] = Utils.entropy(attStats.nominalCounts); } } double minEntropy = Double.MAX_VALUE; for (double e : entropies) { if (e < minEntropy) { minEntropy = e; } } this.value = minEntropy; return value; }
From source file:motaz.CODB.java
License:Open Source License
/** * Returns a new Class-Instance of the specified database * @param database_distanceType String of the specified distance-type * @param instance The original instance that needs to hold by this DataObject * @param key Key for this DataObject/*from w w w .ja v a2 s . c o m*/ * @param database Link to the database * @return DataObject New constructed DataObject */ public List Pcl(Instances insts) { int tc = insts.attributeStats(insts.numAttributes() - 2).totalCount; int[] nomC = insts.attributeStats(insts.numAttributes() - 2).nominalCounts; // For Station double freq = 0.0; List l = new ArrayList(); for (int i = 0; i < nomC.length; i++) { freq = nomC[i] / (double) tc; l.add(freq); } return l; }
From source file:mulan.classifier.neural.NormalizationFilter.java
License:Open Source License
private void Initialize(MultiLabelInstances mlData) { Instances dataSet = mlData.getDataSet(); int[] featureIndices = mlData.getFeatureIndices(); for (int attIndex : featureIndices) { Attribute feature = dataSet.attribute(attIndex); if (feature.isNumeric()) { Stats stats = dataSet.attributeStats(attIndex).numericStats; attStats.put(attIndex, new double[] { stats.min, stats.max }); }/*from ww w . j av a 2 s . c o m*/ } }
From source file:myid3andc45classifier.Model.MyC45.java
@Override public void buildClassifier(Instances data) throws Exception { getCapabilities().testWithFail(data); data = new Instances(data); data.deleteWithMissingClass();/*from www .j ava 2 s . c o m*/ Enumeration enumAtt = data.enumerateAttributes(); while (enumAtt.hasMoreElements()) { Attribute attr = (Attribute) enumAtt.nextElement(); if (attr.isNumeric()) { ArrayList<Double> mid = new ArrayList<Double>(); Instances savedData = null; double temp, max = Double.NEGATIVE_INFINITY; // TODO: split nominal data.sort(attr); for (int i = 0; i < data.numInstances() - 1; i++) { if (data.instance(i).classValue() != data.instance(i + 1).classValue()) { if (data.attribute(attr.name() + " " + (data.instance(i + 1).value(attr) + data.instance(i).value(attr)) / 2) == null) { data = convertInstances(data, attr, (data.instance(i + 1).value(attr) + data.instance(i).value(attr)) / 2); //temp = computeInfoGainRatio(newData, newData.attribute(newData.numAttributes()-1)); //System.out.println("attribute "+newData.attribute(newData.numAttributes()-1).name()); //if (temp > max) { // max = temp; // savedData = newData; //} } } } //Penanganan Missing Value AttributeStats attributeStats = data.attributeStats(attr.index()); double mean = attributeStats.numericStats.mean; if (Double.isNaN(mean)) mean = 0; // Replace missing value with mean Enumeration instEnumerate = data.enumerateInstances(); while (instEnumerate.hasMoreElements()) { Instance instance = (Instance) instEnumerate.nextElement(); if (instance.isMissing(attr.index())) { instance.setValue(attr.index(), mean); } } //data = new Instances(savedData); } else { //Penanganan Missing Value AttributeStats attributeStats = data.attributeStats(attr.index()); int maxIndex = 0; for (int i = 1; i < attr.numValues(); i++) { if (attributeStats.nominalCounts[maxIndex] < attributeStats.nominalCounts[i]) { maxIndex = i; } } // Replace missing value with max index Enumeration instEnumerate = data.enumerateInstances(); while (instEnumerate.hasMoreElements()) { Instance instance = (Instance) instEnumerate.nextElement(); if (instance.isMissing(attr.index())) { instance.setValue(attr.index(), maxIndex); } } } } makeMyC45Tree(data); }
From source file:org.isep.simizer.example.policy.utils.IterativeSimpleKMeans.java
License:Open Source License
/** * Generates a clusterer. Has to initialize all fields of the clusterer that * are not being set via options./*from w ww .j ava 2 s. c om*/ * * @param data set of instances serving as training data * @throws Exception if the clusterer has not been generated successfully */ public void buildClusterer(Instances data) throws Exception { // can clusterer handle the data? getCapabilities().testWithFail(data); m_Iterations = 0; m_ReplaceMissingFilter = new ReplaceMissingValues(); Instances instances = new Instances(data); instances.setClassIndex(-1); if (!m_dontReplaceMissing) { m_ReplaceMissingFilter.setInputFormat(instances); instances = Filter.useFilter(instances, m_ReplaceMissingFilter); } m_FullMissingCounts = new int[instances.numAttributes()]; if (m_displayStdDevs) { m_FullStdDevs = new double[instances.numAttributes()]; } m_FullNominalCounts = new int[instances.numAttributes()][0]; m_FullMeansOrMediansOrModes = moveCentroid(0, instances, false); for (int i = 0; i < instances.numAttributes(); i++) { m_FullMissingCounts[i] = instances.attributeStats(i).missingCount; if (instances.attribute(i).isNumeric()) { if (m_displayStdDevs) { m_FullStdDevs[i] = Math.sqrt(instances.variance(i)); } if (m_FullMissingCounts[i] == instances.numInstances()) { m_FullMeansOrMediansOrModes[i] = Double.NaN; // mark missing as mean } } else { m_FullNominalCounts[i] = instances.attributeStats(i).nominalCounts; if (m_FullMissingCounts[i] > m_FullNominalCounts[i][Utils.maxIndex(m_FullNominalCounts[i])]) { m_FullMeansOrMediansOrModes[i] = -1; // mark missing as most common value } } } // Modified to account for already set centroids if (m_ClusterCentroids == null) { m_ClusterCentroids = new Instances(instances, m_NumClusters); } int[] clusterAssignments = new int[instances.numInstances()]; if (m_PreserveOrder) { m_Assignments = clusterAssignments; } m_DistanceFunction.setInstances(instances); Random RandomO = new Random(getSeed()); int instIndex; HashMap initC = new HashMap(); DecisionTableHashKey hk = null; Instances initInstances = null; if (m_PreserveOrder) { initInstances = new Instances(instances); } else { initInstances = instances; } // Modified to account for already set centroids if (m_ClusterCentroids.numInstances() > 0) { initC = this.centersMap; for (int i = 0; i < m_NumClusters; i++) initInstances.add(m_ClusterCentroids.instance(i)); } else { //part de la fin du Data Set. swappe le centre identifi avec la derniere for (int j = initInstances.numInstances() - 1; j >= 0; j--) { instIndex = RandomO.nextInt(j + 1); hk = new DecisionTableHashKey(initInstances.instance(instIndex), initInstances.numAttributes(), true); if (!initC.containsKey(hk)) { m_ClusterCentroids.add(initInstances.instance(instIndex)); initC.put(hk, null); } initInstances.swap(j, instIndex); if (m_ClusterCentroids.numInstances() == m_NumClusters) { break; } } } m_NumClusters = m_ClusterCentroids.numInstances(); //removing reference initInstances = null; int i; boolean converged = false; int emptyClusterCount; Instances[] tempI = new Instances[m_NumClusters]; m_squaredErrors = new double[m_NumClusters]; m_ClusterNominalCounts = new int[m_NumClusters][instances.numAttributes()][0]; m_ClusterMissingCounts = new int[m_NumClusters][instances.numAttributes()]; while (!converged) { emptyClusterCount = 0; m_Iterations++; converged = true; for (i = 0; i < instances.numInstances(); i++) { Instance toCluster = instances.instance(i); int newC = clusterProcessedInstance(toCluster, true); if (newC != clusterAssignments[i]) { converged = false; } clusterAssignments[i] = newC; } // update centroids m_ClusterCentroids = new Instances(instances, m_NumClusters); for (i = 0; i < m_NumClusters; i++) { tempI[i] = new Instances(instances, 0); } for (i = 0; i < instances.numInstances(); i++) { tempI[clusterAssignments[i]].add(instances.instance(i)); } for (i = 0; i < m_NumClusters; i++) { if (tempI[i].numInstances() == 0) { // empty cluster emptyClusterCount++; } else { moveCentroid(i, tempI[i], true); } } if (m_Iterations == m_MaxIterations) { converged = true; } if (emptyClusterCount > 0) { m_NumClusters -= emptyClusterCount; if (converged) { Instances[] t = new Instances[m_NumClusters]; int index = 0; for (int k = 0; k < tempI.length; k++) { if (tempI[k].numInstances() > 0) { t[index++] = tempI[k]; } } tempI = t; } else { tempI = new Instances[m_NumClusters]; } } if (!converged) { m_squaredErrors = new double[m_NumClusters]; m_ClusterNominalCounts = new int[m_NumClusters][instances.numAttributes()][0]; } } if (m_displayStdDevs) { m_ClusterStdDevs = new Instances(instances, m_NumClusters); } m_ClusterSizes = new int[m_NumClusters]; for (i = 0; i < m_NumClusters; i++) { if (m_displayStdDevs) { double[] vals2 = new double[instances.numAttributes()]; for (int j = 0; j < instances.numAttributes(); j++) { if (instances.attribute(j).isNumeric()) { vals2[j] = Math.sqrt(tempI[i].variance(j)); } else { vals2[j] = Instance.missingValue(); } } m_ClusterStdDevs.add(new Instance(1.0, vals2)); } m_ClusterSizes[i] = tempI[i].numInstances(); } }
From source file:org.isep.simizer.example.policy.utils.IterativeSimpleKMeans.java
License:Open Source License
/** * Move the centroid to it's new coordinates. Generate the centroid * coordinates based on it's members (objects assigned to the cluster of the * centroid) and the distance function being used. * * @param centroidIndex index of the centroid which the coordinates will be * computed//from w w w . j a v a 2 s . c o m * @param members the objects that are assigned to the cluster of this * centroid * @param updateClusterInfo if the method is supposed to update the * m_Cluster arrays * @return the centroid coordinates */ protected double[] moveCentroid(int centroidIndex, Instances members, boolean updateClusterInfo) { double[] vals = new double[members.numAttributes()]; //used only for Manhattan Distance Instances sortedMembers = null; int middle = 0; boolean dataIsEven = false; if (m_DistanceFunction instanceof ManhattanDistance) { middle = (members.numInstances() - 1) / 2; dataIsEven = ((members.numInstances() % 2) == 0); if (m_PreserveOrder) { sortedMembers = members; } else { sortedMembers = new Instances(members); } } for (int j = 0; j < members.numAttributes(); j++) { //in case of Euclidian distance the centroid is the mean point //in case of Manhattan distance the centroid is the median point //in both cases, if the attribute is nominal, the centroid is the mode if (m_DistanceFunction instanceof EuclideanDistance || members.attribute(j).isNominal()) { vals[j] = members.meanOrMode(j); } else if (m_DistanceFunction instanceof ManhattanDistance) { //singleton special case if (members.numInstances() == 1) { vals[j] = members.instance(0).value(j); } else { vals[j] = sortedMembers.kthSmallestValue(j, middle + 1); if (dataIsEven) { vals[j] = (vals[j] + sortedMembers.kthSmallestValue(j, middle + 2)) / 2; } } } if (updateClusterInfo) { m_ClusterMissingCounts[centroidIndex][j] = members.attributeStats(j).missingCount; m_ClusterNominalCounts[centroidIndex][j] = members.attributeStats(j).nominalCounts; if (members.attribute(j).isNominal()) { if (m_ClusterMissingCounts[centroidIndex][j] > m_ClusterNominalCounts[centroidIndex][j][Utils .maxIndex(m_ClusterNominalCounts[centroidIndex][j])]) { vals[j] = Instance.missingValue(); // mark mode as missing } } else { if (m_ClusterMissingCounts[centroidIndex][j] == members.numInstances()) { vals[j] = Instance.missingValue(); // mark mean as missing } } } } if (updateClusterInfo) { m_ClusterCentroids.add(new Instance(1.0, vals)); } return vals; }
From source file:org.openml.webapplication.fantail.dc.statistical.AttributeType.java
License:Open Source License
@Override public Map<String, Double> characterize(Instances instances) { int attrib_count = instances.numAttributes() - 1, nominal_count = 0, numeric_count = 0, bin_count = 0; for (int i = 0; i < attrib_count; i++) { if (instances.attribute(i).isNominal()) { nominal_count++;/* www . ja v a 2 s . com*/ AttributeStats as = instances.attributeStats(i); if (as.distinctCount == 2) { bin_count++; } } else { numeric_count++; } } Map<String, Double> qualities = new HashMap<String, Double>(); qualities.put(ids[0], nominal_count * 1.0); qualities.put(ids[1], numeric_count * 1.0); qualities.put(ids[2], 1.0 * nominal_count / instances.numAttributes()); qualities.put(ids[3], 1.0 * numeric_count / instances.numAttributes()); qualities.put(ids[4], bin_count * 1.0); qualities.put(ids[5], 1.0 * bin_count / instances.numAttributes()); return qualities; }
From source file:org.openml.webapplication.features.ExtractFeatures.java
License:Open Source License
public static List<Feature> getFeatures(Instances dataset, String defaultClass) { if (defaultClass != null) { dataset.setClass(dataset.attribute(defaultClass)); } else {//from www .j a v a 2 s.c o m dataset.setClassIndex(dataset.numAttributes() - 1); } final ArrayList<Feature> resultFeatures = new ArrayList<Feature>(); for (int i = 0; i < dataset.numAttributes(); i++) { Attribute att = dataset.attribute(i); int numValues = dataset.classAttribute().isNominal() ? dataset.classAttribute().numValues() : 0; AttributeStatistics attributeStats = new AttributeStatistics(dataset.attribute(i), numValues); for (int j = 0; j < dataset.numInstances(); ++j) { attributeStats.addValue(dataset.get(j).value(i), dataset.get(j).classValue()); } String data_type = null; Integer numberOfDistinctValues = null; Integer numberOfUniqueValues = null; Integer numberOfMissingValues = null; Integer numberOfIntegerValues = null; Integer numberOfRealValues = null; Integer numberOfNominalValues = null; Integer numberOfValues = null; Double maximumValue = null; Double minimumValue = null; Double meanValue = null; Double standardDeviation = null; AttributeStats as = dataset.attributeStats(i); numberOfDistinctValues = as.distinctCount; numberOfUniqueValues = as.uniqueCount; numberOfMissingValues = as.missingCount; numberOfIntegerValues = as.intCount; numberOfRealValues = as.realCount; numberOfMissingValues = as.missingCount; if (att.isNominal()) { numberOfNominalValues = att.numValues(); } numberOfValues = attributeStats.getTotalObservations(); if (att.isNumeric()) { maximumValue = attributeStats.getMaximum(); minimumValue = attributeStats.getMinimum(); meanValue = attributeStats.getMean(); standardDeviation = 0.0; try { standardDeviation = attributeStats.getStandardDeviation(); } catch (Exception e) { Conversion.log("WARNING", "StdDev", "Could not compute standard deviation of feature " + att.name() + ": " + e.getMessage()); } } if (att.type() == 0) { data_type = "numeric"; } else if (att.type() == 1) { data_type = "nominal"; } else if (att.type() == 2) { data_type = "string"; } else { data_type = "unknown"; } resultFeatures.add(new Feature(att.index(), att.name(), data_type, att.index() == dataset.classIndex(), numberOfDistinctValues, numberOfUniqueValues, numberOfMissingValues, numberOfIntegerValues, numberOfRealValues, numberOfNominalValues, numberOfValues, maximumValue, minimumValue, meanValue, standardDeviation, attributeStats.getClassDistribution())); } return resultFeatures; }
From source file:org.openml.webapplication.predictionCounter.FoldsPredictionCounter.java
License:Open Source License
@SuppressWarnings("unchecked") public FoldsPredictionCounter(Instances splits, String type, String shadowType) throws Exception { ATT_SPLITS_TYPE = InstancesHelper.getRowIndex("type", splits); ATT_SPLITS_ROWID = InstancesHelper.getRowIndex(new String[] { "rowid", "row_id" }, splits); ATT_SPLITS_REPEAT = InstancesHelper.getRowIndex(new String[] { "repeat", "repeat_nr" }, splits); ATT_SPLITS_FOLD = InstancesHelper.getRowIndex(new String[] { "fold", "fold_nr" }, splits); int att_splits_sample; try {/* www.j a va 2s . c o m*/ att_splits_sample = InstancesHelper.getRowIndex(new String[] { "sample", "sample_nr" }, splits); } catch (Exception e) { att_splits_sample = -1; } ATT_SPLITS_SAMPLE = att_splits_sample; NR_OF_REPEATS = splits.attribute("repeat") == null ? 1 : (int) splits.attributeStats(ATT_SPLITS_REPEAT).numericStats.max + 1; NR_OF_FOLDS = splits.attribute("fold") == null ? 1 : (int) splits.attributeStats(ATT_SPLITS_FOLD).numericStats.max + 1; NR_OF_SAMPLES = splits.attribute("sample") == null ? 1 : (int) splits.attributeStats(ATT_SPLITS_SAMPLE).numericStats.max + 1; expectedTotal = 0; expected = new ArrayList[NR_OF_REPEATS][NR_OF_FOLDS][NR_OF_SAMPLES]; actual = new ArrayList[NR_OF_REPEATS][NR_OF_FOLDS][NR_OF_SAMPLES]; shadowTypeSize = new int[NR_OF_REPEATS][NR_OF_FOLDS][NR_OF_SAMPLES]; for (int i = 0; i < NR_OF_REPEATS; i++) for (int j = 0; j < NR_OF_FOLDS; j++) for (int k = 0; k < NR_OF_SAMPLES; k++) { expected[i][j][k] = new ArrayList<Integer>(); actual[i][j][k] = new ArrayList<Integer>(); } for (int i = 0; i < splits.numInstances(); i++) { Instance instance = splits.instance(i); if (instance.value(ATT_SPLITS_TYPE) == splits.attribute(ATT_SPLITS_TYPE).indexOfValue(type)) { int repeat = (int) instance.value(ATT_SPLITS_REPEAT); int fold = (int) instance.value(ATT_SPLITS_FOLD); int sample = ATT_SPLITS_SAMPLE < 0 ? 0 : (int) instance.value(ATT_SPLITS_SAMPLE); int rowid = (int) instance.value(ATT_SPLITS_ROWID); //TODO: maybe we need instance.stringValue() ... expected[repeat][fold][sample].add(rowid); expectedTotal++; } else if (instance.value(ATT_SPLITS_TYPE) == splits.attribute(ATT_SPLITS_TYPE) .indexOfValue(shadowType)) { int repeat = (int) instance.value(ATT_SPLITS_REPEAT); int fold = (int) instance.value(ATT_SPLITS_FOLD); int sample = ATT_SPLITS_SAMPLE < 0 ? 0 : (int) instance.value(ATT_SPLITS_SAMPLE); shadowTypeSize[repeat][fold][sample]++; } } for (int i = 0; i < NR_OF_REPEATS; i++) { for (int j = 0; j < NR_OF_FOLDS; j++) { for (int k = 0; k < NR_OF_SAMPLES; k++) { Collections.sort(expected[i][j][k]); } } } error_message = ""; }