List of usage examples for weka.core Instances instance
publicInstance instance(int index)
From source file:de.ugoe.cs.cpdp.loader.NetgeneLoader.java
License:Apache License
@Override public Instances load(File fileMetricsFile) { // first determine all files String path = fileMetricsFile.getParentFile().getAbsolutePath(); String project = fileMetricsFile.getName().split("_")[0]; File bugsFile = new File(path + "/" + project + "_bugs_per_file.csv"); File networkMetrics = new File(path + "/" + project + "_network_metrics.csv"); Instances metricsData = null;//from ww w . j av a 2 s . com try { CSVLoader wekaCsvLoader = new CSVLoader(); wekaCsvLoader.setSource(fileMetricsFile); metricsData = wekaCsvLoader.getDataSet(); wekaCsvLoader.setSource(bugsFile); Instances bugsData = wekaCsvLoader.getDataSet(); wekaCsvLoader.setSource(networkMetrics); Instances networkData = wekaCsvLoader.getDataSet(); metricsData.setRelationName(project); // fix nominal attributes (i.e., NA values) for (int j = 2; j < networkData.numAttributes(); j++) { if (networkData.attribute(j).isNominal()) { String attributeName = networkData.attribute(j).name(); double[] tmpVals = new double[networkData.size()]; // get temporary values for (int i = 0; i < networkData.size(); i++) { Instance inst = networkData.instance(i); if (!inst.isMissing(j)) { String val = networkData.instance(i).stringValue(j); try { tmpVals[i] = Double.parseDouble(val); } catch (NumberFormatException e) { // not a number, using 0.0; tmpVals[i] = 0.0; } } else { tmpVals[i] = 0.0; } } // replace attribute networkData.deleteAttributeAt(j); networkData.insertAttributeAt(new Attribute(attributeName), j); for (int i = 0; i < networkData.size(); i++) { networkData.instance(i).setValue(j, tmpVals[i]); } } } // fix string attributes for (int j = 2; j < networkData.numAttributes(); j++) { if (networkData.attribute(j).isString()) { String attributeName = networkData.attribute(j).name(); double[] tmpVals = new double[networkData.size()]; // get temporary values for (int i = 0; i < networkData.size(); i++) { Instance inst = networkData.instance(i); if (!inst.isMissing(j)) { String val = networkData.instance(i).stringValue(j); try { tmpVals[i] = Double.parseDouble(val); } catch (NumberFormatException e) { // not a number, using 0.0; tmpVals[i] = 0.0; } } else { tmpVals[i] = 0.0; } } // replace attribute networkData.deleteAttributeAt(j); networkData.insertAttributeAt(new Attribute(attributeName), j); for (int i = 0; i < networkData.size(); i++) { networkData.instance(i).setValue(j, tmpVals[i]); } } } Map<String, Integer> filenames = new HashMap<>(); for (int j = 0; j < metricsData.size(); j++) { filenames.put(metricsData.instance(j).stringValue(0), j); } // merge with network data int attributeIndex; for (int j = 2; j < networkData.numAttributes(); j++) { attributeIndex = metricsData.numAttributes(); metricsData.insertAttributeAt(networkData.attribute(j), attributeIndex); for (int i = 0; i < networkData.size(); i++) { Integer instanceIndex = filenames.get(networkData.instance(i).stringValue(1)); if (instanceIndex != null) { metricsData.instance(instanceIndex).setValue(attributeIndex, networkData.instance(i).value(j)); } } } // add bug information attributeIndex = metricsData.numAttributes(); final ArrayList<String> classAttVals = new ArrayList<String>(); classAttVals.add("0"); classAttVals.add("1"); final Attribute classAtt = new Attribute("bug", classAttVals); metricsData.insertAttributeAt(classAtt, attributeIndex); for (int i = 0; i < bugsData.size(); i++) { if (bugsData.instance(i).value(2) > 0.0d) { Integer instanceIndex = filenames.get(bugsData.instance(i).stringValue(1)); if (instanceIndex != null) { metricsData.instance(instanceIndex).setValue(attributeIndex, 1.0); } } } // remove filenames metricsData.deleteAttributeAt(0); Attribute eigenvector = metricsData.attribute("eigenvector"); if (eigenvector != null) { for (int j = 0; j < metricsData.numAttributes(); j++) { if (metricsData.attribute(j) == eigenvector) { metricsData.deleteAttributeAt(j); } } } metricsData.setClassIndex(metricsData.numAttributes() - 1); // set all missing values to 0 for (int i = 0; i < metricsData.size(); i++) { for (int j = 0; j < metricsData.numAttributes(); j++) { if (metricsData.instance(i).isMissing(j)) { metricsData.instance(i).setValue(j, 0.0d); } } } } catch (IOException e) { Console.traceln(Level.SEVERE, "failure reading file: " + e.getMessage()); metricsData = null; } return metricsData; }
From source file:de.ugoe.cs.cpdp.util.WekaUtils.java
License:Apache License
/** * <p>//from w w w .j a v a2 s .co m * Calculates the distributional characteristics of the distances the instances within a data * set have to each other. * </p> * * @param data * data for which the instances are characterized * @return characteristics */ public static DistChar datasetDistance(Instances data) { double distance; double sumAll = 0.0; double sumAllQ = 0.0; double min = Double.MAX_VALUE; double max = Double.MIN_VALUE; int numCmp = 0; int l = 0; double[] inst1 = new double[data.numAttributes() - 1]; double[] inst2 = new double[data.numAttributes() - 1]; EuclideanDistance euclideanDistance = new EuclideanDistance(); for (int i = 0; i < data.numInstances(); i++) { l = 0; for (int k = 0; k < data.numAttributes(); k++) { if (k != data.classIndex()) { inst1[l] = data.instance(i).value(k); } } for (int j = 0; j < data.numInstances(); j++) { if (j != i) { l = 0; for (int k = 0; k < data.numAttributes(); k++) { if (k != data.classIndex()) { inst2[l] = data.instance(j).value(k); } } distance = euclideanDistance.compute(inst1, inst2); sumAll += distance; sumAllQ += distance * distance; numCmp++; if (distance < min) { min = distance; } if (distance > max) { max = distance; } } } } double mean = sumAll / numCmp; double std = Math.sqrt((sumAllQ - (sumAll * sumAll) / numCmp) * (1.0d / (numCmp - 1))); return new DistChar(mean, std, min, max, data.numInstances()); }
From source file:de.ugoe.cs.cpdp.util.WekaUtils.java
License:Apache License
/** * <p>//from www . j a va2 s. c om * Calculates the distributional characteristics of the distances of a single attribute the * instances within a data set have to each other. * </p> * * @param data * data for which the instances are characterized * @param index * attribute for which the distances are characterized * @return characteristics */ public static DistChar attributeDistance(Instances data, int index) { double distance; double sumAll = 0.0; double sumAllQ = 0.0; double min = Double.MAX_VALUE; double max = Double.MIN_VALUE; int numCmp = 0; double value1, value2; for (int i = 0; i < data.numInstances(); i++) { value1 = data.instance(i).value(index); for (int j = 0; j < data.numInstances(); j++) { if (j != i) { value2 = data.instance(j).value(index); distance = Math.abs(value1 - value2); sumAll += distance; sumAllQ += distance * distance; numCmp++; if (distance < min) { min = distance; } if (distance > max) { max = distance; } } } } double mean = sumAll / numCmp; double std = Math.sqrt((sumAllQ - (sumAll * sumAll) / numCmp) * (1.0d / (numCmp - 1))); return new DistChar(mean, std, min, max, data.numInstances()); }
From source file:de.uniheidelberg.cl.swp.mlprocess.AblationTesting.java
License:Apache License
/** * Copies the Instances from the source Instances object to a new one, which only contains the * currently tested features.//from ww w .j a va 2s. co m * * @param source The Instances object containing all the Instance objects from the source file. * @param targetStructure The list of {@link AbstractFeatureExtractor}s which is currently * being tested. * @return An instances object consisting of all Instance objects from the source file. */ private Instances copyInstances(Instances source, ArrayList<Attribute> targetStructure) { Instances target = new Instances("ACResolution", targetStructure, 0); for (int i = 0; i < source.numInstances(); i++) { double[] vals = new double[targetStructure.size()]; for (int z = 0; z < targetStructure.size(); z++) { vals[z] = getAttributeValue(source.instance(i), targetStructure.get(z).name()); } Instance in = new DenseInstance(1.0, vals); target.add(in); } return target; }
From source file:de.uniheidelberg.cl.swp.mlprocess.WEKARunner.java
License:Apache License
/** * Predicts unknown labels of an Instances. * //from w ww . j av a 2s . c om * @param unkIns Instances with unknown attributes. * @return Instances with the formerly unknown instances, now labeled. * @throws If the Instances couldn't be labeled. */ public Instances labelUnknownInstances(Instances unkIns) throws Exception { Instances testcpy = new Instances(unkIns); for (int i = 0; i < unkIns.numInstances(); i++) { double clsLabel = classifier.classifyInstance(unkIns.instance(i)); testcpy.instance(i).setClassValue(clsLabel); } return testcpy; }
From source file:de.unimannheim.dws.algorithms.CustomSimpleKMedian.java
License:Open Source License
/** * Generates a clusterer. Has to initialize all fields of the clusterer that * are not being set via options.//from ww w .jav a2 s . co m * * @param data set of instances serving as training data * @throws Exception if the clusterer has not been generated successfully */ @Override public void buildClusterer(Instances data) throws Exception { // can clusterer handle the data? getCapabilities().testWithFail(data); m_Iterations = 0; m_ReplaceMissingFilter = new ReplaceMissingValues(); Instances instances = new Instances(data); instances.setClassIndex(-1); if (!m_dontReplaceMissing) { m_ReplaceMissingFilter.setInputFormat(instances); instances = Filter.useFilter(instances, m_ReplaceMissingFilter); } m_FullMissingCounts = new int[instances.numAttributes()]; if (m_displayStdDevs) { m_FullStdDevs = new double[instances.numAttributes()]; } m_FullNominalCounts = new int[instances.numAttributes()][0]; m_FullMeansOrMediansOrModes = moveCentroid(0, instances, false); for (int i = 0; i < instances.numAttributes(); i++) { m_FullMissingCounts[i] = instances.attributeStats(i).missingCount; if (instances.attribute(i).isNumeric()) { if (m_displayStdDevs) { m_FullStdDevs[i] = Math.sqrt(instances.variance(i)); } if (m_FullMissingCounts[i] == instances.numInstances()) { m_FullMeansOrMediansOrModes[i] = Double.NaN; // mark missing as mean } } else { m_FullNominalCounts[i] = instances.attributeStats(i).nominalCounts; if (m_FullMissingCounts[i] > m_FullNominalCounts[i][Utils.maxIndex(m_FullNominalCounts[i])]) { m_FullMeansOrMediansOrModes[i] = -1; // mark missing as most common // value } } } m_ClusterCentroids = new Instances(instances, m_NumClusters); int[] clusterAssignments = new int[instances.numInstances()]; if (m_PreserveOrder) { m_Assignments = clusterAssignments; } m_DistanceFunction.setInstances(instances); Random RandomO = new Random(getSeed()); int instIndex; HashMap initC = new HashMap(); DecisionTableHashKey hk = null; Instances initInstances = null; if (m_PreserveOrder) { initInstances = new Instances(instances); } else { initInstances = instances; } for (int j = initInstances.numInstances() - 1; j >= 0; j--) { instIndex = RandomO.nextInt(j + 1); hk = new DecisionTableHashKey(initInstances.instance(instIndex), initInstances.numAttributes(), true); if (!initC.containsKey(hk)) { m_ClusterCentroids.add(initInstances.instance(instIndex)); initC.put(hk, null); } initInstances.swap(j, instIndex); if (m_ClusterCentroids.numInstances() == m_NumClusters) { break; } } m_NumClusters = m_ClusterCentroids.numInstances(); // removing reference initInstances = null; int i; boolean converged = false; int emptyClusterCount; Instances[] tempI = new Instances[m_NumClusters]; m_squaredErrors = new double[m_NumClusters]; m_ClusterNominalCounts = new int[m_NumClusters][instances.numAttributes()][0]; m_ClusterMissingCounts = new int[m_NumClusters][instances.numAttributes()]; while (!converged) { emptyClusterCount = 0; m_Iterations++; converged = true; for (i = 0; i < instances.numInstances(); i++) { Instance toCluster = instances.instance(i); int newC = clusterProcessedInstance(toCluster, true); if (newC != clusterAssignments[i]) { converged = false; } clusterAssignments[i] = newC; } // update centroids m_ClusterCentroids = new Instances(instances, m_NumClusters); for (i = 0; i < m_NumClusters; i++) { tempI[i] = new Instances(instances, 0); } for (i = 0; i < instances.numInstances(); i++) { tempI[clusterAssignments[i]].add(instances.instance(i)); } for (i = 0; i < m_NumClusters; i++) { if (tempI[i].numInstances() == 0) { // empty cluster emptyClusterCount++; } else { moveCentroid(i, tempI[i], true); } } if (m_Iterations == m_MaxIterations) { converged = true; } if (emptyClusterCount > 0) { m_NumClusters -= emptyClusterCount; if (converged) { Instances[] t = new Instances[m_NumClusters]; int index = 0; for (int k = 0; k < tempI.length; k++) { if (tempI[k].numInstances() > 0) { t[index] = tempI[k]; for (i = 0; i < tempI[k].numAttributes(); i++) { m_ClusterNominalCounts[index][i] = m_ClusterNominalCounts[k][i]; } index++; } } tempI = t; } else { tempI = new Instances[m_NumClusters]; } } if (!converged) { m_squaredErrors = new double[m_NumClusters]; m_ClusterNominalCounts = new int[m_NumClusters][instances.numAttributes()][0]; } } if (m_displayStdDevs) { m_ClusterStdDevs = new Instances(instances, m_NumClusters); } m_ClusterSizes = new int[m_NumClusters]; for (i = 0; i < m_NumClusters; i++) { if (m_displayStdDevs) { double[] vals2 = new double[instances.numAttributes()]; for (int j = 0; j < instances.numAttributes(); j++) { if (instances.attribute(j).isNumeric()) { vals2[j] = Math.sqrt(tempI[i].variance(j)); } else { vals2[j] = Instance.missingValue(); } } m_ClusterStdDevs.add(new Instance(1.0, vals2)); } m_ClusterSizes[i] = tempI[i].numInstances(); } // Save memory!! m_DistanceFunction.clean(); }
From source file:de.unimannheim.dws.algorithms.CustomSimpleKMedian.java
License:Open Source License
/** * Move the centroid to it's new coordinates. Generate the centroid * coordinates based on it's members (objects assigned to the cluster of the * centroid) and the distance function being used. * /*from w w w. j av a 2 s . c om*/ * @param centroidIndex index of the centroid which the coordinates will be * computed * @param members the objects that are assigned to the cluster of this * centroid * @param updateClusterInfo if the method is supposed to update the m_Cluster * arrays * @return the centroid coordinates */ protected double[] moveCentroid(int centroidIndex, Instances members, boolean updateClusterInfo) { double[] vals = new double[members.numAttributes()]; // used only for Manhattan Distance Instances sortedMembers = null; int middle = 0; boolean dataIsEven = false; if (m_DistanceFunction instanceof ManhattanDistance || m_DistanceFunction instanceof CustomPairWiseDistance) { middle = (members.numInstances() - 1) / 2; dataIsEven = ((members.numInstances() % 2) == 0); if (m_PreserveOrder) { sortedMembers = members; } else { sortedMembers = new Instances(members); } } for (int j = 0; j < members.numAttributes(); j++) { // in case of Euclidian distance the centroid is the mean point // in case of Manhattan distance the centroid is the median point // in both cases, if the attribute is nominal, the centroid is the mode if (m_DistanceFunction instanceof EuclideanDistance || members.attribute(j).isNominal()) { vals[j] = members.meanOrMode(j); } else if (m_DistanceFunction instanceof ManhattanDistance || m_DistanceFunction instanceof CustomPairWiseDistance) { // singleton special case if (members.numInstances() == 1) { vals[j] = members.instance(0).value(j); } else { vals[j] = sortedMembers.kthSmallestValue(j, middle + 1); if (dataIsEven) { vals[j] = (vals[j] + sortedMembers.kthSmallestValue(j, middle + 2)) / 2; } } } if (updateClusterInfo) { m_ClusterMissingCounts[centroidIndex][j] = members.attributeStats(j).missingCount; m_ClusterNominalCounts[centroidIndex][j] = members.attributeStats(j).nominalCounts; if (members.attribute(j).isNominal()) { if (m_ClusterMissingCounts[centroidIndex][j] > m_ClusterNominalCounts[centroidIndex][j][Utils .maxIndex(m_ClusterNominalCounts[centroidIndex][j])]) { vals[j] = Instance.missingValue(); // mark mode as missing } } else { if (m_ClusterMissingCounts[centroidIndex][j] == members.numInstances()) { vals[j] = Instance.missingValue(); // mark mean as missing } } } } if (updateClusterInfo) { m_ClusterCentroids.add(new Instance(1.0, vals)); } return vals; }
From source file:de.unimannheim.dws.algorithms.CustomSimpleKMedoids.java
License:Open Source License
/** * Move the centroid to it's new coordinates. Generate the centroid * coordinates based on it's members (objects assigned to the cluster of the * centroid) and the distance function being used. * /* w w w . j a va2s . c om*/ * @param centroidIndex index of the centroid which the coordinates will be * computed * @param members the objects that are assigned to the cluster of this * centroid * @param updateClusterInfo if the method is supposed to update the m_Cluster * arrays * @return the centroid coordinates */ protected double[] moveCentroid(int centroidIndex, Instances members, boolean updateClusterInfo) { double[] vals = new double[members.numAttributes()]; if (!updateClusterInfo) { vals[0] = 100D; return vals; } double smallestError = Double.MAX_VALUE; Instance currentCentroid = null; for (int j = 0; j < members.numInstances(); j++) { Instance currentInstance = members.instance(j); double distanceError = 0D; for (int i = 0; i < members.numInstances(); i++) { distanceError += m_DistanceFunction.distance(currentInstance, members.instance(i)); } if (distanceError < smallestError) { smallestError = distanceError; currentCentroid = currentInstance; } } vals[0] = currentCentroid.valueSparse(0); for (int j = 0; j < members.numAttributes(); j++) { if (updateClusterInfo) { m_ClusterMissingCounts[centroidIndex][j] = members.attributeStats(j).missingCount; m_ClusterNominalCounts[centroidIndex][j] = members.attributeStats(j).nominalCounts; if (members.attribute(j).isNominal()) { if (m_ClusterMissingCounts[centroidIndex][j] > m_ClusterNominalCounts[centroidIndex][j][Utils .maxIndex(m_ClusterNominalCounts[centroidIndex][j])]) { vals[j] = Instance.missingValue(); // mark mode as missing } } else { if (m_ClusterMissingCounts[centroidIndex][j] == members.numInstances()) { vals[j] = Instance.missingValue(); // mark mean as missing } } } } if (updateClusterInfo) { m_ClusterCentroids.add(new Instance(1.0, vals)); } return vals; }
From source file:decisiontree.ID3tree.java
private int calculateSplit(Instances inst) { Instance tempInst;//from w w w. j a v a 2 s. c o m ArrayList<Instance> subset; subset = new ArrayList(); double[] entropy; int numAttr = inst.numAttributes(); entropy = new double[numAttr]; double tempEnt; int numInst = inst.numInstances(); int splitVal = 5; for (int i = 0; i < numInst; i++) { tempInst = inst.instance(i); subset.add(tempInst); } for (int j = 0; j < numAttr - 1; j++) { tempEnt = calculateEntropy(subset, numAttr, j); entropy[j] = tempEnt; } double temp = 5.0; for (int k = 0; k < numAttr - 1; k++) { if (temp > entropy[k]) { temp = entropy[k]; splitVal = k; } } return splitVal; }
From source file:decisiontree.MyC45.java
private Instances handleMissingValues(Instances data) throws Exception { Instances newData = data; Enumeration attrEnum = newData.enumerateAttributes(); while (attrEnum.hasMoreElements()) { Attribute attr = (Attribute) attrEnum.nextElement(); AttributeStats attrStats = newData.attributeStats(attr.index()); if (attr.isNominal()) { int maxIdx = 0; for (int i = 0; i < attr.numValues(); i++) { if (attrStats.nominalCounts[i] > attrStats.nominalCounts[maxIdx]) { maxIdx = i;//from www. java2 s .c om } } for (int i = 0; i < newData.numInstances(); i++) { if (newData.instance(i).isMissing(attr.index())) { newData.instance(i).setValue(attr.index(), maxIdx); } } } else if (attr.isNumeric()) { double mean = attrStats.numericStats.mean; for (int i = 0; i < newData.numInstances(); i++) { if (newData.instance(i).isMissing(attr.index())) { newData.instance(i).setValue(attr.index(), mean); } } } } return newData; }