List of usage examples for weka.core Instances numInstances
publicint numInstances()
From source file:Clustering.WekaKMeansClustererWrapper.java
public ArrayList<String>[] classify(HashMap<String, List> data, boolean clearData) { ArrayList<String>[] clusterResult; try {/*from w w w . j a va2s . c o m*/ File arff = m_ArffExporter.getArff(data); int nSize = data.size(); if (arff == null) return null; if (clearData) data.clear(); FileInputStream is = new FileInputStream(arff.getAbsolutePath()); Instances instances = ConverterUtils.DataSource.read(is); is.close(); String[] keys = new String[instances.numInstances()]; for (int i = 0; i < instances.numInstances(); ++i) { Instance instance = instances.instance(i); keys[i] = instance.stringValue(0); // assume that the 0th attribute is the key string } instances.deleteStringAttributes(); SimpleKMeans cl = new SimpleKMeans(); int numClusters = m_NumberOfClusters < nSize ? m_NumberOfClusters : nSize; String[] options = new String[5]; options[0] = "-O"; options[1] = "-N"; options[2] = Integer.toString(numClusters); options[3] = "-A"; options[4] = m_DistanceFunction; cl.setOptions(options); //System.out.println( "Clustering" ); cl.buildClusterer(instances); //System.out.println( "Create ArrayList" ); clusterResult = new ArrayList[m_NumberOfClusters]; for (int i = 0; i < m_NumberOfClusters; ++i) { clusterResult[i] = new ArrayList<>(); } //System.out.println( "Assigning" ); int[] assignment = cl.getAssignments(); for (int i = 0; i < assignment.length; ++i) { clusterResult[assignment[i]].add(keys[i]); } //System.out.println( "Done" ); if (!arff.delete()) arff.deleteOnExit(); } catch (Exception ex) { //System.out.println( "[EXCEPTION] " + ex.getMessage() ); m_LastErrorMessage = ex.getMessage(); return null; } return clusterResult; }
From source file:cn.edu.xmu.dm.d3c.clustering.SimpleKMeans.java
License:Open Source License
/** * Generates a clusterer. Has to initialize all fields of the clusterer * that are not being set via options.//from w ww. j a v a 2 s.com * * @param data set of instances serving as training data * @throws Exception if the clusterer has not been * generated successfully */ public void buildClusterer(Instances data) throws Exception { // can clusterer handle the data? getCapabilities().testWithFail(data); m_Iterations = 0; m_ReplaceMissingFilter = new ReplaceMissingValues(); Instances instances = new Instances(data); instances.setClassIndex(-1); if (!m_dontReplaceMissing) { m_ReplaceMissingFilter.setInputFormat(instances); instances = Filter.useFilter(instances, m_ReplaceMissingFilter); } m_FullMissingCounts = new int[instances.numAttributes()]; if (m_displayStdDevs) { m_FullStdDevs = new double[instances.numAttributes()]; } m_FullNominalCounts = new int[instances.numAttributes()][0]; m_FullMeansOrMediansOrModes = moveCentroid(0, instances, false); for (int i = 0; i < instances.numAttributes(); i++) { m_FullMissingCounts[i] = instances.attributeStats(i).missingCount; if (instances.attribute(i).isNumeric()) { if (m_displayStdDevs) { m_FullStdDevs[i] = Math.sqrt(instances.variance(i)); } if (m_FullMissingCounts[i] == instances.numInstances()) { m_FullMeansOrMediansOrModes[i] = Double.NaN; // mark missing as mean } } else { m_FullNominalCounts[i] = instances.attributeStats(i).nominalCounts; if (m_FullMissingCounts[i] > m_FullNominalCounts[i][Utils.maxIndex(m_FullNominalCounts[i])]) { m_FullMeansOrMediansOrModes[i] = -1; // mark missing as most common value } } } m_ClusterCentroids = new Instances(instances, m_NumClusters); int[] clusterAssignments = new int[instances.numInstances()]; if (m_PreserveOrder) m_Assignments = clusterAssignments; m_DistanceFunction.setInstances(instances); Random RandomO = new Random(getSeed()); int instIndex; HashMap initC = new HashMap(); DecisionTableHashKey hk = null; Instances initInstances = null; if (m_PreserveOrder) initInstances = new Instances(instances); else initInstances = instances; if (m_initializeWithKMeansPlusPlus) { kMeansPlusPlusInit(initInstances); } else { for (int j = initInstances.numInstances() - 1; j >= 0; j--) { instIndex = RandomO.nextInt(j + 1); hk = new DecisionTableHashKey(initInstances.instance(instIndex), initInstances.numAttributes(), true); if (!initC.containsKey(hk)) { m_ClusterCentroids.add(initInstances.instance(instIndex)); initC.put(hk, null); } initInstances.swap(j, instIndex); if (m_ClusterCentroids.numInstances() == m_NumClusters) { break; } } } m_NumClusters = m_ClusterCentroids.numInstances(); //removing reference initInstances = null; int i; boolean converged = false; int emptyClusterCount; Instances[] tempI = new Instances[m_NumClusters]; m_squaredErrors = new double[m_NumClusters]; m_ClusterNominalCounts = new int[m_NumClusters][instances.numAttributes()][0]; m_ClusterMissingCounts = new int[m_NumClusters][instances.numAttributes()]; while (!converged) { emptyClusterCount = 0; m_Iterations++; converged = true; for (i = 0; i < instances.numInstances(); i++) { Instance toCluster = instances.instance(i); int newC = clusterProcessedInstance(toCluster, false, true); if (newC != clusterAssignments[i]) { converged = false; } clusterAssignments[i] = newC; } // update centroids m_ClusterCentroids = new Instances(instances, m_NumClusters); for (i = 0; i < m_NumClusters; i++) { tempI[i] = new Instances(instances, 0); } for (i = 0; i < instances.numInstances(); i++) { tempI[clusterAssignments[i]].add(instances.instance(i)); } for (i = 0; i < m_NumClusters; i++) { if (tempI[i].numInstances() == 0) { // empty cluster emptyClusterCount++; } else { moveCentroid(i, tempI[i], true); } } if (emptyClusterCount > 0) { m_NumClusters -= emptyClusterCount; if (converged) { Instances[] t = new Instances[m_NumClusters]; int index = 0; for (int k = 0; k < tempI.length; k++) { if (tempI[k].numInstances() > 0) { t[index++] = tempI[k]; } } tempI = t; } else { tempI = new Instances[m_NumClusters]; } } if (m_Iterations == m_MaxIterations) converged = true; if (!converged) { m_ClusterNominalCounts = new int[m_NumClusters][instances.numAttributes()][0]; } } // calculate errors if (!m_FastDistanceCalc) { for (i = 0; i < instances.numInstances(); i++) { clusterProcessedInstance(instances.instance(i), true, false); } } if (m_displayStdDevs) { m_ClusterStdDevs = new Instances(instances, m_NumClusters); } m_ClusterSizes = new int[m_NumClusters]; for (i = 0; i < m_NumClusters; i++) { if (m_displayStdDevs) { double[] vals2 = new double[instances.numAttributes()]; for (int j = 0; j < instances.numAttributes(); j++) { if (instances.attribute(j).isNumeric()) { vals2[j] = Math.sqrt(tempI[i].variance(j)); } else { vals2[j] = Utils.missingValue(); } } m_ClusterStdDevs.add(new DenseInstance(1.0, vals2)); } m_ClusterSizes[i] = tempI[i].numInstances(); } }
From source file:cn.edu.xmu.dm.d3c.clustering.SimpleKMeans.java
License:Open Source License
protected void kMeansPlusPlusInit(Instances data) throws Exception { Random randomO = new Random(getSeed()); HashMap<DecisionTableHashKey, String> initC = new HashMap<DecisionTableHashKey, String>(); // choose initial center uniformly at random int index = randomO.nextInt(data.numInstances()); m_ClusterCentroids.add(data.instance(index)); DecisionTableHashKey hk = new DecisionTableHashKey(data.instance(index), data.numAttributes(), true); initC.put(hk, null);//from w ww.jav a2 s . co m int iteration = 0; int remainingInstances = data.numInstances() - 1; if (m_NumClusters > 1) { // proceed with selecting the rest // distances to the initial randomly chose center double[] distances = new double[data.numInstances()]; double[] cumProbs = new double[data.numInstances()]; for (int i = 0; i < data.numInstances(); i++) { distances[i] = m_DistanceFunction.distance(data.instance(i), m_ClusterCentroids.instance(iteration)); } // now choose the remaining cluster centers for (int i = 1; i < m_NumClusters; i++) { // distances converted to probabilities double[] weights = new double[data.numInstances()]; System.arraycopy(distances, 0, weights, 0, distances.length); Utils.normalize(weights); double sumOfProbs = 0; for (int k = 0; k < data.numInstances(); k++) { sumOfProbs += weights[k]; cumProbs[k] = sumOfProbs; } cumProbs[data.numInstances() - 1] = 1.0; // make sure there are no rounding issues // choose a random instance double prob = randomO.nextDouble(); for (int k = 0; k < cumProbs.length; k++) { if (prob < cumProbs[k]) { Instance candidateCenter = data.instance(k); hk = new DecisionTableHashKey(candidateCenter, data.numAttributes(), true); if (!initC.containsKey(hk)) { initC.put(hk, null); m_ClusterCentroids.add(candidateCenter); } else { // we shouldn't get here because any instance that is a duplicate of // an already chosen cluster center should have zero distance (and hence // zero probability of getting chosen) to that center. System.err.println("We shouldn't get here...."); } remainingInstances--; break; } } iteration++; if (remainingInstances == 0) { break; } // prepare to choose the next cluster center. // check distances against the new cluster center to see if it is closer for (int k = 0; k < data.numInstances(); k++) { if (distances[k] > 0) { double newDist = m_DistanceFunction.distance(data.instance(k), m_ClusterCentroids.instance(iteration)); if (newDist < distances[k]) { distances[k] = newDist; } } } } } }
From source file:cn.edu.xmu.dm.d3c.clustering.SimpleKMeans.java
License:Open Source License
/** * Move the centroid to it's new coordinates. Generate the centroid coordinates based * on it's members (objects assigned to the cluster of the centroid) and the distance * function being used.// ww w . j a va 2 s. c o m * @param centroidIndex index of the centroid which the coordinates will be computed * @param members the objects that are assigned to the cluster of this centroid * @param updateClusterInfo if the method is supposed to update the m_Cluster arrays * @return the centroid coordinates */ protected double[] moveCentroid(int centroidIndex, Instances members, boolean updateClusterInfo) { double[] vals = new double[members.numAttributes()]; //used only for Manhattan Distance Instances sortedMembers = null; int middle = 0; boolean dataIsEven = false; if (m_DistanceFunction instanceof ManhattanDistance) { middle = (members.numInstances() - 1) / 2; dataIsEven = ((members.numInstances() % 2) == 0); if (m_PreserveOrder) { sortedMembers = members; } else { sortedMembers = new Instances(members); } } for (int j = 0; j < members.numAttributes(); j++) { //in case of Euclidian distance the centroid is the mean point //in case of Manhattan distance the centroid is the median point //in both cases, if the attribute is nominal, the centroid is the mode if (m_DistanceFunction instanceof EuclideanDistance || members.attribute(j).isNominal()) { vals[j] = members.meanOrMode(j); } else if (m_DistanceFunction instanceof ManhattanDistance) { //singleton special case if (members.numInstances() == 1) { vals[j] = members.instance(0).value(j); } else { sortedMembers.kthSmallestValue(j, middle + 1); vals[j] = sortedMembers.instance(middle).value(j); if (dataIsEven) { sortedMembers.kthSmallestValue(j, middle + 2); vals[j] = (vals[j] + sortedMembers.instance(middle + 1).value(j)) / 2; } } } if (updateClusterInfo) { m_ClusterMissingCounts[centroidIndex][j] = members.attributeStats(j).missingCount; m_ClusterNominalCounts[centroidIndex][j] = members.attributeStats(j).nominalCounts; if (members.attribute(j).isNominal()) { if (m_ClusterMissingCounts[centroidIndex][j] > m_ClusterNominalCounts[centroidIndex][j][Utils .maxIndex(m_ClusterNominalCounts[centroidIndex][j])]) { vals[j] = Utils.missingValue(); // mark mode as missing } } else { if (m_ClusterMissingCounts[centroidIndex][j] == members.numInstances()) { vals[j] = Utils.missingValue(); // mark mean as missing } } } } if (updateClusterInfo) m_ClusterCentroids.add(new DenseInstance(1.0, vals)); return vals; }
From source file:cn.ict.zyq.bestConf.bestConf.BestConf.java
License:Open Source License
public static Instance findBestPerf(Instances data) { int idx = data.numAttributes() - 1; double bestPerf = data.attributeStats(idx).numericStats.max; for (int i = 0; i < data.numInstances(); i++) if (data.get(i).value(idx) == bestPerf) return data.get(i); return null;//should never return NULL }
From source file:cn.ict.zyq.bestConf.bestConf.BestConf.java
License:Open Source License
public static int findBestPerfIndex(Instances data) { int idx = data.numAttributes() - 1; double bestPerf = data.attributeStats(idx).numericStats.max; for (int i = 0; i < data.numInstances(); i++) if (data.get(i).value(idx) == bestPerf) return i; return -1;//should never return -1 }
From source file:cn.ict.zyq.bestConf.bestConf.BestConf.java
License:Open Source License
public static void testCOMT2() throws Exception { BestConf bestconf = new BestConf(); Instances trainingSet = DataIOFile.loadDataFromArffFile("data/trainingBestConf0.arff"); trainingSet.setClassIndex(trainingSet.numAttributes() - 1); Instances samplePoints = LHSInitializer.getMultiDimContinuous(bestconf.getAttributes(), InitialSampleSetSize, false); samplePoints.insertAttributeAt(trainingSet.classAttribute(), samplePoints.numAttributes()); samplePoints.setClassIndex(samplePoints.numAttributes() - 1); COMT2 comt = new COMT2(samplePoints, COMT2Iteration); comt.buildClassifier(trainingSet);// w ww . j av a 2s . c om Evaluation eval = new Evaluation(trainingSet); eval.evaluateModel(comt, trainingSet); System.err.println(eval.toSummaryString()); Instance best = comt.getInstanceWithPossibleMaxY(samplePoints.firstInstance()); Instances bestInstances = new Instances(trainingSet, 2); bestInstances.add(best); DataIOFile.saveDataToXrffFile("data/trainingBestConf_COMT2.arff", bestInstances); //now we output the training set with the class value updated as the predicted value Instances output = new Instances(trainingSet, trainingSet.numInstances()); Enumeration<Instance> enu = trainingSet.enumerateInstances(); while (enu.hasMoreElements()) { Instance ins = enu.nextElement(); double[] values = ins.toDoubleArray(); values[values.length - 1] = comt.classifyInstance(ins); output.add(ins.copy(values)); } DataIOFile.saveDataToXrffFile("data/trainingBestConf0_predict.xrff", output); }
From source file:cn.ict.zyq.bestConf.cluster.Main.AutoTestAdjust.java
License:Open Source License
public Instances runExp(Instances samplePoints, String perfAttName) { Instances retVal = null;/*from w ww . j a v a 2 s. c o m*/ if (samplePoints.attribute(perfAttName) == null) { Attribute performance = new Attribute(perfAttName); samplePoints.insertAttributeAt(performance, samplePoints.numAttributes()); } int pos = samplePoints.numInstances(); int count = 0; for (int i = 0; i < pos; i++) { Instance ins = samplePoints.get(i); HashMap hm = new HashMap(); int tot = 0; for (int j = 0; j < ins.numAttributes(); j++) { hm.put(ins.attribute(j).name(), ins.value(ins.attribute(j))); } boolean testRet; if (Double.isNaN(ins.value(ins.attribute(ins.numAttributes() - 1)))) { testRet = this.startTest(hm, i, isInterrupt); double y = 0; if (!testRet) {// the setting does not work, we skip it y = -1; count++; if (count >= targetTestErrorNum) { System.out.println( "There must be somthing wrong with the system. Please check and restart....."); System.exit(1); } } else { y = getPerformanceByType(performanceType); count = 0; } ins.setValue(samplePoints.numAttributes() - 1, y); writePerfstoFile(ins); } else { continue; } } retVal = samplePoints; retVal.setClassIndex(retVal.numAttributes() - 1); return retVal; }
From source file:com.actelion.research.orbit.imageAnalysis.imaging.TMAPoints.java
License:Open Source License
private int guessNumClusters(EM clusterer, Instances instances, int start, int end) throws Exception { ClusterEvaluation eval = new ClusterEvaluation(); int bestNum = start; double best = Double.POSITIVE_INFINITY; double bic;/*w w w . j a v a 2 s.c om*/ for (int c = start; c <= end; c++) { clusterer.setNumClusters(c); clusterer.buildClusterer(instances); eval.setClusterer(clusterer); eval.evaluateClusterer(instances); bic = bic(eval.getLogLikelihood(), c, instances.numInstances()); logger.trace("numCluster " + c + " -> BIC: " + bic); if (bic < best) { best = bic; bestNum = c; logger.trace("bestNum: " + bestNum); } } return bestNum; }
From source file:com.github.r351574nc3.amex.assignment2.App.java
License:Open Source License
/** * Generates a predictive model based on a previously trained and evaluated model. * * @param inputName unlabeled model to load * @param outputName path to the file where results will be stored. *//*from ww w . j a v a2 s .c o m*/ public void predict(final String inputName, final String outputName) throws Exception { final Instances input = load(inputName); final Instances labeled = new Instances(input); for (int i = 0; i < input.numInstances(); i++) { final Double clsLabel = getClassifier().classifyInstance(input.instance(i)); labeled.instance(i).setClassValue(clsLabel); } boolean isLocal = true; if (outputName.contains(File.separator)) { isLocal = false; } final File pwd = isLocal ? new File(System.getProperty(USER_DIR_KEY)) : new File(outputName).getParentFile(); if (pwd.exists() && pwd.isDirectory()) { DataSink.write(outputName, labeled); } else { throw new FileNotFoundException("Cannot write to " + outputName); } }