List of usage examples for weka.core Instances Instances
public Instances(Instances dataset)
From source file:de.ugoe.cs.cpdp.util.WekaUtils.java
License:Apache License
/** * <p>/*from w ww .java 2 s .c o m*/ * Upscales the value of a single attribute. This is a workaround to get BayesNet running for * all data. Works on a copy of the training data, i.e., leaves the original data untouched. * </p> * * @param traindata * data from which the attribute is upscaled. * @param attributeIndex * index of the attribute * @return data with upscaled attribute */ public static Instances upscaleAttribute(Instances traindata, int attributeIndex) { Instances traindataCopy = new Instances(traindata); for (int i = 0; i < traindata.size(); i++) { traindataCopy.get(i).setValue(attributeIndex, traindata.get(i).value(attributeIndex) * SCALER); } return traindataCopy; }
From source file:de.unidue.langtech.grading.tc.ClusterExemplarTask.java
License:Open Source License
@Override public void execute(TaskContext aContext) throws Exception { if (learningMode.equals(Constants.LM_MULTI_LABEL)) { throw new IllegalArgumentException("Cannot use multi-label setup in clustering."); }/* w w w. ja va 2s. com*/ boolean multiLabel = false; File arffFileTrain = new File( aContext.getStorageLocation(TEST_TASK_INPUT_KEY_TRAINING_DATA, AccessMode.READONLY).getPath() + "/" + TRAINING_DATA_FILENAME); Instances trainData = TaskUtils.getInstances(arffFileTrain, multiLabel); Clusterer abstractClusterer = AbstractClusterer.forName(clusteringArguments.get(0), clusteringArguments.subList(1, clusteringArguments.size()).toArray(new String[0])); // we assume that only this method has been used - breaks modularity, but need results fast ... :/ SimpleKMeans clusterer = (SimpleKMeans) abstractClusterer; trainData = WekaUtils.removeOutcomeId(trainData, multiLabel); Instances copyTrainData = new Instances(trainData); // generate data for clusterer (w/o class) Remove filter = new Remove(); filter.setAttributeIndices("" + (trainData.classIndex() + 1)); filter.setInputFormat(trainData); Instances clusterTrainData = Filter.useFilter(trainData, filter); clusterer.buildClusterer(clusterTrainData); Instances centroids = clusterer.getClusterCentroids(); // Add addFilter = new Add(); // addFilter.setAttributeIndex(new Integer(numTestLabels + i + 1).toString()); // addFilter.setNominalLabels("0,1"); // addFilter.setAttributeName(trainData.attribute(i).name() + COMPATIBLE_OUTCOME_CLASS); // addFilter.setInputFormat(testData); trainData.clear(); Enumeration<Instance> centroidInstances = centroids.enumerateInstances(); while (centroidInstances.hasMoreElements()) { Instance centroidInstance = centroidInstances.nextElement(); // centroidInstance is usually not a real instance, but a virtual centroid // we need to find the closest point in the training data double minDistance = Double.POSITIVE_INFINITY; int offset = 0; int minOffset = 0; Enumeration<Instance> trainInstances = clusterTrainData.enumerateInstances(); while (trainInstances.hasMoreElements()) { Instance trainInstance = trainInstances.nextElement(); double dist = distance(centroidInstance, trainInstance); if (dist < minDistance) { minDistance = dist; minOffset = offset; } offset++; } // add selected instance to instances trainData.add(copyTrainData.get(minOffset)); } // write the new training data (that will be used by the test task instead of the original one) DataSink.write(aContext.getStorageLocation(ADAPTED_TRAINING_DATA, AccessMode.READWRITE).getPath() + "/" + ARFF_FILENAME, trainData); }
From source file:de.unidue.langtech.grading.tc.ClusteringTask.java
License:Open Source License
@Override public void execute(TaskContext aContext) throws Exception { if (learningMode.equals(Constants.LM_MULTI_LABEL)) { throw new IllegalArgumentException("Cannot use multi-label setup in clustering."); }//from w w w . ja v a2 s. c om boolean multiLabel = false; File arffFileTrain = new File( aContext.getStorageLocation(TEST_TASK_INPUT_KEY_TRAINING_DATA, AccessMode.READONLY).getPath() + "/" + TRAINING_DATA_FILENAME); Instances trainData = TaskUtils.getInstances(arffFileTrain, multiLabel); // get number of outcomes List<String> trainOutcomeValues = TaskUtils.getClassLabels(trainData, multiLabel); Clusterer clusterer = AbstractClusterer.forName(clusteringArguments.get(0), clusteringArguments.subList(1, clusteringArguments.size()).toArray(new String[0])); Instances copyTrainData = new Instances(trainData); trainData = WekaUtils.removeOutcomeId(trainData, multiLabel); // generate data for clusterer (w/o class) Remove filter = new Remove(); filter.setAttributeIndices("" + (trainData.classIndex() + 1)); filter.setInputFormat(trainData); Instances clusterTrainData = Filter.useFilter(trainData, filter); clusterer.buildClusterer(clusterTrainData); // get a mapping from clusterIDs to instance offsets in the ARFF Map<Integer, Set<Integer>> clusterMap = getClusterMap(clusterTrainData, clusterer); Map<String, String> instanceId2TextMap = getInstanceId2TextMap(aContext); ConditionalFrequencyDistribution<Integer, String> clusterAssignments = new ConditionalFrequencyDistribution<Integer, String>(); for (Integer clusterId : clusterMap.keySet()) { System.out.println("CLUSTER: " + clusterId); for (Integer offset : clusterMap.get(clusterId)) { // get instance ID from instance Instance instance = copyTrainData.get(offset); Double classOffset = new Double(instance.value(copyTrainData.classAttribute())); String label = (String) trainOutcomeValues.get(classOffset.intValue()); clusterAssignments.addSample(clusterId, label); String instanceId = instance .stringValue(copyTrainData.attribute(AddIdFeatureExtractor.ID_FEATURE_NAME).index()); System.out.println(label + "\t" + instanceId2TextMap.get(instanceId)); } System.out.println(); } System.out.println("ID\tSIZE\tPURITY\tRMSE"); for (Integer clusterId : clusterMap.keySet()) { FrequencyDistribution<String> fd = clusterAssignments.getFrequencyDistribution(clusterId); double purity = (double) fd.getCount(fd.getSampleWithMaxFreq()) / fd.getN(); String purityString = String.format("%.2f", purity); double rmse = getRMSE(fd, trainOutcomeValues); String rmseString = String.format("%.2f", rmse); System.out.println( clusterId + "\t" + clusterMap.get(clusterId).size() + "\t" + purityString + "\t" + rmseString); } System.out.println(); }
From source file:de.unidue.langtech.grading.tc.ClusterTrainTask.java
License:Open Source License
@Override public void execute(TaskContext aContext) throws Exception { if (learningMode.equals(Constants.LM_MULTI_LABEL)) { throw new IllegalArgumentException("Cannot use multi-label setup in clustering."); }/* w w w.j a va 2s .co m*/ boolean multiLabel = false; File arffFileTrain = new File( aContext.getStorageLocation(TEST_TASK_INPUT_KEY_TRAINING_DATA, AccessMode.READONLY).getPath() + "/" + TRAINING_DATA_FILENAME); Instances trainData = TaskUtils.getInstances(arffFileTrain, multiLabel); // get number of outcomes List<String> trainOutcomeValues = TaskUtils.getClassLabels(trainData, multiLabel); Clusterer clusterer = AbstractClusterer.forName(clusteringArguments.get(0), clusteringArguments.subList(1, clusteringArguments.size()).toArray(new String[0])); Instances copyTrainData = new Instances(trainData); trainData = WekaUtils.removeOutcomeId(trainData, multiLabel); // generate data for clusterer (w/o class) Remove filter = new Remove(); filter.setAttributeIndices("" + (trainData.classIndex() + 1)); filter.setInputFormat(trainData); Instances clusterTrainData = Filter.useFilter(trainData, filter); clusterer.buildClusterer(clusterTrainData); // get a mapping from clusterIDs to instance offsets in the ARFF Map<Integer, Set<Integer>> clusterMap = getClusterMap(clusterTrainData, clusterer); // get a CFD that stores the number of outcomes for each class indexed by the clusterID ConditionalFrequencyDistribution<Integer, String> clusterCfd = getClusterCfd(clusterMap, copyTrainData, trainOutcomeValues); Map<Integer, String> mostFrequentClassPerCluster = new HashMap<Integer, String>(); Map<Integer, Double> clusterScoreMap = new HashMap<Integer, Double>(); for (Integer clusterId : clusterMap.keySet()) { FrequencyDistribution<String> fd = clusterCfd.getFrequencyDistribution(clusterId); mostFrequentClassPerCluster.put(clusterId, fd.getSampleWithMaxFreq()); double purity = (double) fd.getCount(fd.getSampleWithMaxFreq()) / fd.getN(); // attention - cannot simply use RMSE here - as smaller values are better unlike with purity // double rmse = getRMSE(fd, trainOutcomeValues); clusterScoreMap.put(clusterId, purity); } // sort clusters by score Map<Integer, Double> sortedClusters = new TreeMap<Integer, Double>(new ValueComparator(clusterScoreMap)); sortedClusters.putAll(clusterScoreMap); // change the outcome values of instances according to the most frequent class in its cluster double avgPurity = 0.0; int n = 0; for (Integer clusterId : sortedClusters.keySet()) { // we need to take as many clusters until we have seen at least each class once if (onlyPureClusters && trainOutcomeValues.size() == 0) { break; } // // do not use clusters of single responses, as they always have purity of 1 // if (clusterCfd.getFrequencyDistribution(clusterId).getN() == 1) { // continue; // } n++; avgPurity += clusterScoreMap.get(clusterId); String mostFrequentClass = mostFrequentClassPerCluster.get(clusterId); trainOutcomeValues.remove(mostFrequentClass); for (Integer instanceOffset : clusterMap.get(clusterId)) { copyTrainData.get(instanceOffset).setValue(copyTrainData.classIndex(), mostFrequentClass); } } avgPurity = avgPurity / n; System.out.println("Average cluster purity: " + avgPurity); // write the new training data (that will be used by the test task instead of the original one) DataSink.write(aContext.getStorageLocation(ADAPTED_TRAINING_DATA, AccessMode.READWRITE).getPath() + "/" + ARFF_FILENAME, copyTrainData); }
From source file:de.unidue.langtech.grading.tc.LearningCurveTask.java
License:Open Source License
@Override public void execute(TaskContext aContext) throws Exception { boolean multiLabel = false; for (Integer numberInstances : NUMBER_OF_TRAINING_INSTANCES) { for (int iteration = 0; iteration < ITERATIONS; iteration++) { File arffFileTrain = new File( aContext.getStorageLocation(TEST_TASK_INPUT_KEY_TRAINING_DATA, AccessMode.READONLY) .getPath() + "/" + TRAINING_DATA_FILENAME); File arffFileTest = new File( aContext.getStorageLocation(TEST_TASK_INPUT_KEY_TEST_DATA, AccessMode.READONLY).getPath() + "/" + TRAINING_DATA_FILENAME); Instances trainData = TaskUtils.getInstances(arffFileTrain, multiLabel); Instances testData = TaskUtils.getInstances(arffFileTest, multiLabel); if (numberInstances > trainData.size()) { continue; }/* w w w.j av a2s.com*/ Classifier cl = AbstractClassifier.forName(classificationArguments.get(0), classificationArguments.subList(1, classificationArguments.size()).toArray(new String[0])); Instances copyTestData = new Instances(testData); trainData = WekaUtils.removeOutcomeId(trainData, multiLabel); testData = WekaUtils.removeOutcomeId(testData, multiLabel); Random generator = new Random(); generator.setSeed(System.nanoTime()); trainData.randomize(generator); // remove fraction of training data that should not be used for training for (int i = trainData.size() - 1; i >= numberInstances; i--) { trainData.delete(i); } // file to hold prediction results File evalOutput = new File( aContext.getStorageLocation(TEST_TASK_OUTPUT_KEY, AccessMode.READWRITE).getPath() + "/" + EVALUATION_DATA_FILENAME + "_" + numberInstances + "_" + iteration); // train the classifier on the train set split - not necessary in multilabel setup, but // in single label setup cl.buildClassifier(trainData); weka.core.SerializationHelper.write(evalOutput.getAbsolutePath(), WekaUtils.getEvaluationSinglelabel(cl, trainData, testData)); testData = WekaUtils.getPredictionInstancesSingleLabel(testData, cl); testData = WekaUtils.addOutcomeId(testData, copyTestData, false); // // Write out the predictions // DataSink.write(aContext.getStorageLocation(TEST_TASK_OUTPUT_KEY, AccessMode.READWRITE) // .getAbsolutePath() + "/" + PREDICTIONS_FILENAME + "_" + trainPercent, testData); } } }
From source file:de.uniheidelberg.cl.swp.mlprocess.AblationTesting.java
License:Apache License
/** * Creates an Instance object for the specified List of Features. * <br>//w ww .j a v a 2s . com * Extracts the Instance objects from a source file and suppresses all features but the ones * specified. * * @param fileName File to the training results in ARFF format. * @param features List of {@link AbstractFeatureExtractor}s which are currently being tested. * @return Instances object consisting of the desired attribute structure. * @throws Exception If the ARFF file couldn't be read, an exception is thrown. */ public Instances createInstances(String fileName, List<AbstractFeatureExtractor> features) throws Exception { final Instances train = new Instances(new BufferedReader(new FileReader(fileName))); ArrayList<Attribute> newAttributes = new ArrayList<Attribute>(); for (int i = 0; i < train.numAttributes(); i++) { for (AbstractFeatureExtractor feature : features) { if (train.attribute(i).name().equals(feature.getName())) { newAttributes.add(train.attribute(i)); continue; } } } /* * add the last two features (ACR-System + correct/false predictions) as those * are no features gathered by a FeatureExtractor. */ newAttributes.add(train.attribute(train.numAttributes() - 2)); newAttributes.add(train.attribute(train.numAttributes() - 1)); Instances trainCopy = copyInstances(train, newAttributes); trainCopy.setClassIndex(trainCopy.numAttributes() - 1); return trainCopy; }
From source file:de.uniheidelberg.cl.swp.mlprocess.WEKARunner.java
License:Apache License
/** * Creates a WEKA interface with a local ARFF file for training. * /*from ww w.j av a2s . c om*/ * @param trainArff Local ARFF file for training. * @throws If WEKA couldn't be initialized. */ public WEKARunner(String trainArff) throws Exception { train = new Instances(new BufferedReader(new FileReader(trainArff))); train.setClassIndex(train.numAttributes() - 1); }
From source file:de.uniheidelberg.cl.swp.mlprocess.WEKARunner.java
License:Apache License
/** * Evaluates our classifier with a test set. * <br>// ww w. j a va 2s .co m * Not used yet. * * @param testArff ARFF file to evaluate against. * @throws If the evaluation couldn't be initialized. */ public void buildEvaluation(String testArff) throws Exception { Instances evalIns = new Instances(new BufferedReader(new FileReader(testArff))); evalIns.setClassIndex(evalIns.numAttributes() - 1); evaluation = new Evaluation(train); }
From source file:de.uniheidelberg.cl.swp.mlprocess.WEKARunner.java
License:Apache License
/** * Predicts unknown labels of an Instances. * // w ww. j a va 2 s. co m * @param unkIns Instances with unknown attributes. * @return Instances with the formerly unknown instances, now labeled. * @throws If the Instances couldn't be labeled. */ public Instances labelUnknownInstances(Instances unkIns) throws Exception { Instances testcpy = new Instances(unkIns); for (int i = 0; i < unkIns.numInstances(); i++) { double clsLabel = classifier.classifyInstance(unkIns.instance(i)); testcpy.instance(i).setClassValue(clsLabel); } return testcpy; }
From source file:de.unimannheim.dws.algorithms.CustomSimpleKMedian.java
License:Open Source License
/** * Generates a clusterer. Has to initialize all fields of the clusterer that * are not being set via options.//from ww w . ja va 2 s . c o m * * @param data set of instances serving as training data * @throws Exception if the clusterer has not been generated successfully */ @Override public void buildClusterer(Instances data) throws Exception { // can clusterer handle the data? getCapabilities().testWithFail(data); m_Iterations = 0; m_ReplaceMissingFilter = new ReplaceMissingValues(); Instances instances = new Instances(data); instances.setClassIndex(-1); if (!m_dontReplaceMissing) { m_ReplaceMissingFilter.setInputFormat(instances); instances = Filter.useFilter(instances, m_ReplaceMissingFilter); } m_FullMissingCounts = new int[instances.numAttributes()]; if (m_displayStdDevs) { m_FullStdDevs = new double[instances.numAttributes()]; } m_FullNominalCounts = new int[instances.numAttributes()][0]; m_FullMeansOrMediansOrModes = moveCentroid(0, instances, false); for (int i = 0; i < instances.numAttributes(); i++) { m_FullMissingCounts[i] = instances.attributeStats(i).missingCount; if (instances.attribute(i).isNumeric()) { if (m_displayStdDevs) { m_FullStdDevs[i] = Math.sqrt(instances.variance(i)); } if (m_FullMissingCounts[i] == instances.numInstances()) { m_FullMeansOrMediansOrModes[i] = Double.NaN; // mark missing as mean } } else { m_FullNominalCounts[i] = instances.attributeStats(i).nominalCounts; if (m_FullMissingCounts[i] > m_FullNominalCounts[i][Utils.maxIndex(m_FullNominalCounts[i])]) { m_FullMeansOrMediansOrModes[i] = -1; // mark missing as most common // value } } } m_ClusterCentroids = new Instances(instances, m_NumClusters); int[] clusterAssignments = new int[instances.numInstances()]; if (m_PreserveOrder) { m_Assignments = clusterAssignments; } m_DistanceFunction.setInstances(instances); Random RandomO = new Random(getSeed()); int instIndex; HashMap initC = new HashMap(); DecisionTableHashKey hk = null; Instances initInstances = null; if (m_PreserveOrder) { initInstances = new Instances(instances); } else { initInstances = instances; } for (int j = initInstances.numInstances() - 1; j >= 0; j--) { instIndex = RandomO.nextInt(j + 1); hk = new DecisionTableHashKey(initInstances.instance(instIndex), initInstances.numAttributes(), true); if (!initC.containsKey(hk)) { m_ClusterCentroids.add(initInstances.instance(instIndex)); initC.put(hk, null); } initInstances.swap(j, instIndex); if (m_ClusterCentroids.numInstances() == m_NumClusters) { break; } } m_NumClusters = m_ClusterCentroids.numInstances(); // removing reference initInstances = null; int i; boolean converged = false; int emptyClusterCount; Instances[] tempI = new Instances[m_NumClusters]; m_squaredErrors = new double[m_NumClusters]; m_ClusterNominalCounts = new int[m_NumClusters][instances.numAttributes()][0]; m_ClusterMissingCounts = new int[m_NumClusters][instances.numAttributes()]; while (!converged) { emptyClusterCount = 0; m_Iterations++; converged = true; for (i = 0; i < instances.numInstances(); i++) { Instance toCluster = instances.instance(i); int newC = clusterProcessedInstance(toCluster, true); if (newC != clusterAssignments[i]) { converged = false; } clusterAssignments[i] = newC; } // update centroids m_ClusterCentroids = new Instances(instances, m_NumClusters); for (i = 0; i < m_NumClusters; i++) { tempI[i] = new Instances(instances, 0); } for (i = 0; i < instances.numInstances(); i++) { tempI[clusterAssignments[i]].add(instances.instance(i)); } for (i = 0; i < m_NumClusters; i++) { if (tempI[i].numInstances() == 0) { // empty cluster emptyClusterCount++; } else { moveCentroid(i, tempI[i], true); } } if (m_Iterations == m_MaxIterations) { converged = true; } if (emptyClusterCount > 0) { m_NumClusters -= emptyClusterCount; if (converged) { Instances[] t = new Instances[m_NumClusters]; int index = 0; for (int k = 0; k < tempI.length; k++) { if (tempI[k].numInstances() > 0) { t[index] = tempI[k]; for (i = 0; i < tempI[k].numAttributes(); i++) { m_ClusterNominalCounts[index][i] = m_ClusterNominalCounts[k][i]; } index++; } } tempI = t; } else { tempI = new Instances[m_NumClusters]; } } if (!converged) { m_squaredErrors = new double[m_NumClusters]; m_ClusterNominalCounts = new int[m_NumClusters][instances.numAttributes()][0]; } } if (m_displayStdDevs) { m_ClusterStdDevs = new Instances(instances, m_NumClusters); } m_ClusterSizes = new int[m_NumClusters]; for (i = 0; i < m_NumClusters; i++) { if (m_displayStdDevs) { double[] vals2 = new double[instances.numAttributes()]; for (int j = 0; j < instances.numAttributes(); j++) { if (instances.attribute(j).isNumeric()) { vals2[j] = Math.sqrt(tempI[i].variance(j)); } else { vals2[j] = Instance.missingValue(); } } m_ClusterStdDevs.add(new Instance(1.0, vals2)); } m_ClusterSizes[i] = tempI[i].numInstances(); } // Save memory!! m_DistanceFunction.clean(); }