List of usage examples for weka.clusterers AbstractClusterer forName
public static Clusterer forName(String clustererName, String[] options) throws Exception
From source file:com.spread.experiment.tempuntilofficialrelease.ClassificationViaClustering108.java
License:Open Source License
/** * Parses the options for this object.//from w w w . j a va 2s.c o m * <p/> * * <!-- options-start --> Valid options are: * <p/> * * <pre> * -output-debug-info * If set, classifier is run in debug mode and * may output additional info to the console * </pre> * * <pre> * -W <clusterer> * Full name of clusterer. * (default: weka.clusterers.SimpleKMeans) * </pre> * * <pre> * -label-all-clusters * If set, all clusters are labeled probabilistically instead of just those ones that best fit a class * </pre> * * <pre> * Options specific to clusterer weka.clusterers.SimpleKMeans: * </pre> * * <pre> * -N <num> * number of clusters. * (default 2). * </pre> * * <pre> * -V * Display std. deviations for centroids. * </pre> * * <pre> * -M * Replace missing values with mean/mode. * </pre> * * <pre> * -S <num> * Random number seed. * (default 10) * </pre> * * <!-- options-end --> * * @param options the options to use * @throws Exception if setting of options fails */ @Override public void setOptions(String[] options) throws Exception { setLabelAllClusters(Utils.getFlag("label-all-clusters", options)); String tmpStr = Utils.getOption('W', options); if (tmpStr.length() > 0) { setClusterer(AbstractClusterer.forName(tmpStr, null)); setClusterer(AbstractClusterer.forName(tmpStr, Utils.partitionOptions(options))); } else { setClusterer(AbstractClusterer.forName(defaultClustererString(), null)); setClusterer(AbstractClusterer.forName(defaultClustererString(), Utils.partitionOptions(options))); } super.setOptions(options); Utils.checkForRemainingOptions(options); }
From source file:core.ClusterEvaluationEX.java
License:Open Source License
/** * Main method for testing this class.//from ww w .j a v a 2 s . c o m * * @param args the options */ public static void main(String[] args) { try { if (args.length == 0) { throw new Exception("The first argument must be the name of a " + "clusterer"); } String ClustererString = args[0]; args[0] = ""; Clusterer newClusterer = AbstractClusterer.forName(ClustererString, null); System.out.println(evaluateClusterer(newClusterer, args)); } catch (Exception e) { System.out.println(e.getMessage()); } }
From source file:de.unidue.langtech.grading.tc.ClusterExemplarTask.java
License:Open Source License
@Override public void execute(TaskContext aContext) throws Exception { if (learningMode.equals(Constants.LM_MULTI_LABEL)) { throw new IllegalArgumentException("Cannot use multi-label setup in clustering."); }//from w ww .j a v a 2 s . co m boolean multiLabel = false; File arffFileTrain = new File( aContext.getStorageLocation(TEST_TASK_INPUT_KEY_TRAINING_DATA, AccessMode.READONLY).getPath() + "/" + TRAINING_DATA_FILENAME); Instances trainData = TaskUtils.getInstances(arffFileTrain, multiLabel); Clusterer abstractClusterer = AbstractClusterer.forName(clusteringArguments.get(0), clusteringArguments.subList(1, clusteringArguments.size()).toArray(new String[0])); // we assume that only this method has been used - breaks modularity, but need results fast ... :/ SimpleKMeans clusterer = (SimpleKMeans) abstractClusterer; trainData = WekaUtils.removeOutcomeId(trainData, multiLabel); Instances copyTrainData = new Instances(trainData); // generate data for clusterer (w/o class) Remove filter = new Remove(); filter.setAttributeIndices("" + (trainData.classIndex() + 1)); filter.setInputFormat(trainData); Instances clusterTrainData = Filter.useFilter(trainData, filter); clusterer.buildClusterer(clusterTrainData); Instances centroids = clusterer.getClusterCentroids(); // Add addFilter = new Add(); // addFilter.setAttributeIndex(new Integer(numTestLabels + i + 1).toString()); // addFilter.setNominalLabels("0,1"); // addFilter.setAttributeName(trainData.attribute(i).name() + COMPATIBLE_OUTCOME_CLASS); // addFilter.setInputFormat(testData); trainData.clear(); Enumeration<Instance> centroidInstances = centroids.enumerateInstances(); while (centroidInstances.hasMoreElements()) { Instance centroidInstance = centroidInstances.nextElement(); // centroidInstance is usually not a real instance, but a virtual centroid // we need to find the closest point in the training data double minDistance = Double.POSITIVE_INFINITY; int offset = 0; int minOffset = 0; Enumeration<Instance> trainInstances = clusterTrainData.enumerateInstances(); while (trainInstances.hasMoreElements()) { Instance trainInstance = trainInstances.nextElement(); double dist = distance(centroidInstance, trainInstance); if (dist < minDistance) { minDistance = dist; minOffset = offset; } offset++; } // add selected instance to instances trainData.add(copyTrainData.get(minOffset)); } // write the new training data (that will be used by the test task instead of the original one) DataSink.write(aContext.getStorageLocation(ADAPTED_TRAINING_DATA, AccessMode.READWRITE).getPath() + "/" + ARFF_FILENAME, trainData); }
From source file:de.unidue.langtech.grading.tc.ClusteringTask.java
License:Open Source License
@Override public void execute(TaskContext aContext) throws Exception { if (learningMode.equals(Constants.LM_MULTI_LABEL)) { throw new IllegalArgumentException("Cannot use multi-label setup in clustering."); }// ww w . j av a 2s. c o m boolean multiLabel = false; File arffFileTrain = new File( aContext.getStorageLocation(TEST_TASK_INPUT_KEY_TRAINING_DATA, AccessMode.READONLY).getPath() + "/" + TRAINING_DATA_FILENAME); Instances trainData = TaskUtils.getInstances(arffFileTrain, multiLabel); // get number of outcomes List<String> trainOutcomeValues = TaskUtils.getClassLabels(trainData, multiLabel); Clusterer clusterer = AbstractClusterer.forName(clusteringArguments.get(0), clusteringArguments.subList(1, clusteringArguments.size()).toArray(new String[0])); Instances copyTrainData = new Instances(trainData); trainData = WekaUtils.removeOutcomeId(trainData, multiLabel); // generate data for clusterer (w/o class) Remove filter = new Remove(); filter.setAttributeIndices("" + (trainData.classIndex() + 1)); filter.setInputFormat(trainData); Instances clusterTrainData = Filter.useFilter(trainData, filter); clusterer.buildClusterer(clusterTrainData); // get a mapping from clusterIDs to instance offsets in the ARFF Map<Integer, Set<Integer>> clusterMap = getClusterMap(clusterTrainData, clusterer); Map<String, String> instanceId2TextMap = getInstanceId2TextMap(aContext); ConditionalFrequencyDistribution<Integer, String> clusterAssignments = new ConditionalFrequencyDistribution<Integer, String>(); for (Integer clusterId : clusterMap.keySet()) { System.out.println("CLUSTER: " + clusterId); for (Integer offset : clusterMap.get(clusterId)) { // get instance ID from instance Instance instance = copyTrainData.get(offset); Double classOffset = new Double(instance.value(copyTrainData.classAttribute())); String label = (String) trainOutcomeValues.get(classOffset.intValue()); clusterAssignments.addSample(clusterId, label); String instanceId = instance .stringValue(copyTrainData.attribute(AddIdFeatureExtractor.ID_FEATURE_NAME).index()); System.out.println(label + "\t" + instanceId2TextMap.get(instanceId)); } System.out.println(); } System.out.println("ID\tSIZE\tPURITY\tRMSE"); for (Integer clusterId : clusterMap.keySet()) { FrequencyDistribution<String> fd = clusterAssignments.getFrequencyDistribution(clusterId); double purity = (double) fd.getCount(fd.getSampleWithMaxFreq()) / fd.getN(); String purityString = String.format("%.2f", purity); double rmse = getRMSE(fd, trainOutcomeValues); String rmseString = String.format("%.2f", rmse); System.out.println( clusterId + "\t" + clusterMap.get(clusterId).size() + "\t" + purityString + "\t" + rmseString); } System.out.println(); }
From source file:de.unidue.langtech.grading.tc.ClusterTrainTask.java
License:Open Source License
@Override public void execute(TaskContext aContext) throws Exception { if (learningMode.equals(Constants.LM_MULTI_LABEL)) { throw new IllegalArgumentException("Cannot use multi-label setup in clustering."); }/*from ww w. j av a2s . c o m*/ boolean multiLabel = false; File arffFileTrain = new File( aContext.getStorageLocation(TEST_TASK_INPUT_KEY_TRAINING_DATA, AccessMode.READONLY).getPath() + "/" + TRAINING_DATA_FILENAME); Instances trainData = TaskUtils.getInstances(arffFileTrain, multiLabel); // get number of outcomes List<String> trainOutcomeValues = TaskUtils.getClassLabels(trainData, multiLabel); Clusterer clusterer = AbstractClusterer.forName(clusteringArguments.get(0), clusteringArguments.subList(1, clusteringArguments.size()).toArray(new String[0])); Instances copyTrainData = new Instances(trainData); trainData = WekaUtils.removeOutcomeId(trainData, multiLabel); // generate data for clusterer (w/o class) Remove filter = new Remove(); filter.setAttributeIndices("" + (trainData.classIndex() + 1)); filter.setInputFormat(trainData); Instances clusterTrainData = Filter.useFilter(trainData, filter); clusterer.buildClusterer(clusterTrainData); // get a mapping from clusterIDs to instance offsets in the ARFF Map<Integer, Set<Integer>> clusterMap = getClusterMap(clusterTrainData, clusterer); // get a CFD that stores the number of outcomes for each class indexed by the clusterID ConditionalFrequencyDistribution<Integer, String> clusterCfd = getClusterCfd(clusterMap, copyTrainData, trainOutcomeValues); Map<Integer, String> mostFrequentClassPerCluster = new HashMap<Integer, String>(); Map<Integer, Double> clusterScoreMap = new HashMap<Integer, Double>(); for (Integer clusterId : clusterMap.keySet()) { FrequencyDistribution<String> fd = clusterCfd.getFrequencyDistribution(clusterId); mostFrequentClassPerCluster.put(clusterId, fd.getSampleWithMaxFreq()); double purity = (double) fd.getCount(fd.getSampleWithMaxFreq()) / fd.getN(); // attention - cannot simply use RMSE here - as smaller values are better unlike with purity // double rmse = getRMSE(fd, trainOutcomeValues); clusterScoreMap.put(clusterId, purity); } // sort clusters by score Map<Integer, Double> sortedClusters = new TreeMap<Integer, Double>(new ValueComparator(clusterScoreMap)); sortedClusters.putAll(clusterScoreMap); // change the outcome values of instances according to the most frequent class in its cluster double avgPurity = 0.0; int n = 0; for (Integer clusterId : sortedClusters.keySet()) { // we need to take as many clusters until we have seen at least each class once if (onlyPureClusters && trainOutcomeValues.size() == 0) { break; } // // do not use clusters of single responses, as they always have purity of 1 // if (clusterCfd.getFrequencyDistribution(clusterId).getN() == 1) { // continue; // } n++; avgPurity += clusterScoreMap.get(clusterId); String mostFrequentClass = mostFrequentClassPerCluster.get(clusterId); trainOutcomeValues.remove(mostFrequentClass); for (Integer instanceOffset : clusterMap.get(clusterId)) { copyTrainData.get(instanceOffset).setValue(copyTrainData.classIndex(), mostFrequentClass); } } avgPurity = avgPurity / n; System.out.println("Average cluster purity: " + avgPurity); // write the new training data (that will be used by the test task instead of the original one) DataSink.write(aContext.getStorageLocation(ADAPTED_TRAINING_DATA, AccessMode.READWRITE).getPath() + "/" + ARFF_FILENAME, copyTrainData); }