Example usage for weka.clusterers AbstractClusterer forName

Introduction

In this page you can find the example usage for weka.clusterers AbstractClusterer forName.

Prototype

public static Clusterer forName(String clustererName, String[] options) throws Exception

Source Link

Document

Creates a new instance of a clusterer given it's class name and (optional) arguments to pass to it's setOptions method.

Usage

From source file:com.spread.experiment.tempuntilofficialrelease.ClassificationViaClustering108.java

License:Open Source License

/**
 * Parses the options for this object.//from w w  w  .  j a  va 2s.c  o m
 * <p/>
 * 
 * <!-- options-start --> Valid options are:
 * <p/>
 * 
 * <pre>
 * -output-debug-info
 *  If set, classifier is run in debug mode and
 *  may output additional info to the console
 * </pre>
 * 
 * <pre>
 * -W &lt;clusterer&gt
 *  Full name of clusterer.
 *  (default: weka.clusterers.SimpleKMeans)
 * </pre>
 *
 * <pre>
 * -label-all-clusters
 *  If set, all clusters are labeled probabilistically instead of just those ones that best fit a class
 * </pre>
 *
 * <pre>
 * Options specific to clusterer weka.clusterers.SimpleKMeans:
 * </pre>
 * 
 * <pre>
 * -N &lt;num&gt;
 *  number of clusters.
 *  (default 2).
 * </pre>
 * 
 * <pre>
 * -V
 *  Display std. deviations for centroids.
 * </pre>
 * 
 * <pre>
 * -M
 *  Replace missing values with mean/mode.
 * </pre>
 * 
 * <pre>
 * -S &lt;num&gt;
 *  Random number seed.
 *  (default 10)
 * </pre>
 * 
 * <!-- options-end -->
 * 
 * @param options the options to use
 * @throws Exception if setting of options fails
 */
@Override
public void setOptions(String[] options) throws Exception {

    setLabelAllClusters(Utils.getFlag("label-all-clusters", options));
    String tmpStr = Utils.getOption('W', options);
    if (tmpStr.length() > 0) {
        setClusterer(AbstractClusterer.forName(tmpStr, null));
        setClusterer(AbstractClusterer.forName(tmpStr, Utils.partitionOptions(options)));
    } else {
        setClusterer(AbstractClusterer.forName(defaultClustererString(), null));
        setClusterer(AbstractClusterer.forName(defaultClustererString(), Utils.partitionOptions(options)));
    }

    super.setOptions(options);

    Utils.checkForRemainingOptions(options);
}

From source file:core.ClusterEvaluationEX.java

License:Open Source License

/**
 * Main method for testing this class.//from   ww w .j a  v a  2 s .  c o m
 *
 * @param args the options
 */
public static void main(String[] args) {
    try {
        if (args.length == 0) {
            throw new Exception("The first argument must be the name of a " + "clusterer");
        }

        String ClustererString = args[0];
        args[0] = "";
        Clusterer newClusterer = AbstractClusterer.forName(ClustererString, null);
        System.out.println(evaluateClusterer(newClusterer, args));
    } catch (Exception e) {
        System.out.println(e.getMessage());
    }
}

From source file:de.unidue.langtech.grading.tc.ClusterExemplarTask.java

License:Open Source License

@Override
public void execute(TaskContext aContext) throws Exception {
    if (learningMode.equals(Constants.LM_MULTI_LABEL)) {
        throw new IllegalArgumentException("Cannot use multi-label setup in clustering.");
    }//from w ww .j  a v a  2 s  . co  m
    boolean multiLabel = false;

    File arffFileTrain = new File(
            aContext.getStorageLocation(TEST_TASK_INPUT_KEY_TRAINING_DATA, AccessMode.READONLY).getPath() + "/"
                    + TRAINING_DATA_FILENAME);

    Instances trainData = TaskUtils.getInstances(arffFileTrain, multiLabel);

    Clusterer abstractClusterer = AbstractClusterer.forName(clusteringArguments.get(0),
            clusteringArguments.subList(1, clusteringArguments.size()).toArray(new String[0]));

    // we assume that only this method has been used - breaks modularity, but need results fast ... :/
    SimpleKMeans clusterer = (SimpleKMeans) abstractClusterer;

    trainData = WekaUtils.removeOutcomeId(trainData, multiLabel);
    Instances copyTrainData = new Instances(trainData);

    // generate data for clusterer (w/o class)
    Remove filter = new Remove();
    filter.setAttributeIndices("" + (trainData.classIndex() + 1));
    filter.setInputFormat(trainData);
    Instances clusterTrainData = Filter.useFilter(trainData, filter);

    clusterer.buildClusterer(clusterTrainData);
    Instances centroids = clusterer.getClusterCentroids();

    //        Add addFilter = new Add();
    //        addFilter.setAttributeIndex(new Integer(numTestLabels + i + 1).toString());
    //        addFilter.setNominalLabels("0,1");
    //        addFilter.setAttributeName(trainData.attribute(i).name() + COMPATIBLE_OUTCOME_CLASS);
    //        addFilter.setInputFormat(testData);

    trainData.clear();

    Enumeration<Instance> centroidInstances = centroids.enumerateInstances();
    while (centroidInstances.hasMoreElements()) {
        Instance centroidInstance = centroidInstances.nextElement();

        // centroidInstance is usually not a real instance, but a virtual centroid
        // we need to find the closest point in the training data
        double minDistance = Double.POSITIVE_INFINITY;
        int offset = 0;
        int minOffset = 0;
        Enumeration<Instance> trainInstances = clusterTrainData.enumerateInstances();
        while (trainInstances.hasMoreElements()) {
            Instance trainInstance = trainInstances.nextElement();

            double dist = distance(centroidInstance, trainInstance);
            if (dist < minDistance) {
                minDistance = dist;
                minOffset = offset;
            }
            offset++;
        }

        // add selected instance to instances
        trainData.add(copyTrainData.get(minOffset));
    }

    // write the new training data (that will be used by the test task instead of the original one)                
    DataSink.write(aContext.getStorageLocation(ADAPTED_TRAINING_DATA, AccessMode.READWRITE).getPath() + "/"
            + ARFF_FILENAME, trainData);
}

From source file:de.unidue.langtech.grading.tc.ClusteringTask.java

License:Open Source License

@Override
public void execute(TaskContext aContext) throws Exception {
    if (learningMode.equals(Constants.LM_MULTI_LABEL)) {
        throw new IllegalArgumentException("Cannot use multi-label setup in clustering.");
    }// ww w .  j av  a  2s. c o m
    boolean multiLabel = false;

    File arffFileTrain = new File(
            aContext.getStorageLocation(TEST_TASK_INPUT_KEY_TRAINING_DATA, AccessMode.READONLY).getPath() + "/"
                    + TRAINING_DATA_FILENAME);

    Instances trainData = TaskUtils.getInstances(arffFileTrain, multiLabel);

    // get number of outcomes
    List<String> trainOutcomeValues = TaskUtils.getClassLabels(trainData, multiLabel);

    Clusterer clusterer = AbstractClusterer.forName(clusteringArguments.get(0),
            clusteringArguments.subList(1, clusteringArguments.size()).toArray(new String[0]));

    Instances copyTrainData = new Instances(trainData);
    trainData = WekaUtils.removeOutcomeId(trainData, multiLabel);

    // generate data for clusterer (w/o class)
    Remove filter = new Remove();
    filter.setAttributeIndices("" + (trainData.classIndex() + 1));
    filter.setInputFormat(trainData);
    Instances clusterTrainData = Filter.useFilter(trainData, filter);

    clusterer.buildClusterer(clusterTrainData);

    // get a mapping from clusterIDs to instance offsets in the ARFF
    Map<Integer, Set<Integer>> clusterMap = getClusterMap(clusterTrainData, clusterer);

    Map<String, String> instanceId2TextMap = getInstanceId2TextMap(aContext);

    ConditionalFrequencyDistribution<Integer, String> clusterAssignments = new ConditionalFrequencyDistribution<Integer, String>();
    for (Integer clusterId : clusterMap.keySet()) {
        System.out.println("CLUSTER: " + clusterId);
        for (Integer offset : clusterMap.get(clusterId)) {

            // get instance ID from instance
            Instance instance = copyTrainData.get(offset);

            Double classOffset = new Double(instance.value(copyTrainData.classAttribute()));
            String label = (String) trainOutcomeValues.get(classOffset.intValue());

            clusterAssignments.addSample(clusterId, label);

            String instanceId = instance
                    .stringValue(copyTrainData.attribute(AddIdFeatureExtractor.ID_FEATURE_NAME).index());
            System.out.println(label + "\t" + instanceId2TextMap.get(instanceId));
        }
        System.out.println();
    }

    System.out.println("ID\tSIZE\tPURITY\tRMSE");
    for (Integer clusterId : clusterMap.keySet()) {
        FrequencyDistribution<String> fd = clusterAssignments.getFrequencyDistribution(clusterId);
        double purity = (double) fd.getCount(fd.getSampleWithMaxFreq()) / fd.getN();
        String purityString = String.format("%.2f", purity);
        double rmse = getRMSE(fd, trainOutcomeValues);
        String rmseString = String.format("%.2f", rmse);
        System.out.println(
                clusterId + "\t" + clusterMap.get(clusterId).size() + "\t" + purityString + "\t" + rmseString);
    }
    System.out.println();
}

From source file:de.unidue.langtech.grading.tc.ClusterTrainTask.java

License:Open Source License

@Override
public void execute(TaskContext aContext) throws Exception {
    if (learningMode.equals(Constants.LM_MULTI_LABEL)) {
        throw new IllegalArgumentException("Cannot use multi-label setup in clustering.");
    }/*from   ww w. j  av a2s . c  o m*/
    boolean multiLabel = false;

    File arffFileTrain = new File(
            aContext.getStorageLocation(TEST_TASK_INPUT_KEY_TRAINING_DATA, AccessMode.READONLY).getPath() + "/"
                    + TRAINING_DATA_FILENAME);

    Instances trainData = TaskUtils.getInstances(arffFileTrain, multiLabel);

    // get number of outcomes
    List<String> trainOutcomeValues = TaskUtils.getClassLabels(trainData, multiLabel);

    Clusterer clusterer = AbstractClusterer.forName(clusteringArguments.get(0),
            clusteringArguments.subList(1, clusteringArguments.size()).toArray(new String[0]));

    Instances copyTrainData = new Instances(trainData);
    trainData = WekaUtils.removeOutcomeId(trainData, multiLabel);

    // generate data for clusterer (w/o class)
    Remove filter = new Remove();
    filter.setAttributeIndices("" + (trainData.classIndex() + 1));
    filter.setInputFormat(trainData);
    Instances clusterTrainData = Filter.useFilter(trainData, filter);

    clusterer.buildClusterer(clusterTrainData);

    // get a mapping from clusterIDs to instance offsets in the ARFF
    Map<Integer, Set<Integer>> clusterMap = getClusterMap(clusterTrainData, clusterer);

    // get a CFD that stores the number of outcomes for each class indexed by the clusterID
    ConditionalFrequencyDistribution<Integer, String> clusterCfd = getClusterCfd(clusterMap, copyTrainData,
            trainOutcomeValues);

    Map<Integer, String> mostFrequentClassPerCluster = new HashMap<Integer, String>();
    Map<Integer, Double> clusterScoreMap = new HashMap<Integer, Double>();
    for (Integer clusterId : clusterMap.keySet()) {
        FrequencyDistribution<String> fd = clusterCfd.getFrequencyDistribution(clusterId);
        mostFrequentClassPerCluster.put(clusterId, fd.getSampleWithMaxFreq());

        double purity = (double) fd.getCount(fd.getSampleWithMaxFreq()) / fd.getN();
        // attention - cannot simply use RMSE here - as smaller values are better unlike with purity
        //           double rmse = getRMSE(fd, trainOutcomeValues);
        clusterScoreMap.put(clusterId, purity);
    }

    // sort clusters by score
    Map<Integer, Double> sortedClusters = new TreeMap<Integer, Double>(new ValueComparator(clusterScoreMap));
    sortedClusters.putAll(clusterScoreMap);

    // change the outcome values of instances according to the most frequent class in its cluster

    double avgPurity = 0.0;
    int n = 0;
    for (Integer clusterId : sortedClusters.keySet()) {
        // we need to take as many clusters until we have seen at least each class once
        if (onlyPureClusters && trainOutcomeValues.size() == 0) {
            break;
        }

        //           // do not use clusters of single responses, as they always have purity of 1
        //           if (clusterCfd.getFrequencyDistribution(clusterId).getN() == 1) {
        //              continue;
        //           }

        n++;
        avgPurity += clusterScoreMap.get(clusterId);

        String mostFrequentClass = mostFrequentClassPerCluster.get(clusterId);
        trainOutcomeValues.remove(mostFrequentClass);

        for (Integer instanceOffset : clusterMap.get(clusterId)) {
            copyTrainData.get(instanceOffset).setValue(copyTrainData.classIndex(), mostFrequentClass);
        }
    }
    avgPurity = avgPurity / n;
    System.out.println("Average cluster purity: " + avgPurity);

    // write the new training data (that will be used by the test task instead of the original one)                
    DataSink.write(aContext.getStorageLocation(ADAPTED_TRAINING_DATA, AccessMode.READWRITE).getPath() + "/"
            + ARFF_FILENAME, copyTrainData);
}