Example usage for weka.clusterers AbstractClusterer forName

List of usage examples for weka.clusterers AbstractClusterer forName

Introduction

In this page you can find the example usage for weka.clusterers AbstractClusterer forName.

Prototype

public static Clusterer forName(String clustererName, String[] options) throws Exception 

Source Link

Document

Creates a new instance of a clusterer given it's class name and (optional) arguments to pass to it's setOptions method.

Usage

From source file:com.spread.experiment.tempuntilofficialrelease.ClassificationViaClustering108.java

License:Open Source License

/**
 * Parses the options for this object.//from w w  w  .  j a  va 2s.c  o m
 * <p/>
 * 
 * <!-- options-start --> Valid options are:
 * <p/>
 * 
 * <pre>
 * -output-debug-info
 *  If set, classifier is run in debug mode and
 *  may output additional info to the console
 * </pre>
 * 
 * <pre>
 * -W &lt;clusterer&gt
 *  Full name of clusterer.
 *  (default: weka.clusterers.SimpleKMeans)
 * </pre>
 *
 * <pre>
 * -label-all-clusters
 *  If set, all clusters are labeled probabilistically instead of just those ones that best fit a class
 * </pre>
 *
 * <pre>
 * Options specific to clusterer weka.clusterers.SimpleKMeans:
 * </pre>
 * 
 * <pre>
 * -N &lt;num&gt;
 *  number of clusters.
 *  (default 2).
 * </pre>
 * 
 * <pre>
 * -V
 *  Display std. deviations for centroids.
 * </pre>
 * 
 * <pre>
 * -M
 *  Replace missing values with mean/mode.
 * </pre>
 * 
 * <pre>
 * -S &lt;num&gt;
 *  Random number seed.
 *  (default 10)
 * </pre>
 * 
 * <!-- options-end -->
 * 
 * @param options the options to use
 * @throws Exception if setting of options fails
 */
@Override
public void setOptions(String[] options) throws Exception {

    setLabelAllClusters(Utils.getFlag("label-all-clusters", options));
    String tmpStr = Utils.getOption('W', options);
    if (tmpStr.length() > 0) {
        setClusterer(AbstractClusterer.forName(tmpStr, null));
        setClusterer(AbstractClusterer.forName(tmpStr, Utils.partitionOptions(options)));
    } else {
        setClusterer(AbstractClusterer.forName(defaultClustererString(), null));
        setClusterer(AbstractClusterer.forName(defaultClustererString(), Utils.partitionOptions(options)));
    }

    super.setOptions(options);

    Utils.checkForRemainingOptions(options);
}

From source file:core.ClusterEvaluationEX.java

License:Open Source License

/**
 * Main method for testing this class.//from   ww w .j a  v a  2 s .  c o m
 *
 * @param args the options
 */
public static void main(String[] args) {
    try {
        if (args.length == 0) {
            throw new Exception("The first argument must be the name of a " + "clusterer");
        }

        String ClustererString = args[0];
        args[0] = "";
        Clusterer newClusterer = AbstractClusterer.forName(ClustererString, null);
        System.out.println(evaluateClusterer(newClusterer, args));
    } catch (Exception e) {
        System.out.println(e.getMessage());
    }
}

From source file:de.unidue.langtech.grading.tc.ClusterExemplarTask.java

License:Open Source License

@Override
public void execute(TaskContext aContext) throws Exception {
    if (learningMode.equals(Constants.LM_MULTI_LABEL)) {
        throw new IllegalArgumentException("Cannot use multi-label setup in clustering.");
    }//from w ww .j  a v a  2 s  . co  m
    boolean multiLabel = false;

    File arffFileTrain = new File(
            aContext.getStorageLocation(TEST_TASK_INPUT_KEY_TRAINING_DATA, AccessMode.READONLY).getPath() + "/"
                    + TRAINING_DATA_FILENAME);

    Instances trainData = TaskUtils.getInstances(arffFileTrain, multiLabel);

    Clusterer abstractClusterer = AbstractClusterer.forName(clusteringArguments.get(0),
            clusteringArguments.subList(1, clusteringArguments.size()).toArray(new String[0]));

    // we assume that only this method has been used - breaks modularity, but need results fast ... :/
    SimpleKMeans clusterer = (SimpleKMeans) abstractClusterer;

    trainData = WekaUtils.removeOutcomeId(trainData, multiLabel);
    Instances copyTrainData = new Instances(trainData);

    // generate data for clusterer (w/o class)
    Remove filter = new Remove();
    filter.setAttributeIndices("" + (trainData.classIndex() + 1));
    filter.setInputFormat(trainData);
    Instances clusterTrainData = Filter.useFilter(trainData, filter);

    clusterer.buildClusterer(clusterTrainData);
    Instances centroids = clusterer.getClusterCentroids();

    //        Add addFilter = new Add();
    //        addFilter.setAttributeIndex(new Integer(numTestLabels + i + 1).toString());
    //        addFilter.setNominalLabels("0,1");
    //        addFilter.setAttributeName(trainData.attribute(i).name() + COMPATIBLE_OUTCOME_CLASS);
    //        addFilter.setInputFormat(testData);

    trainData.clear();

    Enumeration<Instance> centroidInstances = centroids.enumerateInstances();
    while (centroidInstances.hasMoreElements()) {
        Instance centroidInstance = centroidInstances.nextElement();

        // centroidInstance is usually not a real instance, but a virtual centroid
        // we need to find the closest point in the training data
        double minDistance = Double.POSITIVE_INFINITY;
        int offset = 0;
        int minOffset = 0;
        Enumeration<Instance> trainInstances = clusterTrainData.enumerateInstances();
        while (trainInstances.hasMoreElements()) {
            Instance trainInstance = trainInstances.nextElement();

            double dist = distance(centroidInstance, trainInstance);
            if (dist < minDistance) {
                minDistance = dist;
                minOffset = offset;
            }
            offset++;
        }

        // add selected instance to instances
        trainData.add(copyTrainData.get(minOffset));
    }

    // write the new training data (that will be used by the test task instead of the original one)                
    DataSink.write(aContext.getStorageLocation(ADAPTED_TRAINING_DATA, AccessMode.READWRITE).getPath() + "/"
            + ARFF_FILENAME, trainData);
}

From source file:de.unidue.langtech.grading.tc.ClusteringTask.java

License:Open Source License

@Override
public void execute(TaskContext aContext) throws Exception {
    if (learningMode.equals(Constants.LM_MULTI_LABEL)) {
        throw new IllegalArgumentException("Cannot use multi-label setup in clustering.");
    }// ww w .  j av  a  2s. c o m
    boolean multiLabel = false;

    File arffFileTrain = new File(
            aContext.getStorageLocation(TEST_TASK_INPUT_KEY_TRAINING_DATA, AccessMode.READONLY).getPath() + "/"
                    + TRAINING_DATA_FILENAME);

    Instances trainData = TaskUtils.getInstances(arffFileTrain, multiLabel);

    // get number of outcomes
    List<String> trainOutcomeValues = TaskUtils.getClassLabels(trainData, multiLabel);

    Clusterer clusterer = AbstractClusterer.forName(clusteringArguments.get(0),
            clusteringArguments.subList(1, clusteringArguments.size()).toArray(new String[0]));

    Instances copyTrainData = new Instances(trainData);
    trainData = WekaUtils.removeOutcomeId(trainData, multiLabel);

    // generate data for clusterer (w/o class)
    Remove filter = new Remove();
    filter.setAttributeIndices("" + (trainData.classIndex() + 1));
    filter.setInputFormat(trainData);
    Instances clusterTrainData = Filter.useFilter(trainData, filter);

    clusterer.buildClusterer(clusterTrainData);

    // get a mapping from clusterIDs to instance offsets in the ARFF
    Map<Integer, Set<Integer>> clusterMap = getClusterMap(clusterTrainData, clusterer);

    Map<String, String> instanceId2TextMap = getInstanceId2TextMap(aContext);

    ConditionalFrequencyDistribution<Integer, String> clusterAssignments = new ConditionalFrequencyDistribution<Integer, String>();
    for (Integer clusterId : clusterMap.keySet()) {
        System.out.println("CLUSTER: " + clusterId);
        for (Integer offset : clusterMap.get(clusterId)) {

            // get instance ID from instance
            Instance instance = copyTrainData.get(offset);

            Double classOffset = new Double(instance.value(copyTrainData.classAttribute()));
            String label = (String) trainOutcomeValues.get(classOffset.intValue());

            clusterAssignments.addSample(clusterId, label);

            String instanceId = instance
                    .stringValue(copyTrainData.attribute(AddIdFeatureExtractor.ID_FEATURE_NAME).index());
            System.out.println(label + "\t" + instanceId2TextMap.get(instanceId));
        }
        System.out.println();
    }

    System.out.println("ID\tSIZE\tPURITY\tRMSE");
    for (Integer clusterId : clusterMap.keySet()) {
        FrequencyDistribution<String> fd = clusterAssignments.getFrequencyDistribution(clusterId);
        double purity = (double) fd.getCount(fd.getSampleWithMaxFreq()) / fd.getN();
        String purityString = String.format("%.2f", purity);
        double rmse = getRMSE(fd, trainOutcomeValues);
        String rmseString = String.format("%.2f", rmse);
        System.out.println(
                clusterId + "\t" + clusterMap.get(clusterId).size() + "\t" + purityString + "\t" + rmseString);
    }
    System.out.println();
}

From source file:de.unidue.langtech.grading.tc.ClusterTrainTask.java

License:Open Source License

@Override
public void execute(TaskContext aContext) throws Exception {
    if (learningMode.equals(Constants.LM_MULTI_LABEL)) {
        throw new IllegalArgumentException("Cannot use multi-label setup in clustering.");
    }/*from   ww w. j  av a2s . c  o m*/
    boolean multiLabel = false;

    File arffFileTrain = new File(
            aContext.getStorageLocation(TEST_TASK_INPUT_KEY_TRAINING_DATA, AccessMode.READONLY).getPath() + "/"
                    + TRAINING_DATA_FILENAME);

    Instances trainData = TaskUtils.getInstances(arffFileTrain, multiLabel);

    // get number of outcomes
    List<String> trainOutcomeValues = TaskUtils.getClassLabels(trainData, multiLabel);

    Clusterer clusterer = AbstractClusterer.forName(clusteringArguments.get(0),
            clusteringArguments.subList(1, clusteringArguments.size()).toArray(new String[0]));

    Instances copyTrainData = new Instances(trainData);
    trainData = WekaUtils.removeOutcomeId(trainData, multiLabel);

    // generate data for clusterer (w/o class)
    Remove filter = new Remove();
    filter.setAttributeIndices("" + (trainData.classIndex() + 1));
    filter.setInputFormat(trainData);
    Instances clusterTrainData = Filter.useFilter(trainData, filter);

    clusterer.buildClusterer(clusterTrainData);

    // get a mapping from clusterIDs to instance offsets in the ARFF
    Map<Integer, Set<Integer>> clusterMap = getClusterMap(clusterTrainData, clusterer);

    // get a CFD that stores the number of outcomes for each class indexed by the clusterID
    ConditionalFrequencyDistribution<Integer, String> clusterCfd = getClusterCfd(clusterMap, copyTrainData,
            trainOutcomeValues);

    Map<Integer, String> mostFrequentClassPerCluster = new HashMap<Integer, String>();
    Map<Integer, Double> clusterScoreMap = new HashMap<Integer, Double>();
    for (Integer clusterId : clusterMap.keySet()) {
        FrequencyDistribution<String> fd = clusterCfd.getFrequencyDistribution(clusterId);
        mostFrequentClassPerCluster.put(clusterId, fd.getSampleWithMaxFreq());

        double purity = (double) fd.getCount(fd.getSampleWithMaxFreq()) / fd.getN();
        // attention - cannot simply use RMSE here - as smaller values are better unlike with purity
        //           double rmse = getRMSE(fd, trainOutcomeValues);
        clusterScoreMap.put(clusterId, purity);
    }

    // sort clusters by score
    Map<Integer, Double> sortedClusters = new TreeMap<Integer, Double>(new ValueComparator(clusterScoreMap));
    sortedClusters.putAll(clusterScoreMap);

    // change the outcome values of instances according to the most frequent class in its cluster

    double avgPurity = 0.0;
    int n = 0;
    for (Integer clusterId : sortedClusters.keySet()) {
        // we need to take as many clusters until we have seen at least each class once
        if (onlyPureClusters && trainOutcomeValues.size() == 0) {
            break;
        }

        //           // do not use clusters of single responses, as they always have purity of 1
        //           if (clusterCfd.getFrequencyDistribution(clusterId).getN() == 1) {
        //              continue;
        //           }

        n++;
        avgPurity += clusterScoreMap.get(clusterId);

        String mostFrequentClass = mostFrequentClassPerCluster.get(clusterId);
        trainOutcomeValues.remove(mostFrequentClass);

        for (Integer instanceOffset : clusterMap.get(clusterId)) {
            copyTrainData.get(instanceOffset).setValue(copyTrainData.classIndex(), mostFrequentClass);
        }
    }
    avgPurity = avgPurity / n;
    System.out.println("Average cluster purity: " + avgPurity);

    // write the new training data (that will be used by the test task instead of the original one)                
    DataSink.write(aContext.getStorageLocation(ADAPTED_TRAINING_DATA, AccessMode.READWRITE).getPath() + "/"
            + ARFF_FILENAME, copyTrainData);
}