Example usage for weka.clusterers Clusterer buildClusterer

Introduction

In this page you can find the example usage for weka.clusterers Clusterer buildClusterer.

Prototype

void buildClusterer(Instances data) throws Exception;

Source Link

Document

Generates a clusterer.

Usage

From source file:adams.flow.transformer.WekaTrainClusterer.java

License:Open Source License

/**
 * Executes the flow item.//from   w  w  w .  j a  v a 2s  .co m
 *
 * @return      null if everything is fine, otherwise error message
 */
@Override
protected String doExecute() {
    String result;
    Instances data;
    Instance inst;
    weka.clusterers.Clusterer cls;
    WekaModelContainer cont;

    result = null;

    try {
        cls = null;
        if ((m_InputToken != null) && (m_InputToken.getPayload() instanceof Instances)) {
            cls = getClustererInstance();
            data = (Instances) m_InputToken.getPayload();
            cls.buildClusterer(data);
            cont = new WekaModelContainer(cls, new Instances(data, 0), data);
            cont = m_PostProcessor.postProcess(cont);
            m_OutputToken = new Token(cont);
        } else if ((m_InputToken != null) && (m_InputToken.getPayload() instanceof Instance)) {
            if (m_IncrementalClusterer == null) {
                cls = getClustererInstance();
                if (!(cls instanceof UpdateableClusterer))
                    result = m_Clusterer + "/" + cls.getClass().getName() + " is not an incremental clusterer!";
            }
            if (result == null) {
                inst = (Instance) m_InputToken.getPayload();
                if (m_IncrementalClusterer == null) {
                    m_IncrementalClusterer = cls;
                    data = new Instances(inst.dataset(), 1);
                    data.add((Instance) inst.copy());
                    m_IncrementalClusterer.buildClusterer(data);
                } else {
                    ((UpdateableClusterer) m_IncrementalClusterer).updateClusterer(inst);
                    ((UpdateableClusterer) m_IncrementalClusterer).updateFinished();
                }
                m_OutputToken = new Token(
                        new WekaModelContainer(m_IncrementalClusterer, new Instances(inst.dataset(), 0)));
            }
        }
    } catch (Exception e) {
        m_OutputToken = null;
        result = handleException("Failed to process input: " + m_InputToken.getPayload(), e);
    }

    if (m_OutputToken != null)
        updateProvenance(m_OutputToken);

    return result;
}

From source file:adams.flow.transformer.WekaTrainTestSetClustererEvaluator.java

License:Open Source License

/**
 * Executes the flow item.//from  w ww .ja va2s .  c  om
 *
 * @return      null if everything is fine, otherwise error message
 */
@Override
protected String doExecute() {
    String result;
    Instances train;
    Instances test;
    weka.clusterers.Clusterer cls;
    ClusterEvaluation eval;
    WekaTrainTestSetContainer cont;

    result = null;

    try {
        // cross-validate clusterer
        cls = getClustererInstance();
        if (cls == null)
            throw new IllegalStateException("Clusterer '" + getClusterer() + "' not found!");

        cont = (WekaTrainTestSetContainer) m_InputToken.getPayload();
        train = (Instances) cont.getValue(WekaTrainTestSetContainer.VALUE_TRAIN);
        test = (Instances) cont.getValue(WekaTrainTestSetContainer.VALUE_TEST);
        cls.buildClusterer(train);
        eval = new ClusterEvaluation();
        eval.setClusterer(cls);
        eval.evaluateClusterer(test, null, m_OutputModel);

        // broadcast result
        m_OutputToken = new Token(new WekaClusterEvaluationContainer(eval, cls));
    } catch (Exception e) {
        m_OutputToken = null;
        result = handleException("Failed to evaluate: ", e);
    }

    if (m_OutputToken != null)
        updateProvenance(m_OutputToken);

    return result;
}

From source file:adams.ml.model.clustering.WekaClusterer.java

License:Open Source License

/**
 * Builds a model from the data./*from w  w  w .j  a va  2  s  .c  o  m*/
 *
 * @param data   the data to use for building the model
 * @return      the generated model
 * @throws Exception   if the build fails
 */
@Override
protected ClusteringModel doBuildModel(Dataset data) throws Exception {
    Instances inst;
    weka.clusterers.Clusterer clusterer;

    inst = WekaConverter.toInstances(data);
    clusterer = (weka.clusterers.Clusterer) OptionUtils.shallowCopy(m_Clusterer);
    if (clusterer == null)
        throw new Exception(
                "Failed to create shallow copy of classifier: " + OptionUtils.getCommandLine(m_Clusterer));

    clusterer.buildClusterer(inst);

    return new WekaClusteringModel(clusterer, data, inst);
}

From source file:com.actelion.research.orbit.imageAnalysis.tasks.TrainWorker.java

License:Open Source License

private void createClusterer() {
    int MAX_TILES_CLUSTERING = 50;
    if (iFrames == null || iFrames.size() < 1) {
        logger.error("cannot build clusterer, no open image frames.");
        return;//from   w w w.  ja v a  2  s  .co  m
    }
    if (modelToBuild != null && modelToBuild.getClassifier() != null)
        modelToBuild.getClassifier().setBuild(false);

    int windowSize = modelToBuild.getFeatureDescription().getWindowSize();

    List<double[]> trainData = new ArrayList<double[]>();

    for (ImageFrame iFrame : iFrames) {
        PlanarImage image = iFrame.recognitionFrame.bimg.getImage();
        TissueFeatures tissueFeatures = new TissueFeatures(modelToBuild.getFeatureDescription(),
                iFrame.recognitionFrame.bimg);
        Point[] tileArr = image.getTileIndices(null);
        if (tileArr.length > MAX_TILES_CLUSTERING) {
            logger.trace("number of tiles for clustering: " + tileArr.length);
            List<Point> pList = new ArrayList<Point>(tileArr.length);
            for (Point p : tileArr)
                pList.add(p);
            Collections.shuffle(pList);
            pList = pList.subList(0, MAX_TILES_CLUSTERING);
            tileArr = pList.toArray(new Point[0]);
            logger.trace("number of tiles after tile limit: " + tileArr.length);
        }

        for (Point tileNum : tileArr) {
            Raster r = image.getTile(tileNum.x, tileNum.y);
            for (int x = image.tileXToX(tileNum.x); x < Math
                    .min(image.tileXToX(tileNum.x) + image.getTileWidth(), image.getWidth()); x++) {
                for (int y = image.tileYToY(tileNum.y); y < Math
                        .min(image.tileYToY(tileNum.y) + image.getTileHeight(), image.getHeight()); y++) {
                    if ((x < r.getMinX() + windowSize) || (y < r.getMinY() + windowSize)
                            || (x > r.getMinX() + r.getWidth() - windowSize - 1)
                            || (y > r.getMinY() + r.getHeight() - windowSize - 1))
                        continue;

                    double[] feats = null;
                    try {
                        feats = tissueFeatures.buildFeatures(r, x, y, Double.NaN);
                    } catch (Throwable t) {
                        System.out.println(t.getMessage());
                        t.printStackTrace();
                    }
                    trainData.add(feats);

                } // y
                checkPaused();
                if (isCancelled()) {
                    cleanUp();
                    return;
                }
            } // x

        } // tileNum
    } // iFrames

    timeEst = 1000 * 60L;
    setProgress(20);

    // trainData -> instances
    checkPaused();
    if (isCancelled()) {
        cleanUp();
        return;
    }
    trainSet = null;
    Attribute classAttr = null;
    // create the first time a new trainSet. All further trainings will append new instances.
    if (trainSet == null) {
        // build traindata header
        double[] firstRow = trainData.get(0);
        ArrayList<Attribute> attrInfo = new ArrayList<Attribute>(firstRow.length);
        for (int a = 0; a < firstRow.length - 1; a++) {
            Attribute attr = new Attribute("a" + a);
            // if (a<firstRow.length-2) attr.setWeight(0.1d); else attr.setWeight(1.0d);
            attrInfo.add(attr);
        }
        List<String> classValues = new ArrayList<String>(
                iFrames.get(0).recognitionFrame.getClassShapes().size());
        for (int i = 0; i < iFrames.get(0).recognitionFrame.getClassShapes().size(); i++) {
            classValues.add((i + 1) + ".0"); // "1.0", "2.0", ...
        }
        classAttr = new Attribute("class", classValues);
        attrInfo.add(classAttr);

        trainSet = new Instances("trainSet pattern classes", attrInfo, trainData.size());
        trainSet.setClassIndex(firstRow.length - 1);
    } else
        classAttr = trainSet.attribute("class");

    timeEst = 1000 * 45L;
    setProgress(25);

    // add instances
    checkPaused();
    if (isCancelled()) {
        cleanUp();
        return;
    }
    for (double[] vals : trainData) {
        double classV = Double.NaN;
        vals[vals.length - 1] = classV;
        Instance inst = new DenseInstance(1.0d, vals);
        trainSet.add(inst);
    }
    trainSet = trainSet.resample(rand);
    trainSet.setClassIndex(-1);
    Instances ts = new Instances(trainSet, 0);
    ts.addAll(trainSet.subList(0, Math.min(MAX_CLUSTERING_EXAMPLES, trainSet.size() - 1)));
    trainSet = null;
    trainSet = ts;
    logger.debug("trainSet contains " + trainSet.numInstances() + " instances, class Attribute: "
            + trainSet.classIndex());
    logger.info("start building clusterer...");

    timeEst = 1000 * 40L;
    setProgress(30);

    // build clusterer
    checkPaused();
    if (isCancelled()) {
        cleanUp();
        return;
    }
    // Clusterer clusterer = new weka.clusterers.SimpleKMeans();
    //Clusterer clusterer = new MakeDensityBasedClusterer(new SimpleKMeans());
    Clusterer clusterer = new EM();
    try {
        //((weka.clusterers.SimpleKMeans)clusterer).setNumClusters(iFrames.get(0).recognitionFrame.getClassShapes().size());
        // ((MakeDensityBasedClusterer)clusterer).setNumClusters(iFrames.get(0).recognitionFrame.getClassShapes().size());
        ((EM) clusterer).setNumClusters(iFrames.get(0).recognitionFrame.getClassShapes().size());
        clusterer.buildClusterer(trainSet);
    } catch (Exception e) {
        logger.error(
                "cannot build clusterer or cannot set number of clusters (classShapes not correctly initialized?)");
        e.printStackTrace();
    }
    logger.info(
            "done. (clusterer is densityBasedClusterer: " + (clusterer instanceof DensityBasedClusterer) + ")");

    // sort class labels according to priors

    classifier = new ClassifierWrapper(clusterer);
    classifier.setBuild(true);
    this.trainSet = trainSet.stringFreeStructure();
    modelToBuild.setClassifier(classifier);
    modelToBuild.setStructure(trainSet.stringFreeStructure());
}

From source file:com.rapidminer.operator.learner.clustering.clusterer.GenericWekaClusteringAdaptor.java

License:Open Source License

public ClusterModel createClusterModel(ExampleSet exampleSet) throws OperatorException {
    weka.clusterers.Clusterer clusterer = getWekaClusterer(
            WekaTools.getWekaParametersFromTypes(this, wekaParameters));
    weka.core.Instances instances = WekaTools.toWekaInstances(exampleSet, "ClusterInstances",
            WekaInstancesAdaptor.CLUSTERING);
    try {//w  w w . ja v a  2 s .  c o  m
        clusterer.buildClusterer(instances);
        WekaCluster wekaCluster = new WekaCluster(exampleSet, clusterer);
        exampleSet = wekaCluster.apply(exampleSet);
    } catch (Exception e) {
        throw new UserError(this, e, 905, new Object[] { getOperatorClassName(), e });
    }
    ClusterModel clusterModel = createWekaBasedClusterModel(exampleSet);
    return clusterModel;
}

From source file:core.ClusterEvaluationEX.java

License:Open Source License

/**
 * Evaluates a clusterer with the options given in an array of
 * strings. It takes the string indicated by "-t" as training file, the
 * string indicated by "-T" as test file.
 * If the test file is missing, a stratified ten-fold
 * cross-validation is performed (distribution clusterers only).
 * Using "-x" you can change the number of
 * folds to be used, and using "-s" the random seed.
 * If the "-p" option is present it outputs the classification for
 * each test instance. If you provide the name of an object file using
 * "-l", a clusterer will be loaded from the given file. If you provide the
 * name of an object file using "-d", the clusterer built from the
 * training data will be saved to the given file.
 *
 * @param clusterer machine learning clusterer
 * @param options the array of string containing the options
 * @throws Exception if model could not be evaluated successfully
 * @return a string describing the results 
 *//*from   w  w  w .  j av a 2  s  .  c  o  m*/
public static String evaluateClusterer(Clusterer clusterer, String[] options) throws Exception {

    int seed = 1, folds = 10;
    boolean doXval = false;
    Instances train = null;
    Random random;
    String trainFileName, testFileName, seedString, foldsString;
    String objectInputFileName, objectOutputFileName, attributeRangeString;
    String graphFileName;
    String[] savedOptions = null;
    boolean printClusterAssignments = false;
    Range attributesToOutput = null;
    StringBuffer text = new StringBuffer();
    int theClass = -1; // class based evaluation of clustering
    boolean updateable = (clusterer instanceof UpdateableClusterer);
    DataSource source = null;
    Instance inst;

    if (Utils.getFlag('h', options) || Utils.getFlag("help", options)) {

        // global info requested as well?
        boolean globalInfo = Utils.getFlag("synopsis", options) || Utils.getFlag("info", options);

        throw new Exception("Help requested." + makeOptionString(clusterer, globalInfo));
    }

    try {
        // Get basic options (options the same for all clusterers
        //printClusterAssignments = Utils.getFlag('p', options);
        objectInputFileName = Utils.getOption('l', options);
        objectOutputFileName = Utils.getOption('d', options);
        trainFileName = Utils.getOption('t', options);
        testFileName = Utils.getOption('T', options);
        graphFileName = Utils.getOption('g', options);

        // Check -p option
        try {
            attributeRangeString = Utils.getOption('p', options);
        } catch (Exception e) {
            throw new Exception(e.getMessage() + "\nNOTE: the -p option has changed. "
                    + "It now expects a parameter specifying a range of attributes "
                    + "to list with the predictions. Use '-p 0' for none.");
        }
        if (attributeRangeString.length() != 0) {
            printClusterAssignments = true;
            if (!attributeRangeString.equals("0"))
                attributesToOutput = new Range(attributeRangeString);
        }

        if (trainFileName.length() == 0) {
            if (objectInputFileName.length() == 0) {
                throw new Exception("No training file and no object " + "input file given.");
            }

            if (testFileName.length() == 0) {
                throw new Exception("No training file and no test file given.");
            }
        } else {
            if ((objectInputFileName.length() != 0) && (printClusterAssignments == false)) {
                throw new Exception("Can't use both train and model file " + "unless -p specified.");
            }
        }

        seedString = Utils.getOption('s', options);

        if (seedString.length() != 0) {
            seed = Integer.parseInt(seedString);
        }

        foldsString = Utils.getOption('x', options);

        if (foldsString.length() != 0) {
            folds = Integer.parseInt(foldsString);
            doXval = true;
        }
    } catch (Exception e) {
        throw new Exception('\n' + e.getMessage() + makeOptionString(clusterer, false));
    }

    try {
        if (trainFileName.length() != 0) {
            source = new DataSource(trainFileName);
            train = source.getStructure();

            String classString = Utils.getOption('c', options);
            if (classString.length() != 0) {
                if (classString.compareTo("last") == 0)
                    theClass = train.numAttributes();
                else if (classString.compareTo("first") == 0)
                    theClass = 1;
                else
                    theClass = Integer.parseInt(classString);

                if (theClass != -1) {
                    if (doXval || testFileName.length() != 0)
                        throw new Exception("Can only do class based evaluation on the " + "training data");

                    if (objectInputFileName.length() != 0)
                        throw new Exception("Can't load a clusterer and do class based " + "evaluation");

                    if (objectOutputFileName.length() != 0)
                        throw new Exception("Can't do class based evaluation and save clusterer");
                }
            } else {
                // if the dataset defines a class attribute, use it
                if (train.classIndex() != -1) {
                    theClass = train.classIndex() + 1;
                    System.err
                            .println("Note: using class attribute from dataset, i.e., attribute #" + theClass);
                }
            }

            if (theClass != -1) {
                if (theClass < 1 || theClass > train.numAttributes())
                    throw new Exception("Class is out of range!");

                if (!train.attribute(theClass - 1).isNominal())
                    throw new Exception("Class must be nominal!");

                train.setClassIndex(theClass - 1);
            }
        }
    } catch (Exception e) {
        throw new Exception("ClusterEvaluation: " + e.getMessage() + '.');
    }

    // Save options
    if (options != null) {
        savedOptions = new String[options.length];
        System.arraycopy(options, 0, savedOptions, 0, options.length);
    }

    if (objectInputFileName.length() != 0)
        Utils.checkForRemainingOptions(options);

    // Set options for clusterer
    if (clusterer instanceof OptionHandler)
        ((OptionHandler) clusterer).setOptions(options);

    Utils.checkForRemainingOptions(options);

    Instances trainHeader = train;
    if (objectInputFileName.length() != 0) {
        // Load the clusterer from file
        //      clusterer = (Clusterer) SerializationHelper.read(objectInputFileName);
        java.io.ObjectInputStream ois = new java.io.ObjectInputStream(
                new java.io.BufferedInputStream(new java.io.FileInputStream(objectInputFileName)));
        clusterer = (Clusterer) ois.readObject();
        // try and get the training header
        try {
            trainHeader = (Instances) ois.readObject();
        } catch (Exception ex) {
            // don't moan if we cant
        }
    } else {
        // Build the clusterer if no object file provided
        if (theClass == -1) {
            if (updateable) {
                clusterer.buildClusterer(source.getStructure());
                while (source.hasMoreElements(train)) {
                    inst = source.nextElement(train);
                    ((UpdateableClusterer) clusterer).updateClusterer(inst);
                }
                ((UpdateableClusterer) clusterer).updateFinished();
            } else {
                clusterer.buildClusterer(source.getDataSet());
            }
        } else {
            Remove removeClass = new Remove();
            removeClass.setAttributeIndices("" + theClass);
            removeClass.setInvertSelection(false);
            removeClass.setInputFormat(train);
            if (updateable) {
                Instances clusterTrain = Filter.useFilter(train, removeClass);
                clusterer.buildClusterer(clusterTrain);
                trainHeader = clusterTrain;
                while (source.hasMoreElements(train)) {
                    inst = source.nextElement(train);
                    removeClass.input(inst);
                    removeClass.batchFinished();
                    Instance clusterTrainInst = removeClass.output();
                    ((UpdateableClusterer) clusterer).updateClusterer(clusterTrainInst);
                }
                ((UpdateableClusterer) clusterer).updateFinished();
            } else {
                Instances clusterTrain = Filter.useFilter(source.getDataSet(), removeClass);
                clusterer.buildClusterer(clusterTrain);
                trainHeader = clusterTrain;
            }
            ClusterEvaluationEX ce = new ClusterEvaluationEX();
            ce.setClusterer(clusterer);
            ce.evaluateClusterer(train, trainFileName);

            return "\n\n=== Clustering stats for training data ===\n\n" + ce.clusterResultsToString();
        }
    }

    /* Output cluster predictions only (for the test data if specified,
       otherwise for the training data */
    if (printClusterAssignments) {
        return printClusterings(clusterer, trainFileName, testFileName, attributesToOutput);
    }

    text.append(clusterer.toString());
    text.append(
            "\n\n=== Clustering stats for training data ===\n\n" + printClusterStats(clusterer, trainFileName));

    if (testFileName.length() != 0) {
        // check header compatibility
        DataSource test = new DataSource(testFileName);
        Instances testStructure = test.getStructure();
        if (!trainHeader.equalHeaders(testStructure)) {
            throw new Exception("Training and testing data are not compatible\n");
        }

        text.append("\n\n=== Clustering stats for testing data ===\n\n"
                + printClusterStats(clusterer, testFileName));
    }

    if ((clusterer instanceof DensityBasedClusterer) && (doXval == true) && (testFileName.length() == 0)
            && (objectInputFileName.length() == 0)) {
        // cross validate the log likelihood on the training data
        random = new Random(seed);
        random.setSeed(seed);
        train = source.getDataSet();
        train.randomize(random);
        text.append(crossValidateModel(clusterer.getClass().getName(), train, folds, savedOptions, random));
    }

    // Save the clusterer if an object output file is provided
    if (objectOutputFileName.length() != 0) {
        //SerializationHelper.write(objectOutputFileName, clusterer);
        saveClusterer(objectOutputFileName, clusterer, trainHeader);
    }

    // If classifier is drawable output string describing graph
    if ((clusterer instanceof Drawable) && (graphFileName.length() != 0)) {
        BufferedWriter writer = new BufferedWriter(new FileWriter(graphFileName));
        writer.write(((Drawable) clusterer).graph());
        writer.newLine();
        writer.flush();
        writer.close();
    }

    return text.toString();
}

From source file:de.unidue.langtech.grading.tc.ClusteringTask.java

License:Open Source License

@Override
public void execute(TaskContext aContext) throws Exception {
    if (learningMode.equals(Constants.LM_MULTI_LABEL)) {
        throw new IllegalArgumentException("Cannot use multi-label setup in clustering.");
    }/*from   www.j  av  a 2s  .c o  m*/
    boolean multiLabel = false;

    File arffFileTrain = new File(
            aContext.getStorageLocation(TEST_TASK_INPUT_KEY_TRAINING_DATA, AccessMode.READONLY).getPath() + "/"
                    + TRAINING_DATA_FILENAME);

    Instances trainData = TaskUtils.getInstances(arffFileTrain, multiLabel);

    // get number of outcomes
    List<String> trainOutcomeValues = TaskUtils.getClassLabels(trainData, multiLabel);

    Clusterer clusterer = AbstractClusterer.forName(clusteringArguments.get(0),
            clusteringArguments.subList(1, clusteringArguments.size()).toArray(new String[0]));

    Instances copyTrainData = new Instances(trainData);
    trainData = WekaUtils.removeOutcomeId(trainData, multiLabel);

    // generate data for clusterer (w/o class)
    Remove filter = new Remove();
    filter.setAttributeIndices("" + (trainData.classIndex() + 1));
    filter.setInputFormat(trainData);
    Instances clusterTrainData = Filter.useFilter(trainData, filter);

    clusterer.buildClusterer(clusterTrainData);

    // get a mapping from clusterIDs to instance offsets in the ARFF
    Map<Integer, Set<Integer>> clusterMap = getClusterMap(clusterTrainData, clusterer);

    Map<String, String> instanceId2TextMap = getInstanceId2TextMap(aContext);

    ConditionalFrequencyDistribution<Integer, String> clusterAssignments = new ConditionalFrequencyDistribution<Integer, String>();
    for (Integer clusterId : clusterMap.keySet()) {
        System.out.println("CLUSTER: " + clusterId);
        for (Integer offset : clusterMap.get(clusterId)) {

            // get instance ID from instance
            Instance instance = copyTrainData.get(offset);

            Double classOffset = new Double(instance.value(copyTrainData.classAttribute()));
            String label = (String) trainOutcomeValues.get(classOffset.intValue());

            clusterAssignments.addSample(clusterId, label);

            String instanceId = instance
                    .stringValue(copyTrainData.attribute(AddIdFeatureExtractor.ID_FEATURE_NAME).index());
            System.out.println(label + "\t" + instanceId2TextMap.get(instanceId));
        }
        System.out.println();
    }

    System.out.println("ID\tSIZE\tPURITY\tRMSE");
    for (Integer clusterId : clusterMap.keySet()) {
        FrequencyDistribution<String> fd = clusterAssignments.getFrequencyDistribution(clusterId);
        double purity = (double) fd.getCount(fd.getSampleWithMaxFreq()) / fd.getN();
        String purityString = String.format("%.2f", purity);
        double rmse = getRMSE(fd, trainOutcomeValues);
        String rmseString = String.format("%.2f", rmse);
        System.out.println(
                clusterId + "\t" + clusterMap.get(clusterId).size() + "\t" + purityString + "\t" + rmseString);
    }
    System.out.println();
}

From source file:de.unidue.langtech.grading.tc.ClusterTrainTask.java

License:Open Source License

@Override
public void execute(TaskContext aContext) throws Exception {
    if (learningMode.equals(Constants.LM_MULTI_LABEL)) {
        throw new IllegalArgumentException("Cannot use multi-label setup in clustering.");
    }/* ww  w .  j ava  2  s.com*/
    boolean multiLabel = false;

    File arffFileTrain = new File(
            aContext.getStorageLocation(TEST_TASK_INPUT_KEY_TRAINING_DATA, AccessMode.READONLY).getPath() + "/"
                    + TRAINING_DATA_FILENAME);

    Instances trainData = TaskUtils.getInstances(arffFileTrain, multiLabel);

    // get number of outcomes
    List<String> trainOutcomeValues = TaskUtils.getClassLabels(trainData, multiLabel);

    Clusterer clusterer = AbstractClusterer.forName(clusteringArguments.get(0),
            clusteringArguments.subList(1, clusteringArguments.size()).toArray(new String[0]));

    Instances copyTrainData = new Instances(trainData);
    trainData = WekaUtils.removeOutcomeId(trainData, multiLabel);

    // generate data for clusterer (w/o class)
    Remove filter = new Remove();
    filter.setAttributeIndices("" + (trainData.classIndex() + 1));
    filter.setInputFormat(trainData);
    Instances clusterTrainData = Filter.useFilter(trainData, filter);

    clusterer.buildClusterer(clusterTrainData);

    // get a mapping from clusterIDs to instance offsets in the ARFF
    Map<Integer, Set<Integer>> clusterMap = getClusterMap(clusterTrainData, clusterer);

    // get a CFD that stores the number of outcomes for each class indexed by the clusterID
    ConditionalFrequencyDistribution<Integer, String> clusterCfd = getClusterCfd(clusterMap, copyTrainData,
            trainOutcomeValues);

    Map<Integer, String> mostFrequentClassPerCluster = new HashMap<Integer, String>();
    Map<Integer, Double> clusterScoreMap = new HashMap<Integer, Double>();
    for (Integer clusterId : clusterMap.keySet()) {
        FrequencyDistribution<String> fd = clusterCfd.getFrequencyDistribution(clusterId);
        mostFrequentClassPerCluster.put(clusterId, fd.getSampleWithMaxFreq());

        double purity = (double) fd.getCount(fd.getSampleWithMaxFreq()) / fd.getN();
        // attention - cannot simply use RMSE here - as smaller values are better unlike with purity
        //           double rmse = getRMSE(fd, trainOutcomeValues);
        clusterScoreMap.put(clusterId, purity);
    }

    // sort clusters by score
    Map<Integer, Double> sortedClusters = new TreeMap<Integer, Double>(new ValueComparator(clusterScoreMap));
    sortedClusters.putAll(clusterScoreMap);

    // change the outcome values of instances according to the most frequent class in its cluster

    double avgPurity = 0.0;
    int n = 0;
    for (Integer clusterId : sortedClusters.keySet()) {
        // we need to take as many clusters until we have seen at least each class once
        if (onlyPureClusters && trainOutcomeValues.size() == 0) {
            break;
        }

        //           // do not use clusters of single responses, as they always have purity of 1
        //           if (clusterCfd.getFrequencyDistribution(clusterId).getN() == 1) {
        //              continue;
        //           }

        n++;
        avgPurity += clusterScoreMap.get(clusterId);

        String mostFrequentClass = mostFrequentClassPerCluster.get(clusterId);
        trainOutcomeValues.remove(mostFrequentClass);

        for (Integer instanceOffset : clusterMap.get(clusterId)) {
            copyTrainData.get(instanceOffset).setValue(copyTrainData.classIndex(), mostFrequentClass);
        }
    }
    avgPurity = avgPurity / n;
    System.out.println("Average cluster purity: " + avgPurity);

    // write the new training data (that will be used by the test task instead of the original one)                
    DataSink.write(aContext.getStorageLocation(ADAPTED_TRAINING_DATA, AccessMode.READWRITE).getPath() + "/"
            + ARFF_FILENAME, copyTrainData);
}

From source file:guineu.modules.dataanalysis.clustering.em.EMClusterer.java

License:Open Source License

public List<Integer> getClusterGroups(Instances dataset) {
    List<Integer> clusters = new ArrayList<Integer>();
    String[] options = new String[2];
    Clusterer clusterer = new EM();

    int numberOfIterations = parameters.getParameter(EMClustererParameters.numberOfIterations).getValue();
    options[0] = "-I";
    options[1] = String.valueOf(numberOfIterations);

    try {/*from  w  w  w  .j a  v a  2  s  .c  o  m*/
        ((EM) clusterer).setOptions(options);
        clusterer.buildClusterer(dataset);
        Enumeration e = dataset.enumerateInstances();
        while (e.hasMoreElements()) {
            clusters.add(clusterer.clusterInstance((Instance) e.nextElement()));
        }
        this.numberOfGroups = clusterer.numberOfClusters();
    } catch (Exception ex) {
        Logger.getLogger(EMClusterer.class.getName()).log(Level.SEVERE, null, ex);
    }
    return clusters;
}

From source file:guineu.modules.dataanalysis.clustering.farthestfirst.FarthestFirstClusterer.java

License:Open Source License

public List<Integer> getClusterGroups(Instances dataset) {
    List<Integer> clusters = new ArrayList<Integer>();
    String[] options = new String[2];
    Clusterer clusterer = new FarthestFirst();

    int numberOfGroups = parameters.getParameter(FarthestFirstClustererParameters.numberOfGroups).getValue();
    options[0] = "-N";
    options[1] = String.valueOf(numberOfGroups);

    try {//  ww w.j a  v  a  2s.c om
        ((FarthestFirst) clusterer).setOptions(options);
        clusterer.buildClusterer(dataset);
        Enumeration e = dataset.enumerateInstances();
        while (e.hasMoreElements()) {
            clusters.add(clusterer.clusterInstance((Instance) e.nextElement()));
        }
        this.numberOfGroups = clusterer.numberOfClusters();
    } catch (Exception ex) {
        Logger.getLogger(FarthestFirstClusterer.class.getName()).log(Level.SEVERE, null, ex);
    }
    return clusters;
}