Example usage for weka.clusterers Clusterer buildClusterer

List of usage examples for weka.clusterers Clusterer buildClusterer

Introduction

In this page you can find the example usage for weka.clusterers Clusterer buildClusterer.

Prototype

void buildClusterer(Instances data) throws Exception;

Source Link

Document

Generates a clusterer.

Usage

From source file:adams.flow.transformer.WekaTrainClusterer.java

License:Open Source License

/**
 * Executes the flow item.//from   w  w  w .  j a  v a 2s  .co m
 *
 * @return      null if everything is fine, otherwise error message
 */
@Override
protected String doExecute() {
    String result;
    Instances data;
    Instance inst;
    weka.clusterers.Clusterer cls;
    WekaModelContainer cont;

    result = null;

    try {
        cls = null;
        if ((m_InputToken != null) && (m_InputToken.getPayload() instanceof Instances)) {
            cls = getClustererInstance();
            data = (Instances) m_InputToken.getPayload();
            cls.buildClusterer(data);
            cont = new WekaModelContainer(cls, new Instances(data, 0), data);
            cont = m_PostProcessor.postProcess(cont);
            m_OutputToken = new Token(cont);
        } else if ((m_InputToken != null) && (m_InputToken.getPayload() instanceof Instance)) {
            if (m_IncrementalClusterer == null) {
                cls = getClustererInstance();
                if (!(cls instanceof UpdateableClusterer))
                    result = m_Clusterer + "/" + cls.getClass().getName() + " is not an incremental clusterer!";
            }
            if (result == null) {
                inst = (Instance) m_InputToken.getPayload();
                if (m_IncrementalClusterer == null) {
                    m_IncrementalClusterer = cls;
                    data = new Instances(inst.dataset(), 1);
                    data.add((Instance) inst.copy());
                    m_IncrementalClusterer.buildClusterer(data);
                } else {
                    ((UpdateableClusterer) m_IncrementalClusterer).updateClusterer(inst);
                    ((UpdateableClusterer) m_IncrementalClusterer).updateFinished();
                }
                m_OutputToken = new Token(
                        new WekaModelContainer(m_IncrementalClusterer, new Instances(inst.dataset(), 0)));
            }
        }
    } catch (Exception e) {
        m_OutputToken = null;
        result = handleException("Failed to process input: " + m_InputToken.getPayload(), e);
    }

    if (m_OutputToken != null)
        updateProvenance(m_OutputToken);

    return result;
}

From source file:adams.flow.transformer.WekaTrainTestSetClustererEvaluator.java

License:Open Source License

/**
 * Executes the flow item.//from  w ww .ja va2s .  c  om
 *
 * @return      null if everything is fine, otherwise error message
 */
@Override
protected String doExecute() {
    String result;
    Instances train;
    Instances test;
    weka.clusterers.Clusterer cls;
    ClusterEvaluation eval;
    WekaTrainTestSetContainer cont;

    result = null;

    try {
        // cross-validate clusterer
        cls = getClustererInstance();
        if (cls == null)
            throw new IllegalStateException("Clusterer '" + getClusterer() + "' not found!");

        cont = (WekaTrainTestSetContainer) m_InputToken.getPayload();
        train = (Instances) cont.getValue(WekaTrainTestSetContainer.VALUE_TRAIN);
        test = (Instances) cont.getValue(WekaTrainTestSetContainer.VALUE_TEST);
        cls.buildClusterer(train);
        eval = new ClusterEvaluation();
        eval.setClusterer(cls);
        eval.evaluateClusterer(test, null, m_OutputModel);

        // broadcast result
        m_OutputToken = new Token(new WekaClusterEvaluationContainer(eval, cls));
    } catch (Exception e) {
        m_OutputToken = null;
        result = handleException("Failed to evaluate: ", e);
    }

    if (m_OutputToken != null)
        updateProvenance(m_OutputToken);

    return result;
}

From source file:adams.ml.model.clustering.WekaClusterer.java

License:Open Source License

/**
 * Builds a model from the data./*from w  w  w .j  a va  2  s  .c  o  m*/
 *
 * @param data   the data to use for building the model
 * @return      the generated model
 * @throws Exception   if the build fails
 */
@Override
protected ClusteringModel doBuildModel(Dataset data) throws Exception {
    Instances inst;
    weka.clusterers.Clusterer clusterer;

    inst = WekaConverter.toInstances(data);
    clusterer = (weka.clusterers.Clusterer) OptionUtils.shallowCopy(m_Clusterer);
    if (clusterer == null)
        throw new Exception(
                "Failed to create shallow copy of classifier: " + OptionUtils.getCommandLine(m_Clusterer));

    clusterer.buildClusterer(inst);

    return new WekaClusteringModel(clusterer, data, inst);
}

From source file:com.actelion.research.orbit.imageAnalysis.tasks.TrainWorker.java

License:Open Source License

private void createClusterer() {
    int MAX_TILES_CLUSTERING = 50;
    if (iFrames == null || iFrames.size() < 1) {
        logger.error("cannot build clusterer, no open image frames.");
        return;//from   w w w.  ja v a  2  s  .co  m
    }
    if (modelToBuild != null && modelToBuild.getClassifier() != null)
        modelToBuild.getClassifier().setBuild(false);

    int windowSize = modelToBuild.getFeatureDescription().getWindowSize();

    List<double[]> trainData = new ArrayList<double[]>();

    for (ImageFrame iFrame : iFrames) {
        PlanarImage image = iFrame.recognitionFrame.bimg.getImage();
        TissueFeatures tissueFeatures = new TissueFeatures(modelToBuild.getFeatureDescription(),
                iFrame.recognitionFrame.bimg);
        Point[] tileArr = image.getTileIndices(null);
        if (tileArr.length > MAX_TILES_CLUSTERING) {
            logger.trace("number of tiles for clustering: " + tileArr.length);
            List<Point> pList = new ArrayList<Point>(tileArr.length);
            for (Point p : tileArr)
                pList.add(p);
            Collections.shuffle(pList);
            pList = pList.subList(0, MAX_TILES_CLUSTERING);
            tileArr = pList.toArray(new Point[0]);
            logger.trace("number of tiles after tile limit: " + tileArr.length);
        }

        for (Point tileNum : tileArr) {
            Raster r = image.getTile(tileNum.x, tileNum.y);
            for (int x = image.tileXToX(tileNum.x); x < Math
                    .min(image.tileXToX(tileNum.x) + image.getTileWidth(), image.getWidth()); x++) {
                for (int y = image.tileYToY(tileNum.y); y < Math
                        .min(image.tileYToY(tileNum.y) + image.getTileHeight(), image.getHeight()); y++) {
                    if ((x < r.getMinX() + windowSize) || (y < r.getMinY() + windowSize)
                            || (x > r.getMinX() + r.getWidth() - windowSize - 1)
                            || (y > r.getMinY() + r.getHeight() - windowSize - 1))
                        continue;

                    double[] feats = null;
                    try {
                        feats = tissueFeatures.buildFeatures(r, x, y, Double.NaN);
                    } catch (Throwable t) {
                        System.out.println(t.getMessage());
                        t.printStackTrace();
                    }
                    trainData.add(feats);

                } // y
                checkPaused();
                if (isCancelled()) {
                    cleanUp();
                    return;
                }
            } // x

        } // tileNum
    } // iFrames

    timeEst = 1000 * 60L;
    setProgress(20);

    // trainData -> instances
    checkPaused();
    if (isCancelled()) {
        cleanUp();
        return;
    }
    trainSet = null;
    Attribute classAttr = null;
    // create the first time a new trainSet. All further trainings will append new instances.
    if (trainSet == null) {
        // build traindata header
        double[] firstRow = trainData.get(0);
        ArrayList<Attribute> attrInfo = new ArrayList<Attribute>(firstRow.length);
        for (int a = 0; a < firstRow.length - 1; a++) {
            Attribute attr = new Attribute("a" + a);
            // if (a<firstRow.length-2) attr.setWeight(0.1d); else attr.setWeight(1.0d);
            attrInfo.add(attr);
        }
        List<String> classValues = new ArrayList<String>(
                iFrames.get(0).recognitionFrame.getClassShapes().size());
        for (int i = 0; i < iFrames.get(0).recognitionFrame.getClassShapes().size(); i++) {
            classValues.add((i + 1) + ".0"); // "1.0", "2.0", ...
        }
        classAttr = new Attribute("class", classValues);
        attrInfo.add(classAttr);

        trainSet = new Instances("trainSet pattern classes", attrInfo, trainData.size());
        trainSet.setClassIndex(firstRow.length - 1);
    } else
        classAttr = trainSet.attribute("class");

    timeEst = 1000 * 45L;
    setProgress(25);

    // add instances
    checkPaused();
    if (isCancelled()) {
        cleanUp();
        return;
    }
    for (double[] vals : trainData) {
        double classV = Double.NaN;
        vals[vals.length - 1] = classV;
        Instance inst = new DenseInstance(1.0d, vals);
        trainSet.add(inst);
    }
    trainSet = trainSet.resample(rand);
    trainSet.setClassIndex(-1);
    Instances ts = new Instances(trainSet, 0);
    ts.addAll(trainSet.subList(0, Math.min(MAX_CLUSTERING_EXAMPLES, trainSet.size() - 1)));
    trainSet = null;
    trainSet = ts;
    logger.debug("trainSet contains " + trainSet.numInstances() + " instances, class Attribute: "
            + trainSet.classIndex());
    logger.info("start building clusterer...");

    timeEst = 1000 * 40L;
    setProgress(30);

    // build clusterer
    checkPaused();
    if (isCancelled()) {
        cleanUp();
        return;
    }
    // Clusterer clusterer = new weka.clusterers.SimpleKMeans();
    //Clusterer clusterer = new MakeDensityBasedClusterer(new SimpleKMeans());
    Clusterer clusterer = new EM();
    try {
        //((weka.clusterers.SimpleKMeans)clusterer).setNumClusters(iFrames.get(0).recognitionFrame.getClassShapes().size());
        // ((MakeDensityBasedClusterer)clusterer).setNumClusters(iFrames.get(0).recognitionFrame.getClassShapes().size());
        ((EM) clusterer).setNumClusters(iFrames.get(0).recognitionFrame.getClassShapes().size());
        clusterer.buildClusterer(trainSet);
    } catch (Exception e) {
        logger.error(
                "cannot build clusterer or cannot set number of clusters (classShapes not correctly initialized?)");
        e.printStackTrace();
    }
    logger.info(
            "done. (clusterer is densityBasedClusterer: " + (clusterer instanceof DensityBasedClusterer) + ")");

    // sort class labels according to priors

    classifier = new ClassifierWrapper(clusterer);
    classifier.setBuild(true);
    this.trainSet = trainSet.stringFreeStructure();
    modelToBuild.setClassifier(classifier);
    modelToBuild.setStructure(trainSet.stringFreeStructure());
}

From source file:com.rapidminer.operator.learner.clustering.clusterer.GenericWekaClusteringAdaptor.java

License:Open Source License

public ClusterModel createClusterModel(ExampleSet exampleSet) throws OperatorException {
    weka.clusterers.Clusterer clusterer = getWekaClusterer(
            WekaTools.getWekaParametersFromTypes(this, wekaParameters));
    weka.core.Instances instances = WekaTools.toWekaInstances(exampleSet, "ClusterInstances",
            WekaInstancesAdaptor.CLUSTERING);
    try {//w  w w . ja v a  2 s .  c o  m
        clusterer.buildClusterer(instances);
        WekaCluster wekaCluster = new WekaCluster(exampleSet, clusterer);
        exampleSet = wekaCluster.apply(exampleSet);
    } catch (Exception e) {
        throw new UserError(this, e, 905, new Object[] { getOperatorClassName(), e });
    }
    ClusterModel clusterModel = createWekaBasedClusterModel(exampleSet);
    return clusterModel;
}

From source file:core.ClusterEvaluationEX.java

License:Open Source License

/**
 * Evaluates a clusterer with the options given in an array of
 * strings. It takes the string indicated by "-t" as training file, the
 * string indicated by "-T" as test file.
 * If the test file is missing, a stratified ten-fold
 * cross-validation is performed (distribution clusterers only).
 * Using "-x" you can change the number of
 * folds to be used, and using "-s" the random seed.
 * If the "-p" option is present it outputs the classification for
 * each test instance. If you provide the name of an object file using
 * "-l", a clusterer will be loaded from the given file. If you provide the
 * name of an object file using "-d", the clusterer built from the
 * training data will be saved to the given file.
 *
 * @param clusterer machine learning clusterer
 * @param options the array of string containing the options
 * @throws Exception if model could not be evaluated successfully
 * @return a string describing the results 
 *//*from   w  w  w .  j av a 2  s  .  c  o  m*/
public static String evaluateClusterer(Clusterer clusterer, String[] options) throws Exception {

    int seed = 1, folds = 10;
    boolean doXval = false;
    Instances train = null;
    Random random;
    String trainFileName, testFileName, seedString, foldsString;
    String objectInputFileName, objectOutputFileName, attributeRangeString;
    String graphFileName;
    String[] savedOptions = null;
    boolean printClusterAssignments = false;
    Range attributesToOutput = null;
    StringBuffer text = new StringBuffer();
    int theClass = -1; // class based evaluation of clustering
    boolean updateable = (clusterer instanceof UpdateableClusterer);
    DataSource source = null;
    Instance inst;

    if (Utils.getFlag('h', options) || Utils.getFlag("help", options)) {

        // global info requested as well?
        boolean globalInfo = Utils.getFlag("synopsis", options) || Utils.getFlag("info", options);

        throw new Exception("Help requested." + makeOptionString(clusterer, globalInfo));
    }

    try {
        // Get basic options (options the same for all clusterers
        //printClusterAssignments = Utils.getFlag('p', options);
        objectInputFileName = Utils.getOption('l', options);
        objectOutputFileName = Utils.getOption('d', options);
        trainFileName = Utils.getOption('t', options);
        testFileName = Utils.getOption('T', options);
        graphFileName = Utils.getOption('g', options);

        // Check -p option
        try {
            attributeRangeString = Utils.getOption('p', options);
        } catch (Exception e) {
            throw new Exception(e.getMessage() + "\nNOTE: the -p option has changed. "
                    + "It now expects a parameter specifying a range of attributes "
                    + "to list with the predictions. Use '-p 0' for none.");
        }
        if (attributeRangeString.length() != 0) {
            printClusterAssignments = true;
            if (!attributeRangeString.equals("0"))
                attributesToOutput = new Range(attributeRangeString);
        }

        if (trainFileName.length() == 0) {
            if (objectInputFileName.length() == 0) {
                throw new Exception("No training file and no object " + "input file given.");
            }

            if (testFileName.length() == 0) {
                throw new Exception("No training file and no test file given.");
            }
        } else {
            if ((objectInputFileName.length() != 0) && (printClusterAssignments == false)) {
                throw new Exception("Can't use both train and model file " + "unless -p specified.");
            }
        }

        seedString = Utils.getOption('s', options);

        if (seedString.length() != 0) {
            seed = Integer.parseInt(seedString);
        }

        foldsString = Utils.getOption('x', options);

        if (foldsString.length() != 0) {
            folds = Integer.parseInt(foldsString);
            doXval = true;
        }
    } catch (Exception e) {
        throw new Exception('\n' + e.getMessage() + makeOptionString(clusterer, false));
    }

    try {
        if (trainFileName.length() != 0) {
            source = new DataSource(trainFileName);
            train = source.getStructure();

            String classString = Utils.getOption('c', options);
            if (classString.length() != 0) {
                if (classString.compareTo("last") == 0)
                    theClass = train.numAttributes();
                else if (classString.compareTo("first") == 0)
                    theClass = 1;
                else
                    theClass = Integer.parseInt(classString);

                if (theClass != -1) {
                    if (doXval || testFileName.length() != 0)
                        throw new Exception("Can only do class based evaluation on the " + "training data");

                    if (objectInputFileName.length() != 0)
                        throw new Exception("Can't load a clusterer and do class based " + "evaluation");

                    if (objectOutputFileName.length() != 0)
                        throw new Exception("Can't do class based evaluation and save clusterer");
                }
            } else {
                // if the dataset defines a class attribute, use it
                if (train.classIndex() != -1) {
                    theClass = train.classIndex() + 1;
                    System.err
                            .println("Note: using class attribute from dataset, i.e., attribute #" + theClass);
                }
            }

            if (theClass != -1) {
                if (theClass < 1 || theClass > train.numAttributes())
                    throw new Exception("Class is out of range!");

                if (!train.attribute(theClass - 1).isNominal())
                    throw new Exception("Class must be nominal!");

                train.setClassIndex(theClass - 1);
            }
        }
    } catch (Exception e) {
        throw new Exception("ClusterEvaluation: " + e.getMessage() + '.');
    }

    // Save options
    if (options != null) {
        savedOptions = new String[options.length];
        System.arraycopy(options, 0, savedOptions, 0, options.length);
    }

    if (objectInputFileName.length() != 0)
        Utils.checkForRemainingOptions(options);

    // Set options for clusterer
    if (clusterer instanceof OptionHandler)
        ((OptionHandler) clusterer).setOptions(options);

    Utils.checkForRemainingOptions(options);

    Instances trainHeader = train;
    if (objectInputFileName.length() != 0) {
        // Load the clusterer from file
        //      clusterer = (Clusterer) SerializationHelper.read(objectInputFileName);
        java.io.ObjectInputStream ois = new java.io.ObjectInputStream(
                new java.io.BufferedInputStream(new java.io.FileInputStream(objectInputFileName)));
        clusterer = (Clusterer) ois.readObject();
        // try and get the training header
        try {
            trainHeader = (Instances) ois.readObject();
        } catch (Exception ex) {
            // don't moan if we cant
        }
    } else {
        // Build the clusterer if no object file provided
        if (theClass == -1) {
            if (updateable) {
                clusterer.buildClusterer(source.getStructure());
                while (source.hasMoreElements(train)) {
                    inst = source.nextElement(train);
                    ((UpdateableClusterer) clusterer).updateClusterer(inst);
                }
                ((UpdateableClusterer) clusterer).updateFinished();
            } else {
                clusterer.buildClusterer(source.getDataSet());
            }
        } else {
            Remove removeClass = new Remove();
            removeClass.setAttributeIndices("" + theClass);
            removeClass.setInvertSelection(false);
            removeClass.setInputFormat(train);
            if (updateable) {
                Instances clusterTrain = Filter.useFilter(train, removeClass);
                clusterer.buildClusterer(clusterTrain);
                trainHeader = clusterTrain;
                while (source.hasMoreElements(train)) {
                    inst = source.nextElement(train);
                    removeClass.input(inst);
                    removeClass.batchFinished();
                    Instance clusterTrainInst = removeClass.output();
                    ((UpdateableClusterer) clusterer).updateClusterer(clusterTrainInst);
                }
                ((UpdateableClusterer) clusterer).updateFinished();
            } else {
                Instances clusterTrain = Filter.useFilter(source.getDataSet(), removeClass);
                clusterer.buildClusterer(clusterTrain);
                trainHeader = clusterTrain;
            }
            ClusterEvaluationEX ce = new ClusterEvaluationEX();
            ce.setClusterer(clusterer);
            ce.evaluateClusterer(train, trainFileName);

            return "\n\n=== Clustering stats for training data ===\n\n" + ce.clusterResultsToString();
        }
    }

    /* Output cluster predictions only (for the test data if specified,
       otherwise for the training data */
    if (printClusterAssignments) {
        return printClusterings(clusterer, trainFileName, testFileName, attributesToOutput);
    }

    text.append(clusterer.toString());
    text.append(
            "\n\n=== Clustering stats for training data ===\n\n" + printClusterStats(clusterer, trainFileName));

    if (testFileName.length() != 0) {
        // check header compatibility
        DataSource test = new DataSource(testFileName);
        Instances testStructure = test.getStructure();
        if (!trainHeader.equalHeaders(testStructure)) {
            throw new Exception("Training and testing data are not compatible\n");
        }

        text.append("\n\n=== Clustering stats for testing data ===\n\n"
                + printClusterStats(clusterer, testFileName));
    }

    if ((clusterer instanceof DensityBasedClusterer) && (doXval == true) && (testFileName.length() == 0)
            && (objectInputFileName.length() == 0)) {
        // cross validate the log likelihood on the training data
        random = new Random(seed);
        random.setSeed(seed);
        train = source.getDataSet();
        train.randomize(random);
        text.append(crossValidateModel(clusterer.getClass().getName(), train, folds, savedOptions, random));
    }

    // Save the clusterer if an object output file is provided
    if (objectOutputFileName.length() != 0) {
        //SerializationHelper.write(objectOutputFileName, clusterer);
        saveClusterer(objectOutputFileName, clusterer, trainHeader);
    }

    // If classifier is drawable output string describing graph
    if ((clusterer instanceof Drawable) && (graphFileName.length() != 0)) {
        BufferedWriter writer = new BufferedWriter(new FileWriter(graphFileName));
        writer.write(((Drawable) clusterer).graph());
        writer.newLine();
        writer.flush();
        writer.close();
    }

    return text.toString();
}

From source file:de.unidue.langtech.grading.tc.ClusteringTask.java

License:Open Source License

@Override
public void execute(TaskContext aContext) throws Exception {
    if (learningMode.equals(Constants.LM_MULTI_LABEL)) {
        throw new IllegalArgumentException("Cannot use multi-label setup in clustering.");
    }/*from   www.j  av  a 2s  .c o  m*/
    boolean multiLabel = false;

    File arffFileTrain = new File(
            aContext.getStorageLocation(TEST_TASK_INPUT_KEY_TRAINING_DATA, AccessMode.READONLY).getPath() + "/"
                    + TRAINING_DATA_FILENAME);

    Instances trainData = TaskUtils.getInstances(arffFileTrain, multiLabel);

    // get number of outcomes
    List<String> trainOutcomeValues = TaskUtils.getClassLabels(trainData, multiLabel);

    Clusterer clusterer = AbstractClusterer.forName(clusteringArguments.get(0),
            clusteringArguments.subList(1, clusteringArguments.size()).toArray(new String[0]));

    Instances copyTrainData = new Instances(trainData);
    trainData = WekaUtils.removeOutcomeId(trainData, multiLabel);

    // generate data for clusterer (w/o class)
    Remove filter = new Remove();
    filter.setAttributeIndices("" + (trainData.classIndex() + 1));
    filter.setInputFormat(trainData);
    Instances clusterTrainData = Filter.useFilter(trainData, filter);

    clusterer.buildClusterer(clusterTrainData);

    // get a mapping from clusterIDs to instance offsets in the ARFF
    Map<Integer, Set<Integer>> clusterMap = getClusterMap(clusterTrainData, clusterer);

    Map<String, String> instanceId2TextMap = getInstanceId2TextMap(aContext);

    ConditionalFrequencyDistribution<Integer, String> clusterAssignments = new ConditionalFrequencyDistribution<Integer, String>();
    for (Integer clusterId : clusterMap.keySet()) {
        System.out.println("CLUSTER: " + clusterId);
        for (Integer offset : clusterMap.get(clusterId)) {

            // get instance ID from instance
            Instance instance = copyTrainData.get(offset);

            Double classOffset = new Double(instance.value(copyTrainData.classAttribute()));
            String label = (String) trainOutcomeValues.get(classOffset.intValue());

            clusterAssignments.addSample(clusterId, label);

            String instanceId = instance
                    .stringValue(copyTrainData.attribute(AddIdFeatureExtractor.ID_FEATURE_NAME).index());
            System.out.println(label + "\t" + instanceId2TextMap.get(instanceId));
        }
        System.out.println();
    }

    System.out.println("ID\tSIZE\tPURITY\tRMSE");
    for (Integer clusterId : clusterMap.keySet()) {
        FrequencyDistribution<String> fd = clusterAssignments.getFrequencyDistribution(clusterId);
        double purity = (double) fd.getCount(fd.getSampleWithMaxFreq()) / fd.getN();
        String purityString = String.format("%.2f", purity);
        double rmse = getRMSE(fd, trainOutcomeValues);
        String rmseString = String.format("%.2f", rmse);
        System.out.println(
                clusterId + "\t" + clusterMap.get(clusterId).size() + "\t" + purityString + "\t" + rmseString);
    }
    System.out.println();
}

From source file:de.unidue.langtech.grading.tc.ClusterTrainTask.java

License:Open Source License

@Override
public void execute(TaskContext aContext) throws Exception {
    if (learningMode.equals(Constants.LM_MULTI_LABEL)) {
        throw new IllegalArgumentException("Cannot use multi-label setup in clustering.");
    }/* ww  w .  j ava  2  s.com*/
    boolean multiLabel = false;

    File arffFileTrain = new File(
            aContext.getStorageLocation(TEST_TASK_INPUT_KEY_TRAINING_DATA, AccessMode.READONLY).getPath() + "/"
                    + TRAINING_DATA_FILENAME);

    Instances trainData = TaskUtils.getInstances(arffFileTrain, multiLabel);

    // get number of outcomes
    List<String> trainOutcomeValues = TaskUtils.getClassLabels(trainData, multiLabel);

    Clusterer clusterer = AbstractClusterer.forName(clusteringArguments.get(0),
            clusteringArguments.subList(1, clusteringArguments.size()).toArray(new String[0]));

    Instances copyTrainData = new Instances(trainData);
    trainData = WekaUtils.removeOutcomeId(trainData, multiLabel);

    // generate data for clusterer (w/o class)
    Remove filter = new Remove();
    filter.setAttributeIndices("" + (trainData.classIndex() + 1));
    filter.setInputFormat(trainData);
    Instances clusterTrainData = Filter.useFilter(trainData, filter);

    clusterer.buildClusterer(clusterTrainData);

    // get a mapping from clusterIDs to instance offsets in the ARFF
    Map<Integer, Set<Integer>> clusterMap = getClusterMap(clusterTrainData, clusterer);

    // get a CFD that stores the number of outcomes for each class indexed by the clusterID
    ConditionalFrequencyDistribution<Integer, String> clusterCfd = getClusterCfd(clusterMap, copyTrainData,
            trainOutcomeValues);

    Map<Integer, String> mostFrequentClassPerCluster = new HashMap<Integer, String>();
    Map<Integer, Double> clusterScoreMap = new HashMap<Integer, Double>();
    for (Integer clusterId : clusterMap.keySet()) {
        FrequencyDistribution<String> fd = clusterCfd.getFrequencyDistribution(clusterId);
        mostFrequentClassPerCluster.put(clusterId, fd.getSampleWithMaxFreq());

        double purity = (double) fd.getCount(fd.getSampleWithMaxFreq()) / fd.getN();
        // attention - cannot simply use RMSE here - as smaller values are better unlike with purity
        //           double rmse = getRMSE(fd, trainOutcomeValues);
        clusterScoreMap.put(clusterId, purity);
    }

    // sort clusters by score
    Map<Integer, Double> sortedClusters = new TreeMap<Integer, Double>(new ValueComparator(clusterScoreMap));
    sortedClusters.putAll(clusterScoreMap);

    // change the outcome values of instances according to the most frequent class in its cluster

    double avgPurity = 0.0;
    int n = 0;
    for (Integer clusterId : sortedClusters.keySet()) {
        // we need to take as many clusters until we have seen at least each class once
        if (onlyPureClusters && trainOutcomeValues.size() == 0) {
            break;
        }

        //           // do not use clusters of single responses, as they always have purity of 1
        //           if (clusterCfd.getFrequencyDistribution(clusterId).getN() == 1) {
        //              continue;
        //           }

        n++;
        avgPurity += clusterScoreMap.get(clusterId);

        String mostFrequentClass = mostFrequentClassPerCluster.get(clusterId);
        trainOutcomeValues.remove(mostFrequentClass);

        for (Integer instanceOffset : clusterMap.get(clusterId)) {
            copyTrainData.get(instanceOffset).setValue(copyTrainData.classIndex(), mostFrequentClass);
        }
    }
    avgPurity = avgPurity / n;
    System.out.println("Average cluster purity: " + avgPurity);

    // write the new training data (that will be used by the test task instead of the original one)                
    DataSink.write(aContext.getStorageLocation(ADAPTED_TRAINING_DATA, AccessMode.READWRITE).getPath() + "/"
            + ARFF_FILENAME, copyTrainData);
}

From source file:guineu.modules.dataanalysis.clustering.em.EMClusterer.java

License:Open Source License

public List<Integer> getClusterGroups(Instances dataset) {
    List<Integer> clusters = new ArrayList<Integer>();
    String[] options = new String[2];
    Clusterer clusterer = new EM();

    int numberOfIterations = parameters.getParameter(EMClustererParameters.numberOfIterations).getValue();
    options[0] = "-I";
    options[1] = String.valueOf(numberOfIterations);

    try {/*from  w  w  w  .j a  v a  2  s  .c  o  m*/
        ((EM) clusterer).setOptions(options);
        clusterer.buildClusterer(dataset);
        Enumeration e = dataset.enumerateInstances();
        while (e.hasMoreElements()) {
            clusters.add(clusterer.clusterInstance((Instance) e.nextElement()));
        }
        this.numberOfGroups = clusterer.numberOfClusters();
    } catch (Exception ex) {
        Logger.getLogger(EMClusterer.class.getName()).log(Level.SEVERE, null, ex);
    }
    return clusters;
}

From source file:guineu.modules.dataanalysis.clustering.farthestfirst.FarthestFirstClusterer.java

License:Open Source License

public List<Integer> getClusterGroups(Instances dataset) {
    List<Integer> clusters = new ArrayList<Integer>();
    String[] options = new String[2];
    Clusterer clusterer = new FarthestFirst();

    int numberOfGroups = parameters.getParameter(FarthestFirstClustererParameters.numberOfGroups).getValue();
    options[0] = "-N";
    options[1] = String.valueOf(numberOfGroups);

    try {//  ww w.j a  v  a  2s.c om
        ((FarthestFirst) clusterer).setOptions(options);
        clusterer.buildClusterer(dataset);
        Enumeration e = dataset.enumerateInstances();
        while (e.hasMoreElements()) {
            clusters.add(clusterer.clusterInstance((Instance) e.nextElement()));
        }
        this.numberOfGroups = clusterer.numberOfClusters();
    } catch (Exception ex) {
        Logger.getLogger(FarthestFirstClusterer.class.getName()).log(Level.SEVERE, null, ex);
    }
    return clusters;
}