Example usage for weka.core Instances numInstances

Introduction

In this page you can find the example usage for weka.core Instances numInstances.

Prototype


publicint numInstances()

Source Link

Document

Returns the number of instances in the dataset.

Usage

From source file:gr.auth.ee.lcs.utilities.InstancesUtility.java

License:Open Source License

/**
 * The number of instances are multiple of the number of folds.
 * From a se t of instances, it returns a chunk whose length is instances.numInstances / numberOfFolds
 * with index = index. Index starts at zero.
 * /*from   w ww.ja v a 2s  .co  m*/
 * In essencem this is used when splitting a partition of instances to a train and test set.
 * 
 * One chunk is the test set and the rest is the train set.
 * We provide the index for the test set and the rest will automatically become the train set
        
 * see splitPartitionIntoFolds
 * 
 * _____
 * |_6_| index = 0
 * |_6_|       1
 * |_6_|       2 
 * |_6_|       3
 * |_6_|       4   
 * |_6_|       5
 * |_6_|       6
 * |_6_|       7      
 * |_6_|       8
 * |_6_|       9
 * 
 * */
public static Instances getPartitionSegment(Instances instances, int index, int numberOfFolds) {

    if (instances.numInstances() % numberOfFolds != 0) {
        System.out.println("Number of instances not a multiple of " + numberOfFolds);
        return null;
    }

    int numberOfInstancesToGet = instances.numInstances() / numberOfFolds;
    Instances segment = new Instances(instances, numberOfInstancesToGet);

    for (int i = index * numberOfInstancesToGet; i < (index + 1) * numberOfInstancesToGet; i++) {
        segment.add(instances.instance(i));
    }
    return segment;
}

From source file:gr.auth.ee.lcs.utilities.InstancesUtility.java

License:Open Source License

/**
 * Splits the .arff input dataset to |number-of-distinct-label-combinations| Instances which are stored in the partitions[] array. 
 * Called by initializePopulation() as a preparatory step to clustering.
 * @throws Exception //from w  w  w.jav a  2s .co m
 * 
 * */

public static Instances[] partitionInstances(final AbstractLearningClassifierSystem lcs, final String filename)
        throws Exception {

    // Open .arff
    final Instances set = InstancesUtility.openInstance(filename);
    if (set.classIndex() < 0) {
        set.setClassIndex(set.numAttributes() - 1);
    }
    //set.randomize(new Random());
    int numberOfLabels = (int) SettingsLoader.getNumericSetting("numberOfLabels", 1);

    // the partitions vector holds the indices      
    String stringsArray[] = new String[lcs.instances.length];
    int indicesArray[] = new int[lcs.instances.length];

    // convert each instance's labelset into a string and store it in the stringsArray array
    for (int i = 0; i < set.numInstances(); i++) {
        stringsArray[i] = "";
        indicesArray[i] = i;

        for (int j = set.numAttributes() - numberOfLabels; j < set.numAttributes(); j++) {
            stringsArray[i] += (int) set.instance(i).value(j);
        }
    }

    // contains the indicesVector(s)
    Vector<Vector> mothershipVector = new Vector<Vector>();

    String baseString = "";
    for (int i = 0; i < set.numInstances(); i++) {

        baseString = stringsArray[i];
        if (baseString.equals(""))
            continue;
        Vector<Integer> indicesVector = new Vector<Integer>();

        for (int j = 0; j < set.numInstances(); j++) {
            if (baseString.equals(stringsArray[j])) {
                stringsArray[j] = "";
                indicesVector.add(j);
            }
        }
        mothershipVector.add(indicesVector);
    }

    Instances[] partitions = new Instances[mothershipVector.size()];

    for (int i = 0; i < mothershipVector.size(); i++) {
        partitions[i] = new Instances(set, mothershipVector.elementAt(i).size());
        for (int j = 0; j < mothershipVector.elementAt(i).size(); j++) {
            Instance instanceToAdd = set.instance((Integer) mothershipVector.elementAt(i).elementAt(j));
            partitions[i].add(instanceToAdd);
        }
    }
    /*
     * up to here, the partitions array has been formed. it contains the split dataset by label combinations
     * it holds both the attributes and the labels, but for clustering the input should only be the attributes,
     * so we need to delete the labels. this is taken care of by initializePopulation()
     */
    return partitions;
}

From source file:gr.auth.ee.lcs.utilities.InstancesUtility.java

License:Open Source License

public static Instances[] partitionInstances(final AbstractLearningClassifierSystem lcs,
        final Instances trainSet) throws Exception {

    // Open .arff
    final Instances set = trainSet;
    if (set.classIndex() < 0) {
        set.setClassIndex(set.numAttributes() - 1);
    }/*from  w w w. ja v  a2  s. c o m*/
    //set.randomize(new Random());
    int numberOfLabels = (int) SettingsLoader.getNumericSetting("numberOfLabels", 1);

    // the partitions vector holds the indices      
    String stringsArray[] = new String[trainSet.numInstances()];
    int indicesArray[] = new int[trainSet.numInstances()];

    // convert each instance's labelset into a string and store it in the stringsArray array
    for (int i = 0; i < set.numInstances(); i++) {
        stringsArray[i] = "";
        indicesArray[i] = i;

        for (int j = set.numAttributes() - numberOfLabels; j < set.numAttributes(); j++) {
            stringsArray[i] += (int) set.instance(i).value(j);
        }
    }

    // contains the indicesVector(s)
    Vector<Vector> mothershipVector = new Vector<Vector>();

    String baseString = "";
    for (int i = 0; i < set.numInstances(); i++) {

        baseString = stringsArray[i];
        if (baseString.equals(""))
            continue;
        Vector<Integer> indicesVector = new Vector<Integer>();

        for (int j = 0; j < set.numInstances(); j++) {
            if (baseString.equals(stringsArray[j])) {
                stringsArray[j] = "";
                indicesVector.add(j);
            }
        }
        mothershipVector.add(indicesVector);
    }

    Instances[] partitions = new Instances[mothershipVector.size()];

    for (int i = 0; i < mothershipVector.size(); i++) {
        partitions[i] = new Instances(set, mothershipVector.elementAt(i).size());
        for (int j = 0; j < mothershipVector.elementAt(i).size(); j++) {
            Instance instanceToAdd = set.instance((Integer) mothershipVector.elementAt(i).elementAt(j));
            partitions[i].add(instanceToAdd);
        }
    }
    /*
     * up to here, the partitions array has been formed. it contains the split dataset by label combinations
     * it holds both the attributes and the labels, but for clustering the input should only be the attributes,
     * so we need to delete the labels. this is taken care of by initializePopulation()
     */
    return partitions;
}

From source file:gr.auth.ee.lcs.utilities.InstancesUtility.java

License:Open Source License

public static void splitDatasetIntoFolds(final AbstractLearningClassifierSystem lcs, final Instances dataset,
        final int numberOfFolds) throws Exception {

    Instances[] partitions = InstancesUtility.partitionInstances(lcs, dataset);

    testInstances.setSize(partitions.length);
    trainInstances.setSize(partitions.length);

    int lowerBound = (int) Math.floor((double) dataset.numInstances() / (double) numberOfFolds);
    int upperBound = (int) Math.ceil((double) dataset.numInstances() / (double) numberOfFolds);

    // we demand lowerBound <= numberOfTestInstancesPerFold[i] <= upperBound
    int[] numberOfTestInstancesPerFold = new int[numberOfFolds];

    /*//from   w  w  w  . j  ava 2 s  .  c  om
     * let X partitions have partitions[i].numInstances() > numberOfFolds. 
     * Then, vectors testInstances and trainInstances, after the call of splitPartitionIntoFolds(), will hold X arrays 
      *   meaning X elements.  
     * */
    Vector<Integer> vectorOfPartitionIndices = new Vector<Integer>();

    for (int i = 0; i < partitions.length; i++) {

        if (partitions[i].numInstances() > numberOfFolds) {
            InstancesUtility.splitPartitionIntoFolds(partitions[i], numberOfFolds, i);
            vectorOfPartitionIndices.add(i);
        } else {

            Instances[] emptyArrayTest = new Instances[numberOfFolds];
            Instances[] emptyArrayTrain = new Instances[numberOfFolds];

            for (int j = 0; j < numberOfFolds; j++) {
                emptyArrayTest[j] = new Instances(partitions[0], partitions[i].numInstances());
                emptyArrayTrain[j] = new Instances(partitions[0], partitions[i].numInstances());

            }
            //placeholders
            InstancesUtility.testInstances.add(i, emptyArrayTest);
            InstancesUtility.trainInstances.add(i, emptyArrayTrain);
        }
    }

    /*
     * At this point all partitions with numInstances > numFolds have been successfully been split.
     * What is left is splitting the leftovers. 1st from the above partitions and 2nd from the ones that originally had numInstances < numFolds
     * */

    for (int i = 0; i < numberOfFolds; i++) {
        int instancesSum = 0;
        for (int j = 0; j < vectorOfPartitionIndices.size(); j++) {
            instancesSum += InstancesUtility.testInstances.elementAt(vectorOfPartitionIndices.elementAt(j))[i]
                    .numInstances();
        }

        // initial number of instances in test set per fold
        numberOfTestInstancesPerFold[i] = instancesSum;
    }

    /*
     * 
     *  i = 0 |_0|_0|_0|_0|_0|_0|_0|_0|_0|_0|
       i = 1 |_0|_0|_0|_0|_0|_0|_0|_0|_0|_0|
       i = 2 |_0|_0|_0|_0|_0|_0|_0|_0|_0|_0|
       i = 3 |_0|_0|_0|_0|_0|_0|_0|_0|_0|_0|
       i = 4 |_0|_0|_0|_0|_0|_0|_0|_0|_0|_0|
       i = 5 |_1|_1|_1|_1|_1|_1|_1|_1|_1|_1|
       i = 6 |_3|_3|_3|_3|_3|_3|_3|_3|_3|_3|
       i = 7 |_6|_6|_6|_6|_6|_6|_6|_6|_6|_6|
     * 
     * 
     * */

    for (int i = 0; i < partitions.length; i++) {

        int numberOfLeftoverInstances = partitions[i].numInstances() % numberOfFolds; // eg 64 % 10 = 4
        Instances leftoverInstances = new Instances(partitions[i], numberOfLeftoverInstances);

        if (numberOfLeftoverInstances > 0) {
            /*
             * Starting from the end. Anyhow they are the last {numberOfLeftoverInstances} instances in each partition
             * that splitPartitionIntoFolds() has been called on.
             * */
            for (int k = partitions[i].numInstances() - 1; k >= partitions[i].numInstances()
                    - numberOfLeftoverInstances; k--) {
                leftoverInstances.add(partitions[i].instance(k));
            }

            /*
             * For each partition, randomize the folds. Leftover instances will be placed in the first {numberOfLeftoverInstances} folds,
             * that are already randomly distributed. If the first folds were not randomly distributed, there would be an uneven distribution,
             * meaning that in the first ones there would be instances of the first partition and so on.
             * 
             * */

            ArrayList<Integer> folds = new ArrayList<Integer>();

            for (int k = 0; k < numberOfFolds; k++) {
                folds.add(k);
            }

            Collections.shuffle(folds);

            int j = 0;
            while (leftoverInstances.numInstances() > 0) {
                int foldIndex = folds.get(j);

                if (numberOfTestInstancesPerFold[foldIndex] < upperBound) {

                    Instance toBeAdded = leftoverInstances.instance(0);

                    // place the leftover first instance in a test set
                    testInstances.elementAt(i)[foldIndex].add(toBeAdded);

                    numberOfTestInstancesPerFold[foldIndex]++;

                    // the instance placed in a test set for the current fold, needs to be put in the train set for all the other folds,
                    // except for the current one of course
                    for (int k = 0; k < numberOfFolds; k++) {
                        if (k != foldIndex) {
                            trainInstances.elementAt(i)[k].add(toBeAdded);
                        }
                    }

                    // remove the instance placed in the test set
                    leftoverInstances.delete(0);

                }
                j++;
                // if j hits the roof reset it. 
                // there may exist folds that have not reached their upper limit and abandon them
                if (j == numberOfFolds)
                    j = 0;
            }
        }
    }
}

From source file:gr.auth.ee.lcs.utilities.InstancesUtility.java

License:Open Source License

/**
 * Splits a partition (collection of instances that belong to the same label combination) into train and test sets, leaving leftover instances.
 * It presupposes that partition.numInstances > numberOfFolds.
 * /*from   ww w.  j ava 2s .com*/
 * Leftover instances should be distributed in a way that each test set holds
 * 
 * floor(totalNumInstances / numberOfFolds) <= testSetNumInstances <= ceil(totalNumInstances / numberOfFolds)
 */
public static void splitPartitionIntoFolds(Instances partition, int numberOfFolds, int partitionIndex) {

    int numberOfTestInstancesPerFold = partition.numInstances() / numberOfFolds; // eg 64 / 10 = 6
    int numberOfLeftoverInstances = partition.numInstances() % numberOfFolds; // eg 64 % 10 = 4
    int numberOfTrainInstancesPerFold = partition.numInstances() - numberOfTestInstancesPerFold
            - numberOfLeftoverInstances; // eg 64 - 6 - 4 = 54

    Instances[] testArrayPerPartition = new Instances[numberOfFolds];
    Instances[] trainArrayPerPartition = new Instances[numberOfFolds];

    Instances bulk = new Instances(partition, partition.numInstances() - numberOfLeftoverInstances);

    /*
     * E.g. I will split 64 total instances into 6 for testing, 54 for training and the rest (4) will be leftovers.
     * 6 + 54 = 60 ~ 10
     * The first 60 instances will be temporarily placed in the roundArray array
     * */

    for (int i = 0; i < partition.numInstances() - numberOfLeftoverInstances; i++) {
        bulk.add(partition.instance(i));
    }

    for (int i = 0; i < numberOfFolds; i++) {
        testArrayPerPartition[i] = InstancesUtility.getPartitionSegment(bulk, i, numberOfFolds);
        trainArrayPerPartition[i] = new Instances(bulk, numberOfFolds);

        for (int j = 0; j < numberOfFolds; j++) {
            if (j != i) {
                for (int k = 0; k < numberOfTestInstancesPerFold; k++) {
                    Instance kthInstance = InstancesUtility.getPartitionSegment(bulk, j, numberOfFolds)
                            .instance(k);
                    trainArrayPerPartition[i].add(kthInstance);
                }
            }
        }
    }

    /*
     * In total, there will be partitions.length additions.
     * Place each array in its respective place, depending on the partition index.
     * */

    InstancesUtility.testInstances.add(partitionIndex, testArrayPerPartition);
    InstancesUtility.trainInstances.add(partitionIndex, trainArrayPerPartition);
}

From source file:gr.demokritos.iit.cpgislanddetection.CpGIslandDetection.java

License:Apache License

/**
 * @param args the command line arguments
 *//* w w w  .j  a v a 2s.c o m*/
public static void main(String[] args) throws IOException, ParseException, Exception {

    // String sFileNameArgs = args[0];

    // String[] fileNames = null;
    // Read  file
    //IGenomicSequenceFileReader reader = new SequenceListFileReader();

    //        String seq ="GCTCTTGACTTTCAGACTTCCTGAAAACAACGTTCTGGTAAGGACAAGGGTT";
    //
    //        CpGIslandIdentification iClass = new CpGIslandIdentification();
    //        boolean b = iClass.identify(seq);
    //        System.out.println("This sequence is a CpG island: " + b);
    //        SequenceListFileReader s = new SequenceListFileReader();
    //        ArrayList<BaseSequence> alRes = new ArrayList<>();
    //        
    //        alRes = s.getSequencesFromFile("C:\\Users\\Xenia\\Desktop\\files\\posSamples.txt");

    //        for(int i=0; i<alRes.size(); i++)
    //        System.out.println("alRes = " + i + alRes.get(i));
    //        VectorAnalyzer vA = new VectorAnalyzer();
    //        List<Vector<Integer>> listVector = new ArrayList<>();
    //Vector<Vector<Integer>> list = 
    //        listVector = vA.analyze(alRes);
    //        for(int i=0; i<listVector.size();i++)
    //        System.out.println(i + " " +listVector.get(i));
    //IGenomicSequenceFileReader reader = new FASTAFileReader();

    // If no input file has been given
    /*        if (args.length == 0) {
    // Use default
    fileNames[0] = "C:\\Users\\Xenia\\Desktop\\files\\posSamples.txt";
    fileNames[1] = "C:\\Users\\Xenia\\Desktop\\files\\negSamples.txt";
    fileNames[2] = "C:\\Users\\Xenia\\Desktop\\files\\newsamples.txt";
            
            } else // else use the provided one
            {
    fileNames = sFileNameArgs.split(";");
            }
    */

    //-----------------VECTOR ANALYSIS STARTS HERE--------------------------------------

    //read sequences from txt files
    SequenceListFileReader reader = new SequenceListFileReader();
    ArrayList<BaseSequence> lSeqs1 = new ArrayList<>();
    ArrayList<BaseSequence> lSeqs2 = new ArrayList<>();

    lSeqs1 = reader.getSequencesFromFile("C:\\Users\\Xenia\\Desktop\\files\\posSamples.txt");
    lSeqs2 = reader.getSequencesFromFile("C:\\Users\\Xenia\\Desktop\\files\\negSamples.txt");

    //create vectors for every sequence
    List<Vector<Integer>> listVectorForPositiveSamples = new ArrayList<>();
    List<Vector<Integer>> listVectorForNegativeSamples = new ArrayList<>();
    VectorAnalyzer v = new VectorAnalyzer();
    listVectorForPositiveSamples = v.analyze(lSeqs1);
    listVectorForNegativeSamples = v.analyze(lSeqs2);

    //create ARFF files for positive and negative samples
    FileCreatorARFF fc = new FileCreatorARFF();
    Instances positiveInstances = fc.createARFF(listVectorForPositiveSamples, "yes");
    Instances negativeInstances = fc.createARFF(listVectorForNegativeSamples, "no");
    //System.out.println(positiveInstances);

    //build and train classifier
    // setting class attribute
    positiveInstances.setClassIndex(positiveInstances.numAttributes() - 1);
    negativeInstances.setClassIndex(negativeInstances.numAttributes() - 1);
    // train NaiveBayes
    NaiveBayesUpdateable nb = new NaiveBayesUpdateable();
    nb.buildClassifier(positiveInstances);
    nb.buildClassifier(negativeInstances);
    Instance current;
    for (int i = 0; i < positiveInstances.numInstances(); i++) {
        current = positiveInstances.instance(i);
        nb.updateClassifier(current);
    }

    // Test the model
    Evaluation eTest = new Evaluation(positiveInstances);
    Instances isTestingSet = fc.createARFF(listVectorForNegativeSamples, "?");
    isTestingSet.setClassIndex(isTestingSet.numAttributes() - 1);
    eTest.evaluateModel(nb, isTestingSet);

    //------------------VECTOR ANALYSIS ENDS HERE---------------------------------------

    //----------------------------HMM CLASSIFIER STARTS HERE----------------------------------
    // Init classifier
    /*       ISequenceClassifier<List<ObservationDiscrete<HmmSequence.Packet>>> classifier
        = new HmmClassifier();
    */
    // WARNING: Remember to change when you have normal data!!!
    // Obfuscation in negative training file?
    //       final boolean bObfuscateNeg = true;
    //        FASTAObfuscatorReader r = new FASTAObfuscatorReader();
    //for each file do the same work: train
    //        for (int i = 0; i < 3; i++) {
    // Read the sequences

    // If obfuscation is on and we are dealing with the negative
    // training file
    /*            if ((i == 2) && (bObfuscateNeg)) {
        //FASTAObfuscatorReader r = new FASTAObfuscatorReader();
        lSeqs = r.getSequencesFromFile(fileNames[i]);
        fileNames[1] = "Not" + fileNames[1]; // Update to indicate different class
    }
    else
        // else read normally
        lSeqs = reader.getSequencesFromFile(fileNames[i]);
            
    System.out.println("lSeqs size="+lSeqs.size());
    */
    // Create HMM sequences
    /*            ISequenceAnalyst<List<ObservationDiscrete<HmmSequence.Packet>>> analyst
            = new HmmAnalyzer();
    List<List<ObservationDiscrete<HmmSequence.Packet>>> lHmmSeqs = analyst.analyze(lSeqs);
            
    // Train classifier with the observations
    classifier.train(lHmmSeqs, new File(fileNames[i]).getName());
            }
            
            //Classify the test file        
            //First: Read the sequences
            lSeqs = r.getSequencesFromFile(fileNames[2]);
            //System.out.println("file name= "+fileNames[2]);
            //Then: Create HMM sequences
            ISequenceAnalyst<List<ObservationDiscrete<HmmSequence.Packet>>> analyst
        = new HmmAnalyzer();
            List<List<ObservationDiscrete<HmmSequence.Packet>>> lHmmSeqs = analyst.analyze(lSeqs);
    */

    //-------------------------------HMM CLASSIFIER ENDS HERE-----------------------------------------

    /*
            
    //----------------------------HMM EVALUATION STARTS-----------------------------------------------
    //System.out.println("size of lHmmSeqs="+ lHmmSeqs.size());
    String str = null;
    String[] savedResults = new String[lHmmSeqs.size()];
            
    //create a 2x2 array to store successes and failures for each class
    int[][] matrix = new int[2][2];
    int successForCpG = 0, failForCpG = 0, successForNotCpG = 0, failForNotCpG = 0;
            
    // Init identifier
    // CpGIslandIdentification identifier = new CpGIslandIdentification();
    CpGIslandIdentification identifier = new CpGIslandIdentificationByList("CpG_hg18.fa");
            
    for (int i = 0; i < lHmmSeqs.size(); i++) {
    // DEBUG
    System.err.print(".");
    if (i % 10 == 0)
        System.err.println();
    ////////
    str = classifier.classify(lHmmSeqs.get(i));
            //  System.out.println(  "i="+i);    
            
    System.out.println("Determined class:" + str);
    //            savedResults[i] = str;
            
    //kalw sunarthsh pou exetazei an to sequence ikanopoiei ta CpG criterias
    if (identifier.identify(lSeqs.get(i).getSymbolSequence()) && str.equals(fileNames[0])) {
            
        //Success for CpG class
        successForCpG++;
        System.out.println("successForCpG"  + successForCpG);
    } else if (identifier.identify(lSeqs.get(i).getSymbolSequence()) && str.equals(fileNames[1])) {
        //fail for CpG class
        failForCpG++;
        System.out.println("failForCpG" + failForCpG);
    } else if (identifier.identify(lSeqs.get(i).getSymbolSequence()) == false && str.equals(fileNames[1])) {
            
        //System.out.println(i);
        //Success for Not CpG class
        successForNotCpG++;
        System.out.println("successForNotCpG" + successForNotCpG);
    } else if (identifier.identify(lSeqs.get(i).getSymbolSequence()) == false && str.equals(fileNames[0])) {
              
        //fail for Not CpG class
        failForNotCpG++;
        System.out.println("failForNotCpG" + failForNotCpG);
    }
              
    }
            
    //Evaluation: calculation of classification rate and accuracy
    double totalAccuracy = (successForNotCpG + successForCpG)/(successForCpG + failForCpG + failForNotCpG + successForNotCpG);
            
    //missclassification rate for CpG class
    double rate1 = ( failForCpG + successForCpG ) != 0 ?
        failForCpG / ( failForCpG + successForCpG ) :
        0.0;
            
    //missclassification rate for Not CpG class
    double rate2 = ( failForNotCpG + successForNotCpG ) != 0 ? 
        failForNotCpG / ( failForNotCpG + successForNotCpG ) :
        0.0;
            
    System.out.println(totalAccuracy +" "+ rate1 + " "+ rate2);
            
    NGramGraphClassifier nGramGraphClassifier = new NGramGraphClassifier();
    List<List<DocumentNGramGraph>> representation;
    NGramGraphAnalyzer myAnalyst = new NGramGraphAnalyzer();
    representation = myAnalyst.analyze(lSeqs);
    for(int i=0; i<representation.size();i++)
    nGramGraphClassifier.classify(representation.get(i));
            
            
    */
}

From source file:gr.iti.mklab.visual.quantization.SimpleKMeansWithOutput.java

License:Open Source License

/**
 * Generates a clusterer. Has to initialize all fields of the clusterer that are not being set via
 * options./*from  www . jav  a 2s  .c om*/
 * 
 * @param data
 *            set of instances serving as training data
 * @throws Exception
 *             if the clusterer has not been generated successfully
 */
@Override
public void buildClusterer(Instances data) throws Exception {

    // can clusterer handle the data?
    getCapabilities().testWithFail(data);

    m_Iterations = 0;

    m_ReplaceMissingFilter = new ReplaceMissingValues();
    Instances instances = new Instances(data);

    instances.setClassIndex(-1);
    if (!m_dontReplaceMissing) {
        m_ReplaceMissingFilter.setInputFormat(instances);
        instances = Filter.useFilter(instances, m_ReplaceMissingFilter);
    }

    m_FullMissingCounts = new int[instances.numAttributes()];
    if (m_displayStdDevs) {
        m_FullStdDevs = new double[instances.numAttributes()];
    }
    m_FullNominalCounts = new int[instances.numAttributes()][0];

    m_FullMeansOrMediansOrModes = moveCentroid(0, instances, false, false);
    for (int i = 0; i < instances.numAttributes(); i++) {
        m_FullMissingCounts[i] = instances.attributeStats(i).missingCount;
        if (instances.attribute(i).isNumeric()) {
            if (m_displayStdDevs) {
                m_FullStdDevs[i] = Math.sqrt(instances.variance(i));
            }
            if (m_FullMissingCounts[i] == instances.numInstances()) {
                m_FullMeansOrMediansOrModes[i] = Double.NaN; // mark missing as mean
            }
        } else {
            m_FullNominalCounts[i] = instances.attributeStats(i).nominalCounts;
            if (m_FullMissingCounts[i] > m_FullNominalCounts[i][Utils.maxIndex(m_FullNominalCounts[i])]) {
                m_FullMeansOrMediansOrModes[i] = -1; // mark missing as most common
                // value
            }
        }
    }

    m_ClusterCentroids = new Instances(instances, m_NumClusters);
    int[] clusterAssignments = new int[instances.numInstances()];

    if (m_PreserveOrder)
        m_Assignments = clusterAssignments;

    m_DistanceFunction.setInstances(instances);

    Random RandomO = new Random(getSeed());
    int instIndex;
    HashMap initC = new HashMap();
    DecisionTableHashKey hk = null;

    Instances initInstances = null;
    if (m_PreserveOrder)
        initInstances = new Instances(instances);
    else
        initInstances = instances;

    if (m_initializeWithKMeansPlusPlus) {
        kMeansPlusPlusInit(initInstances);
    } else {
        for (int j = initInstances.numInstances() - 1; j >= 0; j--) {
            instIndex = RandomO.nextInt(j + 1);
            hk = new DecisionTableHashKey(initInstances.instance(instIndex), initInstances.numAttributes(),
                    true);
            if (!initC.containsKey(hk)) {
                m_ClusterCentroids.add(initInstances.instance(instIndex));
                initC.put(hk, null);
            }
            initInstances.swap(j, instIndex);

            if (m_ClusterCentroids.numInstances() == m_NumClusters) {
                break;
            }
        }
    }

    m_NumClusters = m_ClusterCentroids.numInstances();

    // removing reference
    initInstances = null;

    int i;
    boolean converged = false;
    int emptyClusterCount;
    Instances[] tempI = new Instances[m_NumClusters];
    m_squaredErrors = new double[m_NumClusters];
    m_ClusterNominalCounts = new int[m_NumClusters][instances.numAttributes()][0];
    m_ClusterMissingCounts = new int[m_NumClusters][instances.numAttributes()];
    startExecutorPool();

    long start = System.currentTimeMillis();
    while (!converged) {
        emptyClusterCount = 0;
        m_Iterations++;
        converged = true;
        System.out.print(new Date() + ": " + "Iter " + m_Iterations + " ");
        if (m_executionSlots <= 1 || instances.numInstances() < 2 * m_executionSlots) {
            for (i = 0; i < instances.numInstances(); i++) {
                Instance toCluster = instances.instance(i);
                int newC = clusterProcessedInstance(toCluster, true, true);
                if (newC != clusterAssignments[i]) {
                    converged = false;
                }
                clusterAssignments[i] = newC;
            }
        } else {
            converged = launchAssignToClusters(instances, clusterAssignments);
        }

        // update centroids
        m_ClusterCentroids = new Instances(instances, m_NumClusters);
        for (i = 0; i < m_NumClusters; i++) {
            tempI[i] = new Instances(instances, 0);
        }
        for (i = 0; i < instances.numInstances(); i++) {
            tempI[clusterAssignments[i]].add(instances.instance(i));
        }
        if (m_executionSlots <= 1 || instances.numInstances() < 2 * m_executionSlots) {
            for (i = 0; i < m_NumClusters; i++) {
                if (tempI[i].numInstances() == 0) {
                    // empty cluster
                    emptyClusterCount++;
                } else {
                    moveCentroid(i, tempI[i], true, true);
                }
            }
        } else {
            emptyClusterCount = launchMoveCentroids(tempI);
        }

        if (m_Iterations == m_MaxIterations)
            converged = true;

        if (emptyClusterCount > 0) {
            m_NumClusters -= emptyClusterCount;
            if (converged) {
                Instances[] t = new Instances[m_NumClusters];
                int index = 0;
                for (int k = 0; k < tempI.length; k++) {
                    if (tempI[k].numInstances() > 0) {
                        t[index++] = tempI[k];
                    }
                }
                tempI = t;
            } else {
                tempI = new Instances[m_NumClusters];
            }
        }

        if (!converged) {
            m_ClusterNominalCounts = new int[m_NumClusters][instances.numAttributes()][0];
        }
        System.out.println("Sum of within cluster distances: " + Utils.sum(m_squaredErrors));
        // reset erros to zero
        m_squaredErrors = new double[m_NumClusters];
    }
    long end = System.currentTimeMillis();
    System.out.println("\nClustering completed in " + (end - start) + " ms and converged in " + m_Iterations
            + " iterations");

    // calculate errors
    if (!m_FastDistanceCalc) {
        for (i = 0; i < instances.numInstances(); i++) {
            clusterProcessedInstance(instances.instance(i), true, false);
        }
    }

    if (m_displayStdDevs) {
        m_ClusterStdDevs = new Instances(instances, m_NumClusters);
    }
    m_ClusterSizes = new int[m_NumClusters];
    for (i = 0; i < m_NumClusters; i++) {
        if (m_displayStdDevs) {
            double[] vals2 = new double[instances.numAttributes()];
            for (int j = 0; j < instances.numAttributes(); j++) {
                if (instances.attribute(j).isNumeric()) {
                    vals2[j] = Math.sqrt(tempI[i].variance(j));
                } else {
                    vals2[j] = Utils.missingValue();
                }
            }
            m_ClusterStdDevs.add(new DenseInstance(1.0, vals2));
        }
        m_ClusterSizes[i] = tempI[i].numInstances();
    }

    m_executorPool.shutdown();
}

From source file:gr.iti.mklab.visual.quantization.SimpleKMeansWithOutput.java

License:Open Source License

protected void kMeansPlusPlusInit(Instances data) throws Exception {
    Random randomO = new Random(getSeed());
    HashMap<DecisionTableHashKey, String> initC = new HashMap<DecisionTableHashKey, String>();

    // choose initial center uniformly at random
    int index = randomO.nextInt(data.numInstances());
    m_ClusterCentroids.add(data.instance(index));
    DecisionTableHashKey hk = new DecisionTableHashKey(data.instance(index), data.numAttributes(), true);
    initC.put(hk, null);/*from  ww w . j a va  2 s.co m*/

    int iteration = 0;
    int remainingInstances = data.numInstances() - 1;
    if (m_NumClusters > 1) {
        // proceed with selecting the rest

        // distances to the initial randomly chose center
        double[] distances = new double[data.numInstances()];
        double[] cumProbs = new double[data.numInstances()];
        for (int i = 0; i < data.numInstances(); i++) {
            distances[i] = m_DistanceFunction.distance(data.instance(i),
                    m_ClusterCentroids.instance(iteration));
        }

        // now choose the remaining cluster centers
        for (int i = 1; i < m_NumClusters; i++) {

            // distances converted to probabilities
            double[] weights = new double[data.numInstances()];
            System.arraycopy(distances, 0, weights, 0, distances.length);
            Utils.normalize(weights);

            double sumOfProbs = 0;
            for (int k = 0; k < data.numInstances(); k++) {
                sumOfProbs += weights[k];
                cumProbs[k] = sumOfProbs;
            }

            cumProbs[data.numInstances() - 1] = 1.0; // make sure there are no
            // rounding issues

            // choose a random instance
            double prob = randomO.nextDouble();
            for (int k = 0; k < cumProbs.length; k++) {
                if (prob < cumProbs[k]) {
                    Instance candidateCenter = data.instance(k);
                    hk = new DecisionTableHashKey(candidateCenter, data.numAttributes(), true);
                    if (!initC.containsKey(hk)) {
                        initC.put(hk, null);
                        m_ClusterCentroids.add(candidateCenter);
                    } else {
                        // we shouldn't get here because any instance that is a duplicate
                        // of
                        // an already chosen cluster center should have zero distance (and
                        // hence
                        // zero probability of getting chosen) to that center.
                        System.err.println("We shouldn't get here....");
                    }
                    remainingInstances--;
                    break;
                }
            }
            iteration++;

            if (remainingInstances == 0) {
                break;
            }

            // prepare to choose the next cluster center.
            // check distances against the new cluster center to see if it is closer
            for (int k = 0; k < data.numInstances(); k++) {
                if (distances[k] > 0) {
                    double newDist = m_DistanceFunction.distance(data.instance(k),
                            m_ClusterCentroids.instance(iteration));
                    if (newDist < distances[k]) {
                        distances[k] = newDist;
                    }
                }
            }
        }
    }
}

From source file:gr.iti.mklab.visual.quantization.SimpleKMeansWithOutput.java

License:Open Source License

/**
 * Move the centroid to it's new coordinates. Generate the centroid coordinates based on it's members
 * (objects assigned to the cluster of the centroid) and the distance function being used.
 * //w w  w.j av  a2 s  .c  o  m
 * @param centroidIndex
 *            index of the centroid which the coordinates will be computed
 * @param members
 *            the objects that are assigned to the cluster of this centroid
 * @param updateClusterInfo
 *            if the method is supposed to update the m_Cluster arrays
 * @param addToCentroidInstances
 *            true if the method is to add the computed coordinates to the Instances holding the centroids
 * @return the centroid coordinates
 */
protected double[] moveCentroid(int centroidIndex, Instances members, boolean updateClusterInfo,
        boolean addToCentroidInstances) {
    double[] vals = new double[members.numAttributes()];

    // used only for Manhattan Distance
    Instances sortedMembers = null;
    int middle = 0;
    boolean dataIsEven = false;

    if (m_DistanceFunction instanceof ManhattanDistance) {
        middle = (members.numInstances() - 1) / 2;
        dataIsEven = ((members.numInstances() % 2) == 0);
        if (m_PreserveOrder) {
            sortedMembers = members;
        } else {
            sortedMembers = new Instances(members);
        }
    }

    for (int j = 0; j < members.numAttributes(); j++) {

        // in case of Euclidian distance the centroid is the mean point
        // in case of Manhattan distance the centroid is the median point
        // in both cases, if the attribute is nominal, the centroid is the mode
        if (m_DistanceFunction instanceof EuclideanDistance || members.attribute(j).isNominal()) {
            vals[j] = members.meanOrMode(j);
        } else if (m_DistanceFunction instanceof ManhattanDistance) {
            // singleton special case
            if (members.numInstances() == 1) {
                vals[j] = members.instance(0).value(j);
            } else {
                vals[j] = sortedMembers.kthSmallestValue(j, middle + 1);
                if (dataIsEven) {
                    vals[j] = (vals[j] + sortedMembers.kthSmallestValue(j, middle + 2)) / 2;
                }
            }
        }

        if (updateClusterInfo) {
            m_ClusterMissingCounts[centroidIndex][j] = members.attributeStats(j).missingCount;
            m_ClusterNominalCounts[centroidIndex][j] = members.attributeStats(j).nominalCounts;
            if (members.attribute(j).isNominal()) {
                if (m_ClusterMissingCounts[centroidIndex][j] > m_ClusterNominalCounts[centroidIndex][j][Utils
                        .maxIndex(m_ClusterNominalCounts[centroidIndex][j])]) {
                    vals[j] = Utils.missingValue(); // mark mode as missing
                }
            } else {
                if (m_ClusterMissingCounts[centroidIndex][j] == members.numInstances()) {
                    vals[j] = Utils.missingValue(); // mark mean as missing
                }
            }
        }
    }
    if (addToCentroidInstances) {
        m_ClusterCentroids.add(new DenseInstance(1.0, vals));
    }
    return vals;
}

From source file:gr.uoc.nlp.opinion.analysis.suggestion.AnalyzeSuggestions.java

/**
 *
 * @param classifier/*from  w  w  w  .  j  av a 2s. com*/
 * @param unclassified
 * @return
 */
public Instances classify(Classifier classifier, Instances unclassified) {

    unclassified.setClassIndex(unclassified.numAttributes() - 1);

    //new set wich will contain classifies instances
    Instances classified = new Instances(unclassified);

    double clsLabel;
    try {
        for (int i = 0; i < unclassified.numInstances(); i++) {
            //for each unclassifies, classify
            clsLabel = classifier.classifyInstance(unclassified.instance(i));
            //append result to final set
            classified.instance(i).setClassValue(clsLabel);
        }

    } catch (Exception ex) {
        Logger.getLogger(AnalyzeArguments.class.getName()).log(Level.SEVERE, null, ex);
    }

    return classified;
}