Example usage for weka.core Instances numInstances

List of usage examples for weka.core Instances numInstances

Introduction

In this page you can find the example usage for weka.core Instances numInstances.

Prototype


publicint numInstances() 

Source Link

Document

Returns the number of instances in the dataset.

Usage

From source file:entity.NoiseInjectionManager.java

License:Open Source License

/**
 * //from   w  w w  . j  av  a  2s. c o m
 * Increments fp and fn by specified percentages.
 * Randomize order of instances and modifies instances until noise quota is reached.
 * Than randomized instances again.
 * NOTE: It modifies the given dataset, because it is a reference.
 *  
 * @param origDataset
 * @param fpPercentage
 * @param fnPercentage
 * @return Instances noisyDataset
 */
public Instances addNoiseToDataset(Instances origDataset, BigDecimal fpPercentage, BigDecimal fnPercentage) {

    // exits if no noise must be added
    if (fnPercentage.equals(BigDecimal.ZERO) && fpPercentage.equals(BigDecimal.ZERO)) {
        if (verbose)
            System.out.println("[NoiseManager , addNoiseToDataset] nessun errore da aggiungere");
        return origDataset;
    }

    // total instances in dataset
    int numInstances = origDataset.numInstances();

    // finds positive (buggy) and negative (non-buggy) instances numbers
    int numOfPositives = 0;
    int numOfNegatives = 0;

    for (int j = 0; j < numInstances; j++) {

        if (origDataset.instance(j).stringValue(origDataset.classIndex()).equals(Settings.buggyLabel)) {
            numOfPositives++;
        }
        // this is a redundant control, but better safe than sorry
        else if (origDataset.instance(j).stringValue(origDataset.classIndex()).equals(Settings.nonbuggyLabel)) {
            numOfNegatives++;
        }
    }

    // calculates the number of false positives to insert
    int fpToInsert = (int) Math.round(numOfNegatives * fpPercentage.doubleValue() / 100);
    int fpInserted = 0;
    if (verbose)
        System.out.println("\n\n[NoiseManager , addNoiseToDataset] fpToInsert= " + fpToInsert
                + ", totIntances= " + origDataset.numInstances() + " true negatives= " + numOfNegatives
                + " %fp= " + fpPercentage);

    // calculates the number of false negatives to insert
    int fnToInsert = (int) Math.round(numOfPositives * fnPercentage.doubleValue() / 100);
    int fnInserted = 0;
    if (verbose)
        System.out.println("[NoiseManager , addNoiseToDataset] fnToInsert= " + fnToInsert + ", totIntances= "
                + origDataset.numInstances() + " true positives= " + numOfPositives + " %fn= " + fnPercentage);

    if (verbose)
        System.out.println("[NoiseManager , addNoiseToDataset] buggy label: " + Settings.buggyLabel
                + " - nonbuggy label: " + Settings.nonbuggyLabel);

    // randomize order of instances
    origDataset.randomize(RandomizationManager.randomGenerator);

    for (int i = 0; i < origDataset.numInstances(); i++) {
        if (verbose)
            System.out.print("\nORIGINAL VALUES: "
                    + origDataset.instance(i).value(origDataset.attribute(origDataset.classIndex())) + " - "
                    + origDataset.instance(i).stringValue(origDataset.classIndex()));

        // gets the classification attribute (it HAS to be the last)
        Attribute att = origDataset.instance(i).attribute(origDataset.classIndex());

        // if there are fn to add and this is a positive instances it turns it into a negative, making it a fn 
        if ((fnInserted < fnToInsert) && (origDataset.instance(i).stringValue(origDataset.classIndex())
                .equals(Settings.buggyLabel))) {

            origDataset.instance(i).setValue(att, Settings.nonbuggyLabel);
            fnInserted++;
            if (verbose)
                System.out.print(" - added FN, added " + fnInserted + " of " + fnToInsert + " ");
        }

        // if there are fp to add and this is a negative instances it turns it into a positive, making it a fp 
        else if ((fpInserted < fpToInsert) && (origDataset.instance(i).stringValue(origDataset.classIndex())
                .equals(Settings.nonbuggyLabel))) {

            origDataset.instance(i).setValue(att, Settings.buggyLabel);
            fpInserted++;
            if (verbose)
                System.out.print(" - added FP, added " + fpInserted + " of " + fpToInsert + " ");

        }

        if (verbose)
            System.out.print(" FINAL ELEMENT VALUES: "
                    + origDataset.instance(i).value(origDataset.attribute(origDataset.classIndex())) + " - "
                    + origDataset.instance(i).stringValue(origDataset.classIndex()));
    }

    // randomize order of instances
    origDataset.randomize(RandomizationManager.randomGenerator);
    return origDataset;
}

From source file:entity.NoiseInjectionManager.java

License:Open Source License

/**
 * Increments fp and fn in combination by a specified percentages.
 * Randomize order of instances and modifies instances until noise quota is reached.
 * Than randomized instances again.//from  w  ww .ja  v  a 2 s  .  c  o  m
 * NOTE: It modifies the given dataset, because it is a reference.
 * 
 * @param origDataset
 * @param combinedFpFnPercentage
 * @return noisydata
 */
public Instances addNoiseToDataset(Instances origDataset, BigDecimal combinedFpFnPercentage) {

    // exits if no noise must be added
    if (combinedFpFnPercentage.equals(BigDecimal.ZERO)) {
        if (verbose)
            System.out.println("[NoiseManager , addNoiseToDataset] nessun errore da aggiungere");
        return origDataset;
    }

    // total instances in dataset
    int numInstances = origDataset.numInstances();

    // finds positive (buggy) and negative (non-buggy) instances numbers
    int fpAndFnToInsert = (int) Math.round(numInstances * combinedFpFnPercentage.doubleValue() / 100);
    int fpAndFnInserted = 0;
    if (verbose)
        System.out.println("\n\n[NoiseManager , addNoiseToDataset] fpAndFnToInsert= " + fpAndFnToInsert
                + ", totIntances= " + origDataset.numInstances());

    if (verbose)
        System.out.println("[NoiseManager , addNoiseToDataset] buggy label: " + Settings.buggyLabel
                + " - nonbuggy label: " + Settings.nonbuggyLabel);

    // randomize order of instances
    origDataset.randomize(RandomizationManager.randomGenerator);

    for (int i = 0; i < origDataset.numInstances(); i++) {
        if (verbose)
            System.out.print("\nORIGINAL VALUES: "
                    + origDataset.instance(i).value(origDataset.attribute(origDataset.classIndex())) + " - "
                    + origDataset.instance(i).stringValue(origDataset.classIndex()));

        // gets the classification attribute (it HAS to be the last)
        Attribute att = origDataset.instance(i).attribute(origDataset.classIndex());

        // if there are fn or fp to add  
        if (fpAndFnInserted < fpAndFnToInsert) {

            // if this is a positive instances it turns it into a negative, making it a fn
            if (origDataset.instance(i).stringValue(origDataset.classIndex()).equals(Settings.buggyLabel)) {

                if (verbose)
                    System.out.print(" - added FN, added " + fpAndFnInserted + " of " + fpAndFnToInsert + " ");
                origDataset.instance(i).setValue(att, Settings.nonbuggyLabel);
                fpAndFnInserted++;
            }

            // if this is a negative instances it turns it into a positive, making it a fp
            else if (origDataset.instance(i).stringValue(origDataset.classIndex())
                    .equals(Settings.nonbuggyLabel)) {

                if (verbose)
                    System.out.print(" - added FP, added " + fpAndFnInserted + " of " + fpAndFnToInsert + " ");
                origDataset.instance(i).setValue(att, Settings.buggyLabel);
                fpAndFnInserted++;
            }
        }

        if (verbose)
            System.out.print(" FINAL ELEMENT VALUES: "
                    + origDataset.instance(i).value(origDataset.attribute(origDataset.classIndex())) + " - "
                    + origDataset.instance(i).stringValue(origDataset.classIndex()));
    }

    // randomize order of instances
    origDataset.randomize(RandomizationManager.randomGenerator);
    return origDataset;
}

From source file:ergasia2pkg.LP_ROS.java

/**
 * Groups instances by their labels//from w w w  .j  a va2 s  . c o m
 *
 * @param MultilabelInstances labeled instances
 * @return HashMap<String,List<Instance>> returned Hashmap with grouping
 */
public HashMap groupByLabelSet(MultiLabelInstances mlData) {

    Instances inst = mlData.getDataSet();

    Set<Attribute> atts = mlData.getLabelAttributes();
    HashMap LabelSetGroups = new HashMap<String, List<Instance>>();

    for (int i = 0; i < inst.numInstances(); i++) {
        Instance in = inst.get(i);
        String labelsetName = "";
        for (Attribute att : atts) {
            if (in.value(att) != 0) {
                labelsetName = labelsetName + att.name();
            }

        }
        if (LabelSetGroups.containsKey(labelsetName)) {
            List myList = (List) LabelSetGroups.get(labelsetName);
            myList.add(in);
            LabelSetGroups.put(labelsetName, myList);
        } else {
            List<Instance> myList = new ArrayList<Instance>();
            myList.add(in);
            LabelSetGroups.put(labelsetName, myList);
        }

    }

    return LabelSetGroups;
}

From source file:es.jarias.FMC.ClassCompoundTransformation.java

License:Open Source License

/**
 * //from  w ww  .j  a va 2s  .c  om
 * @param mlData
 * @return the transformed instances
 * @throws Exception
 */
public Instances transformInstances(MultiLabelInstances mlData) throws Exception {
    data = mlData.getDataSet();
    numLabels = mlData.getNumLabels();
    labelIndices = mlData.getLabelIndices();

    Instances newData = null;

    // This must be different in order to combine ALL class states, not only existing ones.
    // gather distinct label combinations
    // ASSUME CLASSES ARE BINARY

    ArrayList<LabelSet> labelSets = new ArrayList<LabelSet>();

    double[] dblLabels = new double[numLabels];
    double nCombinations = Math.pow(2, numLabels);

    for (int i = 0; i < nCombinations; i++) {
        for (int l = 0; l < numLabels; l++) {
            int digit = (int) Math.pow(2, numLabels - 1 - l);
            dblLabels[l] = (digit & i) / digit;
        }

        LabelSet labelSet = new LabelSet(dblLabels);
        labelSets.add(labelSet);
    }

    //        for (int i = 0; i < numInstances; i++) {
    //            // construct labelset
    //            double[] dblLabels = new double[numLabels];
    //            for (int j = 0; j < numLabels; j++) {
    //                int index = labelIndices[j];
    //                dblLabels[j] = Double.parseDouble(data.attribute(index).value((int) data.instance(i).value(index)));
    //            }
    //            LabelSet labelSet = new LabelSet(dblLabels);
    //
    //            // add labelset if not already present
    //            labelSets.add(labelSet);
    //        }

    // create class attribute
    ArrayList<String> classValues = new ArrayList<String>(labelSets.size());
    for (LabelSet subset : labelSets) {
        classValues.add(subset.toBitString());
    }
    newClass = new Attribute("class", classValues);

    //        for (String s : classValues)
    //        {
    //           System.out.print(s+", ");
    //           
    //        }
    //        System.out.println();

    // remove all labels
    newData = RemoveAllLabels.transformInstances(data, labelIndices);

    // add new class attribute
    newData.insertAttributeAt(newClass, newData.numAttributes());
    newData.setClassIndex(newData.numAttributes() - 1);

    // add class values
    for (int i = 0; i < newData.numInstances(); i++) {
        //System.out.println(newData.instance(i).toString());
        String strClass = "";
        for (int j = 0; j < numLabels; j++) {
            int index = labelIndices[j];
            strClass = strClass + data.attribute(index).value((int) data.instance(i).value(index));
        }
        //System.out.println(strClass);
        newData.instance(i).setClassValue(strClass);
    }
    transformedFormat = new Instances(newData, 0);
    return newData;
}

From source file:es.jarias.FMC.FMC.java

License:Open Source License

public static double[][] mutualInfo(Instances data, int[] indexes) {

    double[][] m_counts = new double[indexes.length][];
    double[][][] m_2counts = new double[indexes.length][indexes.length][];

    double[] nValues = new double[indexes.length];

    double[][] I = new double[indexes.length][indexes.length];

    for (int i = 0; i < indexes.length; i++) {
        nValues[i] = data.attribute(indexes[i]).numValues();
        m_counts[i] = new double[(int) nValues[i]];
    }//from  w  w  w  .j a va  2 s . c o  m

    for (int i = 0; i < indexes.length; i++) {
        for (int j = 0; j < indexes.length; j++) {
            if (i != j) {
                double cardinality = nValues[i] * nValues[j];
                m_2counts[i][j] = new double[(int) cardinality];
            }
        }
    }

    // Compute counts:
    for (Instance d : data) {
        for (int i = 0; i < indexes.length; i++) {
            m_counts[i][(int) d.value(indexes[i])]++;
            for (int j = 0; j < indexes.length; j++) {
                if (i != j) {
                    int index = (int) (d.value(indexes[j]) * nValues[i] + d.value(indexes[i]));
                    m_2counts[i][j][index]++;
                }
            }
        }
    }

    // Calculate MI(X_i; X_j)
    for (int i = 0; i < indexes.length; i++) {
        for (int j = 0; j < indexes.length; j++) {
            if (i != j) {
                double mi = 0.0;
                for (int v_i = 0; v_i < nValues[i]; v_i++) {
                    for (int v_j = 0; v_j < nValues[j]; v_j++) {

                        if ((1.0 * data.numInstances() * m_2counts[i][j][(int) (v_j * nValues[i] + v_i)])
                                / (1.0 * m_counts[i][v_i] * m_counts[j][v_j]) > 0)
                            mi += m_2counts[i][j][(int) (v_j * nValues[i] + v_i)] * Math.log((1.0
                                    * data.numInstances() * m_2counts[i][j][(int) (v_j * nValues[i] + v_i)])
                                    / (1.0 * m_counts[i][v_i] * m_counts[j][v_j]));
                    }
                }
                I[i][j] = mi / data.numInstances();
            }
        }
    }

    return I;
}

From source file:es.upm.dit.gsi.barmas.dataset.utils.DatasetSplitter.java

License:Open Source License

/**
 * @param folds/*from w w w. j  av  a 2  s  .c o  m*/
 * @param minAgents
 * @param maxAgents
 * @param originalDatasetPath
 * @param outputDir
 * @param scenario
 * @param logger
 */
public void splitDataset(int folds, int minAgents, int maxAgents, String originalDatasetPath, String outputDir,
        String scenario, Logger logger) {

    int ratioint = (int) ((1 / (double) folds) * 100);
    double roundedratio = ((double) ratioint) / 100;

    // Look for essentials
    List<String[]> essentials = this.getEssentials(originalDatasetPath, logger);

    for (int fold = 0; fold < folds; fold++) {
        String outputDirWithRatio = outputDir + "/" + roundedratio + "testRatio/iteration-" + fold;
        File dir = new File(outputDirWithRatio);
        if (!dir.exists() || !dir.isDirectory()) {
            dir.mkdirs();
        }

        logger.finer("--> splitDataset()");
        logger.fine("Creating experiment.info...");

        try {

            Instances originalData = this.getDataFromCSV(originalDatasetPath);

            originalData.randomize(new Random());
            originalData.stratify(folds);

            // TestDataSet
            Instances testData = originalData.testCV(folds, fold);
            CSVSaver saver = new CSVSaver();
            ArffSaver arffsaver = new ArffSaver();
            File file = new File(outputDirWithRatio + File.separator + "test-dataset.csv");
            if (!file.exists()) {
                saver.resetOptions();
                saver.setInstances(testData);
                saver.setFile(file);
                saver.writeBatch();
            }

            file = new File(outputDirWithRatio + File.separator + "test-dataset.arff");
            if (!file.exists()) {
                arffsaver.resetOptions();
                arffsaver.setInstances(testData);
                arffsaver.setFile(file);
                arffsaver.writeBatch();
            }

            // BayesCentralDataset
            Instances trainData = originalData.trainCV(folds, fold);
            file = new File(outputDirWithRatio + File.separator + "bayes-central-dataset.csv");
            if (!file.exists()) {
                saver.resetOptions();
                saver.setInstances(trainData);
                saver.setFile(file);
                saver.writeBatch();
                this.copyFileUsingApacheCommonsIO(file,
                        new File(
                                outputDirWithRatio + File.separator + "bayes-central-dataset-noEssentials.csv"),
                        logger);
                CsvWriter w = new CsvWriter(new FileWriter(file, true), ',');
                for (String[] essential : essentials) {
                    w.writeRecord(essential);
                }
                w.close();
            }
            file = new File(outputDirWithRatio + File.separator + "bayes-central-dataset.arff");
            if (!file.exists()) {
                arffsaver.resetOptions();
                arffsaver.setInstances(trainData);
                arffsaver.setFile(file);
                arffsaver.writeBatch();
                this.copyFileUsingApacheCommonsIO(file, new File(
                        outputDirWithRatio + File.separator + "bayes-central-dataset-noEssentials.arff"),
                        logger);
                CsvWriter w = new CsvWriter(new FileWriter(file, true), ',');
                for (String[] essential : essentials) {
                    w.writeRecord(essential);
                }
                w.close();
            }

            // Agent datasets
            CsvReader csvreader = new CsvReader(new FileReader(new File(originalDatasetPath)));
            csvreader.readHeaders();
            String[] headers = csvreader.getHeaders();
            csvreader.close();

            for (int agents = minAgents; agents <= maxAgents; agents++) {
                this.createExperimentInfoFile(folds, agents, originalDatasetPath, outputDirWithRatio, scenario,
                        logger);
                HashMap<String, CsvWriter> writers = new HashMap<String, CsvWriter>();
                String agentsDatasetsDir = outputDirWithRatio + File.separator + agents + "agents";
                HashMap<String, CsvWriter> arffWriters = new HashMap<String, CsvWriter>();
                File f = new File(agentsDatasetsDir);
                if (!f.isDirectory()) {
                    f.mkdirs();
                }
                Instances copy = new Instances(trainData);
                copy.delete();
                for (int i = 0; i < agents; i++) {
                    String fileName = agentsDatasetsDir + File.separator + "agent-" + i + "-dataset.csv";
                    file = new File(fileName);
                    if (!file.exists()) {
                        CsvWriter writer = new CsvWriter(new FileWriter(fileName), ',');
                        writer.writeRecord(headers);
                        writers.put("AGENT" + i, writer);
                    }
                    fileName = agentsDatasetsDir + File.separator + "agent-" + i + "-dataset.arff";
                    file = new File(fileName);
                    if (!file.exists()) {
                        arffsaver.resetOptions();
                        arffsaver.setInstances(copy);
                        arffsaver.setFile(new File(fileName));
                        arffsaver.writeBatch();
                        CsvWriter arffwriter = new CsvWriter(new FileWriter(fileName, true), ',');
                        arffWriters.put("AGENT" + i, arffwriter);
                    }

                    logger.fine("AGENT" + i + " dataset created in csv and arff formats.");
                }
                // Append essentials to all
                for (String[] essential : essentials) {
                    for (CsvWriter wr : writers.values()) {
                        wr.writeRecord(essential);
                    }
                    for (CsvWriter arffwr : arffWriters.values()) {
                        arffwr.writeRecord(essential);
                    }
                }

                int agentCounter = 0;
                for (int j = 0; j < trainData.numInstances(); j++) {
                    Instance instance = trainData.instance(j);
                    CsvWriter writer = writers.get("AGENT" + agentCounter);
                    CsvWriter arffwriter = arffWriters.get("AGENT" + agentCounter);
                    String[] row = new String[instance.numAttributes()];
                    for (int a = 0; a < instance.numAttributes(); a++) {
                        row[a] = instance.stringValue(a);
                    }
                    if (writer != null) {
                        writer.writeRecord(row);
                    }
                    if (arffwriter != null) {
                        arffwriter.writeRecord(row);
                    }
                    agentCounter++;
                    if (agentCounter == agents) {
                        agentCounter = 0;
                    }
                }

                for (CsvWriter wr : writers.values()) {
                    wr.close();
                }
                for (CsvWriter arffwr : arffWriters.values()) {
                    arffwr.close();
                }
            }

        } catch (Exception e) {
            logger.severe("Exception while splitting dataset. ->");
            logger.severe(e.getMessage());
            System.exit(1);
        }

        logger.finest("Dataset for fold " + fold + " created.");
    }

    logger.finer("<-- splitDataset()");

}

From source file:es.upm.dit.gsi.barmas.launcher.WekaClassifiersValidator.java

License:Open Source License

/**
 * @param cls//  w w w . j av  a 2  s .c  o  m
 * @param trainingData
 * @param testData
 * @param leba
 * @return [0] = pctCorrect, [1] = pctIncorrect
 * @throws Exception
 */
public double[] getValidation(Classifier cls, Instances trainingData, Instances testData, int leba)
        throws Exception {

    Instances testDataWithLEBA = new Instances(testData);

    for (int j = 0; j < leba; j++) {
        if (j < testDataWithLEBA.numAttributes() - 1) {
            for (int i = 0; i < testDataWithLEBA.numInstances(); i++) {
                testDataWithLEBA.instance(i).setMissing(j);
            }
        }
    }

    Evaluation eval;
    try {
        eval = new Evaluation(trainingData);
        logger.fine("Evaluating model with leba: " + leba);
        eval.evaluateModel(cls, testDataWithLEBA);

        double[] results = new double[2];
        results[0] = eval.pctCorrect() / 100;
        results[1] = eval.pctIncorrect() / 100;
        return results;
    } catch (Exception e) {
        logger.severe("Problems evaluating model for " + cls.getClass().getSimpleName());
        logger.severe(e.getMessage());
        e.printStackTrace();
        throw e;
    }
}

From source file:etc.aloe.filters.StringToDictionaryVector.java

License:Open Source License

private int[] determineDictionary(Instances instances) {
    if (stringAttributeIndex < 0) {
        throw new IllegalStateException("String attribute index not valid");
    }//from   ww w.  j a  v  a  2 s  .co  m

    // Operate on a per-class basis if class attribute is set
    int classInd = instances.classIndex();
    int values = 1;
    if (!m_doNotOperateOnPerClassBasis && (classInd != -1)) {
        values = instances.attribute(classInd).numValues();
    }

    HashMap<String, Integer> termIndices = new HashMap<String, Integer>();
    for (int i = 0; i < termList.size(); i++) {
        termIndices.put(termList.get(i), i);
    }

    //Create the trie for matching terms
    Trie termTrie = new Trie(termList);

    //Initialize the dictionary/count map
    ArrayList<HashMap<Integer, Count>> termCounts = new ArrayList<HashMap<Integer, Count>>();
    for (int z = 0; z < values; z++) {
        termCounts.add(new HashMap<Integer, Count>());
    }

    //Go through all the instances and count the emoticons
    for (int i = 0; i < instances.numInstances(); i++) {
        Instance instance = instances.instance(i);
        int vInd = 0;
        if (!m_doNotOperateOnPerClassBasis && (classInd != -1)) {
            vInd = (int) instance.classValue();
        }

        //Get the string attribute to examine
        String stringValue = instance.stringValue(stringAttributeIndex);

        HashMap<Integer, Count> termCountsForClass = termCounts.get(vInd);

        HashMap<String, Integer> termMatches = termTrie.countNonoverlappingMatches(stringValue);
        for (Map.Entry<String, Integer> entry : termMatches.entrySet()) {
            String term = entry.getKey();
            int termIdx = termIndices.get(term);

            int matches = entry.getValue();

            Count count = termCountsForClass.get(termIdx);
            if (count == null) {
                count = new Count(0);
                termCountsForClass.put(termIdx, count);
            }

            if (matches > 0) {
                count.docCount += 1;
                count.count += matches;
            }
        }
    }

    // Figure out the minimum required word frequency
    int prune[] = new int[values];
    for (int z = 0; z < values; z++) {
        HashMap<Integer, Count> termCountsForClass = termCounts.get(z);

        int array[] = new int[termCountsForClass.size()];
        int pos = 0;
        for (Map.Entry<Integer, Count> entry : termCountsForClass.entrySet()) {
            array[pos] = entry.getValue().count;
            pos++;
        }

        // sort the array
        sortArray(array);

        if (array.length < m_WordsToKeep) {
            // if there aren't enough words, set the threshold to
            // minFreq
            prune[z] = m_minTermFreq;
        } else {
            // otherwise set it to be at least minFreq
            prune[z] = Math.max(m_minTermFreq, array[array.length - m_WordsToKeep]);
        }
    }

    // Add the word vector attributes (eliminating duplicates
    // that occur in multiple classes)
    HashSet<String> selectedTerms = new HashSet<String>();
    for (int z = 0; z < values; z++) {
        HashMap<Integer, Count> termCountsForClass = termCounts.get(z);

        for (Map.Entry<Integer, Count> entry : termCountsForClass.entrySet()) {
            int termIndex = entry.getKey();
            String term = termList.get(termIndex);
            Count count = entry.getValue();
            if (count.count >= prune[z]) {
                selectedTerms.add(term);
            }
        }
    }

    //Save the selected terms as a list
    this.m_selectedTerms = new ArrayList<String>(selectedTerms);
    this.m_selectedTermsTrie = new Trie(this.m_selectedTerms);
    this.m_NumInstances = instances.size();

    //Construct the selected terms to index map
    this.m_selectedTermIndices = new HashMap<String, Integer>();
    for (int i = 0; i < m_selectedTerms.size(); i++) {
        m_selectedTermIndices.put(m_selectedTerms.get(i), i);
    }

    // Compute document frequencies, organized by selected term index (not original term index)
    int[] docsCounts = new int[m_selectedTerms.size()];
    for (int i = 0; i < m_selectedTerms.size(); i++) {
        String term = m_selectedTerms.get(i);
        int termIndex = termIndices.get(term);
        int docsCount = 0;
        for (int z = 0; z < values; z++) {
            HashMap<Integer, Count> termCountsForClass = termCounts.get(z);

            Count count = termCountsForClass.get(termIndex);
            if (count != null) {
                docsCount += count.docCount;
            }
        }
        docsCounts[i] = docsCount;
    }
    return docsCounts;
}

From source file:eu.cassandra.server.mongo.csn.MongoCluster.java

License:Apache License

/**
 * /*w w  w  .j a  v  a 2 s.  co m*/
 * @param message
 * @param graph_id
 * @param clusterBasedOn
 * @param numberOfClusters
 * @param httpHeaders
 * @return
 */
private DBObject clusterKmeans(String message, String graph_id, String run_id, String clusterBasedOn,
        int numberOfClusters, String name, String clusterbasedon) {
    try {
        Instances instances = getInstances(clusterBasedOn, graph_id);
        if (instances.numInstances() < 2) {
            return new JSONtoReturn().createJSONError(message, new Exception("Number of CSN Nodes is < 2"));
        }

        SimpleKMeans kmeans = new SimpleKMeans();
        kmeans.setSeed((int) Calendar.getInstance().getTimeInMillis());
        // This is the important parameter to set
        kmeans.setPreserveInstancesOrder(true);
        kmeans.setNumClusters(numberOfClusters);
        kmeans.buildClusterer(instances);

        // This array returns the cluster number (starting with 0) for each instance
        // The array has as many elements as the number of instances
        int[] assignments = kmeans.getAssignments();

        int i = 0;
        HashMap<Integer, Vector<String>> clusters = new HashMap<Integer, Vector<String>>();
        for (int clusterNum : assignments) {
            if (clusters.containsKey(clusterNum)) {
                Vector<String> cluster = clusters.get(clusterNum);
                cluster.add(nodeIDs.get(i));
                clusters.put(clusterNum, cluster);
            } else {
                Vector<String> cluster = new Vector<String>();
                cluster.add(nodeIDs.get(i));
                clusters.put(clusterNum, cluster);
            }
            i++;
        }
        nodeIDs.clear();
        return saveClusters(graph_id, run_id, "kmeans", clusters, null, name, clusterbasedon);
    } catch (Exception e) {
        e.printStackTrace();
        return new JSONtoReturn().createJSONError(message, e);
    }
}

From source file:eu.cassandra.server.mongo.csn.MongoCluster.java

License:Apache License

public DBObject clusterHierarchical(String message, String graph_id, String run_id, String clusterBasedOn,
        int numberOfClusters, String name, String clusterbasedon) {
    try {// ww w  .  j a  v  a 2 s  .c o  m
        Instances instances = getInstances(clusterBasedOn, graph_id);
        if (instances.numInstances() < 2) {
            return new JSONtoReturn().createJSONError(message, new Exception("Number of CSN Nodes is < 2"));
        }

        HierarchicalClusterer h = new HierarchicalClusterer();
        h.setOptions(new String[] { "-L", "AVERAGE" });
        h.setDistanceFunction(new EuclideanDistance());
        if (numberOfClusters > 0)
            h.setNumClusters(numberOfClusters);
        h.buildClusterer(instances);

        HashMap<Integer, Vector<String>> clusters = new HashMap<Integer, Vector<String>>();
        double[] arr;
        for (int i = 0; i < instances.numInstances(); i++) {
            String nodeId = nodeIDs.get(i);
            arr = h.distributionForInstance(instances.instance(i));
            for (int j = 0; j < arr.length; j++) {
                if (arr[j] == 1.0) {
                    if (!clusters.containsKey(j)) {
                        Vector<String> nodes = new Vector<String>();
                        nodes.add(nodeId);
                        clusters.put(j, nodes);
                    } else {
                        Vector<String> nodes = clusters.get(j);
                        nodes.add(nodeId);
                        clusters.put(j, nodes);
                    }
                }
            }
        }
        return saveClusters(graph_id, run_id, "hierarchical", clusters, null, name, clusterbasedon);
    } catch (Exception e) {
        e.printStackTrace();
        return new JSONtoReturn().createJSONError(message, e);
    }
}