Example usage for weka.core Instances numAttributes

List of usage examples for weka.core Instances numAttributes

Introduction

In this page you can find the example usage for weka.core Instances numAttributes.

Prototype


publicint numAttributes() 

Source Link

Document

Returns the number of attributes.

Usage

From source file:classifiers.mlp.MultilayerPerceptronCustom.java

License:Open Source License

/**
 * Call this function to build and train a neural network for the training
 * data provided./*from  w ww  . j  a va2s  .c  om*/
 * @param i The training data.
 * @throws Exception if can't build classification properly.
 */
public void buildClassifier(Instances i) throws Exception {

    // can classifier handle the data?
    getCapabilities().testWithFail(i);

    // remove instances with missing class
    i = new Instances(i);
    i.deleteWithMissingClass();

    m_ZeroR = new weka.classifiers.rules.ZeroR();
    m_ZeroR.buildClassifier(i);
    // only class? -> use ZeroR model
    if (i.numAttributes() == 1) {
        System.err.println(
                "Cannot build model (only class attribute present in data!), " + "using ZeroR model instead!");
        m_useDefaultModel = true;
        return;
    } else {
        m_useDefaultModel = false;
    }

    m_epoch = 0;
    m_error = 0;
    m_instances = null;
    m_currentInstance = null;
    m_controlPanel = null;
    m_nodePanel = null;

    m_outputs = new NeuralEnd[0];
    m_inputs = new NeuralEnd[0];
    m_numAttributes = 0;
    m_numClasses = 0;
    m_neuralNodes = new NeuralConnection[0];

    m_selected = new FastVector(4);
    m_graphers = new FastVector(2);
    m_nextId = 0;
    m_stopIt = true;
    m_stopped = true;
    m_accepted = false;
    m_instances = new Instances(i);
    m_random = new Random(m_randomSeed);
    m_instances.randomize(m_random);

    if (m_useNomToBin) {
        m_nominalToBinaryFilter = new NominalToBinary();
        m_nominalToBinaryFilter.setInputFormat(m_instances);
        m_instances = Filter.useFilter(m_instances, m_nominalToBinaryFilter);
    }
    m_numAttributes = m_instances.numAttributes() - 1;
    m_numClasses = m_instances.numClasses();

    setClassType(m_instances);

    //this sets up the validation set.
    Instances valSet = null;
    //numinval is needed later
    int numInVal = (int) (m_valSize / 100.0 * m_instances.numInstances());
    if (m_valSize > 0) {
        if (numInVal == 0) {
            numInVal = 1;
        }
        valSet = new Instances(m_instances, 0, numInVal);
    }
    ///////////

    setupInputs();

    setupOutputs();
    if (m_autoBuild) {
        setupHiddenLayer();
    }

    /////////////////////////////
    //this sets up the gui for usage
    if (m_gui) {
        m_win = new JFrame();

        m_win.addWindowListener(new WindowAdapter() {
            public void windowClosing(WindowEvent e) {
                boolean k = m_stopIt;
                m_stopIt = true;
                int well = JOptionPane
                        .showConfirmDialog(m_win,
                                "Are You Sure...\n" + "Click Yes To Accept" + " The Neural Network"
                                        + "\n Click No To Return",
                                "Accept Neural Network", JOptionPane.YES_NO_OPTION);

                if (well == 0) {
                    m_win.setDefaultCloseOperation(JFrame.DISPOSE_ON_CLOSE);
                    m_accepted = true;
                    blocker(false);
                } else {
                    m_win.setDefaultCloseOperation(JFrame.DO_NOTHING_ON_CLOSE);
                }
                m_stopIt = k;
            }
        });

        m_win.getContentPane().setLayout(new BorderLayout());
        m_win.setTitle("Neural Network");
        m_nodePanel = new NodePanel();
        // without the following two lines, the NodePanel.paintComponents(Graphics) 
        // method will go berserk if the network doesn't fit completely: it will
        // get called on a constant basis, using 100% of the CPU
        // see the following forum thread:
        // http://forum.java.sun.com/thread.jspa?threadID=580929&messageID=2945011
        m_nodePanel.setPreferredSize(new Dimension(640, 480));
        m_nodePanel.revalidate();

        JScrollPane sp = new JScrollPane(m_nodePanel, JScrollPane.VERTICAL_SCROLLBAR_ALWAYS,
                JScrollPane.HORIZONTAL_SCROLLBAR_NEVER);
        m_controlPanel = new ControlPanel();

        m_win.getContentPane().add(sp, BorderLayout.CENTER);
        m_win.getContentPane().add(m_controlPanel, BorderLayout.SOUTH);
        m_win.setSize(640, 480);
        m_win.setVisible(true);
    }

    //This sets up the initial state of the gui
    if (m_gui) {
        blocker(true);
        m_controlPanel.m_changeEpochs.setEnabled(false);
        m_controlPanel.m_changeLearning.setEnabled(false);
        m_controlPanel.m_changeMomentum.setEnabled(false);
    }

    //For silly situations in which the network gets accepted before training
    //commenses
    if (m_numeric) {
        setEndsToLinear();
    }
    if (m_accepted) {
        m_win.dispose();
        m_controlPanel = null;
        m_nodePanel = null;
        m_instances = new Instances(m_instances, 0);
        m_currentInstance = null;
        return;
    }

    //connections done.
    double right = 0;
    double driftOff = 0;
    double lastRight = Double.POSITIVE_INFINITY;
    double bestError = Double.POSITIVE_INFINITY;
    double tempRate;
    double totalWeight = 0;
    double totalValWeight = 0;
    double origRate = m_learningRate; //only used for when reset

    //ensure that at least 1 instance is trained through.
    if (numInVal == m_instances.numInstances()) {
        numInVal--;
    }
    if (numInVal < 0) {
        numInVal = 0;
    }
    for (int noa = numInVal; noa < m_instances.numInstances(); noa++) {
        if (!m_instances.instance(noa).classIsMissing()) {
            totalWeight += m_instances.instance(noa).weight();
        }
    }
    if (m_valSize != 0) {
        for (int noa = 0; noa < valSet.numInstances(); noa++) {
            if (!valSet.instance(noa).classIsMissing()) {
                totalValWeight += valSet.instance(noa).weight();
            }
        }
    }
    m_stopped = false;

    for (int noa = 1; noa < m_numEpochs + 1; noa++) {
        right = 0;
        for (int nob = numInVal; nob < m_instances.numInstances(); nob++) {
            m_currentInstance = m_instances.instance(nob);

            if (!m_currentInstance.classIsMissing()) {

                //this is where the network updating (and training occurs, for the
                //training set
                resetNetwork();
                calculateOutputs();
                tempRate = m_learningRate * m_currentInstance.weight();
                if (m_decay) {
                    tempRate /= noa;
                }

                right += (calculateErrors() / m_instances.numClasses()) * m_currentInstance.weight();
                updateNetworkWeights(tempRate, m_momentum);

            }

        }
        right /= totalWeight;
        if (Double.isInfinite(right) || Double.isNaN(right)) {
            if (!m_reset) {
                m_instances = null;
                throw new Exception("Network cannot train. Try restarting with a" + " smaller learning rate.");
            } else {
                //reset the network if possible
                if (m_learningRate <= Utils.SMALL)
                    throw new IllegalStateException(
                            "Learning rate got too small (" + m_learningRate + " <= " + Utils.SMALL + ")!");
                m_learningRate /= 2;
                buildClassifier(i);
                m_learningRate = origRate;
                m_instances = new Instances(m_instances, 0);
                m_currentInstance = null;
                return;
            }
        }

        ////////////////////////do validation testing if applicable
        if (m_valSize != 0) {
            right = 0;
            for (int nob = 0; nob < valSet.numInstances(); nob++) {
                m_currentInstance = valSet.instance(nob);
                if (!m_currentInstance.classIsMissing()) {
                    //this is where the network updating occurs, for the validation set
                    resetNetwork();
                    calculateOutputs();
                    right += (calculateErrors() / valSet.numClasses()) * m_currentInstance.weight();
                    //note 'right' could be calculated here just using
                    //the calculate output values. This would be faster.
                    //be less modular
                }

            }

            if (right < lastRight) {

                if (right < bestError) {
                    bestError = right;
                    // save the network weights at this point
                    for (int noc = 0; noc < m_numClasses; noc++) {
                        m_outputs[noc].saveWeights();
                    }
                    driftOff = 0;
                }
            } else {
                driftOff++;
            }
            lastRight = right;
            if (driftOff > m_driftThreshold || noa + 1 >= m_numEpochs) {
                for (int noc = 0; noc < m_numClasses; noc++) {
                    m_outputs[noc].restoreWeights();
                }
                m_accepted = true;
            }
            right /= totalValWeight;
        }
        m_epoch = noa;
        m_error = right;
        //shows what the neuralnet is upto if a gui exists. 
        updateDisplay();
        //This junction controls what state the gui is in at the end of each
        //epoch, Such as if it is paused, if it is resumable etc...
        if (m_gui) {
            while ((m_stopIt || (m_epoch >= m_numEpochs && m_valSize == 0)) && !m_accepted) {
                m_stopIt = true;
                m_stopped = true;
                if (m_epoch >= m_numEpochs && m_valSize == 0) {

                    m_controlPanel.m_startStop.setEnabled(false);
                } else {
                    m_controlPanel.m_startStop.setEnabled(true);
                }
                m_controlPanel.m_startStop.setText("Start");
                m_controlPanel.m_startStop.setActionCommand("Start");
                m_controlPanel.m_changeEpochs.setEnabled(true);
                m_controlPanel.m_changeLearning.setEnabled(true);
                m_controlPanel.m_changeMomentum.setEnabled(true);

                blocker(true);
                if (m_numeric) {
                    setEndsToLinear();
                }
            }
            m_controlPanel.m_changeEpochs.setEnabled(false);
            m_controlPanel.m_changeLearning.setEnabled(false);
            m_controlPanel.m_changeMomentum.setEnabled(false);

            m_stopped = false;
            //if the network has been accepted stop the training loop
            if (m_accepted) {
                m_win.dispose();
                m_controlPanel = null;
                m_nodePanel = null;
                m_instances = new Instances(m_instances, 0);
                m_currentInstance = null;
                return;
            }
        }
        if (m_accepted) {
            m_instances = new Instances(m_instances, 0);
            m_currentInstance = null;
            return;
        }

        //TODO:
        // Customization: store the model created after this epoch
        ObjectOutputStream oos = new ObjectOutputStream(new FileOutputStream("mlp/temp/" + noa + ".model"));
        oos.writeObject(this);
        oos.flush();
        oos.close();
    }

    if (m_gui) {
        m_win.dispose();
        m_controlPanel = null;
        m_nodePanel = null;
    }
    m_instances = new Instances(m_instances, 0);
    m_currentInstance = null;
}

From source file:classify.Classifier.java

/**
 * @param args the command line arguments
 *///from   ww  w  . j av  a2 s .c om
public static void main(String[] args) {
    //read in data
    try {
        DataSource input = new DataSource("no_missing_values.csv");
        Instances data = input.getDataSet();
        //Instances data = readFile("newfixed.txt");
        missingValuesRows(data);

        setAttributeValues(data);
        data.setClassIndex(data.numAttributes() - 1);

        //boosting
        AdaBoostM1 boosting = new AdaBoostM1();
        boosting.setNumIterations(25);
        boosting.setClassifier(new DecisionStump());

        //build the classifier
        boosting.buildClassifier(data);

        //evaluate using 10-fold cross validation
        Evaluation e1 = new Evaluation(data);
        e1.crossValidateModel(boosting, data, 10, new Random(1));

        DecimalFormat nf = new DecimalFormat("0.000");

        System.out.println("Results of Boosting with Decision Stumps:");
        System.out.println(boosting.toString());
        System.out.println("Results of Cross Validation:");
        System.out.println("Number of correctly classified instances: " + e1.correct() + " ("
                + nf.format(e1.pctCorrect()) + "%)");
        System.out.println("Number of incorrectly classified instances: " + e1.incorrect() + " ("
                + nf.format(e1.pctIncorrect()) + "%)");

        System.out.println("TP Rate: " + nf.format(e1.weightedTruePositiveRate() * 100) + "%");
        System.out.println("FP Rate: " + nf.format(e1.weightedFalsePositiveRate() * 100) + "%");
        System.out.println("Precision: " + nf.format(e1.weightedPrecision() * 100) + "%");
        System.out.println("Recall: " + nf.format(e1.weightedRecall() * 100) + "%");

        System.out.println();
        System.out.println("Confusion Matrix:");
        for (int i = 0; i < e1.confusionMatrix().length; i++) {
            for (int j = 0; j < e1.confusionMatrix()[0].length; j++) {
                System.out.print(e1.confusionMatrix()[i][j] + "   ");
            }
            System.out.println();
        }
        System.out.println();
        System.out.println();
        System.out.println();

        //logistic regression
        Logistic l = new Logistic();
        l.buildClassifier(data);

        e1 = new Evaluation(data);

        e1.crossValidateModel(l, data, 10, new Random(1));
        System.out.println("Results of Logistic Regression:");
        System.out.println(l.toString());
        System.out.println("Results of Cross Validation:");
        System.out.println("Number of correctly classified instances: " + e1.correct() + " ("
                + nf.format(e1.pctCorrect()) + "%)");
        System.out.println("Number of incorrectly classified instances: " + e1.incorrect() + " ("
                + nf.format(e1.pctIncorrect()) + "%)");

        System.out.println("TP Rate: " + nf.format(e1.weightedTruePositiveRate() * 100) + "%");
        System.out.println("FP Rate: " + nf.format(e1.weightedFalsePositiveRate() * 100) + "%");
        System.out.println("Precision: " + nf.format(e1.weightedPrecision() * 100) + "%");
        System.out.println("Recall: " + nf.format(e1.weightedRecall() * 100) + "%");

        System.out.println();
        System.out.println("Confusion Matrix:");
        for (int i = 0; i < e1.confusionMatrix().length; i++) {
            for (int j = 0; j < e1.confusionMatrix()[0].length; j++) {
                System.out.print(e1.confusionMatrix()[i][j] + "   ");
            }
            System.out.println();
        }

    } catch (Exception ex) {
        //data couldn't be read, so end program
        System.out.println("Exception thrown, program ending.");
    }
}

From source file:cluster.ABC.ClusterUtils.java

License:Open Source License

/** Fast version of meanOrMode - streamlined from Instances.meanOrMode for efficiency 
 *  Does not check for missing attributes, assumes numeric attributes, assumes Sparse instances
 *///  ww w  . j  ava2 s . c o m

public static double[] meanOrMode(Instances insts) {

    int numAttributes = insts.numAttributes();
    double[] value = new double[numAttributes];
    double weight = 0;

    for (int i = 0; i < numAttributes; i++) {
        value[i] = 0;
    }

    for (int j = 0; j < insts.numInstances(); j++) {
        SparseInstance inst = (SparseInstance) (insts.instance(j));
        weight += inst.weight();

        for (int i = 0; i < inst.numValues(); i++) {
            int indexOfIndex = inst.index(i);
            value[indexOfIndex] += inst.weight() * inst.valueSparse(i);
        }
    }

    if (Utils.eq(weight, 0)) {
        for (int k = 0; k < numAttributes; k++) {
            value[k] = 0;
        }
    } else {
        for (int k = 0; k < numAttributes; k++) {
            value[k] = value[k] / weight;
        }
    }

    return value;
}

From source file:clusterer.SimpleKMeansWithSilhouette.java

License:Open Source License

/**
 * Generates a clusterer. Has to initialize all fields of the clusterer that
 * are not being set via options./*from ww w  .  j  av a 2 s .  com*/
 * 
 * @param data set of instances serving as training data
 * @throws Exception if the clusterer has not been generated successfully
 */
@Override
public void buildClusterer(Instances data) throws Exception {

    m_canopyClusters = null;

    // can clusterer handle the data?
    getCapabilities().testWithFail(data);

    m_Iterations = 0;

    m_ReplaceMissingFilter = new ReplaceMissingValues();
    Instances instances = new Instances(data);

    instances.setClassIndex(-1);
    if (!m_dontReplaceMissing) {
        m_ReplaceMissingFilter.setInputFormat(instances);
        instances = Filter.useFilter(instances, m_ReplaceMissingFilter);
    }

    m_ClusterNominalCounts = new double[m_NumClusters][instances.numAttributes()][];
    m_ClusterMissingCounts = new double[m_NumClusters][instances.numAttributes()];
    if (m_displayStdDevs) {
        m_FullStdDevs = instances.variances();
    }

    m_FullMeansOrMediansOrModes = moveCentroid(0, instances, true, false);

    m_FullMissingCounts = m_ClusterMissingCounts[0];
    m_FullNominalCounts = m_ClusterNominalCounts[0];
    double sumOfWeights = instances.sumOfWeights();
    for (int i = 0; i < instances.numAttributes(); i++) {
        if (instances.attribute(i).isNumeric()) {
            if (m_displayStdDevs) {
                m_FullStdDevs[i] = Math.sqrt(m_FullStdDevs[i]);
            }
            if (m_FullMissingCounts[i] == sumOfWeights) {
                m_FullMeansOrMediansOrModes[i] = Double.NaN; // mark missing as mean
            }
        } else {
            if (m_FullMissingCounts[i] > m_FullNominalCounts[i][Utils.maxIndex(m_FullNominalCounts[i])]) {
                m_FullMeansOrMediansOrModes[i] = -1; // mark missing as most common
                                                     // value
            }
        }
    }

    m_ClusterCentroids = new Instances(instances, m_NumClusters);
    int[] clusterAssignments = new int[instances.numInstances()];

    if (m_PreserveOrder) {
        m_Assignments = clusterAssignments;
    }

    m_DistanceFunction.setInstances(instances);

    Random RandomO = new Random(getSeed());
    int instIndex;
    HashMap<DecisionTableHashKey, Integer> initC = new HashMap<DecisionTableHashKey, Integer>();
    DecisionTableHashKey hk = null;

    Instances initInstances = null;
    if (m_PreserveOrder) {
        initInstances = new Instances(instances);
    } else {
        initInstances = instances;
    }

    if (m_speedUpDistanceCompWithCanopies) {
        m_canopyClusters = new Canopy();
        m_canopyClusters.setNumClusters(m_NumClusters);
        m_canopyClusters.setSeed(getSeed());
        m_canopyClusters.setT2(getCanopyT2());
        m_canopyClusters.setT1(getCanopyT1());
        m_canopyClusters.setMaxNumCandidateCanopiesToHoldInMemory(getCanopyMaxNumCanopiesToHoldInMemory());
        m_canopyClusters.setPeriodicPruningRate(getCanopyPeriodicPruningRate());
        m_canopyClusters.setMinimumCanopyDensity(getCanopyMinimumCanopyDensity());
        m_canopyClusters.setDebug(getDebug());
        m_canopyClusters.buildClusterer(initInstances);
        // System.err.println(m_canopyClusters);
        m_centroidCanopyAssignments = new ArrayList<long[]>();
        m_dataPointCanopyAssignments = new ArrayList<long[]>();
    }

    if (m_initializationMethod == KMEANS_PLUS_PLUS) {
        kMeansPlusPlusInit(initInstances);

        m_initialStartPoints = new Instances(m_ClusterCentroids);
    } else if (m_initializationMethod == CANOPY) {
        canopyInit(initInstances);

        m_initialStartPoints = new Instances(m_canopyClusters.getCanopies());
    } else if (m_initializationMethod == FARTHEST_FIRST) {
        farthestFirstInit(initInstances);

        m_initialStartPoints = new Instances(m_ClusterCentroids);
    } else {
        // random
        for (int j = initInstances.numInstances() - 1; j >= 0; j--) {
            instIndex = RandomO.nextInt(j + 1);
            hk = new DecisionTableHashKey(initInstances.instance(instIndex), initInstances.numAttributes(),
                    true);
            if (!initC.containsKey(hk)) {
                m_ClusterCentroids.add(initInstances.instance(instIndex));
                initC.put(hk, null);
            }
            initInstances.swap(j, instIndex);

            if (m_ClusterCentroids.numInstances() == m_NumClusters) {
                break;
            }
        }

        m_initialStartPoints = new Instances(m_ClusterCentroids);
    }

    if (m_speedUpDistanceCompWithCanopies) {
        // assign canopies to training data
        for (int i = 0; i < instances.numInstances(); i++) {
            m_dataPointCanopyAssignments.add(m_canopyClusters.assignCanopies(instances.instance(i)));
        }
    }

    m_NumClusters = m_ClusterCentroids.numInstances();

    // removing reference
    initInstances = null;

    int i;
    boolean converged = false;
    int emptyClusterCount;
    Instances[] tempI = new Instances[m_NumClusters];
    m_squaredErrors = new double[m_NumClusters];
    m_ClusterNominalCounts = new double[m_NumClusters][instances.numAttributes()][0];
    m_ClusterMissingCounts = new double[m_NumClusters][instances.numAttributes()];
    startExecutorPool();

    while (!converged) {
        if (m_speedUpDistanceCompWithCanopies) {
            // re-assign canopies to the current cluster centers
            m_centroidCanopyAssignments.clear();
            for (int kk = 0; kk < m_ClusterCentroids.numInstances(); kk++) {
                m_centroidCanopyAssignments
                        .add(m_canopyClusters.assignCanopies(m_ClusterCentroids.instance(kk)));
            }
        }

        emptyClusterCount = 0;
        m_Iterations++;
        converged = true;

        if (m_executionSlots <= 1 || instances.numInstances() < 2 * m_executionSlots) {
            for (i = 0; i < instances.numInstances(); i++) {
                Instance toCluster = instances.instance(i);
                int newC = clusterProcessedInstance(toCluster, false, true,
                        m_speedUpDistanceCompWithCanopies ? m_dataPointCanopyAssignments.get(i) : null);
                if (newC != clusterAssignments[i]) {
                    converged = false;
                }
                clusterAssignments[i] = newC;
            }
        } else {
            converged = launchAssignToClusters(instances, clusterAssignments);
        }

        // update centroids
        m_ClusterCentroids = new Instances(instances, m_NumClusters);
        for (i = 0; i < m_NumClusters; i++) {
            tempI[i] = new Instances(instances, 0);
        }
        for (i = 0; i < instances.numInstances(); i++) {
            tempI[clusterAssignments[i]].add(instances.instance(i));
        }
        if (m_executionSlots <= 1 || instances.numInstances() < 2 * m_executionSlots) {
            for (i = 0; i < m_NumClusters; i++) {
                if (tempI[i].numInstances() == 0) {
                    // empty cluster
                    emptyClusterCount++;
                } else {
                    moveCentroid(i, tempI[i], true, true);
                }
            }
        } else {
            emptyClusterCount = launchMoveCentroids(tempI);
        }

        if (m_Iterations == m_MaxIterations) {
            converged = true;
        }

        if (emptyClusterCount > 0) {
            m_NumClusters -= emptyClusterCount;
            if (converged) {
                Instances[] t = new Instances[m_NumClusters];
                int index = 0;
                for (int k = 0; k < tempI.length; k++) {
                    if (tempI[k].numInstances() > 0) {
                        t[index] = tempI[k];

                        for (i = 0; i < tempI[k].numAttributes(); i++) {
                            m_ClusterNominalCounts[index][i] = m_ClusterNominalCounts[k][i];
                        }
                        index++;
                    }
                }
                tempI = t;
            } else {
                tempI = new Instances[m_NumClusters];
            }
        }

        if (!converged) {
            m_ClusterNominalCounts = new double[m_NumClusters][instances.numAttributes()][0];
        }
    }

    // calculate errors
    if (!m_FastDistanceCalc) {
        for (i = 0; i < instances.numInstances(); i++) {
            clusterProcessedInstance(instances.instance(i), true, false, null);
        }
    }

    if (m_displayStdDevs) {
        m_ClusterStdDevs = new Instances(instances, m_NumClusters);
    }
    m_ClusterSizes = new double[m_NumClusters];
    for (i = 0; i < m_NumClusters; i++) {
        if (m_displayStdDevs) {
            double[] vals2 = tempI[i].variances();
            for (int j = 0; j < instances.numAttributes(); j++) {
                if (instances.attribute(j).isNumeric()) {
                    vals2[j] = Math.sqrt(vals2[j]);
                } else {
                    vals2[j] = Utils.missingValue();
                }
            }
            m_ClusterStdDevs.add(new DenseInstance(1.0, vals2));
        }
        m_ClusterSizes[i] = tempI[i].sumOfWeights();
    }

    m_executorPool.shutdown();

    // save memory!
    m_DistanceFunction.clean();

    // Calculate Silhouette Coefficient
    SilCoeff = new double[instances.numInstances()];
    AvgSilCoeff = 0;
    for (int z = 0; z < instances.numInstances(); z++) {
        double[] distance = new double[m_NumClusters];
        Arrays.fill(distance, 0.0);
        //Sum
        for (int y = 0; y < instances.numInstances(); y++) {
            distance[clusterAssignments[y]] += m_DistanceFunction.distance(instances.get(z), instances.get(y));
        }
        //Average
        for (int x = 0; x < m_NumClusters; x++) {
            distance[x] = distance[x] / m_ClusterSizes[x];
        }
        double a = distance[clusterAssignments[z]];
        distance[clusterAssignments[z]] = Double.MAX_VALUE;
        Arrays.sort(distance);
        double b = distance[0];
        SilCoeff[z] = (b - a) / Math.max(a, b);
        AvgSilCoeff += SilCoeff[z];
    }
    AvgSilCoeff = AvgSilCoeff / instances.numInstances();
    //System.out.println("AvgSilCoeff: " + AvgSilCoeff);
}

From source file:clusterer.SimpleKMeansWithSilhouette.java

License:Open Source License

/**
 * Initialize using the k-means++ method
 * /*www  . ja  va 2 s.c o m*/
 * @param data the training data
 * @throws Exception if a problem occurs
 */
protected void kMeansPlusPlusInit(Instances data) throws Exception {
    Random randomO = new Random(getSeed());
    HashMap<DecisionTableHashKey, String> initC = new HashMap<DecisionTableHashKey, String>();

    // choose initial center uniformly at random
    int index = randomO.nextInt(data.numInstances());
    m_ClusterCentroids.add(data.instance(index));
    DecisionTableHashKey hk = new DecisionTableHashKey(data.instance(index), data.numAttributes(), true);
    initC.put(hk, null);

    int iteration = 0;
    int remainingInstances = data.numInstances() - 1;
    if (m_NumClusters > 1) {
        // proceed with selecting the rest

        // distances to the initial randomly chose center
        double[] distances = new double[data.numInstances()];
        double[] cumProbs = new double[data.numInstances()];
        for (int i = 0; i < data.numInstances(); i++) {
            distances[i] = m_DistanceFunction.distance(data.instance(i),
                    m_ClusterCentroids.instance(iteration));
        }

        // now choose the remaining cluster centers
        for (int i = 1; i < m_NumClusters; i++) {

            // distances converted to probabilities
            double[] weights = new double[data.numInstances()];
            System.arraycopy(distances, 0, weights, 0, distances.length);
            Utils.normalize(weights);

            double sumOfProbs = 0;
            for (int k = 0; k < data.numInstances(); k++) {
                sumOfProbs += weights[k];
                cumProbs[k] = sumOfProbs;
            }

            cumProbs[data.numInstances() - 1] = 1.0; // make sure there are no
                                                     // rounding issues

            // choose a random instance
            double prob = randomO.nextDouble();
            for (int k = 0; k < cumProbs.length; k++) {
                if (prob < cumProbs[k]) {
                    Instance candidateCenter = data.instance(k);
                    hk = new DecisionTableHashKey(candidateCenter, data.numAttributes(), true);
                    if (!initC.containsKey(hk)) {
                        initC.put(hk, null);
                        m_ClusterCentroids.add(candidateCenter);
                    } else {
                        // we shouldn't get here because any instance that is a duplicate
                        // of
                        // an already chosen cluster center should have zero distance (and
                        // hence
                        // zero probability of getting chosen) to that center.
                        System.err.println("We shouldn't get here....");
                    }
                    remainingInstances--;
                    break;
                }
            }
            iteration++;

            if (remainingInstances == 0) {
                break;
            }

            // prepare to choose the next cluster center.
            // check distances against the new cluster center to see if it is closer
            for (int k = 0; k < data.numInstances(); k++) {
                if (distances[k] > 0) {
                    double newDist = m_DistanceFunction.distance(data.instance(k),
                            m_ClusterCentroids.instance(iteration));
                    if (newDist < distances[k]) {
                        distances[k] = newDist;
                    }
                }
            }
        }
    }
}

From source file:clusterer.SimpleKMeansWithSilhouette.java

License:Open Source License

/**
 * Move the centroid to it's new coordinates. Generate the centroid
 * coordinates based on it's members (objects assigned to the cluster of the
 * centroid) and the distance function being used.
 * /*w ww .  jav  a  2 s .  co m*/
 * @param centroidIndex index of the centroid which the coordinates will be
 *          computed
 * @param members the objects that are assigned to the cluster of this
 *          centroid
 * @param updateClusterInfo if the method is supposed to update the m_Cluster
 *          arrays
 * @param addToCentroidInstances true if the method is to add the computed
 *          coordinates to the Instances holding the centroids
 * @return the centroid coordinates
 */
protected double[] moveCentroid(int centroidIndex, Instances members, boolean updateClusterInfo,
        boolean addToCentroidInstances) {

    double[] vals = new double[members.numAttributes()];
    double[][] nominalDists = new double[members.numAttributes()][];
    double[] weightMissing = new double[members.numAttributes()];
    double[] weightNonMissing = new double[members.numAttributes()];

    // Quickly calculate some relevant statistics 
    for (int j = 0; j < members.numAttributes(); j++) {
        if (members.attribute(j).isNominal()) {
            nominalDists[j] = new double[members.attribute(j).numValues()];
        }
    }
    for (Instance inst : members) {
        for (int j = 0; j < members.numAttributes(); j++) {
            if (inst.isMissing(j)) {
                weightMissing[j] += inst.weight();
            } else {
                weightNonMissing[j] += inst.weight();
                if (members.attribute(j).isNumeric()) {
                    vals[j] += inst.weight() * inst.value(j); // Will be overwritten in Manhattan case
                } else {
                    nominalDists[j][(int) inst.value(j)] += inst.weight();
                }
            }
        }
    }
    for (int j = 0; j < members.numAttributes(); j++) {
        if (members.attribute(j).isNumeric()) {
            if (weightNonMissing[j] > 0) {
                vals[j] /= weightNonMissing[j];
            } else {
                vals[j] = Utils.missingValue();
            }
        } else {
            double max = -Double.MAX_VALUE;
            double maxIndex = -1;
            for (int i = 0; i < nominalDists[j].length; i++) {
                if (nominalDists[j][i] > max) {
                    max = nominalDists[j][i];
                    maxIndex = i;
                }
                if (max < weightMissing[j]) {
                    vals[j] = Utils.missingValue();
                } else {
                    vals[j] = maxIndex;
                }
            }
        }
    }

    if (m_DistanceFunction instanceof ManhattanDistance) {

        // Need to replace means by medians
        Instances sortedMembers = null;
        int middle = (members.numInstances() - 1) / 2;
        boolean dataIsEven = ((members.numInstances() % 2) == 0);
        if (m_PreserveOrder) {
            sortedMembers = members;
        } else {
            sortedMembers = new Instances(members);
        }
        for (int j = 0; j < members.numAttributes(); j++) {
            if ((weightNonMissing[j] > 0) && members.attribute(j).isNumeric()) {
                // singleton special case
                if (members.numInstances() == 1) {
                    vals[j] = members.instance(0).value(j);
                } else {
                    vals[j] = sortedMembers.kthSmallestValue(j, middle + 1);
                    if (dataIsEven) {
                        vals[j] = (vals[j] + sortedMembers.kthSmallestValue(j, middle + 2)) / 2;
                    }
                }
            }
        }
    }

    if (updateClusterInfo) {
        for (int j = 0; j < members.numAttributes(); j++) {
            m_ClusterMissingCounts[centroidIndex][j] = weightMissing[j];
            m_ClusterNominalCounts[centroidIndex][j] = nominalDists[j];
        }
    }

    if (addToCentroidInstances) {
        m_ClusterCentroids.add(new DenseInstance(1.0, vals));
    }

    return vals;
}

From source file:cn.edu.xmu.dm.d3c.clustering.SimpleKMeans.java

License:Open Source License

/**
 * Generates a clusterer. Has to initialize all fields of the clusterer
 * that are not being set via options./*from   w  w  w  .ja v  a  2s .c  o  m*/
 *
 * @param data set of instances serving as training data 
 * @throws Exception if the clusterer has not been 
 * generated successfully
 */
public void buildClusterer(Instances data) throws Exception {

    // can clusterer handle the data?
    getCapabilities().testWithFail(data);

    m_Iterations = 0;

    m_ReplaceMissingFilter = new ReplaceMissingValues();
    Instances instances = new Instances(data);

    instances.setClassIndex(-1);
    if (!m_dontReplaceMissing) {
        m_ReplaceMissingFilter.setInputFormat(instances);
        instances = Filter.useFilter(instances, m_ReplaceMissingFilter);
    }

    m_FullMissingCounts = new int[instances.numAttributes()];
    if (m_displayStdDevs) {
        m_FullStdDevs = new double[instances.numAttributes()];
    }
    m_FullNominalCounts = new int[instances.numAttributes()][0];

    m_FullMeansOrMediansOrModes = moveCentroid(0, instances, false);
    for (int i = 0; i < instances.numAttributes(); i++) {
        m_FullMissingCounts[i] = instances.attributeStats(i).missingCount;
        if (instances.attribute(i).isNumeric()) {
            if (m_displayStdDevs) {
                m_FullStdDevs[i] = Math.sqrt(instances.variance(i));
            }
            if (m_FullMissingCounts[i] == instances.numInstances()) {
                m_FullMeansOrMediansOrModes[i] = Double.NaN; // mark missing as mean
            }
        } else {
            m_FullNominalCounts[i] = instances.attributeStats(i).nominalCounts;
            if (m_FullMissingCounts[i] > m_FullNominalCounts[i][Utils.maxIndex(m_FullNominalCounts[i])]) {
                m_FullMeansOrMediansOrModes[i] = -1; // mark missing as most common value
            }
        }
    }

    m_ClusterCentroids = new Instances(instances, m_NumClusters);
    int[] clusterAssignments = new int[instances.numInstances()];

    if (m_PreserveOrder)
        m_Assignments = clusterAssignments;

    m_DistanceFunction.setInstances(instances);

    Random RandomO = new Random(getSeed());
    int instIndex;
    HashMap initC = new HashMap();
    DecisionTableHashKey hk = null;

    Instances initInstances = null;
    if (m_PreserveOrder)
        initInstances = new Instances(instances);
    else
        initInstances = instances;

    if (m_initializeWithKMeansPlusPlus) {
        kMeansPlusPlusInit(initInstances);
    } else {
        for (int j = initInstances.numInstances() - 1; j >= 0; j--) {
            instIndex = RandomO.nextInt(j + 1);
            hk = new DecisionTableHashKey(initInstances.instance(instIndex), initInstances.numAttributes(),
                    true);
            if (!initC.containsKey(hk)) {
                m_ClusterCentroids.add(initInstances.instance(instIndex));
                initC.put(hk, null);
            }
            initInstances.swap(j, instIndex);

            if (m_ClusterCentroids.numInstances() == m_NumClusters) {
                break;
            }
        }
    }

    m_NumClusters = m_ClusterCentroids.numInstances();

    //removing reference
    initInstances = null;

    int i;
    boolean converged = false;
    int emptyClusterCount;
    Instances[] tempI = new Instances[m_NumClusters];
    m_squaredErrors = new double[m_NumClusters];
    m_ClusterNominalCounts = new int[m_NumClusters][instances.numAttributes()][0];
    m_ClusterMissingCounts = new int[m_NumClusters][instances.numAttributes()];
    while (!converged) {
        emptyClusterCount = 0;
        m_Iterations++;
        converged = true;
        for (i = 0; i < instances.numInstances(); i++) {
            Instance toCluster = instances.instance(i);
            int newC = clusterProcessedInstance(toCluster, false, true);
            if (newC != clusterAssignments[i]) {
                converged = false;
            }
            clusterAssignments[i] = newC;
        }

        // update centroids
        m_ClusterCentroids = new Instances(instances, m_NumClusters);
        for (i = 0; i < m_NumClusters; i++) {
            tempI[i] = new Instances(instances, 0);
        }
        for (i = 0; i < instances.numInstances(); i++) {
            tempI[clusterAssignments[i]].add(instances.instance(i));
        }
        for (i = 0; i < m_NumClusters; i++) {
            if (tempI[i].numInstances() == 0) {
                // empty cluster
                emptyClusterCount++;
            } else {
                moveCentroid(i, tempI[i], true);
            }
        }

        if (emptyClusterCount > 0) {
            m_NumClusters -= emptyClusterCount;
            if (converged) {
                Instances[] t = new Instances[m_NumClusters];
                int index = 0;
                for (int k = 0; k < tempI.length; k++) {
                    if (tempI[k].numInstances() > 0) {
                        t[index++] = tempI[k];
                    }
                }
                tempI = t;
            } else {
                tempI = new Instances[m_NumClusters];
            }
        }

        if (m_Iterations == m_MaxIterations)
            converged = true;

        if (!converged) {
            m_ClusterNominalCounts = new int[m_NumClusters][instances.numAttributes()][0];
        }
    }

    // calculate errors
    if (!m_FastDistanceCalc) {
        for (i = 0; i < instances.numInstances(); i++) {
            clusterProcessedInstance(instances.instance(i), true, false);
        }
    }

    if (m_displayStdDevs) {
        m_ClusterStdDevs = new Instances(instances, m_NumClusters);
    }
    m_ClusterSizes = new int[m_NumClusters];
    for (i = 0; i < m_NumClusters; i++) {
        if (m_displayStdDevs) {
            double[] vals2 = new double[instances.numAttributes()];
            for (int j = 0; j < instances.numAttributes(); j++) {
                if (instances.attribute(j).isNumeric()) {
                    vals2[j] = Math.sqrt(tempI[i].variance(j));
                } else {
                    vals2[j] = Utils.missingValue();
                }
            }
            m_ClusterStdDevs.add(new DenseInstance(1.0, vals2));
        }
        m_ClusterSizes[i] = tempI[i].numInstances();
    }
}

From source file:cn.edu.xmu.dm.d3c.clustering.SimpleKMeans.java

License:Open Source License

protected void kMeansPlusPlusInit(Instances data) throws Exception {
    Random randomO = new Random(getSeed());
    HashMap<DecisionTableHashKey, String> initC = new HashMap<DecisionTableHashKey, String>();

    // choose initial center uniformly at random
    int index = randomO.nextInt(data.numInstances());
    m_ClusterCentroids.add(data.instance(index));
    DecisionTableHashKey hk = new DecisionTableHashKey(data.instance(index), data.numAttributes(), true);
    initC.put(hk, null);/*from  w w  w.jav a  2s  .c o m*/

    int iteration = 0;
    int remainingInstances = data.numInstances() - 1;
    if (m_NumClusters > 1) {
        // proceed with selecting the rest

        // distances to the initial randomly chose center
        double[] distances = new double[data.numInstances()];
        double[] cumProbs = new double[data.numInstances()];
        for (int i = 0; i < data.numInstances(); i++) {
            distances[i] = m_DistanceFunction.distance(data.instance(i),
                    m_ClusterCentroids.instance(iteration));
        }

        // now choose the remaining cluster centers
        for (int i = 1; i < m_NumClusters; i++) {

            // distances converted to probabilities
            double[] weights = new double[data.numInstances()];
            System.arraycopy(distances, 0, weights, 0, distances.length);
            Utils.normalize(weights);

            double sumOfProbs = 0;
            for (int k = 0; k < data.numInstances(); k++) {
                sumOfProbs += weights[k];
                cumProbs[k] = sumOfProbs;
            }

            cumProbs[data.numInstances() - 1] = 1.0; // make sure there are no rounding issues

            // choose a random instance
            double prob = randomO.nextDouble();
            for (int k = 0; k < cumProbs.length; k++) {
                if (prob < cumProbs[k]) {
                    Instance candidateCenter = data.instance(k);
                    hk = new DecisionTableHashKey(candidateCenter, data.numAttributes(), true);
                    if (!initC.containsKey(hk)) {
                        initC.put(hk, null);
                        m_ClusterCentroids.add(candidateCenter);
                    } else {
                        // we shouldn't get here because any instance that is a duplicate of
                        // an already chosen cluster center should have zero distance (and hence
                        // zero probability of getting chosen) to that center.
                        System.err.println("We shouldn't get here....");
                    }
                    remainingInstances--;
                    break;
                }
            }
            iteration++;

            if (remainingInstances == 0) {
                break;
            }

            // prepare to choose the next cluster center.
            // check distances against the new cluster center to see if it is closer
            for (int k = 0; k < data.numInstances(); k++) {
                if (distances[k] > 0) {
                    double newDist = m_DistanceFunction.distance(data.instance(k),
                            m_ClusterCentroids.instance(iteration));
                    if (newDist < distances[k]) {
                        distances[k] = newDist;
                    }
                }
            }
        }
    }
}

From source file:cn.edu.xmu.dm.d3c.clustering.SimpleKMeans.java

License:Open Source License

/**
 * Move the centroid to it's new coordinates. Generate the centroid coordinates based 
 * on it's  members (objects assigned to the cluster of the centroid) and the distance 
 * function being used./*from   ww w.  j a  v  a  2  s . c o m*/
 * @param centroidIndex index of the centroid which the coordinates will be computed
 * @param members the objects that are assigned to the cluster of this centroid
 * @param updateClusterInfo if the method is supposed to update the m_Cluster arrays
 * @return the centroid coordinates
 */
protected double[] moveCentroid(int centroidIndex, Instances members, boolean updateClusterInfo) {
    double[] vals = new double[members.numAttributes()];

    //used only for Manhattan Distance
    Instances sortedMembers = null;
    int middle = 0;
    boolean dataIsEven = false;

    if (m_DistanceFunction instanceof ManhattanDistance) {
        middle = (members.numInstances() - 1) / 2;
        dataIsEven = ((members.numInstances() % 2) == 0);
        if (m_PreserveOrder) {
            sortedMembers = members;
        } else {
            sortedMembers = new Instances(members);
        }
    }

    for (int j = 0; j < members.numAttributes(); j++) {

        //in case of Euclidian distance the centroid is the mean point
        //in case of Manhattan distance the centroid is the median point
        //in both cases, if the attribute is nominal, the centroid is the mode
        if (m_DistanceFunction instanceof EuclideanDistance || members.attribute(j).isNominal()) {
            vals[j] = members.meanOrMode(j);
        } else if (m_DistanceFunction instanceof ManhattanDistance) {
            //singleton special case
            if (members.numInstances() == 1) {
                vals[j] = members.instance(0).value(j);
            } else {
                sortedMembers.kthSmallestValue(j, middle + 1);
                vals[j] = sortedMembers.instance(middle).value(j);
                if (dataIsEven) {
                    sortedMembers.kthSmallestValue(j, middle + 2);
                    vals[j] = (vals[j] + sortedMembers.instance(middle + 1).value(j)) / 2;
                }
            }
        }

        if (updateClusterInfo) {
            m_ClusterMissingCounts[centroidIndex][j] = members.attributeStats(j).missingCount;
            m_ClusterNominalCounts[centroidIndex][j] = members.attributeStats(j).nominalCounts;
            if (members.attribute(j).isNominal()) {
                if (m_ClusterMissingCounts[centroidIndex][j] > m_ClusterNominalCounts[centroidIndex][j][Utils
                        .maxIndex(m_ClusterNominalCounts[centroidIndex][j])]) {
                    vals[j] = Utils.missingValue(); // mark mode as missing
                }
            } else {
                if (m_ClusterMissingCounts[centroidIndex][j] == members.numInstances()) {
                    vals[j] = Utils.missingValue(); // mark mean as missing
                }
            }
        }
    }
    if (updateClusterInfo)
        m_ClusterCentroids.add(new DenseInstance(1.0, vals));
    return vals;
}

From source file:cn.ict.zyq.bestConf.bestConf.BestConf.java

License:Open Source License

public static Instance findBestPerf(Instances data) {
    int idx = data.numAttributes() - 1;
    double bestPerf = data.attributeStats(idx).numericStats.max;
    for (int i = 0; i < data.numInstances(); i++)
        if (data.get(i).value(idx) == bestPerf)
            return data.get(i);
    return null;//should never return NULL
}