Example usage for weka.core Instance index

List of usage examples for weka.core Instance index

Introduction

In this page you can find the example usage for weka.core Instance index.

Prototype

public int index(int position);

Source Link

Document

Returns the index of the attribute stored at the given position in the sparse representation.

Usage

From source file:edu.columbia.cs.ltrie.sampling.queries.generation.ChiSquaredWithYatesCorrectionAttributeEval.java

License:Open Source License

/**
 * Initializes a chi-squared attribute evaluator.
 * Discretizes all attributes that are numeric.
 *
 * @param data set of instances serving as training data 
 * @throws Exception if the evaluator has not been 
 * generated successfully/*from   www  .  j  a  v  a2 s . c  o  m*/
 */
public void buildEvaluator(Instances data) throws Exception {

    // can evaluator handle data?
    getCapabilities().testWithFail(data);

    int classIndex = data.classIndex();
    int numInstances = data.numInstances();

    if (!m_Binarize) {
        Discretize disTransform = new Discretize();
        disTransform.setUseBetterEncoding(true);
        disTransform.setInputFormat(data);
        data = Filter.useFilter(data, disTransform);
    } else {
        NumericToBinary binTransform = new NumericToBinary();
        binTransform.setInputFormat(data);
        data = Filter.useFilter(data, binTransform);
    }
    int numClasses = data.attribute(classIndex).numValues();

    // Reserve space and initialize counters
    double[][][] counts = new double[data.numAttributes()][][];
    for (int k = 0; k < data.numAttributes(); k++) {
        if (k != classIndex) {
            int numValues = data.attribute(k).numValues();
            counts[k] = new double[numValues + 1][numClasses + 1];
        }
    }

    // Initialize counters
    double[] temp = new double[numClasses + 1];
    for (int k = 0; k < numInstances; k++) {
        Instance inst = data.instance(k);
        if (inst.classIsMissing()) {
            temp[numClasses] += inst.weight();
        } else {
            temp[(int) inst.classValue()] += inst.weight();
        }
    }
    for (int k = 0; k < counts.length; k++) {
        if (k != classIndex) {
            for (int i = 0; i < temp.length; i++) {
                counts[k][0][i] = temp[i];
            }
        }
    }

    // Get counts
    for (int k = 0; k < numInstances; k++) {
        Instance inst = data.instance(k);
        for (int i = 0; i < inst.numValues(); i++) {
            if (inst.index(i) != classIndex) {
                if (inst.isMissingSparse(i) || inst.classIsMissing()) {
                    if (!inst.isMissingSparse(i)) {
                        counts[inst.index(i)][(int) inst.valueSparse(i)][numClasses] += inst.weight();
                        counts[inst.index(i)][0][numClasses] -= inst.weight();
                    } else if (!inst.classIsMissing()) {
                        counts[inst.index(i)][data.attribute(inst.index(i)).numValues()][(int) inst
                                .classValue()] += inst.weight();
                        counts[inst.index(i)][0][(int) inst.classValue()] -= inst.weight();
                    } else {
                        counts[inst.index(i)][data.attribute(inst.index(i)).numValues()][numClasses] += inst
                                .weight();
                        counts[inst.index(i)][0][numClasses] -= inst.weight();
                    }
                } else {
                    counts[inst.index(i)][(int) inst.valueSparse(i)][(int) inst.classValue()] += inst.weight();
                    counts[inst.index(i)][0][(int) inst.classValue()] -= inst.weight();
                }
            }
        }
    }

    // distribute missing counts if required
    if (m_missing_merge) {

        for (int k = 0; k < data.numAttributes(); k++) {
            if (k != classIndex) {
                int numValues = data.attribute(k).numValues();

                // Compute marginals
                double[] rowSums = new double[numValues];
                double[] columnSums = new double[numClasses];
                double sum = 0;
                for (int i = 0; i < numValues; i++) {
                    for (int j = 0; j < numClasses; j++) {
                        rowSums[i] += counts[k][i][j];
                        columnSums[j] += counts[k][i][j];
                    }
                    sum += rowSums[i];
                }

                if (Utils.gr(sum, 0)) {
                    double[][] additions = new double[numValues][numClasses];

                    // Compute what needs to be added to each row
                    for (int i = 0; i < numValues; i++) {
                        for (int j = 0; j < numClasses; j++) {
                            additions[i][j] = (rowSums[i] / sum) * counts[k][numValues][j];
                        }
                    }

                    // Compute what needs to be added to each column
                    for (int i = 0; i < numClasses; i++) {
                        for (int j = 0; j < numValues; j++) {
                            additions[j][i] += (columnSums[i] / sum) * counts[k][j][numClasses];
                        }
                    }

                    // Compute what needs to be added to each cell
                    for (int i = 0; i < numClasses; i++) {
                        for (int j = 0; j < numValues; j++) {
                            additions[j][i] += (counts[k][j][i] / sum) * counts[k][numValues][numClasses];
                        }
                    }

                    // Make new contingency table
                    double[][] newTable = new double[numValues][numClasses];
                    for (int i = 0; i < numValues; i++) {
                        for (int j = 0; j < numClasses; j++) {
                            newTable[i][j] = counts[k][i][j] + additions[i][j];
                        }
                    }
                    counts[k] = newTable;
                }
            }
        }
    }

    // Compute chi-squared values
    m_ChiSquareds = new double[data.numAttributes()];
    for (int i = 0; i < data.numAttributes(); i++) {
        if (i != classIndex) {
            m_ChiSquareds[i] = chiVal(ContingencyTables.reduceMatrix(counts[i]));
        }
    }
}

From source file:feature.InfoGainEval.java

License:Open Source License

/**
 * Initializes an information gain attribute evaluator. Discretizes all
 * attributes that are numeric./*from   w ww .ja va  2  s  .  c o  m*/
 *
 * @param data
 *            set of instances serving as training data
 * @throws Exception
 *             if the evaluator has not been generated successfully
 */
public double computeInfoGain(Instances data, int att) throws Exception {

    // can evaluator handle data?
    getCapabilities().testWithFail(data);

    int classIndex = data.classIndex();
    int numInstances = data.numInstances();

    if (!m_Binarize) {
        Discretize disTransform = new Discretize();
        disTransform.setUseBetterEncoding(true);
        disTransform.setInputFormat(data);
        data = Filter.useFilter(data, disTransform);
    } else {
        NumericToBinary binTransform = new NumericToBinary();
        binTransform.setInputFormat(data);
        data = Filter.useFilter(data, binTransform);
    }
    int numClasses = data.attribute(classIndex).numValues();

    // Reserve space and initialize counters
    double[][][] counts = new double[data.numAttributes()][][];
    for (int k = 0; k < data.numAttributes(); k++) {
        if (k != classIndex) {
            int numValues = data.attribute(k).numValues();
            counts[k] = new double[numValues + 1][numClasses + 1];
        }
    }

    // Initialize counters
    double[] temp = new double[numClasses + 1];
    for (int k = 0; k < numInstances; k++) {
        Instance inst = data.instance(k);
        if (inst.classIsMissing()) {
            temp[numClasses] += inst.weight();
        } else {
            temp[(int) inst.classValue()] += inst.weight();
        }
    }
    for (int k = 0; k < counts.length; k++) {
        if (k != classIndex) {
            for (int i = 0; i < temp.length; i++) {
                counts[k][0][i] = temp[i];
            }
        }
    }

    // Get counts
    for (int k = 0; k < numInstances; k++) {
        Instance inst = data.instance(k);
        for (int i = 0; i < inst.numValues(); i++) {
            if (inst.index(i) != classIndex) {
                if (inst.isMissingSparse(i) || inst.classIsMissing()) {
                    if (!inst.isMissingSparse(i)) {
                        counts[inst.index(i)][(int) inst.valueSparse(i)][numClasses] += inst.weight();
                        counts[inst.index(i)][0][numClasses] -= inst.weight();
                    } else if (!inst.classIsMissing()) {
                        counts[inst.index(i)][data.attribute(inst.index(i)).numValues()][(int) inst
                                .classValue()] += inst.weight();
                        counts[inst.index(i)][0][(int) inst.classValue()] -= inst.weight();
                    } else {
                        counts[inst.index(i)][data.attribute(inst.index(i)).numValues()][numClasses] += inst
                                .weight();
                        counts[inst.index(i)][0][numClasses] -= inst.weight();
                    }
                } else {
                    counts[inst.index(i)][(int) inst.valueSparse(i)][(int) inst.classValue()] += inst.weight();
                    counts[inst.index(i)][0][(int) inst.classValue()] -= inst.weight();
                }
            }
        }
    }

    // distribute missing counts if required
    if (m_missing_merge) {

        for (int k = 0; k < data.numAttributes(); k++) {
            if (k != classIndex) {
                int numValues = data.attribute(k).numValues();

                // Compute marginals
                double[] rowSums = new double[numValues];
                double[] columnSums = new double[numClasses];
                double sum = 0;
                for (int i = 0; i < numValues; i++) {
                    for (int j = 0; j < numClasses; j++) {
                        rowSums[i] += counts[k][i][j];
                        columnSums[j] += counts[k][i][j];
                    }
                    sum += rowSums[i];
                }

                if (Utils.gr(sum, 0)) {
                    double[][] additions = new double[numValues][numClasses];

                    // Compute what needs to be added to each row
                    for (int i = 0; i < numValues; i++) {
                        for (int j = 0; j < numClasses; j++) {
                            additions[i][j] = (rowSums[i] / sum) * counts[k][numValues][j];
                        }
                    }

                    // Compute what needs to be added to each column
                    for (int i = 0; i < numClasses; i++) {
                        for (int j = 0; j < numValues; j++) {
                            additions[j][i] += (columnSums[i] / sum) * counts[k][j][numClasses];
                        }
                    }

                    // Compute what needs to be added to each cell
                    for (int i = 0; i < numClasses; i++) {
                        for (int j = 0; j < numValues; j++) {
                            additions[j][i] += (counts[k][j][i] / sum) * counts[k][numValues][numClasses];
                        }
                    }

                    // Make new contingency table
                    double[][] newTable = new double[numValues][numClasses];
                    for (int i = 0; i < numValues; i++) {
                        for (int j = 0; j < numClasses; j++) {
                            newTable[i][j] = counts[k][i][j] + additions[i][j];
                        }
                    }
                    counts[k] = newTable;
                }
            }
        }
    }

    // Compute info gains
    m_InfoGains = new double[data.numAttributes()];
    m_InfoGains[att] = (ContingencyTables.entropyOverColumns(counts[att])
            - ContingencyTables.entropyConditionedOnRows(counts[att]));

    return m_InfoGains[att];
}

From source file:feature.InfoGainEval.java

License:Open Source License

public void buildEvaluator(Instances data) throws Exception {

    // can evaluator handle data?
    getCapabilities().testWithFail(data);

    int classIndex = data.classIndex();
    int numInstances = data.numInstances();

    if (!m_Binarize) {
        Discretize disTransform = new Discretize();
        disTransform.setUseBetterEncoding(true);
        disTransform.setInputFormat(data);
        data = Filter.useFilter(data, disTransform);
    } else {//from  www  .j a  v a 2 s  .co  m
        NumericToBinary binTransform = new NumericToBinary();
        binTransform.setInputFormat(data);
        data = Filter.useFilter(data, binTransform);
    }
    int numClasses = data.attribute(classIndex).numValues();

    // Reserve space and initialize counters
    double[][][] counts = new double[data.numAttributes()][][];
    for (int k = 0; k < data.numAttributes(); k++) {
        if (k != classIndex) {
            int numValues = data.attribute(k).numValues();
            counts[k] = new double[numValues + 1][numClasses + 1];
        }
    }

    // Initialize counters
    double[] temp = new double[numClasses + 1];
    for (int k = 0; k < numInstances; k++) {
        Instance inst = data.instance(k);
        if (inst.classIsMissing()) {
            temp[numClasses] += inst.weight();
        } else {
            temp[(int) inst.classValue()] += inst.weight();
        }
    }
    for (int k = 0; k < counts.length; k++) {
        if (k != classIndex) {
            for (int i = 0; i < temp.length; i++) {
                counts[k][0][i] = temp[i];
            }
        }
    }

    // Get counts
    for (int k = 0; k < numInstances; k++) {
        Instance inst = data.instance(k);
        for (int i = 0; i < inst.numValues(); i++) {
            if (inst.index(i) != classIndex) {
                if (inst.isMissingSparse(i) || inst.classIsMissing()) {
                    if (!inst.isMissingSparse(i)) {
                        counts[inst.index(i)][(int) inst.valueSparse(i)][numClasses] += inst.weight();
                        counts[inst.index(i)][0][numClasses] -= inst.weight();
                    } else if (!inst.classIsMissing()) {
                        counts[inst.index(i)][data.attribute(inst.index(i)).numValues()][(int) inst
                                .classValue()] += inst.weight();
                        counts[inst.index(i)][0][(int) inst.classValue()] -= inst.weight();
                    } else {
                        counts[inst.index(i)][data.attribute(inst.index(i)).numValues()][numClasses] += inst
                                .weight();
                        counts[inst.index(i)][0][numClasses] -= inst.weight();
                    }
                } else {
                    counts[inst.index(i)][(int) inst.valueSparse(i)][(int) inst.classValue()] += inst.weight();
                    counts[inst.index(i)][0][(int) inst.classValue()] -= inst.weight();
                }
            }
        }
    }

    // distribute missing counts if required
    if (m_missing_merge) {

        for (int k = 0; k < data.numAttributes(); k++) {
            if (k != classIndex) {
                int numValues = data.attribute(k).numValues();

                // Compute marginals
                double[] rowSums = new double[numValues];
                double[] columnSums = new double[numClasses];
                double sum = 0;
                for (int i = 0; i < numValues; i++) {
                    for (int j = 0; j < numClasses; j++) {
                        rowSums[i] += counts[k][i][j];
                        columnSums[j] += counts[k][i][j];
                    }
                    sum += rowSums[i];
                }

                if (Utils.gr(sum, 0)) {
                    double[][] additions = new double[numValues][numClasses];

                    // Compute what needs to be added to each row
                    for (int i = 0; i < numValues; i++) {
                        for (int j = 0; j < numClasses; j++) {
                            additions[i][j] = (rowSums[i] / sum) * counts[k][numValues][j];
                        }
                    }

                    // Compute what needs to be added to each column
                    for (int i = 0; i < numClasses; i++) {
                        for (int j = 0; j < numValues; j++) {
                            additions[j][i] += (columnSums[i] / sum) * counts[k][j][numClasses];
                        }
                    }

                    // Compute what needs to be added to each cell
                    for (int i = 0; i < numClasses; i++) {
                        for (int j = 0; j < numValues; j++) {
                            additions[j][i] += (counts[k][j][i] / sum) * counts[k][numValues][numClasses];
                        }
                    }

                    // Make new contingency table
                    double[][] newTable = new double[numValues][numClasses];
                    for (int i = 0; i < numValues; i++) {
                        for (int j = 0; j < numClasses; j++) {
                            newTable[i][j] = counts[k][i][j] + additions[i][j];
                        }
                    }
                    counts[k] = newTable;
                }
            }
        }
    }

    // Compute info gains
    m_InfoGains = new double[data.numAttributes()];
    for (int i = 0; i < data.numAttributes(); i++) {
        if (i != classIndex) {
            m_InfoGains[i] = (ContingencyTables.entropyOverColumns(counts[i])
                    - ContingencyTables.entropyConditionedOnRows(counts[i]));
        }
    }
}

From source file:FeatureSelection.ReliefFAttributeEval.java

License:Open Source License

/**
 * Updates the minimum and maximum values for all the attributes based on a
 * new instance./*from  ww w. j  a  v a 2 s  . c o m*/
 *
 * @param instance
 *            the new instance
 */
private void updateMinMax(Instance instance) {
    // for (int j = 0; j < m_numAttribs; j++) {
    try {
        for (int j = 0; j < instance.numValues(); j++) {
            if ((instance.attributeSparse(j).isNumeric()) && (!instance.isMissingSparse(j))) {
                if (Double.isNaN(m_minArray[instance.index(j)])) {
                    m_minArray[instance.index(j)] = instance.valueSparse(j);
                    m_maxArray[instance.index(j)] = instance.valueSparse(j);
                } else {
                    if (instance.valueSparse(j) < m_minArray[instance.index(j)]) {
                        m_minArray[instance.index(j)] = instance.valueSparse(j);
                    } else {
                        if (instance.valueSparse(j) > m_maxArray[instance.index(j)]) {
                            m_maxArray[instance.index(j)] = instance.valueSparse(j);
                        }
                    }
                }
            }
        }
    } catch (Exception ex) {
        System.err.println(ex);
        ex.printStackTrace();
    }
}

From source file:FeatureSelection.ReliefFAttributeEval.java

License:Open Source License

/**
 * Calculates the distance between two instances
 *
 * @param first/* w w w .ja va  2s  .  c o m*/
 *            the first instance
 * @param second
 *            the second instance
 * @return the distance between the two given instances, between 0 and 1
 */
private double distance(Instance first, Instance second) {

    double distance = 0;
    int firstI, secondI;

    for (int p1 = 0, p2 = 0; p1 < first.numValues() || p2 < second.numValues();) {
        if (p1 >= first.numValues()) {
            firstI = m_trainInstances.numAttributes();
        } else {
            firstI = first.index(p1);
        }
        if (p2 >= second.numValues()) {
            secondI = m_trainInstances.numAttributes();
        } else {
            secondI = second.index(p2);
        }
        if (firstI == m_trainInstances.classIndex()) {
            p1++;
            continue;
        }
        if (secondI == m_trainInstances.classIndex()) {
            p2++;
            continue;
        }
        double diff;
        if (firstI == secondI) {
            diff = difference(firstI, first.valueSparse(p1), second.valueSparse(p2));
            p1++;
            p2++;
        } else if (firstI > secondI) {
            diff = difference(secondI, 0, second.valueSparse(p2));
            p2++;
        } else {
            diff = difference(firstI, first.valueSparse(p1), 0);
            p1++;
        }
        // distance += diff * diff;
        distance += diff;
    }

    // return Math.sqrt(distance / m_NumAttributesUsed);
    return distance;
}

From source file:FeatureSelection.ReliefFAttributeEval.java

License:Open Source License

/**
 * update attribute weights given an instance when the class is numeric
 *
 * @param instNum/*  w  ww  . j  a v a2s  .  com*/
 *            the index of the instance to use when updating weights
 */
private void updateWeightsNumericClass(int instNum) {
    int i, j;
    double temp, temp2;
    int[] tempSorted = null;
    double[] tempDist = null;
    double distNorm = 1.0;
    int firstI, secondI;

    Instance inst = m_trainInstances.instance(instNum);

    // sort nearest neighbours and set up normalization variable
    if (m_weightByDistance) {
        tempDist = new double[m_stored[0]];

        for (j = 0, distNorm = 0; j < m_stored[0]; j++) {
            // copy the distances
            tempDist[j] = m_karray[0][j][0];
            // sum normalizer
            distNorm += m_weightsByRank[j];
        }

        tempSorted = Utils.sort(tempDist);
    }

    for (i = 0; i < m_stored[0]; i++) {
        // P diff prediction (class) given nearest instances
        if (m_weightByDistance) {
            temp = difference(m_classIndex, inst.value(m_classIndex),
                    m_trainInstances.instance((int) m_karray[0][tempSorted[i]][1]).value(m_classIndex));
            temp *= (m_weightsByRank[i] / distNorm);
        } else {
            temp = difference(m_classIndex, inst.value(m_classIndex),
                    m_trainInstances.instance((int) m_karray[0][i][1]).value(m_classIndex));
            temp *= (1.0 / (double) m_stored[0]); // equal influence
        }

        m_ndc += temp;

        Instance cmp;
        cmp = (m_weightByDistance) ? m_trainInstances.instance((int) m_karray[0][tempSorted[i]][1])
                : m_trainInstances.instance((int) m_karray[0][i][1]);

        double temp_diffP_diffA_givNearest = difference(m_classIndex, inst.value(m_classIndex),
                cmp.value(m_classIndex));
        // now the attributes
        for (int p1 = 0, p2 = 0; p1 < inst.numValues() || p2 < cmp.numValues();) {
            if (p1 >= inst.numValues()) {
                firstI = m_trainInstances.numAttributes();
            } else {
                firstI = inst.index(p1);
            }
            if (p2 >= cmp.numValues()) {
                secondI = m_trainInstances.numAttributes();
            } else {
                secondI = cmp.index(p2);
            }
            if (firstI == m_trainInstances.classIndex()) {
                p1++;
                continue;
            }
            if (secondI == m_trainInstances.classIndex()) {
                p2++;
                continue;
            }
            temp = 0.0;
            temp2 = 0.0;

            if (firstI == secondI) {
                j = firstI;
                temp = difference(j, inst.valueSparse(p1), cmp.valueSparse(p2));
                p1++;
                p2++;
            } else if (firstI > secondI) {
                j = secondI;
                temp = difference(j, 0, cmp.valueSparse(p2));
                p2++;
            } else {
                j = firstI;
                temp = difference(j, inst.valueSparse(p1), 0);
                p1++;
            }

            temp2 = temp_diffP_diffA_givNearest * temp;
            // P of different prediction and different att value given
            // nearest instances
            if (m_weightByDistance) {
                temp2 *= (m_weightsByRank[i] / distNorm);
            } else {
                temp2 *= (1.0 / (double) m_stored[0]); // equal influence
            }

            m_ndcda[j] += temp2;

            // P of different attribute val given nearest instances
            if (m_weightByDistance) {
                temp *= (m_weightsByRank[i] / distNorm);
            } else {
                temp *= (1.0 / (double) m_stored[0]); // equal influence
            }

            m_nda[j] += temp;
        }
    }
}

From source file:FeatureSelection.ReliefFAttributeEval.java

License:Open Source License

/**
 * update attribute weights given an instance when the class is discrete
 *
 * @param instNum//from w  w  w .  j  a v a2  s.  c om
 *            the index of the instance to use when updating weights
 */
private void updateWeightsDiscreteClass(int instNum) {
    int i, j, k;
    int cl;
    double temp_diff, w_norm = 1.0;
    double[] tempDistClass;
    int[] tempSortedClass = null;
    double distNormClass = 1.0;
    double[] tempDistAtt;
    int[][] tempSortedAtt = null;
    double[] distNormAtt = null;
    int firstI, secondI;

    // store the indexes (sparse instances) of non-zero elements
    Instance inst = m_trainInstances.instance(instNum);

    // get the class of this instance
    cl = (int) m_trainInstances.instance(instNum).value(m_classIndex);

    // sort nearest neighbours and set up normalization variables
    if (m_weightByDistance) {
        // do class (hits) first
        // sort the distances
        tempDistClass = new double[m_stored[cl]];

        for (j = 0, distNormClass = 0; j < m_stored[cl]; j++) {
            // copy the distances
            tempDistClass[j] = m_karray[cl][j][0];
            // sum normalizer
            distNormClass += m_weightsByRank[j];
        }

        tempSortedClass = Utils.sort(tempDistClass);
        // do misses (other classes)
        tempSortedAtt = new int[m_numClasses][1];
        distNormAtt = new double[m_numClasses];

        for (k = 0; k < m_numClasses; k++) {
            if (k != cl) // already done cl
            {
                // sort the distances
                tempDistAtt = new double[m_stored[k]];

                for (j = 0, distNormAtt[k] = 0; j < m_stored[k]; j++) {
                    // copy the distances
                    tempDistAtt[j] = m_karray[k][j][0];
                    // sum normalizer
                    distNormAtt[k] += m_weightsByRank[j];
                }

                tempSortedAtt[k] = Utils.sort(tempDistAtt);
            }
        }
    }

    if (m_numClasses > 2) {
        // the amount of probability space left after removing the
        // probability of this instance's class value
        w_norm = (1.0 - m_classProbs[cl]);
    }

    // do the k nearest hits of the same class
    for (j = 0, temp_diff = 0.0; j < m_stored[cl]; j++) {
        Instance cmp;
        cmp = (m_weightByDistance) ? m_trainInstances.instance((int) m_karray[cl][tempSortedClass[j]][1])
                : m_trainInstances.instance((int) m_karray[cl][j][1]);

        for (int p1 = 0, p2 = 0; p1 < inst.numValues() || p2 < cmp.numValues();) {
            if (p1 >= inst.numValues()) {
                firstI = m_trainInstances.numAttributes();
            } else {
                firstI = inst.index(p1);
            }
            if (p2 >= cmp.numValues()) {
                secondI = m_trainInstances.numAttributes();
            } else {
                secondI = cmp.index(p2);
            }
            if (firstI == m_trainInstances.classIndex()) {
                p1++;
                continue;
            }
            if (secondI == m_trainInstances.classIndex()) {
                p2++;
                continue;
            }
            if (firstI == secondI) {
                i = firstI;
                temp_diff = difference(i, inst.valueSparse(p1), cmp.valueSparse(p2));
                p1++;
                p2++;
            } else if (firstI > secondI) {
                i = secondI;
                temp_diff = difference(i, 0, cmp.valueSparse(p2));
                p2++;
            } else {
                i = firstI;
                temp_diff = difference(i, inst.valueSparse(p1), 0);
                p1++;
            }

            if (m_weightByDistance) {
                temp_diff *= (m_weightsByRank[j] / distNormClass);
            } else {
                if (m_stored[cl] > 0) {
                    temp_diff /= (double) m_stored[cl];
                }
            }
            m_weights[i] -= temp_diff;

        }
    }

    // now do k nearest misses from each of the other classes
    temp_diff = 0.0;

    for (k = 0; k < m_numClasses; k++) {
        if (k != cl) // already done cl
        {
            for (j = 0; j < m_stored[k]; j++) {
                Instance cmp;
                cmp = (m_weightByDistance)
                        ? m_trainInstances.instance((int) m_karray[k][tempSortedAtt[k][j]][1])
                        : m_trainInstances.instance((int) m_karray[k][j][1]);

                for (int p1 = 0, p2 = 0; p1 < inst.numValues() || p2 < cmp.numValues();) {
                    if (p1 >= inst.numValues()) {
                        firstI = m_trainInstances.numAttributes();
                    } else {
                        firstI = inst.index(p1);
                    }
                    if (p2 >= cmp.numValues()) {
                        secondI = m_trainInstances.numAttributes();
                    } else {
                        secondI = cmp.index(p2);
                    }
                    if (firstI == m_trainInstances.classIndex()) {
                        p1++;
                        continue;
                    }
                    if (secondI == m_trainInstances.classIndex()) {
                        p2++;
                        continue;
                    }
                    if (firstI == secondI) {
                        i = firstI;
                        temp_diff = difference(i, inst.valueSparse(p1), cmp.valueSparse(p2));
                        p1++;
                        p2++;
                    } else if (firstI > secondI) {
                        i = secondI;
                        temp_diff = difference(i, 0, cmp.valueSparse(p2));
                        p2++;
                    } else {
                        i = firstI;
                        temp_diff = difference(i, inst.valueSparse(p1), 0);
                        p1++;
                    }

                    if (m_weightByDistance) {
                        temp_diff *= (m_weightsByRank[j] / distNormAtt[k]);
                    } else {
                        if (m_stored[k] > 0) {
                            temp_diff /= (double) m_stored[k];
                        }
                    }
                    if (m_numClasses > 2) {
                        m_weights[i] += ((m_classProbs[k] / w_norm) * temp_diff);
                    } else {
                        m_weights[i] += temp_diff;
                    }
                }
            }
        }
    }
}

From source file:ml.dataprocess.CorrelationAttributeEval.java

License:Open Source License

/**
 * Initializes an information gain attribute evaluator. Replaces missing
 * values with means/modes; Deletes instances with missing class values.
 * /*from  w w w . j  av  a 2 s  .c o  m*/
 * @param data set of instances serving as training data
 * @throws Exception if the evaluator has not been generated successfully
 */
@Override
public void buildEvaluator(Instances data) throws Exception {
    data = new Instances(data);
    data.deleteWithMissingClass();

    ReplaceMissingValues rmv = new ReplaceMissingValues();
    rmv.setInputFormat(data);
    data = Filter.useFilter(data, rmv);

    int numClasses = data.classAttribute().numValues();
    int classIndex = data.classIndex();
    int numInstances = data.numInstances();
    m_correlations = new double[data.numAttributes()];
    /*
     * boolean hasNominals = false; boolean hasNumerics = false;
     */
    List<Integer> numericIndexes = new ArrayList<Integer>();
    List<Integer> nominalIndexes = new ArrayList<Integer>();
    if (m_detailedOutput) {
        m_detailedOutputBuff = new StringBuffer();
    }

    // TODO for instance weights (folded into computing weighted correlations)
    // add another dimension just before the last [2] (0 for 0/1 binary vector
    // and
    // 1 for corresponding instance weights for the 1's)
    double[][][] nomAtts = new double[data.numAttributes()][][];
    for (int i = 0; i < data.numAttributes(); i++) {
        if (data.attribute(i).isNominal() && i != classIndex) {
            nomAtts[i] = new double[data.attribute(i).numValues()][data.numInstances()];
            Arrays.fill(nomAtts[i][0], 1.0); // set zero index for this att to all
                                             // 1's
            nominalIndexes.add(i);
        } else if (data.attribute(i).isNumeric() && i != classIndex) {
            numericIndexes.add(i);
        }
    }

    // do the nominal attributes
    if (nominalIndexes.size() > 0) {
        for (int i = 0; i < data.numInstances(); i++) {
            Instance current = data.instance(i);
            for (int j = 0; j < current.numValues(); j++) {
                if (current.attribute(current.index(j)).isNominal() && current.index(j) != classIndex) {
                    // Will need to check for zero in case this isn't a sparse
                    // instance (unless we add 1 and subtract 1)
                    nomAtts[current.index(j)][(int) current.valueSparse(j)][i] += 1;
                    nomAtts[current.index(j)][0][i] -= 1;
                }
            }
        }
    }

    if (data.classAttribute().isNumeric()) {
        double[] classVals = data.attributeToDoubleArray(classIndex);

        // do the numeric attributes
        for (Integer i : numericIndexes) {
            double[] numAttVals = data.attributeToDoubleArray(i);
            m_correlations[i] = Utils.correlation(numAttVals, classVals, numAttVals.length);

            if (m_correlations[i] == 1.0) {
                // check for zero variance (useless numeric attribute)
                if (Utils.variance(numAttVals) == 0) {
                    m_correlations[i] = 0;
                }
            }
        }

        // do the nominal attributes
        if (nominalIndexes.size() > 0) {

            // now compute the correlations for the binarized nominal attributes
            for (Integer i : nominalIndexes) {
                double sum = 0;
                double corr = 0;
                double sumCorr = 0;
                double sumForValue = 0;

                if (m_detailedOutput) {
                    m_detailedOutputBuff.append("\n\n").append(data.attribute(i).name());
                }

                for (int j = 0; j < data.attribute(i).numValues(); j++) {
                    sumForValue = Utils.sum(nomAtts[i][j]);
                    corr = Utils.correlation(nomAtts[i][j], classVals, classVals.length);

                    // useless attribute - all instances have the same value
                    if (sumForValue == numInstances || sumForValue == 0) {
                        corr = 0;
                    }
                    if (corr < 0.0) {
                        corr = -corr;
                    }
                    sumCorr += sumForValue * corr;
                    sum += sumForValue;

                    if (m_detailedOutput) {
                        m_detailedOutputBuff.append("\n\t").append(data.attribute(i).value(j)).append(": ");
                        m_detailedOutputBuff.append(Utils.doubleToString(corr, 6));
                    }
                }
                m_correlations[i] = (sum > 0) ? sumCorr / sum : 0;
            }
        }
    } else {
        // class is nominal
        // TODO extra dimension for storing instance weights too
        double[][] binarizedClasses = new double[data.classAttribute().numValues()][data.numInstances()];

        // this is equal to the number of instances for all inst weights = 1
        double[] classValCounts = new double[data.classAttribute().numValues()];

        for (int i = 0; i < data.numInstances(); i++) {
            Instance current = data.instance(i);
            binarizedClasses[(int) current.classValue()][i] = 1;
        }
        for (int i = 0; i < data.classAttribute().numValues(); i++) {
            classValCounts[i] = Utils.sum(binarizedClasses[i]);
        }

        double sumClass = Utils.sum(classValCounts);

        // do numeric attributes first
        if (numericIndexes.size() > 0) {
            for (Integer i : numericIndexes) {
                double[] numAttVals = data.attributeToDoubleArray(i);
                double corr = 0;
                double sumCorr = 0;

                for (int j = 0; j < data.classAttribute().numValues(); j++) {
                    corr = Utils.correlation(numAttVals, binarizedClasses[j], numAttVals.length);
                    if (corr < 0.0) {
                        corr = -corr;
                    }

                    if (corr == 1.0) {
                        // check for zero variance (useless numeric attribute)
                        if (Utils.variance(numAttVals) == 0) {
                            corr = 0;
                        }
                    }

                    sumCorr += classValCounts[j] * corr;
                }
                m_correlations[i] = sumCorr / sumClass;
            }
        }

        if (nominalIndexes.size() > 0) {
            for (Integer i : nominalIndexes) {
                if (m_detailedOutput) {
                    m_detailedOutputBuff.append("\n\n").append(data.attribute(i).name());
                }

                double sumForAtt = 0;
                double corrForAtt = 0;
                for (int j = 0; j < data.attribute(i).numValues(); j++) {
                    double sumForValue = Utils.sum(nomAtts[i][j]);
                    double corr = 0;
                    double sumCorr = 0;
                    double avgCorrForValue = 0;

                    sumForAtt += sumForValue;
                    for (int k = 0; k < numClasses; k++) {

                        // corr between value j and class k
                        corr = Utils.correlation(nomAtts[i][j], binarizedClasses[k],
                                binarizedClasses[k].length);

                        // useless attribute - all instances have the same value
                        if (sumForValue == numInstances || sumForValue == 0) {
                            corr = 0;
                        }
                        if (corr < 0.0) {
                            corr = -corr;
                        }
                        sumCorr += classValCounts[k] * corr;
                    }
                    avgCorrForValue = sumCorr / sumClass;
                    corrForAtt += sumForValue * avgCorrForValue;

                    if (m_detailedOutput) {
                        m_detailedOutputBuff.append("\n\t").append(data.attribute(i).value(j)).append(": ");
                        m_detailedOutputBuff.append(Utils.doubleToString(avgCorrForValue, 6));
                    }
                }

                // the weighted average corr for att i as
                // a whole (wighted by value frequencies)
                m_correlations[i] = (sumForAtt > 0) ? corrForAtt / sumForAtt : 0;
            }
        }
    }

    if (m_detailedOutputBuff != null && m_detailedOutputBuff.length() > 0) {
        m_detailedOutputBuff.append("\n");
    }
}

From source file:ml.engine.LibSVM.java

License:Open Source License

/**
 * returns an instance into a sparse libsvm array
 * /*  w w  w.j  ava2  s  .co  m*/
 * @param instance the instance to work on
 * @return the libsvm array
 * @throws Exception if setup of array fails
 */
protected Object instanceToArray(Instance instance) throws Exception {
    int index;
    int count;
    int i;
    Object result;

    // determine number of non-zero attributes
    /*
     * for (i = 0; i < instance.numAttributes(); i++) { if (i ==
     * instance.classIndex()) continue; if (instance.value(i) != 0) count++; }
     */
    count = 0;
    for (i = 0; i < instance.numValues(); i++) {
        if (instance.index(i) == instance.classIndex()) {
            continue;
        }
        if (instance.valueSparse(i) != 0) {
            count++;
        }
    }

    // fill array
    /*
     * result = Array.newInstance(Class.forName(CLASS_SVMNODE), count); index =
     * 0; for (i = 0; i < instance.numAttributes(); i++) { if (i ==
     * instance.classIndex()) continue; if (instance.value(i) == 0) continue;
     * 
     * Array.set(result, index, Class.forName(CLASS_SVMNODE).newInstance());
     * setField(Array.get(result, index), "index", new Integer(i + 1));
     * setField(Array.get(result, index), "value", new
     * Double(instance.value(i))); index++; }
     */

    result = Array.newInstance(Class.forName(CLASS_SVMNODE), count);
    index = 0;
    for (i = 0; i < instance.numValues(); i++) {

        int idx = instance.index(i);
        if (idx == instance.classIndex()) {
            continue;
        }
        if (instance.valueSparse(i) == 0) {
            continue;
        }

        Array.set(result, index, Class.forName(CLASS_SVMNODE).newInstance());
        setField(Array.get(result, index), "index", new Integer(idx + 1));
        setField(Array.get(result, index), "value", new Double(instance.valueSparse(i)));
        index++;
    }

    return result;
}

From source file:moa.classifiers.bayes.NaiveBayesMultinomial.java

License:Open Source License

/**
 * Trains the classifier with the given instance.
 *
 * @param instance the new training instance to include in the model
 *///from w ww. j a va2 s  . c o m
@Override
public void trainOnInstanceImpl(Instance inst) {
    if (this.reset == true) {
        this.m_numClasses = inst.numClasses();
        double laplace = this.laplaceCorrectionOption.getValue();
        int numAttributes = inst.numAttributes();

        m_probOfClass = new double[m_numClasses];
        Arrays.fill(m_probOfClass, laplace);

        m_classTotals = new double[m_numClasses];
        Arrays.fill(m_classTotals, laplace * numAttributes);

        m_wordTotalForClass = new DoubleVector[m_numClasses];
        for (int i = 0; i < m_numClasses; i++) {
            //Arrays.fill(wordTotal, laplace);
            m_wordTotalForClass[i] = new DoubleVector();
        }
        this.reset = false;
    }
    // Update classifier
    int classIndex = inst.classIndex();
    int classValue = (int) inst.value(classIndex);

    double w = inst.weight();
    m_probOfClass[classValue] += w;

    m_classTotals[classValue] += w * totalSize(inst);
    double total = m_classTotals[classValue];

    for (int i = 0; i < inst.numValues(); i++) {
        int index = inst.index(i);
        if (index != classIndex && !inst.isMissing(i)) {
            //m_wordTotalForClass[index][classValue] += w * inst.valueSparse(i);
            double laplaceCorrection = 0.0;
            if (m_wordTotalForClass[classValue].getValue(index) == 0) {
                laplaceCorrection = this.laplaceCorrectionOption.getValue();
            }
            m_wordTotalForClass[classValue].addToValue(index, w * inst.valueSparse(i) + laplaceCorrection);
        }
    }
}