Example usage for weka.core Instance index

Introduction

In this page you can find the example usage for weka.core Instance index.

Prototype

public int index(int position);

Source Link

Document

Returns the index of the attribute stored at the given position in the sparse representation.

Usage

From source file:br.com.ufu.lsi.utils.DocumentFrequencyAttributeEval.java

License:Open Source License

/**
 * Initializes an information gain attribute evaluator. Discretizes all attributes that are
 * numeric./*from  w w  w  .  j a  v a2  s  .  c  om*/
 *
 * @param data set of instances serving as training data
 * @throws Exception if the evaluator has not been generated successfully
 */
public void buildEvaluator(Instances data) throws Exception {

    // can evaluator handle data?
    getCapabilities().testWithFail(data);

    int classIndex = data.classIndex();

    int numAttributes = data.numAttributes();
    m_DFs = new int[numAttributes];
    Enumeration e = data.enumerateInstances();
    while (e.hasMoreElements()) {
        Instance instance = (Instance) e.nextElement();
        int numValues = instance.numValues();
        for (int valueIndex = 0; valueIndex < numValues; valueIndex++) {
            int attIndex = instance.index(valueIndex);
            if (attIndex != classIndex) {
                double value = instance.valueSparse(valueIndex);
                //missingvalues werden also 0 betrachtet.
                if (m_missingAsZero) {
                    if (!Instance.isMissingValue(value) && value != 0.0) { //man knnte auch isMissingSparce(valueIndex) verwenden, oder ineffizienterweise isMissing(attIndex)
                        m_DFs[attIndex]++;
                        //m_DFs[ attIndex ]+=value ;
                    }
                } else {
                    if (value != 0.0) {
                        m_DFs[attIndex]++;
                        //m_DFs[ attIndex ]+=value ;
                    }
                }
            }
        }
    }
}

From source file:cba.ItemSet.java

License:Open Source License

/**
 * Checks if an instance contains an item set.
 *
 * @param instance the instance to be tested
 * @return true if the given instance contains this item set
 *///w ww.  j av a2  s.  c om

public boolean containedBy(Instance instance) {

    if (instance instanceof weka.core.SparseInstance && m_treatZeroAsMissing) {
        int numInstVals = instance.numValues();
        int numItemSetVals = m_items.length;

        for (int p1 = 0, p2 = 0; p1 < numInstVals || p2 < numItemSetVals;) {
            int instIndex = Integer.MAX_VALUE;
            if (p1 < numInstVals) {
                instIndex = instance.index(p1);
            }
            int itemIndex = p2;

            if (m_items[itemIndex] > -1) {
                if (itemIndex != instIndex) {
                    return false;
                } else {
                    if (instance.isMissingSparse(p1)) {
                        return false;
                    }
                    if (m_items[itemIndex] != (int) instance.valueSparse(p1)) {
                        return false;
                    }
                }

                p1++;
                p2++;
            } else {
                if (itemIndex < instIndex) {
                    p2++;
                } else if (itemIndex == instIndex) {
                    p2++;
                    p1++;
                }
            }
        }
    } else {
        for (int i = 0; i < instance.numAttributes(); i++)
            if (m_items[i] > -1) {
                if (instance.isMissing(i) || (m_treatZeroAsMissing && (int) instance.value(i) == 0))
                    return false;
                if (m_items[i] != (int) instance.value(i))
                    return false;
            }
    }

    return true;
}

From source file:ChiSquare.ChiSquaredAttributeEval.java

License:Open Source License

/**
 * Initializes a chi-squared attribute evaluator.
 * Discretizes all attributes that are numeric.
 *
 * @param data set of instances serving as training data 
 * @throws Exception if the evaluator has not been 
 * generated successfully// w  w  w.j a  v  a2  s.  com
 */
public void buildEvaluator(Instances data) throws Exception {

    // can evaluator handle data?
    getCapabilities().testWithFail(data);

    int classIndex = data.classIndex();
    int numInstances = data.numInstances();

    if (!m_Binarize) {
        Discretize disTransform = new Discretize();
        disTransform.setUseBetterEncoding(true);
        disTransform.setInputFormat(data);
        data = Filter.useFilter(data, disTransform);
    } else {
        NumericToBinary binTransform = new NumericToBinary();
        binTransform.setInputFormat(data);
        data = Filter.useFilter(data, binTransform);
    }
    int numClasses = data.attribute(classIndex).numValues();

    // Reserve space and initialize counters
    double[][][] counts = new double[data.numAttributes()][][];
    for (int k = 0; k < data.numAttributes(); k++) {
        if (k != classIndex) {
            int numValues = data.attribute(k).numValues();
            counts[k] = new double[numValues + 1][numClasses + 1];
        }
    }

    // Initialize counters
    double[] temp = new double[numClasses + 1];
    for (int k = 0; k < numInstances; k++) {
        Instance inst = data.instance(k);
        if (inst.classIsMissing()) {
            temp[numClasses] += inst.weight();
        } else {
            temp[(int) inst.classValue()] += inst.weight();
        }
    }
    for (int k = 0; k < counts.length; k++) {
        if (k != classIndex) {
            for (int i = 0; i < temp.length; i++) {
                counts[k][0][i] = temp[i];
            }
        }
    }

    // Get counts
    for (int k = 0; k < numInstances; k++) {
        Instance inst = data.instance(k);
        for (int i = 0; i < inst.numValues(); i++) {
            if (inst.index(i) != classIndex) {
                if (inst.isMissingSparse(i) || inst.classIsMissing()) {
                    if (!inst.isMissingSparse(i)) {
                        counts[inst.index(i)][(int) inst.valueSparse(i)][numClasses] += inst.weight();
                        counts[inst.index(i)][0][numClasses] -= inst.weight();
                    } else if (!inst.classIsMissing()) {
                        counts[inst.index(i)][data.attribute(inst.index(i)).numValues()][(int) inst
                                .classValue()] += inst.weight();
                        counts[inst.index(i)][0][(int) inst.classValue()] -= inst.weight();
                    } else {
                        counts[inst.index(i)][data.attribute(inst.index(i)).numValues()][numClasses] += inst
                                .weight();
                        counts[inst.index(i)][0][numClasses] -= inst.weight();
                    }
                } else {
                    counts[inst.index(i)][(int) inst.valueSparse(i)][(int) inst.classValue()] += inst.weight();
                    counts[inst.index(i)][0][(int) inst.classValue()] -= inst.weight();
                }
            }
        }
    }

    // distribute missing counts if required
    if (m_missing_merge) {

        for (int k = 0; k < data.numAttributes(); k++) {
            if (k != classIndex) {
                int numValues = data.attribute(k).numValues();

                // Compute marginals
                double[] rowSums = new double[numValues];
                double[] columnSums = new double[numClasses];
                double sum = 0;
                for (int i = 0; i < numValues; i++) {
                    for (int j = 0; j < numClasses; j++) {
                        rowSums[i] += counts[k][i][j];
                        columnSums[j] += counts[k][i][j];
                    }
                    sum += rowSums[i];
                }

                if (Utils.gr(sum, 0)) {
                    double[][] additions = new double[numValues][numClasses];

                    // Compute what needs to be added to each row
                    for (int i = 0; i < numValues; i++) {
                        for (int j = 0; j < numClasses; j++) {
                            additions[i][j] = (rowSums[i] / sum) * counts[k][numValues][j];
                        }
                    }

                    // Compute what needs to be added to each column
                    for (int i = 0; i < numClasses; i++) {
                        for (int j = 0; j < numValues; j++) {
                            additions[j][i] += (columnSums[i] / sum) * counts[k][j][numClasses];
                        }
                    }

                    // Compute what needs to be added to each cell
                    for (int i = 0; i < numClasses; i++) {
                        for (int j = 0; j < numValues; j++) {
                            additions[j][i] += (counts[k][j][i] / sum) * counts[k][numValues][numClasses];
                        }
                    }

                    // Make new contingency table
                    double[][] newTable = new double[numValues][numClasses];
                    for (int i = 0; i < numValues; i++) {
                        for (int j = 0; j < numClasses; j++) {
                            newTable[i][j] = counts[k][i][j] + additions[i][j];
                        }
                    }
                    counts[k] = newTable;
                }
            }
        }
    }

    // Compute chi-squared values
    m_ChiSquareds = new double[data.numAttributes()];
    for (int i = 0; i < data.numAttributes(); i++) {
        if (i != classIndex) {
            m_ChiSquareds[i] = ContingencyTables.chiVal(ContingencyTables.reduceMatrix(counts[i]), false);
        }
    }
}

From source file:classifier.CustomStringToWordVector.java

License:Open Source License

/**
 * Signify that this batch of input to the filter is finished. If the filter
 * requires all instances prior to filtering, output() may now be called to
 * retrieve the filtered instances.//w  ww. j  av  a  2  s  . com
 * 
 * @return true if there are instances pending output.
 * @throws IllegalStateException
 *             if no input structure has been defined.
 */
public boolean batchFinished() throws Exception {

    if (getInputFormat() == null) {
        throw new IllegalStateException("No input instance format defined");
    }

    // We only need to do something in this method
    // if the first batch hasn't been processed. Otherwise
    // input() has already done all the work.
    if (!isFirstBatchDone()) {

        // Determine the dictionary from the first batch (training data)
        determineDictionary();

        // Convert all instances w/o normalization
        FastVector fv = new FastVector();
        int firstCopy = 0;
        for (int i = 0; i < m_NumInstances; i++) {
            firstCopy = convertInstancewoDocNorm(getInputFormat().instance(i), fv);
        }

        // Need to compute average document length if necessary
        if (m_filterType != FILTER_NONE) {
            m_AvgDocLength = 0;
            for (int i = 0; i < fv.size(); i++) {
                Instance inst = (Instance) fv.elementAt(i);
                double docLength = 0;
                for (int j = 0; j < inst.numValues(); j++) {
                    if (inst.index(j) >= firstCopy) {
                        docLength += inst.valueSparse(j) * inst.valueSparse(j);
                    }
                }
                m_AvgDocLength += Math.sqrt(docLength);
            }
            m_AvgDocLength /= m_NumInstances;
        }

        // Perform normalization if necessary.
        if (m_filterType == FILTER_NORMALIZE_ALL) {
            for (int i = 0; i < fv.size(); i++) {
                normalizeInstance((Instance) fv.elementAt(i), firstCopy);
            }
        }

        // Push all instances into the output queue
        for (int i = 0; i < fv.size(); i++) {
            push((Instance) fv.elementAt(i));
        }
    }

    // Flush the input
    flushInput();

    m_NewBatch = true;
    m_FirstBatchDone = true;
    return (numPendingOutput() != 0);
}

From source file:classifier.CustomStringToWordVector.java

License:Open Source License

/**
 * Normalizes given instance to average doc length (only the newly
 * constructed attributes).//ww  w .ja  v  a2s  . co  m
 * 
 * @param inst
 *            the instance to normalize
 * @param firstCopy
 * @throws Exception
 *             if avg. doc length not set
 */
private void normalizeInstance(Instance inst, int firstCopy) throws Exception {

    double docLength = 0;

    if (m_AvgDocLength < 0) {
        throw new Exception("Average document length not set.");
    }

    // Compute length of document vector
    for (int j = 0; j < inst.numValues(); j++) {
        if (inst.index(j) >= firstCopy) {
            docLength += inst.valueSparse(j) * inst.valueSparse(j);
        }
    }
    docLength = Math.sqrt(docLength);

    // Normalize document vector
    for (int j = 0; j < inst.numValues(); j++) {
        if (inst.index(j) >= firstCopy) {
            double val = inst.valueSparse(j) * m_AvgDocLength / docLength;
            inst.setValueSparse(j, val);
            if (val == 0) {
                System.err.println("setting value " + inst.index(j) + " to zero.");
                j--;
            }
        }
    }
}

From source file:cluster.ABC.ClusterUtils.java

License:Open Source License

/** Normalizes the values of a SparseInstance in L2 norm
 *
 * @author Sugato Basu/*from w w  w  .ja  v  a2s  .co  m*/
 * @param inst SparseInstance to be normalized
 */

public static void normalizeSparseInstance(Instance inst) throws Exception {
    double norm = 0;
    int length = inst.numValues();

    if (!(inst instanceof SparseInstance)) {
        System.err.println("Not SparseInstance, using normalizeInstance function instead");
        normalizeInstance(inst);
    }

    for (int i = 0; i < length; i++) {
        if (inst.index(i) != inst.classIndex()) { // don't normalize the class index
            norm += inst.valueSparse(i) * inst.valueSparse(i);
        }
    }
    norm = Math.sqrt(norm);
    for (int i = 0; i < length; i++) { // don't normalize the class index
        if (inst.index(i) != inst.classIndex()) {
            inst.setValueSparse(i, inst.valueSparse(i) / norm);
        }
    }
}

From source file:cluster.ABC.ClusterUtils.java

License:Open Source License

/** Finds sum of 2 instances (handles sparse and non-sparse)
 *///from w w  w  .  j a v  a  2  s.  c o m

public static Instance sumInstances(Instance inst1, Instance inst2, Instances m_Instances) throws Exception {
    int numAttributes = inst1.numAttributes();
    if (inst2.numAttributes() != numAttributes) {
        throw new Exception("Error!! inst1 and inst2 should have same number of attributes.");
    }
    double weight1 = inst1.weight(), weight2 = inst2.weight();
    double[] values = new double[numAttributes];

    for (int i = 0; i < numAttributes; i++) {
        values[i] = 0;
    }

    if (inst1 instanceof SparseInstance && inst2 instanceof SparseInstance) {
        for (int i = 0; i < inst1.numValues(); i++) {
            int indexOfIndex = inst1.index(i);
            values[indexOfIndex] = inst1.valueSparse(i);
        }
        for (int i = 0; i < inst2.numValues(); i++) {
            int indexOfIndex = inst2.index(i);
            values[indexOfIndex] += inst2.valueSparse(i);
        }
        SparseInstance newInst = new SparseInstance(weight1 + weight2, values);
        newInst.setDataset(m_Instances);
        return newInst;
    } else if (!(inst1 instanceof SparseInstance) && !(inst2 instanceof SparseInstance)) {
        for (int i = 0; i < numAttributes; i++) {
            values[i] = inst1.value(i) + inst2.value(i);
        }
    } else {
        throw new Exception("Error!! inst1 and inst2 should be both of same type -- sparse or non-sparse");
    }
    Instance newInst = new Instance(weight1 + weight2, values);
    newInst.setDataset(m_Instances);
    return newInst;
}

From source file:cn.edu.xjtu.dbmine.StringToWordVector.java

License:Open Source License

/**
 * Normalizes given instance to average doc length (only the newly
 * constructed attributes).//  w  ww  .j a  v a 2 s  .co  m
 * 
 * @param inst
 *            the instance to normalize
 * @param firstCopy
 * @throws Exception
 *             if avg. doc length not set
 */

private void normalizeInstance(Instance inst, int firstCopy) throws Exception {

    double docLength = 0;

    if (m_AvgDocLength < 0) {
        throw new Exception("Average document length not set.");
    }

    // Compute length of document vector
    for (int j = 0; j < inst.numValues(); j++) {
        if (inst.index(j) >= firstCopy) {
            docLength += inst.valueSparse(j) * inst.valueSparse(j);
        }
    }
    docLength = Math.sqrt(docLength);

    // Normalize document vector
    for (int j = 0; j < inst.numValues(); j++) {
        if (inst.index(j) >= firstCopy) {
            double val = inst.valueSparse(j) * m_AvgDocLength / docLength;
            inst.setValueSparse(j, val);
            if (val == 0) {
                System.err.println("setting value " + inst.index(j) + " to zero.");
                j--;
            }
        }
    }
}

From source file:com.yahoo.labs.samoa.instances.WekaToSamoaInstanceConverter.java

License:Apache License

/**
 * Samoa instance from weka instance.//from w  w  w . jav a  2 s.c o  m
 *
 * @param inst the inst
 * @return the instance
 */
public Instance samoaInstance(weka.core.Instance inst) {
    Instance samoaInstance;
    if (inst instanceof weka.core.SparseInstance) {
        double[] attributeValues = new double[inst.numValues()];
        int[] indexValues = new int[inst.numValues()];
        for (int i = 0; i < inst.numValues(); i++) {
            if (inst.index(i) != inst.classIndex()) {
                attributeValues[i] = inst.valueSparse(i);
                indexValues[i] = inst.index(i);
            }
        }
        samoaInstance = new SparseInstance(inst.weight(), attributeValues, indexValues, inst.numAttributes());
    } else {
        samoaInstance = new DenseInstance(inst.weight(), inst.toDoubleArray());
        //samoaInstance.deleteAttributeAt(inst.classIndex());
    }
    if (this.samoaInstanceInformation == null) {
        this.samoaInstanceInformation = this.samoaInstancesInformation(inst.dataset());
    }
    samoaInstance.setDataset(samoaInstanceInformation);
    samoaInstance.setClassValue(inst.classValue());
    return samoaInstance;
}

From source file:de.uni_potsdam.hpi.bpt.promnicat.util.WeightedEuclideanDistance.java

License:Open Source License

/**
 * Calculates the distance between two instances. Offers speed up (if the 
 * distance function class in use supports it) in nearest neighbour search by 
 * taking into account the cutOff or maximum distance. Depending on the 
 * distance function class, post processing of the distances by 
 * postProcessDistances(double []) may be required if this function is used.
 *
 * @param first    the first instance/*from   w ww .  ja  va 2 s .  co m*/
 * @param second    the second instance
 * @param cutOffValue If the distance being calculated becomes larger than 
 *                    cutOffValue then the rest of the calculation is 
 *                    discarded.
 * @param stats    the performance stats object
 * @return       the distance between the two given instances or 
 *          Double.POSITIVE_INFINITY if the distance being 
 *          calculated becomes larger than cutOffValue. 
 */
public double distance(Instance first, Instance second, double cutOffValue, PerformanceStats stats) {
    double distance = 0;
    int firstI, secondI;
    int firstNumValues = first.numValues();
    int secondNumValues = second.numValues();
    int numAttributes = m_Data.numAttributes();
    int classIndex = m_Data.classIndex();
    double weights = 1;

    validate();

    for (int p1 = 0, p2 = 0; p1 < firstNumValues || p2 < secondNumValues;) {
        weights += first.attribute(p1).weight();
        if (p1 >= firstNumValues)
            firstI = numAttributes;
        else
            firstI = first.index(p1);

        if (p2 >= secondNumValues)
            secondI = numAttributes;
        else
            secondI = second.index(p2);

        if (firstI == classIndex) {
            p1++;
            continue;
        }
        if ((firstI < numAttributes) && !m_ActiveIndices[firstI]) {
            p1++;
            continue;
        }

        if (secondI == classIndex) {
            p2++;
            continue;
        }
        if ((secondI < numAttributes) && !m_ActiveIndices[secondI]) {
            p2++;
            continue;
        }

        double diff;

        if (firstI == secondI) {
            diff = difference(firstI, first.valueSparse(p1), second.valueSparse(p2));
            p1++;
            p2++;
        } else if (firstI > secondI) {
            diff = difference(secondI, 0, second.valueSparse(p2));
            p2++;
        } else {
            diff = difference(firstI, first.valueSparse(p1), 0);
            p1++;
        }
        if (stats != null)
            stats.incrCoordCount();

        distance = updateDistance(distance, diff);
        if (distance > cutOffValue)
            return Double.POSITIVE_INFINITY;
    }

    if (weights > 1) {
        return distance / (weights - 1);
    }
    return distance / weights;
}