Example usage for weka.core Instances attribute

List of usage examples for weka.core Instances attribute

Introduction

In this page you can find the example usage for weka.core Instances attribute.

Prototype

publicAttribute attribute(String name) 

Source Link

Document

Returns an attribute given its name.

Usage

From source file:ml.dataprocess.CorrelationAttributeEval.java

License:Open Source License

/**
 * Initializes an information gain attribute evaluator. Replaces missing
 * values with means/modes; Deletes instances with missing class values.
 * //from   ww  w.  j  a  va2  s  . c  om
 * @param data set of instances serving as training data
 * @throws Exception if the evaluator has not been generated successfully
 */
@Override
public void buildEvaluator(Instances data) throws Exception {
    data = new Instances(data);
    data.deleteWithMissingClass();

    ReplaceMissingValues rmv = new ReplaceMissingValues();
    rmv.setInputFormat(data);
    data = Filter.useFilter(data, rmv);

    int numClasses = data.classAttribute().numValues();
    int classIndex = data.classIndex();
    int numInstances = data.numInstances();
    m_correlations = new double[data.numAttributes()];
    /*
     * boolean hasNominals = false; boolean hasNumerics = false;
     */
    List<Integer> numericIndexes = new ArrayList<Integer>();
    List<Integer> nominalIndexes = new ArrayList<Integer>();
    if (m_detailedOutput) {
        m_detailedOutputBuff = new StringBuffer();
    }

    // TODO for instance weights (folded into computing weighted correlations)
    // add another dimension just before the last [2] (0 for 0/1 binary vector
    // and
    // 1 for corresponding instance weights for the 1's)
    double[][][] nomAtts = new double[data.numAttributes()][][];
    for (int i = 0; i < data.numAttributes(); i++) {
        if (data.attribute(i).isNominal() && i != classIndex) {
            nomAtts[i] = new double[data.attribute(i).numValues()][data.numInstances()];
            Arrays.fill(nomAtts[i][0], 1.0); // set zero index for this att to all
                                             // 1's
            nominalIndexes.add(i);
        } else if (data.attribute(i).isNumeric() && i != classIndex) {
            numericIndexes.add(i);
        }
    }

    // do the nominal attributes
    if (nominalIndexes.size() > 0) {
        for (int i = 0; i < data.numInstances(); i++) {
            Instance current = data.instance(i);
            for (int j = 0; j < current.numValues(); j++) {
                if (current.attribute(current.index(j)).isNominal() && current.index(j) != classIndex) {
                    // Will need to check for zero in case this isn't a sparse
                    // instance (unless we add 1 and subtract 1)
                    nomAtts[current.index(j)][(int) current.valueSparse(j)][i] += 1;
                    nomAtts[current.index(j)][0][i] -= 1;
                }
            }
        }
    }

    if (data.classAttribute().isNumeric()) {
        double[] classVals = data.attributeToDoubleArray(classIndex);

        // do the numeric attributes
        for (Integer i : numericIndexes) {
            double[] numAttVals = data.attributeToDoubleArray(i);
            m_correlations[i] = Utils.correlation(numAttVals, classVals, numAttVals.length);

            if (m_correlations[i] == 1.0) {
                // check for zero variance (useless numeric attribute)
                if (Utils.variance(numAttVals) == 0) {
                    m_correlations[i] = 0;
                }
            }
        }

        // do the nominal attributes
        if (nominalIndexes.size() > 0) {

            // now compute the correlations for the binarized nominal attributes
            for (Integer i : nominalIndexes) {
                double sum = 0;
                double corr = 0;
                double sumCorr = 0;
                double sumForValue = 0;

                if (m_detailedOutput) {
                    m_detailedOutputBuff.append("\n\n").append(data.attribute(i).name());
                }

                for (int j = 0; j < data.attribute(i).numValues(); j++) {
                    sumForValue = Utils.sum(nomAtts[i][j]);
                    corr = Utils.correlation(nomAtts[i][j], classVals, classVals.length);

                    // useless attribute - all instances have the same value
                    if (sumForValue == numInstances || sumForValue == 0) {
                        corr = 0;
                    }
                    if (corr < 0.0) {
                        corr = -corr;
                    }
                    sumCorr += sumForValue * corr;
                    sum += sumForValue;

                    if (m_detailedOutput) {
                        m_detailedOutputBuff.append("\n\t").append(data.attribute(i).value(j)).append(": ");
                        m_detailedOutputBuff.append(Utils.doubleToString(corr, 6));
                    }
                }
                m_correlations[i] = (sum > 0) ? sumCorr / sum : 0;
            }
        }
    } else {
        // class is nominal
        // TODO extra dimension for storing instance weights too
        double[][] binarizedClasses = new double[data.classAttribute().numValues()][data.numInstances()];

        // this is equal to the number of instances for all inst weights = 1
        double[] classValCounts = new double[data.classAttribute().numValues()];

        for (int i = 0; i < data.numInstances(); i++) {
            Instance current = data.instance(i);
            binarizedClasses[(int) current.classValue()][i] = 1;
        }
        for (int i = 0; i < data.classAttribute().numValues(); i++) {
            classValCounts[i] = Utils.sum(binarizedClasses[i]);
        }

        double sumClass = Utils.sum(classValCounts);

        // do numeric attributes first
        if (numericIndexes.size() > 0) {
            for (Integer i : numericIndexes) {
                double[] numAttVals = data.attributeToDoubleArray(i);
                double corr = 0;
                double sumCorr = 0;

                for (int j = 0; j < data.classAttribute().numValues(); j++) {
                    corr = Utils.correlation(numAttVals, binarizedClasses[j], numAttVals.length);
                    if (corr < 0.0) {
                        corr = -corr;
                    }

                    if (corr == 1.0) {
                        // check for zero variance (useless numeric attribute)
                        if (Utils.variance(numAttVals) == 0) {
                            corr = 0;
                        }
                    }

                    sumCorr += classValCounts[j] * corr;
                }
                m_correlations[i] = sumCorr / sumClass;
            }
        }

        if (nominalIndexes.size() > 0) {
            for (Integer i : nominalIndexes) {
                if (m_detailedOutput) {
                    m_detailedOutputBuff.append("\n\n").append(data.attribute(i).name());
                }

                double sumForAtt = 0;
                double corrForAtt = 0;
                for (int j = 0; j < data.attribute(i).numValues(); j++) {
                    double sumForValue = Utils.sum(nomAtts[i][j]);
                    double corr = 0;
                    double sumCorr = 0;
                    double avgCorrForValue = 0;

                    sumForAtt += sumForValue;
                    for (int k = 0; k < numClasses; k++) {

                        // corr between value j and class k
                        corr = Utils.correlation(nomAtts[i][j], binarizedClasses[k],
                                binarizedClasses[k].length);

                        // useless attribute - all instances have the same value
                        if (sumForValue == numInstances || sumForValue == 0) {
                            corr = 0;
                        }
                        if (corr < 0.0) {
                            corr = -corr;
                        }
                        sumCorr += classValCounts[k] * corr;
                    }
                    avgCorrForValue = sumCorr / sumClass;
                    corrForAtt += sumForValue * avgCorrForValue;

                    if (m_detailedOutput) {
                        m_detailedOutputBuff.append("\n\t").append(data.attribute(i).value(j)).append(": ");
                        m_detailedOutputBuff.append(Utils.doubleToString(avgCorrForValue, 6));
                    }
                }

                // the weighted average corr for att i as
                // a whole (wighted by value frequencies)
                m_correlations[i] = (sumForAtt > 0) ? corrForAtt / sumForAtt : 0;
            }
        }
    }

    if (m_detailedOutputBuff != null && m_detailedOutputBuff.length() > 0) {
        m_detailedOutputBuff.append("\n");
    }
}

From source file:mlda.attributes.AvgAbsoluteCorrelationBetweenNumericAttributes.java

License:Open Source License

/**
 * Calculate metric value/*from ww w. j  av  a2 s .  co m*/
 * 
 * @param mlData Multi-label dataset to which calculate the metric
 * @return Value of the metric
 */
public double calculate(MultiLabelInstances mlData) {
    Instances instances = mlData.getDataSet();

    int numInstances = mlData.getNumInstances();

    double res = 0.0;
    int count = 0;

    int[] featureIndices = mlData.getFeatureIndices();

    Vector<Integer> numericFeatureIndices = new Vector<>();
    for (int fIndex : featureIndices) {
        if (instances.attribute(fIndex).isNumeric()) {
            numericFeatureIndices.add(fIndex);
        }
    }

    if (numericFeatureIndices.size() <= 0) {
        return Double.NaN;
    }

    double[][] attributesToDoubleArray = new double[numericFeatureIndices.size()][numInstances];
    for (int fIndex : numericFeatureIndices) {
        attributesToDoubleArray[fIndex] = instances.attributeToDoubleArray(fIndex);
    }

    for (int fIndex1 : numericFeatureIndices) {
        for (int fIndex2 = fIndex1 + 1; fIndex2 < numericFeatureIndices.size(); fIndex2++) {
            count++;
            res += Utils.correlation(attributesToDoubleArray[fIndex1], attributesToDoubleArray[fIndex2],
                    numInstances);
        }
    }

    if (count > 0) {
        this.value = res / count;
    } else {
        this.value = Double.NaN;
    }

    //this.value = res/count;
    return value;
}

From source file:mlda.util.Utils.java

License:Open Source License

/**
 * Get array of ImbalancedFeature with labels frequency
 * /*from   ww w .j  av  a  2s. c  om*/
 * @param dataset Multi-label dataset
 * @return Array of ImbalancedFeature with the labels frequency
 */
public static ImbalancedFeature[] getAppearancesPerLabel(MultiLabelInstances dataset) {
    int[] labelIndices = dataset.getLabelIndices();

    ImbalancedFeature[] labels = new ImbalancedFeature[labelIndices.length];

    Instances instances = dataset.getDataSet();

    int appearances = 0;
    Attribute currentAtt;

    for (int i = 0; i < labelIndices.length; i++) {
        currentAtt = instances.attribute(labelIndices[i]);
        appearances = 0;

        for (int j = 0; j < instances.size(); j++) {
            if (instances.instance(j).value(currentAtt) == 1.0) {
                appearances++;
            }
        }
        labels[i] = new ImbalancedFeature(currentAtt.name(), appearances);
    }

    return labels;
}

From source file:mlda.util.Utils.java

License:Open Source License

/**
 * Calculate IRs of the ImbalancedFeatures
 * //from   ww  w .  ja  va 2s.c  o  m
 * @param dataset Multi-label dataset
 * @param labels Labels of the dataset as ImbalancedFeature objects
 * @return Array of ImbalancedFeature objects with calculated IR
 */
public static ImbalancedFeature[] getImbalancedWithIR(MultiLabelInstances dataset, ImbalancedFeature[] labels) {
    int[] labelIndices = dataset.getLabelIndices();

    ImbalancedFeature[] labels_imbalanced = new ImbalancedFeature[labelIndices.length];

    Instances instances = dataset.getDataSet();

    int nOnes = 0, nZeros = 0, maxAppearance = 0;
    double IRIntraClass;
    double variance;
    double IRInterClass;
    double mean = dataset.getNumInstances() / 2;

    Attribute current;
    ImbalancedFeature currentLabel;

    for (int i = 0; i < labelIndices.length; i++) //for each label
    {
        nZeros = 0;
        nOnes = 0;
        current = instances.attribute(labelIndices[i]); //current label

        for (int j = 0; j < instances.size(); j++) //for each instance
        {
            if (instances.instance(j).value(current) == 1.0) {
                nOnes++;
            } else {
                nZeros++;
            }
        }

        try {
            if (nZeros == 0 || nOnes == 0) {
                IRIntraClass = 0;
            } else if (nZeros > nOnes) {
                IRIntraClass = (double) nZeros / nOnes;
            } else {
                IRIntraClass = (double) nOnes / nZeros;
            }
        } catch (Exception e1) {
            IRIntraClass = 0;
        }

        variance = (Math.pow((nZeros - mean), 2) + Math.pow((nOnes - mean), 2)) / 2;

        currentLabel = getLabelByName(current.name(), labels);

        maxAppearance = labels[0].getAppearances();

        if (currentLabel.getAppearances() <= 0) {
            IRInterClass = Double.NaN;
        } else {
            IRInterClass = (double) maxAppearance / currentLabel.getAppearances();
        }

        labels_imbalanced[i] = new ImbalancedFeature(current.name(), currentLabel.getAppearances(),
                IRInterClass, IRIntraClass, variance);
    }

    return labels_imbalanced;
}

From source file:mlflex.WekaInMemoryLearner.java

License:Open Source License

@Override
protected ArrayList<String> SelectOrRankFeatures(ArrayList<String> algorithmParameters,
        DataInstanceCollection trainData, DataInstanceCollection dependentVariableInstances) throws Exception {
    ArrayList<String> dataPointNames = Lists.SortStringList(trainData.GetDataPointNames());

    FastVector attVector = GetAttributeVector(dependentVariableInstances, dataPointNames, trainData);
    Instances instances = GetInstances(dependentVariableInstances, attVector, trainData);

    AttributeSelection attsel = new AttributeSelection();
    ASEvaluation eval = GetAttributeEvaluator(algorithmParameters);
    ASSearch search = GetSearchMethod(algorithmParameters);
    attsel.setEvaluator(eval);//from  w w  w  . j av  a2  s. co  m
    attsel.setSearch(search);

    boolean isRanker = algorithmParameters.get(2).equals(Ranker.class.getName());

    if (isRanker)
        attsel.setRanking(true);

    attsel.SelectAttributes(instances);

    ArrayList<String> features = new ArrayList<String>();

    if (isRanker) {
        for (double[] rank : attsel.rankedAttributes())
            features.add(instances.attribute((int) rank[0]).name());
    } else {
        for (int i : attsel.selectedAttributes())
            features.add(instances.attribute(i).name());
    }

    instances = null;

    return features;
}

From source file:moa.classifiers.macros.TACNB.java

License:Open Source License

public void initHeader(Instances dataset) {
    int numLabels = this.numOldLabelsOption.getValue();
    Attribute target = dataset.classAttribute();

    List<String> possibleValues = new ArrayList<String>();
    int n = target.numValues();
    for (int i = 0; i < n; i++) {
        possibleValues.add(target.value(i));
    }/*from   www  .  j ava 2  s  .  c  o m*/

    ArrayList<Attribute> attrs = new ArrayList<Attribute>(numLabels + dataset.numAttributes());
    for (int i = 0; i < numLabels; i++) {
        attrs.add(new Attribute(target.name() + "_" + i, possibleValues));
    }
    for (int i = 0; i < dataset.numAttributes(); i++) {
        attrs.add((Attribute) dataset.attribute(i).copy());
    }
    this.header = new Instances("extended_" + dataset.relationName(), attrs, 0);
    this.header.setClassIndex(numLabels + dataset.classIndex());
}

From source file:moa.classifiers.novelClass.AbstractNovelClassClassifier.java

License:Apache License

final public static Instances augmentInstances(Instances datum) {
    ArrayList<Attribute> attInfo = new ArrayList<>(datum.numAttributes());
    for (int aIdx = 0; aIdx < datum.numAttributes(); aIdx++) {
        Attribute a = datum.attribute(aIdx).copy(datum.attribute(aIdx).name());
        if ((aIdx == datum.classIndex()) && (a.indexOfValue(NOVEL_LABEL_STR) < 0)) { // only if we don't already have these
            List<String> values = new ArrayList<>(a.numValues() + 2);
            for (int i = 0; i < a.numValues(); ++i) {
                values.add(a.value(i));//from w w w.  j  av a 2s.c o m
            }
            values.add(OUTLIER_LABEL_STR);
            values.add(NOVEL_LABEL_STR);
            a = new Attribute(a.name(), values, a.getMetadata());
        }
        attInfo.add(a);
    }
    String relationshipName = NOVEL_CLASS_INSTANCE_RELATIONSHIP_TYPE + "-" + datum.relationName();
    Instances ret = new Instances(relationshipName, attInfo, 1);
    ret.setClassIndex(datum.classIndex());

    return ret;
}

From source file:moa.core.VectorDistances.java

License:Apache License

/**
 * Generalized Minkowski distance equation to cover the entire family of distances
 * power &lt; 1 --&gt; Minimum distance (strictly speaking, not a Minkowski distance)
 * power = 1 --&gt; Manhattan distance
 * power = 2 --&gt; Euclidian distance
 * power = INF --&gt; Chebyshev (or maximum) distance
 * @param src first data point to compare from
 * @param dst second data point to compare to
 * @param header feature weight (strictly speaking, all weights should be 1 for pure Minkowski)
 * @param power power used to raise each component distance and 1/p for final reduction
 * @return Minkowski distance between the two points
 *///w w  w. j  av a  2  s.com
public static synchronized double distanceMinkowski(double[] src, double[] dst, Instances header,
        double power) {
    double ret = 0.0;
    int minSize = Math.min(src.length, Math.min(dst.length, header.numAttributes()));
    if (minSize < 1) {
        return Double.MAX_VALUE;
    }
    double minDist = Double.MAX_VALUE;
    double maxDist = Double.MIN_VALUE;
    for (int i = 0; i < minSize; i++) {
        double d = Math.abs(src[i] - dst[i]);
        double w = header.attribute(i).weight();
        ret += (d >= (epsilon * epsilon)) ? Math.abs(Math.pow(d, power)) * w : 0;
        if (w > 0) {
            minDist = Math.min(minDist, d);
        }
        if (w > 0) {
            maxDist = Math.max(maxDist, d);
        }
    }

    if (power >= Minkowski_Chebyshev) {
        ret = maxDist;
    } else if (power < 0.000000001) {
        ret = minDist;
    } else {
        ret = (ret >= (epsilon * epsilon)) ? Math.pow(ret, 1.0 / power) : 0;
    }

    // Safety...
    if (Double.isInfinite(ret)) {
        ret = Double.MAX_VALUE;
    } else if (Double.isNaN(ret)) {
        ret = 0.0;
    }

    return ret;
}

From source file:moa.core.VectorDistances.java

License:Apache License

/**
 * Average distance, which is a modification of Euclidian distance
 * @param src first data point to compare from
 * @param dst second data point to compare to
 * @param header feature weights and meta data (strictly speaking, all weights should be 1 for pure Minkowski)
 * @return component-averaged Euclidian distance
 *///w w  w  . jav  a 2 s . c om
public static synchronized double distanceAverage(double[] src, double[] dst, Instances header) {
    double ret = 0.0;
    int minSize = Math.min(src.length, Math.min(dst.length, header.numAttributes()));
    if (minSize < 1) {
        return Double.MAX_VALUE;
    }
    for (int i = 0; i < minSize; i++) {
        double d = Math.abs(src[i] - dst[i]);
        ret += d * d * header.attribute(i).weight();
    }
    ret = Math.sqrt(ret / minSize);
    // Safety...
    if (Double.isInfinite(ret)) {
        ret = Double.MAX_VALUE;
    } else if (Double.isNaN(ret)) {
        ret = 0.0;
    }
    return ret;
}

From source file:moa.core.VectorDistances.java

License:Apache License

/**
 * Average distance, which is a modification of Euclidian distance
 * @param src first data point to compare from
 * @param dst second data point to compare to
 * @param header data set header used to determine attribute/feature type for mixed distance
 * @return component-averaged Euclidian distance
 *//*from www. j a  v a 2s  .  c  o  m*/
public static synchronized double distanceGower(double[] src, double[] dst, Instances header) {
    double ret = 0.0;
    int minSize = Math.min(src.length, Math.min(dst.length, header.numAttributes()));
    if (minSize < 1) {
        return Double.MAX_VALUE;
    }
    double wSum = 0.0;
    for (int i = 0; i < minSize; i++) {
        Attribute att = header.attribute(i);
        double d = 0.0;
        double w = header.attribute(i).weight();
        if (att == null) {
            continue;
        }
        switch (att.type()) {
        case Attribute.NUMERIC:
            w = (src[i] == 0 || dst[i] == 0) ? 0.0 : 1.0;
            double sigma = Math.abs(
                    header.attribute(i).getUpperNumericBound() - header.attribute(i).getLowerNumericBound());
            d = (Double.isFinite(sigma) && sigma > 0) ? Math.abs(src[i] - dst[i]) / sigma
                    : Math.abs(src[i] - dst[i]) / 1;//Math.max(src[i], dst[i]);
            break;
        case Attribute.NOMINAL:
        case Attribute.STRING:
            d = (src[i] == dst[i]) ? 0.0 : 1.0;
            break;
        case Attribute.DATE:
        case Attribute.RELATIONAL:
        default:
            System.err.println("Attribute type " + Attribute.typeToString(att)
                    + " is not yet supported... ignoring feature " + i);
            d = 0.0;
            w = 0;
        }
        wSum += w;
        ret += d * d * w;
    }
    ret = (wSum > 0) ? Math.sqrt(ret / wSum) : 0.0;
    // Safety...
    if (Double.isInfinite(ret)) {
        ret = Double.MAX_VALUE;
    } else if (Double.isNaN(ret)) {
        ret = 0.0;
    }
    return ret;
}