Example usage for weka.core Instances meanOrMode

List of usage examples for weka.core Instances meanOrMode

Introduction

In this page you can find the example usage for weka.core Instances meanOrMode.

Prototype

publicdouble meanOrMode(Attribute att) 

Source Link

Document

Returns the mean (mode) for a numeric (nominal) attribute as a floating-point value.

Usage

From source file:de.ugoe.cs.cpdp.dataprocessing.NormalizationUtil.java

License:Apache License

/**
 * <p>/*from w  ww  . jav  a  2 s.  co m*/
 * Z-Score normalization using the mean and std of the training data (N3 in Transfer Defect
 * Learning by Nam et al.).
 * </p>
 *
 * @param testdata
 *            test data of the target product
 * @param traindata
 *            training data
 */
public static void zScoreTraining(Instances testdata, Instances traindata) {
    final double[] mean = new double[testdata.numAttributes()];
    final double[] std = new double[testdata.numAttributes()];

    // get means of training
    for (int j = 0; j < traindata.numAttributes(); j++) {
        if (traindata.classIndex() != j) {
            mean[j] = traindata.meanOrMode(j);
            std[j] = Math.sqrt(traindata.variance(j));
        }
    }

    applyZScore(testdata, mean, std);
    applyZScore(traindata, mean, std);
}

From source file:de.ugoe.cs.cpdp.dataprocessing.NormalizationUtil.java

License:Apache License

/**
 * <p>/*from w w  w.j ava  2  s  .c om*/
 * Z-Score normalization using the mean and std of the test data (N4 in Transfer Defect Learning
 * by Nam et al.).
 * </p>
 *
 * @param testdata
 *            test data of the target product
 * @param traindata
 *            training data
 */
public static void zScoreTarget(Instances testdata, Instances traindata) {
    final double[] mean = new double[testdata.numAttributes()];
    final double[] std = new double[testdata.numAttributes()];

    // get means of testdata
    for (int j = 0; j < testdata.numAttributes(); j++) {
        if (testdata.classIndex() != j) {
            mean[j] = testdata.meanOrMode(j);
            std[j] = Math.sqrt(testdata.variance(j));
        }
    }

    applyZScore(testdata, mean, std);
    applyZScore(traindata, mean, std);
}

From source file:de.ugoe.cs.cpdp.dataprocessing.NormalizationUtil.java

License:Apache License

/**
 * <p>/*  ww w  .j  av  a2s.c o m*/
 * Z-Score normalization using the mean and std of the test data (N4 in Transfer Defect Learning
 * by Nam et al.).
 * </p>
 *
 * @param testdata
 *            test data of the target product
 * @param traindataSet
 *            training data
 */
public static void zScoreTarget(Instances testdata, SetUniqueList<Instances> traindataSet) {
    final double[] mean = new double[testdata.numAttributes()];
    final double[] std = new double[testdata.numAttributes()];

    // get means of testdata
    for (int j = 0; j < testdata.numAttributes(); j++) {
        if (testdata.classIndex() != j) {
            mean[j] = testdata.meanOrMode(j);
            std[j] = Math.sqrt(testdata.variance(j));
        }
    }

    applyZScore(testdata, mean, std);
    for (Instances traindata : traindataSet) {
        applyZScore(traindata, mean, std);
    }
}

From source file:de.unimannheim.dws.algorithms.CustomSimpleKMedian.java

License:Open Source License

/**
 * Move the centroid to it's new coordinates. Generate the centroid
 * coordinates based on it's members (objects assigned to the cluster of the
 * centroid) and the distance function being used.
 * /*  w ww  . j  a  v  a 2s .  co m*/
 * @param centroidIndex index of the centroid which the coordinates will be
 *          computed
 * @param members the objects that are assigned to the cluster of this
 *          centroid
 * @param updateClusterInfo if the method is supposed to update the m_Cluster
 *          arrays
 * @return the centroid coordinates
 */
protected double[] moveCentroid(int centroidIndex, Instances members, boolean updateClusterInfo) {
    double[] vals = new double[members.numAttributes()];

    // used only for Manhattan Distance
    Instances sortedMembers = null;
    int middle = 0;
    boolean dataIsEven = false;

    if (m_DistanceFunction instanceof ManhattanDistance
            || m_DistanceFunction instanceof CustomPairWiseDistance) {
        middle = (members.numInstances() - 1) / 2;
        dataIsEven = ((members.numInstances() % 2) == 0);
        if (m_PreserveOrder) {
            sortedMembers = members;
        } else {
            sortedMembers = new Instances(members);
        }
    }

    for (int j = 0; j < members.numAttributes(); j++) {

        // in case of Euclidian distance the centroid is the mean point
        // in case of Manhattan distance the centroid is the median point
        // in both cases, if the attribute is nominal, the centroid is the mode
        if (m_DistanceFunction instanceof EuclideanDistance || members.attribute(j).isNominal()) {
            vals[j] = members.meanOrMode(j);
        } else if (m_DistanceFunction instanceof ManhattanDistance
                || m_DistanceFunction instanceof CustomPairWiseDistance) {
            // singleton special case
            if (members.numInstances() == 1) {
                vals[j] = members.instance(0).value(j);
            } else {
                vals[j] = sortedMembers.kthSmallestValue(j, middle + 1);
                if (dataIsEven) {
                    vals[j] = (vals[j] + sortedMembers.kthSmallestValue(j, middle + 2)) / 2;
                }
            }
        }

        if (updateClusterInfo) {
            m_ClusterMissingCounts[centroidIndex][j] = members.attributeStats(j).missingCount;
            m_ClusterNominalCounts[centroidIndex][j] = members.attributeStats(j).nominalCounts;
            if (members.attribute(j).isNominal()) {
                if (m_ClusterMissingCounts[centroidIndex][j] > m_ClusterNominalCounts[centroidIndex][j][Utils
                        .maxIndex(m_ClusterNominalCounts[centroidIndex][j])]) {
                    vals[j] = Instance.missingValue(); // mark mode as missing
                }
            } else {
                if (m_ClusterMissingCounts[centroidIndex][j] == members.numInstances()) {
                    vals[j] = Instance.missingValue(); // mark mean as missing
                }
            }
        }
    }
    if (updateClusterInfo) {
        m_ClusterCentroids.add(new Instance(1.0, vals));
    }
    return vals;
}

From source file:dewaweebtreeclassifier.Sujeong.java

public void buildTree(Instances instances) throws java.lang.Exception {
    if (instances.numAttributes() < 1) {
        throw new Exception("Data instances need to have minimum of 1 attribute.");
    } else if (instances.numAttributes() == 1) {
        this.value = instances.meanOrMode(instances.classIndex());
    } else {/*from  w w  w.  ja  v  a  2  s .c om*/
        Enumeration attrs = instances.enumerateAttributes();
        double informationGain = 0.0;
        while (attrs.hasMoreElements()) {
            Attribute attr = (Attribute) attrs.nextElement();
            double tmpGain = computeGain(instances, attr);
            if (tmpGain > informationGain) {
                bestAttr = attr;
                informationGain = tmpGain;
            }
        }
        if (bestAttr != null) {
            double mode = instances.meanOrMode(instances.classIndex());
            Instances[] chunks = splitInstancesOnAttribute(instances, bestAttr);
            children = new Sujeong[chunks.length];
            for (int i = 0; i < chunks.length; ++i) {
                Instances chunk = chunks[i];
                Sujeong child = new Sujeong();
                children[i] = child;
                if (chunk.numInstances() > 0)
                    child.buildTree(chunk);
                else
                    child.value = mode;
            }
        } else {
            this.value = instances.meanOrMode(instances.classIndex());
        }
    }
}

From source file:fantail.algorithms.BinaryART.java

License:Open Source License

private void makeTree(Instances data, java.util.Random r, int depth) throws Exception {
    if (m_K > data.numAttributes()) {
        m_K = data.numAttributes() - 1;//w ww.ja  va 2  s. com
    }

    if (m_K < 1) {
        m_K = (int) weka.core.Utils.log2(data.numAttributes()) + 1;
    }

    int[] randAtts = new int[data.numAttributes() - 1];
    //TODO: handle class target att
    for (int i = 0; i < randAtts.length; i++) {
        randAtts[i] = i;
    }
    for (int i = 0; i < randAtts.length; i++) {
        int randomPosition = r.nextInt(randAtts.length);
        int temp = randAtts[i];
        randAtts[i] = randAtts[randomPosition];
        randAtts[randomPosition] = temp;
    }

    int bestAttIndex = -1;

    AttScorePair[] attScorePair = new AttScorePair[m_K];

    //double currentR2 = estimateAvgDistanceSpearman(data);
    for (int i = 0; i < m_K; i++) {
        int attIndex = randAtts[i];

        double splitPoint = Double.NaN;

        if (!m_UseMedian) {
            splitPoint = data.meanOrMode(attIndex);
        } else {
            splitPoint = getMedian(data, attIndex);
        }

        double r2 = estimateR2(data, attIndex, splitPoint);
        attScorePair[i] = new AttScorePair(attIndex, r2);
    }

    Arrays.sort(attScorePair);

    bestAttIndex = attScorePair[0].index;
    double maxR2 = attScorePair[0].score;
    boolean stop1 = false;

    //        for (int kk = 0; kk < attScorePair.length; kk++) {
    //            System.out.println(attScorePair[kk].score);
    //        }
    //        if (true) {
    //            throw new Exception("stop");
    //        }
    if (attScorePair[0].score <= attScorePair[m_K - 1].score) {
        stop1 = true;
    }

    if (data.numInstances() <= m_MiniLeaf || (depth >= m_MaxDepth && m_MaxDepth != 0)
    //|| maxR2 <= 0.01 // removed 10/01/2013
            || maxR2 >= 0.95 || stop1 // 11/01/13 the paper version doesn't have this
            || data.variance(bestAttIndex) <= 0) {

        m_Attribute = null;
        m_Prototype = AbstractRanker.getAvgRanking(data);
        //m_Prototype = AbstractRanker.getCenterRanking(data, m_ApproxCenterMethod);
        return;
    }

    m_Attribute = data.attribute(bestAttIndex);
    if (!m_UseMedian) {
        m_SplitPoint = data.meanOrMode(bestAttIndex);
    } else {
        m_SplitPoint = getMedian(data, bestAttIndex);
    }
    Instances[] splitData = splitData(data, bestAttIndex, m_SplitPoint);

    m_Successors = new BinaryART[2];
    for (int j = 0; j < 2; j++) {
        m_Successors[j] = new BinaryART();
        m_Successors[j].setMiniLeaf(m_MiniLeaf);
        m_Successors[j].setK(m_K);
        m_Successors[j].setUseMedian(m_UseMedian);
        m_Successors[j].setNumObjects(m_NumObjects);

        m_Successors[j].makeTree(splitData[j], r, depth + 1);
    }
}

From source file:fantail.algorithms.RankingWithBinaryPCT.java

License:Open Source License

private void makeTree(Instances data, Random r, int depth) throws Exception {

    if (data.numInstances() <= m_MiniLeaf || (depth >= m_MaxDepth && m_MaxDepth != 0)
            || computeVariance(data) <= m_MinVariancea) {
        //|| maxVarianceaReduction <= 0
        //|| data.variance(bestAttIndex) <= 0) { // || data.variance(bestAttIndex) <= 0 ) {   copied from ART, 

        m_Attribute = null;//from   w  w w .j a va2s .c  o m
        m_Prototype = AbstractRanker.getAvgRanking(data);
        return;
    }

    //
    if (m_K > data.numAttributes()) {
        m_K = data.numAttributes();
    }
    if (m_K < 1) {
        m_K = (int) weka.core.Utils.log2(data.numAttributes()) + 1;
    }

    // TODO:
    int[] attIndice = new int[data.numAttributes() - 1];
    for (int i = 0; i < attIndice.length; i++) {
        attIndice[i] = i;
    }
    for (int i = 0; i < attIndice.length; i++) {
        //int randomPosition = getRandomPosition(r, attIndice);
        int randomPosition = r.nextInt(attIndice.length);
        int temp = attIndice[i];
        attIndice[i] = attIndice[randomPosition];
        attIndice[randomPosition] = temp;
    }

    AttScorePair[] attScorePair = new AttScorePair[m_K];

    for (int i = 0; i < m_K; i++) {
        int attIndex = attIndice[i];

        double splitPoint = Double.NaN;
        if (!m_UseMedian) {
            splitPoint = data.meanOrMode(attIndex);
        } else {
            splitPoint = getMedian(data, attIndex);
        }
        double varianceReduction = computeVarianceReduction(data, attIndex, splitPoint);
        attScorePair[i] = new AttScorePair(attIndex, varianceReduction);

    }

    Arrays.sort(attScorePair);
    int randAttIndex = 0;
    int bestAttIndex = attScorePair[randAttIndex].index;

    double maxVarianceaReduction = attScorePair[randAttIndex].score;

    //        if (data.numInstances() <= 1 * m_MiniLeaf
    //                || (depth >= m_MaxDepth && m_MaxDepth != 0)
    //                || computeVariance(data) <= m_MinVariancea) {
    //                //|| maxVarianceaReduction <= 0
    //                //|| data.variance(bestAttIndex) <= 0) { // || data.variance(bestAttIndex) <= 0 ) {   copied from ART, 
    //
    //            m_Attribute = null;
    //            m_Prototype = AbstractRanker.getAvgRanking(data);
    //            return;
    //        }
    m_Attribute = data.attribute(bestAttIndex);

    if (!m_UseMedian) {
        m_SplitPoint = data.meanOrMode(bestAttIndex);
    } else {
        m_SplitPoint = getMedian(data, bestAttIndex);
    }

    //m_SplitPoint = data.meanOrMode(m_Attribute);
    Instances[] splitData = splitData(data, bestAttIndex, m_SplitPoint);

    //System.out.println(splitData[0].numInstances());
    //System.out.println(splitData[1].numInstances());
    //System.out.println();

    m_Successors = new RankingWithBinaryPCT[2];

    for (int j = 0; j < 2; j++) {
        m_Successors[j] = new RankingWithBinaryPCT();
        m_Successors[j].setMiniLeaf(m_MiniLeaf);
        m_Successors[j].setK(m_K);
        m_Successors[j].setUseMedian(m_UseMedian);
        m_Successors[j].setNumTargetLabels(m_NumTargetLabels);
        m_Successors[j].makeTree(splitData[j], r, depth + 1);
    }
}

From source file:gr.iti.mklab.visual.quantization.SimpleKMeansWithOutput.java

License:Open Source License

/**
 * Move the centroid to it's new coordinates. Generate the centroid coordinates based on it's members
 * (objects assigned to the cluster of the centroid) and the distance function being used.
 * //from   w w w . j ava 2s.  c o  m
 * @param centroidIndex
 *            index of the centroid which the coordinates will be computed
 * @param members
 *            the objects that are assigned to the cluster of this centroid
 * @param updateClusterInfo
 *            if the method is supposed to update the m_Cluster arrays
 * @param addToCentroidInstances
 *            true if the method is to add the computed coordinates to the Instances holding the centroids
 * @return the centroid coordinates
 */
protected double[] moveCentroid(int centroidIndex, Instances members, boolean updateClusterInfo,
        boolean addToCentroidInstances) {
    double[] vals = new double[members.numAttributes()];

    // used only for Manhattan Distance
    Instances sortedMembers = null;
    int middle = 0;
    boolean dataIsEven = false;

    if (m_DistanceFunction instanceof ManhattanDistance) {
        middle = (members.numInstances() - 1) / 2;
        dataIsEven = ((members.numInstances() % 2) == 0);
        if (m_PreserveOrder) {
            sortedMembers = members;
        } else {
            sortedMembers = new Instances(members);
        }
    }

    for (int j = 0; j < members.numAttributes(); j++) {

        // in case of Euclidian distance the centroid is the mean point
        // in case of Manhattan distance the centroid is the median point
        // in both cases, if the attribute is nominal, the centroid is the mode
        if (m_DistanceFunction instanceof EuclideanDistance || members.attribute(j).isNominal()) {
            vals[j] = members.meanOrMode(j);
        } else if (m_DistanceFunction instanceof ManhattanDistance) {
            // singleton special case
            if (members.numInstances() == 1) {
                vals[j] = members.instance(0).value(j);
            } else {
                vals[j] = sortedMembers.kthSmallestValue(j, middle + 1);
                if (dataIsEven) {
                    vals[j] = (vals[j] + sortedMembers.kthSmallestValue(j, middle + 2)) / 2;
                }
            }
        }

        if (updateClusterInfo) {
            m_ClusterMissingCounts[centroidIndex][j] = members.attributeStats(j).missingCount;
            m_ClusterNominalCounts[centroidIndex][j] = members.attributeStats(j).nominalCounts;
            if (members.attribute(j).isNominal()) {
                if (m_ClusterMissingCounts[centroidIndex][j] > m_ClusterNominalCounts[centroidIndex][j][Utils
                        .maxIndex(m_ClusterNominalCounts[centroidIndex][j])]) {
                    vals[j] = Utils.missingValue(); // mark mode as missing
                }
            } else {
                if (m_ClusterMissingCounts[centroidIndex][j] == members.numInstances()) {
                    vals[j] = Utils.missingValue(); // mark mean as missing
                }
            }
        }
    }
    if (addToCentroidInstances) {
        m_ClusterCentroids.add(new DenseInstance(1.0, vals));
    }
    return vals;
}

From source file:lu.lippmann.cdb.lab.beta.util.WekaUtil2.java

License:Open Source License

/**
 * Generate the centroid coordinates based 
 * on it's  members (objects assigned to the cluster of the centroid) and the distance 
 * function being used./*from   w  ww . ja v  a  2  s.c  o m*/
 * @return the centroid
 */
public static MixedCentroid computeMixedCentroid(final boolean preserveOrder,
        final NormalizableDistance distanceFunction, final Instances numericInstances,
        final Instances originalInstances, final int clusterIndex) {
    final int numInstances = numericInstances.numInstances();
    final int numAttributes = numericInstances.numAttributes();

    final Map<TupleSI, Integer> addedAttr = new HashMap<TupleSI, Integer>();

    if (numInstances == 1) {
        Instance uniqueNumInstance = numericInstances.firstInstance();
        Instance uniqueMixInstance = originalInstances.firstInstance();
        double[] centroid = uniqueNumInstance.toDoubleArray();
        for (int i = 0; i < uniqueMixInstance.numAttributes(); i++) {
            if (!uniqueMixInstance.attribute(i).isNumeric()) {
                final String catVal = uniqueMixInstance.attribute(i).value((int) uniqueMixInstance.value(i));
                addedAttr.put(new TupleSI(catVal, i), 1);
            }
        }
        return new MixedCentroid(clusterIndex, centroid, addedAttr);
    }

    final double[] vals = new double[numAttributes];

    //used only for Manhattan Distance
    Instances sortedMembers = null;
    int middle = 0;
    boolean dataIsEven = false;

    final boolean isManhattanDist = (distanceFunction instanceof ManhattanDistance);
    final boolean isEuclideanDist = (distanceFunction instanceof EuclideanDistance);

    if (isManhattanDist) {
        middle = (numInstances - 1) / 2;
        dataIsEven = ((numInstances % 2) == 0);
        if (preserveOrder) {
            sortedMembers = numericInstances;
        } else {
            sortedMembers = new Instances(numericInstances);
        }
    }

    for (int j = 0; j < numAttributes; j++) {
        //in case of Euclidian distance the centroid is the mean point
        //in case of Manhattan distance the centroid is the median point
        //in both cases, if the attribute is nominal, the centroid is the mode            
        if (isEuclideanDist) {
            vals[j] = numericInstances.meanOrMode(j);

            for (int i = 0; i < numInstances; i++) {
                if (!originalInstances.attribute(j).isNumeric()) {
                    final Instance instance = originalInstances.instance(i);
                    final String catVal = instance.attribute(j).value((int) instance.value(j));
                    //Initialize map
                    final TupleSI key = new TupleSI(catVal, j);
                    if (!addedAttr.containsKey(key))
                        addedAttr.put(key, 0);
                    addedAttr.put(key, addedAttr.get(key) + 1);
                }
            }
        } else if (isManhattanDist) {
            sortedMembers.kthSmallestValue(j, middle + 1);
            vals[j] = sortedMembers.instance(middle).value(j);
            if (dataIsEven) {
                sortedMembers.kthSmallestValue(j, middle + 2);
                vals[j] = (vals[j] + sortedMembers.instance(middle + 1).value(j)) / 2;
            }
        } else {
            throw new IllegalStateException("Not handled distance ...");
        }
    }

    return new MixedCentroid(clusterIndex, vals, addedAttr);
}

From source file:meansagnes.MyKMeans.java

protected Instance moveCentroid(Instances instances) {
    double[] vals = new double[instances.numAttributes()];
    for (int k = 0; k < instances.numAttributes(); k++) {
        vals[k] = instances.meanOrMode(k);
    }/*  ww w  .ja va 2  s.c o m*/
    return new Instance(1.0, vals);
}