de.uni_potsdam.hpi.bpt.promnicat.util.WeightedEuclideanDistance.java Source code

Introduction

Here is the source code for de.uni_potsdam.hpi.bpt.promnicat.util.WeightedEuclideanDistance.java
Source

/**
 * PromniCAT - Collection and Analysis of Business Process Models
 * Copyright (C) 2012 Cindy Fhnrich, Tobias Hoppe, Andrina Mascher
 * 
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * any later version.
 * 
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 * 
 * You should have received a copy of the GNU General Public License
 * along with this program.  If not, see <http://www.gnu.org/licenses/>.
 */
package de.uni_potsdam.hpi.bpt.promnicat.util;

import java.util.ArrayList;

import weka.core.Attribute;
import weka.core.Instance;
import weka.core.neighboursearch.PerformanceStats;
import de.uni_potsdam.hpi.bpt.promnicat.analysisModules.clustering.ProcessInstance;
import de.uni_potsdam.hpi.bpt.promnicat.analysisModules.clustering.ProcessInstances;

/**
 * Class derived from the EuclideanDistance class in WEKA, but also taking into account
 * attribute weights and normalized values.
 * @author Cindy Fhnrich
 *
 */
public class WeightedEuclideanDistance extends weka.core.EuclideanDistance {

    /** for serialization. */
    private static final long serialVersionUID = 1068606253458807903L;

    /** indicates whether to normalize the values or not */
    private boolean normalizeEnabled = false;
    /** contains the maximum values for the different attributes for normalization */
    private ArrayList<double[]> normFeatureValues;

    /**
     * Constructs an Euclidean Distance object, Instances must be still set.
     * Enables normalization.
     * @param maxFeatures
     *          the maximum values of the different numeric features
     */
    public WeightedEuclideanDistance(ArrayList<double[]> normFeatures) {
        super();
        normFeatureValues = normFeatures;
        normalizeEnabled = true;
    }

    public WeightedEuclideanDistance() {
        super();
    }

    /**
     * Constructs an Euclidean Distance object and automatically initializes the
     * ranges.
     * 
     * @param data    the instances the distance function should work on
     */
    public WeightedEuclideanDistance(ProcessInstances data) {
        super(data);
    }

    /**
     * Sets the maximum values for normalization
     * @param maxFeatures
     *          maximum values of the different attributes
     */
    public void setMaximumFeatureValues(ArrayList<double[]> normFeatures) {
        normFeatureValues = normFeatures;
        normalizeEnabled = true;
    }

    /**
     * Calculates the distance between two instances.
     * 
     * @param first    the first instance
     * @param second    the second instance
     * @return       the distance between the two given instances
     */
    public double distance(ProcessInstance first, ProcessInstance second) {
        return Math.sqrt(distance(first, second, Double.POSITIVE_INFINITY));
    }

    /**
     * Calculates the distance (or similarity) between two instances. Need to
     * pass this returned distance later on to postprocess method to set it on
     * correct scale. <br/>
     * P.S.: Please don't mix the use of this function with
     * distance(Instance first, Instance second), as that already does post
     * processing. Please consider passing Double.POSITIVE_INFINITY as the cutOffValue to
     * this function and then later on do the post processing on all the
     * distances.
     *
     * @param first    the first instance
     * @param second    the second instance
     * @param stats    the structure for storing performance statistics.
     * @return       the distance between the two given instances or 
     *          Double.POSITIVE_INFINITY.
     */
    public double distance(ProcessInstance first, ProcessInstance second, PerformanceStats stats) { //debug method pls remove after use
        return Math.sqrt(distance(first, second, Double.POSITIVE_INFINITY, stats));
    }

    /**
     * Computes the difference between two given attribute
     * values. Incorporates weights if enabled.
     * 
     * @param index   the attribute index
     * @param val1   the first value
     * @param val2   the second value
     * @return      the difference
     */
    protected double difference(int index, double val1, double val2) {
        double weight = m_Data.attribute(index).weight();

        //normalize features by formula: (x - minimum) / (maximum - minimum)
        if (normalizeEnabled) {
            val1 = (val1 - normFeatureValues.get(0)[index])
                    / (normFeatureValues.get(1)[index] - normFeatureValues.get(0)[index]);
            val2 /= (val2 - normFeatureValues.get(0)[index])
                    / (normFeatureValues.get(1)[index] - normFeatureValues.get(0)[index]);
        }
        switch (m_Data.attribute(index).type()) {
        case Attribute.NOMINAL:
            if (Instance.isMissingValue(val1) || Instance.isMissingValue(val2) || ((int) val1 != (int) val2)) {
                return weight * 1;
            } else {
                return 0;
            }

        case Attribute.NUMERIC:
            if (Instance.isMissingValue(val1) || Instance.isMissingValue(val2)) {
                if (Instance.isMissingValue(val1) && Instance.isMissingValue(val2)) {
                    if (!m_DontNormalize) //We are doing normalization
                        return weight * 1;
                    else
                        return weight * (m_Ranges[index][R_MAX] - m_Ranges[index][R_MIN]);
                } else {
                    double diff;
                    if (Instance.isMissingValue(val2)) {
                        diff = (!m_DontNormalize) ? norm(val1, index) : val1;
                    } else {
                        diff = (!m_DontNormalize) ? norm(val2, index) : val2;
                    }
                    if (!m_DontNormalize && diff < 0.5) {
                        diff = 1.0 - diff;
                    } else if (m_DontNormalize) {
                        if ((m_Ranges[index][R_MAX] - diff) > (diff - m_Ranges[index][R_MIN]))
                            return weight * (m_Ranges[index][R_MAX] - diff);
                        else
                            return weight * (diff - m_Ranges[index][R_MIN]);
                    }
                    return weight * diff;
                }
            } else {
                return (!m_DontNormalize) ? weight * (norm(val1, index) - norm(val2, index))
                        : weight * (val1 - val2);
            }

        default:
            return 0;
        }
    }

    /**
     * Calculates the distance between two instances. Offers speed up (if the 
     * distance function class in use supports it) in nearest neighbour search by 
     * taking into account the cutOff or maximum distance. Depending on the 
     * distance function class, post processing of the distances by 
     * postProcessDistances(double []) may be required if this function is used.
     *
     * @param first    the first instance
     * @param second    the second instance
     * @param cutOffValue If the distance being calculated becomes larger than 
     *                    cutOffValue then the rest of the calculation is 
     *                    discarded.
     * @param stats    the performance stats object
     * @return       the distance between the two given instances or 
     *          Double.POSITIVE_INFINITY if the distance being 
     *          calculated becomes larger than cutOffValue. 
     */
    public double distance(Instance first, Instance second, double cutOffValue, PerformanceStats stats) {
        double distance = 0;
        int firstI, secondI;
        int firstNumValues = first.numValues();
        int secondNumValues = second.numValues();
        int numAttributes = m_Data.numAttributes();
        int classIndex = m_Data.classIndex();
        double weights = 1;

        validate();

        for (int p1 = 0, p2 = 0; p1 < firstNumValues || p2 < secondNumValues;) {
            weights += first.attribute(p1).weight();
            if (p1 >= firstNumValues)
                firstI = numAttributes;
            else
                firstI = first.index(p1);

            if (p2 >= secondNumValues)
                secondI = numAttributes;
            else
                secondI = second.index(p2);

            if (firstI == classIndex) {
                p1++;
                continue;
            }
            if ((firstI < numAttributes) && !m_ActiveIndices[firstI]) {
                p1++;
                continue;
            }

            if (secondI == classIndex) {
                p2++;
                continue;
            }
            if ((secondI < numAttributes) && !m_ActiveIndices[secondI]) {
                p2++;
                continue;
            }

            double diff;

            if (firstI == secondI) {
                diff = difference(firstI, first.valueSparse(p1), second.valueSparse(p2));
                p1++;
                p2++;
            } else if (firstI > secondI) {
                diff = difference(secondI, 0, second.valueSparse(p2));
                p2++;
            } else {
                diff = difference(firstI, first.valueSparse(p1), 0);
                p1++;
            }
            if (stats != null)
                stats.incrCoordCount();

            distance = updateDistance(distance, diff);
            if (distance > cutOffValue)
                return Double.POSITIVE_INFINITY;
        }

        if (weights > 1) {
            return distance / (weights - 1);
        }
        return distance / weights;
    }
}