es.ucm.fdi.ac.outlier.Hampel.java Source code

Java tutorial

Introduction

Here is the source code for es.ucm.fdi.ac.outlier.Hampel.java

Source

/**
 * AC - A source-code copy detector
 *
 *     For more information please visit:  http://github.com/manuel-freire/ac
 *
 * ****************************************************************************
 *
 * This file is part of AC, version 2.0
 *
 * AC is free software: you can redistribute it and/or modify it under the
 * terms of the GNU Lesser General Public License as published by the
 * Free Software Foundation, either version 3 of the License,
 * or (at your option) any later version.
 *
 * AC is distributed in the hope that it will be useful, but
 * WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public License
 * along with AC.  If not, see <http://www.gnu.org/licenses/>.
 */

package es.ucm.fdi.ac.outlier;

import java.io.FileWriter;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import org.apache.commons.math.stat.StatUtils;

/**
 * @author Manuel Cebrian (manuel.cebrian-at-uam.es)
 * FUNCTION THR = HAMPEL(DATA)
 * 
 * Description: this function calculates a threshold for detecting outliers
 * by means of the Hampel indentifier [1].
 * 
 * This statistic is calculated as |X-M|/(MAD/.6745), and compared to a value K,
 * i.e. if |X-M|/(MAD/.6745) > K then X is identified as an outlier. As in
 * plagiarism outliers are low distances, we have modified the Hampel
 * identified to take into account only the lowest distances. Thus, the
 * statistic is now (M-X)/(MAD/.6745), which gives high values to outliying
 * low distances, and low values (even negatives) to high distances.
 * 
 * Several values for K can be chosen 3.5 [1], 2.24 [2] or more sophisticaed
 * ones which are a function of the sample size [3].
 * In.. this implementation we have chosen the following standarization [3, p.783]
 * P(no outliers in the sample) = 1 - alpha. To do this, we mark as outliers
 * all X such that |X-M|/(MAD/.6745) > g(N,alpha). We are limited to use
 * alpha = 0.01 and alpha = 0.05, the reasons for this limitation are in
 * [3, p. 791].
 * 
 * Input: an array of distances between assignments.
 * 
 * Output: a threshold so that data &lt; thr are identified as outliers.
 * 
 * Bibliography:
 * 
 * [1] Frank R. Hampel, The Breakdown Points of the Mean Combined With Some
 * Rejection Rules. Technometrics, Vol. 25, 1985.
 * 
 * [2] Rand R. Wilcox, Appliying Contemporary Statistical Techniques,
 * Elsevier Science, 2003.
 * 
 * [3] Laurie Davies and Ursula Gather, The Identification of Multiple
 * Outliers, Journal of the American Statistical Association, 1993.
 */

public class Hampel {

    /** 
     * median/stdDistDevEstimator is a  robust estimator for the deviation of 
     * the standard distribution (see [2])
     */
    private static final double stdDistDevEstimator = 0.6745;

    /** Default alpha values */
    private static final double[] defaultAlphaValues = new double[] { 0.01, 0.05 };

    /** Default montecarlo size (number of gaussian samples) */
    private static final int nMontecarlos = 1000;

    /** A very small number; results will are guaranteed to fall between EPSILON and 1-EPSILON */
    private static final double EPSILON = 0.000001;

    /**
     * Calculate the threshold identifier for this distribution, using 
     * using default 'alpha' values
     */
    public static List<Double> hampel(double[] array) {
        ArrayList<Double> al = new ArrayList<Double>();
        for (double alpha : defaultAlphaValues) {
            al.add(hampel(analyticK(array.length, alpha), array));
        }
        return al;
    }

    /**
     * calculate the threshold identifier for a distribution, given a 
     * 'k' value. Does not return negative values, even when Hampel generally
     * could.
     */
    public static double hampel(double k, double[] array) {

        double m = median(array);
        double s = madn(array);
        double v = Math.max(EPSILON, m - k * s);

        return Math.min(1 - EPSILON, v);
    }

    /**
     * median absolute deviation from the median 
     */
    public static double madn(double[] array) {
        double aux[] = new double[array.length];
        double madn = 0;
        double m = median(array);

        for (int i = 0; i < array.length; i++)
            aux[i] = Math.abs(array[i] - m);

        madn = median(aux) / stdDistDevEstimator;

        return madn;
    }

    /**
     * median of an array
     */
    public static double median(double[] array) {
        Arrays.sort(array); // sort the elements
        boolean even = (array.length % 2) == 0;
        double median = 0;

        if (even) {
            int rightNumber = array.length / 2;
            int leftNumber = rightNumber - 1;
            // return average of two center-elements
            median = (array[rightNumber] + array[leftNumber]) / 2;
        } else {
            // choose the center one            
            median = array[array.length / 2];
        }

        return median;
    }

    /**
     * Ugly lookup of 'k' given alpha. Alpha must be either 0.01 or 0.05
     * for this to work as expected.
     */
    private static double analyticK(int n, double alpha) {
        double k;
        double logN = Math.log(n);
        double sqrtTwoLogN = Math.sqrt(2 * logN);
        double z_n_alpha = sqrtTwoLogN - Math.log(alpha / 2) / sqrtTwoLogN
                - (Math.log(logN) + Math.log(4 * Math.PI)) / (2 * sqrtTwoLogN);

        if (alpha == 0.05) {
            if ((n % 2) == 0) {//N even
                k = 1.483 * z_n_alpha + 21.61 * Math.pow(n + 1, -0.8655);
            } else {
                k = 1.483 * z_n_alpha + 14.43 * Math.pow(n - 3, -0.7939);
            }

            return k;

        } else if (alpha == 0.01) {
            if ((n % 2) == 0) {//N even
                k = 1.483 * z_n_alpha + 41.39 * Math.pow(n, -0.9143);
            } else {
                k = 1.483 * z_n_alpha + 24.48 * Math.pow(n - 5, -0.8236);
            }

            return k;
        } else {
            return Double.NaN;
            //            throw new IllegalArgumentException(
            //                    "Alpha IS RESTRICTED to be 0.01 or 0.05 ");
        }
    }

    /**
     * Montecarlo estimation of 'k' given alpha
     * @param n number of entries
     * @param alpha value of free parameter for the Hampel identifier
     * @return k
     */
    public static double montecarloK(int n, double alpha) {

        double statistic[] = new double[nMontecarlos];
        double ra[] = new double[n]; // a random array

        System.err.println("initializing cache...");
        RandomCache.init(32 * 1024 * 1024);
        System.err.println("Starting calculations...");
        for (int i = 0; i < nMontecarlos; i++) {
            RandomCache.fill(ra, 0, n);

            double med = median(ra);
            double madN = madn(ra);

            // Absolute value of sample i
            for (int k = 0; k < n; k++) {
                ra[k] = Math.abs(ra[k]);
            }

            // Hampel statistic computation
            statistic[i] = (StatUtils.max(ra) - med) / madN;
        }

        // calculate result
        double result = StatUtils.percentile(statistic, 100 * (1 - alpha));

        return result;
    }

    /**
     * To regenerate cache from command-line use
     * java -cp classes:lib/commons-math-1.1.jar eps.outlier.Hampel
     */
    public static void main(String args[]) throws Exception {

        System.err.println("init...");
        Hampel o = new Hampel();
        System.err.println("init OK");

        double[] enes = { 1, 10, 20, 40, 80, 160, 320, 640, 1280, 5120, 10240 }; // 11 rows
        double[] alfas = { 0.005, 0.01, 0.025, 0.05, 0.075, 0.1, 0.2, 0.25, 0.5, 0.75 }; // 10 cols
        double[] values = new double[enes.length * alfas.length];

        long startTime = System.currentTimeMillis();
        for (int i = 0, k = 0; i < enes.length; i++) {
            for (int j = 0; j < alfas.length; j++) {
                long partialStart = System.currentTimeMillis();
                values[k++] = o.montecarloK((int) enes[i], alfas[j]);
                long partialTime = System.currentTimeMillis() - partialStart;
                System.err.println(
                        "" + k + " down (" + (partialTime / 1000.0f) + " s), " + (values.length - k) + " to go...");
            }
        }
        long endTime = System.currentTimeMillis();
        float seconds = (endTime - startTime) / 1000.0f;

        ArrayList<double[]> al = new ArrayList<double[]>();
        al.add(enes);
        al.add(alfas);
        Interpolator ip = new Interpolator(2, al, values);
        ip.saveGrid(new FileWriter("/tmp/saved_grid.txt"));

        System.err.println("Finished after " + seconds + " s");
    }
}