StatFunctions.java Source code

Java tutorial

Introduction

Here is the source code for StatFunctions.java

Source

/* Copyright (C) 2002 Univ. of Massachusetts Amherst, Computer Science Dept.
   This file is part of "MALLET" (MAchine Learning for LanguagE Toolkit).
   http://www.cs.umass.edu/~mccallum/mallet
   This software is provided under the terms of the Common Public License,
   version 1.0, as published by http://www.opensource.org.  For further
   information, see the file `LICENSE' included with this distribution. */

/** 
 @author Andrew McCallum <a href="mailto:mccallum@cs.umass.edu">mccallum@cs.umass.edu</a>
 */

//package cc.mallet.util;

public final class StatFunctions {
    public static double cov(Univariate x, Univariate y) {
        double sumxy = 0;
        int i, n = (x.size() >= y.size() ? x.size() : y.size());
        try {
            for (i = 0; i < x.size(); i++)
                sumxy += (x.elementAt(i) - x.mean()) * (y.elementAt(i) - y.mean());
        } catch (ArrayIndexOutOfBoundsException e) {
            System.out.println("size of x != size of y");
            e.printStackTrace();
        }
        return (sumxy / (n - 1));
    }

    public static double corr(Univariate x, Univariate y) {
        double cov = cov(x, y);
        return (cov / (x.stdev() * y.stdev()));
    }

    public static double[] ols(Univariate x, Univariate y) {
        double[] coef = new double[2];
        int i, n = (x.size() <= y.size() ? x.size() : y.size());
        double sxy = 0.0, sxx = 0.0;
        double xbar = x.mean(), ybar = y.mean(), xi, yi;
        for (i = 0; i < n; i++) {
            xi = x.elementAt(i);
            yi = y.elementAt(i);
            sxy += (xi - xbar) * (yi - ybar);
            sxx += (xi - xbar) * (xi - xbar);
        }
        coef[0] = sxy / sxx;
        coef[1] = ybar - coef[0] * xbar;
        return (coef);
    }

    public static double qnorm(double p, boolean upper) {
        /*
         * Reference: J. D. Beasley and S. G. Springer Algorithm AS 111:
         * "The Percentage Points of the Normal Distribution" Applied Statistics
         */
        if (p < 0 || p > 1)
            throw new IllegalArgumentException("Illegal argument " + p + " for qnorm(p).");
        double split = 0.42, a0 = 2.50662823884, a1 = -18.61500062529, a2 = 41.39119773534, a3 = -25.44106049637,
                b1 = -8.47351093090, b2 = 23.08336743743, b3 = -21.06224101826, b4 = 3.13082909833,
                c0 = -2.78718931138, c1 = -2.29796479134, c2 = 4.85014127135, c3 = 2.32121276858,
                d1 = 3.54388924762, d2 = 1.63706781897, q = p - 0.5;
        double r, ppnd;
        if (Math.abs(q) <= split) {
            r = q * q;
            ppnd = q * (((a3 * r + a2) * r + a1) * r + a0) / ((((b4 * r + b3) * r + b2) * r + b1) * r + 1);
        } else {
            r = p;
            if (q > 0)
                r = 1 - p;
            if (r > 0) {
                r = Math.sqrt(-Math.log(r));
                ppnd = (((c3 * r + c2) * r + c1) * r + c0) / ((d2 * r + d1) * r + 1);
                if (q < 0)
                    ppnd = -ppnd;
            } else {
                ppnd = 0;
            }
        }
        if (upper)
            ppnd = 1 - ppnd;
        return (ppnd);
    }

    public static double qnorm(double p, boolean upper, double mu, double sigma2) {
        return (qnorm(p, upper) * Math.sqrt(sigma2) + mu);
    }

    public static double pnorm(double z, boolean upper) {
        /*
         * Reference: I. D. Hill Algorithm AS 66: "The Normal Integral" Applied
         * Statistics
         */
        double ltone = 7.0, utzero = 18.66, con = 1.28, a1 = 0.398942280444, a2 = 0.399903438504,
                a3 = 5.75885480458, a4 = 29.8213557808, a5 = 2.62433121679, a6 = 48.6959930692, a7 = 5.92885724438,
                b1 = 0.398942280385, b2 = 3.8052e-8, b3 = 1.00000615302, b4 = 3.98064794e-4, b5 = 1.986153813664,
                b6 = 0.151679116635, b7 = 5.29330324926, b8 = 4.8385912808, b9 = 15.1508972451,
                b10 = 0.742380924027, b11 = 30.789933034, b12 = 3.99019417011;
        double y, alnorm;

        if (z < 0) {
            upper = !upper;
            z = -z;
        }
        if (z <= ltone || upper && z <= utzero) {
            y = 0.5 * z * z;
            if (z > con) {
                alnorm = b1 * Math.exp(-y) / (z - b2
                        + b3 / (z + b4 + b5 / (z - b6 + b7 / (z + b8 - b9 / (z + b10 + b11 / (z + b12))))));
            } else {
                alnorm = 0.5 - z * (a1 - a2 * y / (y + a3 - a4 / (y + a5 + a6 / (y + a7))));
            }
        } else {
            alnorm = 0;
        }
        if (!upper)
            alnorm = 1 - alnorm;
        return (alnorm);
    }

    public static double pnorm(double x, boolean upper, double mu, double sigma2) {
        return (pnorm((x - mu) / Math.sqrt(sigma2), upper));
    }

    public static double qt(double p, double ndf, boolean lower_tail) {
        // Algorithm 396: Student's t-quantiles by
        // G.W. Hill CACM 13(10), 619-620, October 1970
        if (p <= 0 || p >= 1 || ndf < 1)
            throw new IllegalArgumentException("Invalid p or df in call to qt(double,double,boolean).");
        double eps = 1e-12;
        double M_PI_2 = 1.570796326794896619231321691640; // pi/2
        boolean neg;
        double P, q, prob, a, b, c, d, y, x;
        if ((lower_tail && p > 0.5) || (!lower_tail && p < 0.5)) {
            neg = false;
            P = 2 * (lower_tail ? (1 - p) : p);
        } else {
            neg = true;
            P = 2 * (lower_tail ? p : (1 - p));
        }

        if (Math.abs(ndf - 2) < eps) { /* df ~= 2 */
            q = Math.sqrt(2 / (P * (2 - P)) - 2);
        } else if (ndf < 1 + eps) { /* df ~= 1 */
            prob = P * M_PI_2;
            q = Math.cos(prob) / Math.sin(prob);
        } else { /*-- usual case;  including, e.g.,  df = 1.1 */
            a = 1 / (ndf - 0.5);
            b = 48 / (a * a);
            c = ((20700 * a / b - 98) * a - 16) * a + 96.36;
            d = ((94.5 / (b + c) - 3) / b + 1) * Math.sqrt(a * M_PI_2) * ndf;
            y = Math.pow(d * P, 2 / ndf);
            if (y > 0.05 + a) {
                /* Asymptotic inverse expansion about normal */
                x = qnorm(0.5 * P, false);
                y = x * x;
                if (ndf < 5)
                    c += 0.3 * (ndf - 4.5) * (x + 0.6);
                c = (((0.05 * d * x - 5) * x - 7) * x - 2) * x + b + c;
                y = (((((0.4 * y + 6.3) * y + 36) * y + 94.5) / c - y - 3) / b + 1) * x;
                y = a * y * y;
                if (y > 0.002)/* FIXME: This cutoff is machine-precision dependent */
                    y = Math.exp(y) - 1;
                else { /* Taylor of e^y -1 : */
                    y = (0.5 * y + 1) * y;
                }
            } else {
                y = ((1 / (((ndf + 6) / (ndf * y) - 0.089 * d - 0.822) * (ndf + 2) * 3) + 0.5 / (ndf + 4)) * y - 1)
                        * (ndf + 1) / (ndf + 2) + 1 / y;
            }
            q = Math.sqrt(ndf * y);
        }
        if (neg)
            q = -q;
        return q;
    }

    public static double pt(double t, double df) {
        // ALGORITHM AS 3 APPL. STATIST. (1968) VOL.17, P.189
        // Computes P(T<t)
        double a, b, idf, im2, ioe, s, c, ks, fk, k;
        double g1 = 0.3183098862;// =1/pi;
        if (df < 1)
            throw new IllegalArgumentException("Illegal argument df for pt(t,df).");
        idf = df;
        a = t / Math.sqrt(idf);
        b = idf / (idf + t * t);
        im2 = df - 2;
        ioe = idf % 2;
        s = 1;
        c = 1;
        idf = 1;
        ks = 2 + ioe;
        fk = ks;
        if (im2 >= 2) {
            for (k = ks; k <= im2; k += 2) {
                c = c * b * (fk - 1) / fk;
                s += c;
                if (s != idf) {
                    idf = s;
                    fk += 2;
                }
            }
        }
        if (ioe != 1)
            return 0.5 + 0.5 * a * Math.sqrt(b) * s;
        if (df == 1)
            s = 0;
        return 0.5 + (a * b * s + Math.atan(a)) * g1;
    }

    public double pchisq(double q, double df) {
        // Posten, H. (1989) American Statistician 43 p. 261-265
        double df2 = df * .5;
        double q2 = q * .5;
        int n = 5, k;
        double tk, CFL, CFU, prob;
        if (q <= 0 || df <= 0)
            throw new IllegalArgumentException("Illegal argument " + q + " or " + df + " for qnorm(p).");
        if (q < df) {
            tk = q2 * (1 - n - df2) / (df2 + 2 * n - 1 + n * q2 / (df2 + 2 * n));
            for (k = n - 1; k > 1; k--)
                tk = q2 * (1 - k - df2) / (df2 + 2 * k - 1 + k * q2 / (df2 + 2 * k + tk));
            CFL = 1 - q2 / (df2 + 1 + q2 / (df2 + 2 + tk));
            prob = Math.exp(df2 * Math.log(q2) - q2 - logGamma(df2 + 1) - Math.log(CFL));
        } else {
            tk = (n - df2) / (q2 + n);
            for (k = n - 1; k > 1; k--)
                tk = (k - df2) / (q2 + k / (1 + tk));
            CFU = 1 + (1 - df2) / (q2 + 1 / (1 + tk));
            prob = 1 - Math.exp((df2 - 1) * Math.log(q2) - q2 - logGamma(df2) - Math.log(CFU));
        }
        return prob;
    }

    public static double logBeta(double a, double b) {
        return logGamma(a) + logGamma(b) - logGamma(a + b);
    }

    public static double betainv(double x, double p, double q) {
        // ALGORITHM AS 63 APPL. STATIST. VOL.32, NO.1
        // Computes P(Beta>x)
        double beta = logBeta(p, q), acu = 1E-14;
        double cx, psq, pp, qq, x2, term, ai, betain, ns, rx, temp;
        boolean indx;
        if (p <= 0 || q <= 0)
            return (-1.0);
        if (x <= 0 || x >= 1)
            return (-1.0);
        psq = p + q;
        cx = 1 - x;
        if (p < psq * x) {
            x2 = cx;
            cx = x;
            pp = q;
            qq = p;
            indx = true;
        } else {
            x2 = x;
            pp = p;
            qq = q;
            indx = false;
        }
        term = 1;
        ai = 1;
        betain = 1;
        ns = qq + cx * psq;
        rx = x2 / cx;
        temp = qq - ai;
        if (ns == 0)
            rx = x2;
        while (temp > acu && temp > acu * betain) {
            term = term * temp * rx / (pp + ai);
            betain = betain + term;
            temp = Math.abs(term);
            if (temp > acu && temp > acu * betain) {
                ai++;
                ns--;
                if (ns >= 0) {
                    temp = qq - ai;
                    if (ns == 0)
                        rx = x2;
                } else {
                    temp = psq;
                    psq += 1;
                }
            }
        }
        betain *= Math.exp(pp * Math.log(x2) + (qq - 1) * Math.log(cx) - beta) / pp;
        if (indx)
            betain = 1 - betain;
        return (betain);
    }

    public static double pf(double x, double df1, double df2) {
        // ALGORITHM AS 63 APPL. STATIST. VOL.32, NO.1
        // Computes P(F>x)
        return (betainv(df1 * x / (df1 * x + df2), 0.5 * df1, 0.5 * df2));
    }

    // From libbow, dirichlet.c
    // Written by Tom Minka <minka@stat.cmu.edu>
    public static final double logGamma(double x) {
        double result, y, xnum, xden;
        int i;
        final double d1 = -5.772156649015328605195174e-1;
        final double p1[] = { 4.945235359296727046734888e0, 2.018112620856775083915565e2,
                2.290838373831346393026739e3, 1.131967205903380828685045e4, 2.855724635671635335736389e4,
                3.848496228443793359990269e4, 2.637748787624195437963534e4, 7.225813979700288197698961e3 };
        final double q1[] = { 6.748212550303777196073036e1, 1.113332393857199323513008e3,
                7.738757056935398733233834e3, 2.763987074403340708898585e4, 5.499310206226157329794414e4,
                6.161122180066002127833352e4, 3.635127591501940507276287e4, 8.785536302431013170870835e3 };
        final double d2 = 4.227843350984671393993777e-1;
        final double p2[] = { 4.974607845568932035012064e0, 5.424138599891070494101986e2,
                1.550693864978364947665077e4, 1.847932904445632425417223e5, 1.088204769468828767498470e6,
                3.338152967987029735917223e6, 5.106661678927352456275255e6, 3.074109054850539556250927e6 };
        final double q2[] = { 1.830328399370592604055942e2, 7.765049321445005871323047e3,
                1.331903827966074194402448e5, 1.136705821321969608938755e6, 5.267964117437946917577538e6,
                1.346701454311101692290052e7, 1.782736530353274213975932e7, 9.533095591844353613395747e6 };
        final double d4 = 1.791759469228055000094023e0;
        final double p4[] = { 1.474502166059939948905062e4, 2.426813369486704502836312e6,
                1.214755574045093227939592e8, 2.663432449630976949898078e9, 2.940378956634553899906876e10,
                1.702665737765398868392998e11, 4.926125793377430887588120e11, 5.606251856223951465078242e11 };
        final double q4[] = { 2.690530175870899333379843e3, 6.393885654300092398984238e5,
                4.135599930241388052042842e7, 1.120872109616147941376570e9, 1.488613728678813811542398e10,
                1.016803586272438228077304e11, 3.417476345507377132798597e11, 4.463158187419713286462081e11 };
        final double c[] = { -1.910444077728e-03, 8.4171387781295e-04, -5.952379913043012e-04,
                7.93650793500350248e-04, -2.777777777777681622553e-03, 8.333333333333333331554247e-02,
                5.7083835261e-03 };
        final double a = 0.6796875;

        if ((x <= 0.5) || ((x > a) && (x <= 1.5))) {
            if (x <= 0.5) {
                result = -Math.log(x);
                /* Test whether X < machine epsilon. */
                if (x + 1 == 1) {
                    return result;
                }
            } else {
                result = 0;
                x = (x - 0.5) - 0.5;
            }
            xnum = 0;
            xden = 1;
            for (i = 0; i < 8; i++) {
                xnum = xnum * x + p1[i];
                xden = xden * x + q1[i];
            }
            result += x * (d1 + x * (xnum / xden));
        } else if ((x <= a) || ((x > 1.5) && (x <= 4))) {
            if (x <= a) {
                result = -Math.log(x);
                x = (x - 0.5) - 0.5;
            } else {
                result = 0;
                x -= 2;
            }
            xnum = 0;
            xden = 1;
            for (i = 0; i < 8; i++) {
                xnum = xnum * x + p2[i];
                xden = xden * x + q2[i];
            }
            result += x * (d2 + x * (xnum / xden));
        } else if (x <= 12) {
            x -= 4;
            xnum = 0;
            xden = -1;
            for (i = 0; i < 8; i++) {
                xnum = xnum * x + p4[i];
                xden = xden * x + q4[i];
            }
            result = d4 + x * (xnum / xden);
        }
        /* X > 12 */
        else {
            y = Math.log(x);
            result = x * (y - 1) - y * 0.5 + .9189385332046727417803297;
            x = 1 / x;
            y = x * x;
            xnum = c[6];
            for (i = 0; i < 6; i++) {
                xnum = xnum * y + c[i];
            }
            xnum *= x;
            result += xnum;
        }
        return result;
    }

}

/**
 * * @(#)Univariate.java * * DAMAGE (c) 2000 by Sundar Dorai-Raj * @author
 * Sundar Dorai-Raj * Email: sdoraira@vt.edu * This program is free software;
 * you can redistribute it and/or * modify it under the terms of the GNU General
 * Public License * as published by the Free Software Foundation; either version
 * 2 * of the License, or (at your option) any later version, * provided that
 * any use properly credits the author. * This program is distributed in the
 * hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the
 * implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
 * See the * GNU General Public License for more details at http://www.gnu.org *
 * *
 */

class Univariate {
    private double[] x, sortx;
    private double[] summary = new double[6];
    private boolean isSorted = false;
    public double[] five = new double[5];
    private int n;
    private double mean, variance, stdev;
    private double median, min, Q1, Q3, max;

    public Univariate(double[] data) {
        x = (double[]) data.clone();
        n = x.length;
        createSummaryStats();
    }

    private void createSummaryStats() {
        int i;
        mean = 0;
        for (i = 0; i < n; i++)
            mean += x[i];
        mean /= n;
        variance = variance();
        stdev = stdev();

        double sumxx = 0;
        variance = 0;
        for (i = 0; i < n; i++)
            sumxx += x[i] * x[i];
        if (n > 1)
            variance = (sumxx - n * mean * mean) / (n - 1);
        stdev = Math.sqrt(variance);
    }

    public double[] summary() {
        summary[0] = n;
        summary[1] = mean;
        summary[2] = variance;
        summary[3] = stdev;
        summary[4] = Math.sqrt(variance / n);
        summary[5] = mean / summary[4];
        return (summary);
    }

    public double mean() {
        return (mean);
    }

    public double variance() {
        return (variance);
    }

    public double stdev() {
        return (stdev);
    }

    public double SE() {
        return (Math.sqrt(variance / n));
    }

    public double max() {
        if (!isSorted)
            sortx = sort();
        return (sortx[n - 1]);
    }

    public double min() {
        if (!isSorted)
            sortx = sort();
        return (sortx[0]);
    }

    public double median() {
        return (quant(0.50));
    }

    public double quant(double q) {
        if (!isSorted)
            sortx = sort();
        if (q > 1 || q < 0)
            return (0);
        else {
            double index = (n + 1) * q;
            if (index - (int) index == 0)
                return sortx[(int) index - 1];
            else
                return q * sortx[(int) Math.floor(index) - 1] + (1 - q) * sortx[(int) Math.ceil(index) - 1];
        }
    }

    public double[] sort() {
        sortx = (double[]) x.clone();
        int incr = (int) (n * .5);
        while (incr >= 1) {
            for (int i = incr; i < n; i++) {
                double temp = sortx[i];
                int j = i;
                while (j >= incr && temp < sortx[j - incr]) {
                    sortx[j] = sortx[j - incr];
                    j -= incr;
                }
                sortx[j] = temp;
            }
            incr /= 2;
        }
        isSorted = true;
        return (sortx);
    }

    public double[] getData() {
        return (x);
    }

    public int size() {
        return (n);
    }

    public double elementAt(int index) {
        double element = 0;
        try {
            element = x[index];
        } catch (ArrayIndexOutOfBoundsException e) {
            System.out.println("Index " + index + " does not exist in data.");
        }
        return (element);
    }

    public double[] subset(int[] indices) {
        int k = indices.length, i = 0;
        double elements[] = new double[k];
        try {
            for (i = 0; i < k; i++)
                elements[i] = x[k];
        } catch (ArrayIndexOutOfBoundsException e) {
            System.out.println("Index " + i + " does not exist in data.");
        }
        return (elements);
    }

    public int compare(double t) {
        int index = n - 1;
        int i;
        boolean found = false;
        for (i = 0; i < n && !found; i++)
            if (sortx[i] > t) {
                index = i;
                found = true;
            }
        return (index);
    }

    public int[] between(double t1, double t2) {
        int[] indices = new int[2];
        indices[0] = compare(t1);
        indices[1] = compare(t2);
        return (indices);
    }

    public int indexOf(double element) {
        int index = -1;
        for (int i = 0; i < n; i++)
            if (Math.abs(x[i] - element) < 1e-6)
                index = i;
        return (index);
    }
}