Statistics Functions
/* Copyright (C) 2002 Univ. of Massachusetts Amherst, Computer Science Dept.
This file is part of "MALLET" (MAchine Learning for LanguagE Toolkit).
http://www.cs.umass.edu/~mccallum/mallet
This software is provided under the terms of the Common Public License,
version 1.0, as published by http://www.opensource.org. For further
information, see the file `LICENSE' included with this distribution. */
/**
@author Andrew McCallum <a href="mailto:mccallum@cs.umass.edu">mccallum@cs.umass.edu</a>
*/
//package cc.mallet.util;
public final class StatFunctions {
public static double cov(Univariate x, Univariate y) {
double sumxy = 0;
int i, n = (x.size() >= y.size() ? x.size() : y.size());
try {
for (i = 0; i < x.size(); i++)
sumxy += (x.elementAt(i) - x.mean())
* (y.elementAt(i) - y.mean());
} catch (ArrayIndexOutOfBoundsException e) {
System.out.println("size of x != size of y");
e.printStackTrace();
}
return (sumxy / (n - 1));
}
public static double corr(Univariate x, Univariate y) {
double cov = cov(x, y);
return (cov / (x.stdev() * y.stdev()));
}
public static double[] ols(Univariate x, Univariate y) {
double[] coef = new double[2];
int i, n = (x.size() <= y.size() ? x.size() : y.size());
double sxy = 0.0, sxx = 0.0;
double xbar = x.mean(), ybar = y.mean(), xi, yi;
for (i = 0; i < n; i++) {
xi = x.elementAt(i);
yi = y.elementAt(i);
sxy += (xi - xbar) * (yi - ybar);
sxx += (xi - xbar) * (xi - xbar);
}
coef[0] = sxy / sxx;
coef[1] = ybar - coef[0] * xbar;
return (coef);
}
public static double qnorm(double p, boolean upper) {
/*
* Reference: J. D. Beasley and S. G. Springer Algorithm AS 111:
* "The Percentage Points of the Normal Distribution" Applied Statistics
*/
if (p < 0 || p > 1)
throw new IllegalArgumentException("Illegal argument " + p
+ " for qnorm(p).");
double split = 0.42, a0 = 2.50662823884, a1 = -18.61500062529, a2 = 41.39119773534, a3 = -25.44106049637, b1 = -8.47351093090, b2 = 23.08336743743, b3 = -21.06224101826, b4 = 3.13082909833, c0 = -2.78718931138, c1 = -2.29796479134, c2 = 4.85014127135, c3 = 2.32121276858, d1 = 3.54388924762, d2 = 1.63706781897, q = p - 0.5;
double r, ppnd;
if (Math.abs(q) <= split) {
r = q * q;
ppnd = q * (((a3 * r + a2) * r + a1) * r + a0)
/ ((((b4 * r + b3) * r + b2) * r + b1) * r + 1);
} else {
r = p;
if (q > 0)
r = 1 - p;
if (r > 0) {
r = Math.sqrt(-Math.log(r));
ppnd = (((c3 * r + c2) * r + c1) * r + c0)
/ ((d2 * r + d1) * r + 1);
if (q < 0)
ppnd = -ppnd;
} else {
ppnd = 0;
}
}
if (upper)
ppnd = 1 - ppnd;
return (ppnd);
}
public static double qnorm(double p, boolean upper, double mu, double sigma2) {
return (qnorm(p, upper) * Math.sqrt(sigma2) + mu);
}
public static double pnorm(double z, boolean upper) {
/*
* Reference: I. D. Hill Algorithm AS 66: "The Normal Integral" Applied
* Statistics
*/
double ltone = 7.0, utzero = 18.66, con = 1.28, a1 = 0.398942280444, a2 = 0.399903438504, a3 = 5.75885480458, a4 = 29.8213557808, a5 = 2.62433121679, a6 = 48.6959930692, a7 = 5.92885724438, b1 = 0.398942280385, b2 = 3.8052e-8, b3 = 1.00000615302, b4 = 3.98064794e-4, b5 = 1.986153813664, b6 = 0.151679116635, b7 = 5.29330324926, b8 = 4.8385912808, b9 = 15.1508972451, b10 = 0.742380924027, b11 = 30.789933034, b12 = 3.99019417011;
double y, alnorm;
if (z < 0) {
upper = !upper;
z = -z;
}
if (z <= ltone || upper && z <= utzero) {
y = 0.5 * z * z;
if (z > con) {
alnorm = b1
* Math.exp(-y)
/ (z - b2 + b3
/ (z + b4 + b5
/ (z - b6 + b7
/ (z + b8 - b9
/ (z + b10 + b11
/ (z + b12))))));
} else {
alnorm = 0.5
- z
* (a1 - a2 * y
/ (y + a3 - a4 / (y + a5 + a6 / (y + a7))));
}
} else {
alnorm = 0;
}
if (!upper)
alnorm = 1 - alnorm;
return (alnorm);
}
public static double pnorm(double x, boolean upper, double mu, double sigma2) {
return (pnorm((x - mu) / Math.sqrt(sigma2), upper));
}
public static double qt(double p, double ndf, boolean lower_tail) {
// Algorithm 396: Student's t-quantiles by
// G.W. Hill CACM 13(10), 619-620, October 1970
if (p <= 0 || p >= 1 || ndf < 1)
throw new IllegalArgumentException(
"Invalid p or df in call to qt(double,double,boolean).");
double eps = 1e-12;
double M_PI_2 = 1.570796326794896619231321691640; // pi/2
boolean neg;
double P, q, prob, a, b, c, d, y, x;
if ((lower_tail && p > 0.5) || (!lower_tail && p < 0.5)) {
neg = false;
P = 2 * (lower_tail ? (1 - p) : p);
} else {
neg = true;
P = 2 * (lower_tail ? p : (1 - p));
}
if (Math.abs(ndf - 2) < eps) { /* df ~= 2 */
q = Math.sqrt(2 / (P * (2 - P)) - 2);
} else if (ndf < 1 + eps) { /* df ~= 1 */
prob = P * M_PI_2;
q = Math.cos(prob) / Math.sin(prob);
} else { /*-- usual case; including, e.g., df = 1.1 */
a = 1 / (ndf - 0.5);
b = 48 / (a * a);
c = ((20700 * a / b - 98) * a - 16) * a + 96.36;
d = ((94.5 / (b + c) - 3) / b + 1) * Math.sqrt(a * M_PI_2) * ndf;
y = Math.pow(d * P, 2 / ndf);
if (y > 0.05 + a) {
/* Asymptotic inverse expansion about normal */
x = qnorm(0.5 * P, false);
y = x * x;
if (ndf < 5)
c += 0.3 * (ndf - 4.5) * (x + 0.6);
c = (((0.05 * d * x - 5) * x - 7) * x - 2) * x + b + c;
y = (((((0.4 * y + 6.3) * y + 36) * y + 94.5) / c - y - 3) / b + 1)
* x;
y = a * y * y;
if (y > 0.002)/* FIXME: This cutoff is machine-precision dependent */
y = Math.exp(y) - 1;
else { /* Taylor of e^y -1 : */
y = (0.5 * y + 1) * y;
}
} else {
y = ((1 / (((ndf + 6) / (ndf * y) - 0.089 * d - 0.822)
* (ndf + 2) * 3) + 0.5 / (ndf + 4))
* y - 1)
* (ndf + 1) / (ndf + 2) + 1 / y;
}
q = Math.sqrt(ndf * y);
}
if (neg)
q = -q;
return q;
}
public static double pt(double t, double df) {
// ALGORITHM AS 3 APPL. STATIST. (1968) VOL.17, P.189
// Computes P(T<t)
double a, b, idf, im2, ioe, s, c, ks, fk, k;
double g1 = 0.3183098862;// =1/pi;
if (df < 1)
throw new IllegalArgumentException(
"Illegal argument df for pt(t,df).");
idf = df;
a = t / Math.sqrt(idf);
b = idf / (idf + t * t);
im2 = df - 2;
ioe = idf % 2;
s = 1;
c = 1;
idf = 1;
ks = 2 + ioe;
fk = ks;
if (im2 >= 2) {
for (k = ks; k <= im2; k += 2) {
c = c * b * (fk - 1) / fk;
s += c;
if (s != idf) {
idf = s;
fk += 2;
}
}
}
if (ioe != 1)
return 0.5 + 0.5 * a * Math.sqrt(b) * s;
if (df == 1)
s = 0;
return 0.5 + (a * b * s + Math.atan(a)) * g1;
}
public double pchisq(double q, double df) {
// Posten, H. (1989) American Statistician 43 p. 261-265
double df2 = df * .5;
double q2 = q * .5;
int n = 5, k;
double tk, CFL, CFU, prob;
if (q <= 0 || df <= 0)
throw new IllegalArgumentException("Illegal argument " + q + " or "
+ df + " for qnorm(p).");
if (q < df) {
tk = q2 * (1 - n - df2)
/ (df2 + 2 * n - 1 + n * q2 / (df2 + 2 * n));
for (k = n - 1; k > 1; k--)
tk = q2 * (1 - k - df2)
/ (df2 + 2 * k - 1 + k * q2 / (df2 + 2 * k + tk));
CFL = 1 - q2 / (df2 + 1 + q2 / (df2 + 2 + tk));
prob = Math.exp(df2 * Math.log(q2) - q2 - logGamma(df2 + 1)
- Math.log(CFL));
} else {
tk = (n - df2) / (q2 + n);
for (k = n - 1; k > 1; k--)
tk = (k - df2) / (q2 + k / (1 + tk));
CFU = 1 + (1 - df2) / (q2 + 1 / (1 + tk));
prob = 1 - Math.exp((df2 - 1) * Math.log(q2) - q2 - logGamma(df2)
- Math.log(CFU));
}
return prob;
}
public static double logBeta(double a, double b) {
return logGamma(a) + logGamma(b) - logGamma(a + b);
}
public static double betainv(double x, double p, double q) {
// ALGORITHM AS 63 APPL. STATIST. VOL.32, NO.1
// Computes P(Beta>x)
double beta = logBeta(p, q), acu = 1E-14;
double cx, psq, pp, qq, x2, term, ai, betain, ns, rx, temp;
boolean indx;
if (p <= 0 || q <= 0)
return (-1.0);
if (x <= 0 || x >= 1)
return (-1.0);
psq = p + q;
cx = 1 - x;
if (p < psq * x) {
x2 = cx;
cx = x;
pp = q;
qq = p;
indx = true;
} else {
x2 = x;
pp = p;
qq = q;
indx = false;
}
term = 1;
ai = 1;
betain = 1;
ns = qq + cx * psq;
rx = x2 / cx;
temp = qq - ai;
if (ns == 0)
rx = x2;
while (temp > acu && temp > acu * betain) {
term = term * temp * rx / (pp + ai);
betain = betain + term;
temp = Math.abs(term);
if (temp > acu && temp > acu * betain) {
ai++;
ns--;
if (ns >= 0) {
temp = qq - ai;
if (ns == 0)
rx = x2;
} else {
temp = psq;
psq += 1;
}
}
}
betain *= Math.exp(pp * Math.log(x2) + (qq - 1) * Math.log(cx) - beta)
/ pp;
if (indx)
betain = 1 - betain;
return (betain);
}
public static double pf(double x, double df1, double df2) {
// ALGORITHM AS 63 APPL. STATIST. VOL.32, NO.1
// Computes P(F>x)
return (betainv(df1 * x / (df1 * x + df2), 0.5 * df1, 0.5 * df2));
}
// From libbow, dirichlet.c
// Written by Tom Minka <minka@stat.cmu.edu>
public static final double logGamma(double x) {
double result, y, xnum, xden;
int i;
final double d1 = -5.772156649015328605195174e-1;
final double p1[] = { 4.945235359296727046734888e0,
2.018112620856775083915565e2, 2.290838373831346393026739e3,
1.131967205903380828685045e4, 2.855724635671635335736389e4,
3.848496228443793359990269e4, 2.637748787624195437963534e4,
7.225813979700288197698961e3 };
final double q1[] = { 6.748212550303777196073036e1,
1.113332393857199323513008e3, 7.738757056935398733233834e3,
2.763987074403340708898585e4, 5.499310206226157329794414e4,
6.161122180066002127833352e4, 3.635127591501940507276287e4,
8.785536302431013170870835e3 };
final double d2 = 4.227843350984671393993777e-1;
final double p2[] = { 4.974607845568932035012064e0,
5.424138599891070494101986e2, 1.550693864978364947665077e4,
1.847932904445632425417223e5, 1.088204769468828767498470e6,
3.338152967987029735917223e6, 5.106661678927352456275255e6,
3.074109054850539556250927e6 };
final double q2[] = { 1.830328399370592604055942e2,
7.765049321445005871323047e3, 1.331903827966074194402448e5,
1.136705821321969608938755e6, 5.267964117437946917577538e6,
1.346701454311101692290052e7, 1.782736530353274213975932e7,
9.533095591844353613395747e6 };
final double d4 = 1.791759469228055000094023e0;
final double p4[] = { 1.474502166059939948905062e4,
2.426813369486704502836312e6, 1.214755574045093227939592e8,
2.663432449630976949898078e9, 2.940378956634553899906876e10,
1.702665737765398868392998e11, 4.926125793377430887588120e11,
5.606251856223951465078242e11 };
final double q4[] = { 2.690530175870899333379843e3,
6.393885654300092398984238e5, 4.135599930241388052042842e7,
1.120872109616147941376570e9, 1.488613728678813811542398e10,
1.016803586272438228077304e11, 3.417476345507377132798597e11,
4.463158187419713286462081e11 };
final double c[] = { -1.910444077728e-03, 8.4171387781295e-04,
-5.952379913043012e-04, 7.93650793500350248e-04,
-2.777777777777681622553e-03, 8.333333333333333331554247e-02,
5.7083835261e-03 };
final double a = 0.6796875;
if ((x <= 0.5) || ((x > a) && (x <= 1.5))) {
if (x <= 0.5) {
result = -Math.log(x);
/* Test whether X < machine epsilon. */
if (x + 1 == 1) {
return result;
}
} else {
result = 0;
x = (x - 0.5) - 0.5;
}
xnum = 0;
xden = 1;
for (i = 0; i < 8; i++) {
xnum = xnum * x + p1[i];
xden = xden * x + q1[i];
}
result += x * (d1 + x * (xnum / xden));
} else if ((x <= a) || ((x > 1.5) && (x <= 4))) {
if (x <= a) {
result = -Math.log(x);
x = (x - 0.5) - 0.5;
} else {
result = 0;
x -= 2;
}
xnum = 0;
xden = 1;
for (i = 0; i < 8; i++) {
xnum = xnum * x + p2[i];
xden = xden * x + q2[i];
}
result += x * (d2 + x * (xnum / xden));
} else if (x <= 12) {
x -= 4;
xnum = 0;
xden = -1;
for (i = 0; i < 8; i++) {
xnum = xnum * x + p4[i];
xden = xden * x + q4[i];
}
result = d4 + x * (xnum / xden);
}
/* X > 12 */
else {
y = Math.log(x);
result = x * (y - 1) - y * 0.5 + .9189385332046727417803297;
x = 1 / x;
y = x * x;
xnum = c[6];
for (i = 0; i < 6; i++) {
xnum = xnum * y + c[i];
}
xnum *= x;
result += xnum;
}
return result;
}
}
/**
* * @(#)Univariate.java * * DAMAGE (c) 2000 by Sundar Dorai-Raj * @author
* Sundar Dorai-Raj * Email: sdoraira@vt.edu * This program is free software;
* you can redistribute it and/or * modify it under the terms of the GNU General
* Public License * as published by the Free Software Foundation; either version
* 2 * of the License, or (at your option) any later version, * provided that
* any use properly credits the author. * This program is distributed in the
* hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the
* implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
* See the * GNU General Public License for more details at http://www.gnu.org *
* *
*/
class Univariate {
private double[] x, sortx;
private double[] summary = new double[6];
private boolean isSorted = false;
public double[] five = new double[5];
private int n;
private double mean, variance, stdev;
private double median, min, Q1, Q3, max;
public Univariate(double[] data) {
x = (double[]) data.clone();
n = x.length;
createSummaryStats();
}
private void createSummaryStats() {
int i;
mean = 0;
for (i = 0; i < n; i++)
mean += x[i];
mean /= n;
variance = variance();
stdev = stdev();
double sumxx = 0;
variance = 0;
for (i = 0; i < n; i++)
sumxx += x[i] * x[i];
if (n > 1)
variance = (sumxx - n * mean * mean) / (n - 1);
stdev = Math.sqrt(variance);
}
public double[] summary() {
summary[0] = n;
summary[1] = mean;
summary[2] = variance;
summary[3] = stdev;
summary[4] = Math.sqrt(variance / n);
summary[5] = mean / summary[4];
return (summary);
}
public double mean() {
return (mean);
}
public double variance() {
return (variance);
}
public double stdev() {
return (stdev);
}
public double SE() {
return (Math.sqrt(variance / n));
}
public double max() {
if (!isSorted)
sortx = sort();
return (sortx[n - 1]);
}
public double min() {
if (!isSorted)
sortx = sort();
return (sortx[0]);
}
public double median() {
return (quant(0.50));
}
public double quant(double q) {
if (!isSorted)
sortx = sort();
if (q > 1 || q < 0)
return (0);
else {
double index = (n + 1) * q;
if (index - (int) index == 0)
return sortx[(int) index - 1];
else
return q * sortx[(int) Math.floor(index) - 1] + (1 - q)
* sortx[(int) Math.ceil(index) - 1];
}
}
public double[] sort() {
sortx = (double[]) x.clone();
int incr = (int) (n * .5);
while (incr >= 1) {
for (int i = incr; i < n; i++) {
double temp = sortx[i];
int j = i;
while (j >= incr && temp < sortx[j - incr]) {
sortx[j] = sortx[j - incr];
j -= incr;
}
sortx[j] = temp;
}
incr /= 2;
}
isSorted = true;
return (sortx);
}
public double[] getData() {
return (x);
}
public int size() {
return (n);
}
public double elementAt(int index) {
double element = 0;
try {
element = x[index];
} catch (ArrayIndexOutOfBoundsException e) {
System.out.println("Index " + index + " does not exist in data.");
}
return (element);
}
public double[] subset(int[] indices) {
int k = indices.length, i = 0;
double elements[] = new double[k];
try {
for (i = 0; i < k; i++)
elements[i] = x[k];
} catch (ArrayIndexOutOfBoundsException e) {
System.out.println("Index " + i + " does not exist in data.");
}
return (elements);
}
public int compare(double t) {
int index = n - 1;
int i;
boolean found = false;
for (i = 0; i < n && !found; i++)
if (sortx[i] > t) {
index = i;
found = true;
}
return (index);
}
public int[] between(double t1, double t2) {
int[] indices = new int[2];
indices[0] = compare(t1);
indices[1] = compare(t2);
return (indices);
}
public int indexOf(double element) {
int index = -1;
for (int i = 0; i < n; i++)
if (Math.abs(x[i] - element) < 1e-6)
index = i;
return (index);
}
}
Related examples in the same category