Java tutorial
/* Copyright (C) 2002 Univ. of Massachusetts Amherst, Computer Science Dept. This file is part of "MALLET" (MAchine Learning for LanguagE Toolkit). http://www.cs.umass.edu/~mccallum/mallet This software is provided under the terms of the Common Public License, version 1.0, as published by http://www.opensource.org. For further information, see the file `LICENSE' included with this distribution. */ /** @author Andrew McCallum <a href="mailto:mccallum@cs.umass.edu">mccallum@cs.umass.edu</a> */ //package cc.mallet.util; public final class StatFunctions { public static double cov(Univariate x, Univariate y) { double sumxy = 0; int i, n = (x.size() >= y.size() ? x.size() : y.size()); try { for (i = 0; i < x.size(); i++) sumxy += (x.elementAt(i) - x.mean()) * (y.elementAt(i) - y.mean()); } catch (ArrayIndexOutOfBoundsException e) { System.out.println("size of x != size of y"); e.printStackTrace(); } return (sumxy / (n - 1)); } public static double corr(Univariate x, Univariate y) { double cov = cov(x, y); return (cov / (x.stdev() * y.stdev())); } public static double[] ols(Univariate x, Univariate y) { double[] coef = new double[2]; int i, n = (x.size() <= y.size() ? x.size() : y.size()); double sxy = 0.0, sxx = 0.0; double xbar = x.mean(), ybar = y.mean(), xi, yi; for (i = 0; i < n; i++) { xi = x.elementAt(i); yi = y.elementAt(i); sxy += (xi - xbar) * (yi - ybar); sxx += (xi - xbar) * (xi - xbar); } coef[0] = sxy / sxx; coef[1] = ybar - coef[0] * xbar; return (coef); } public static double qnorm(double p, boolean upper) { /* * Reference: J. D. Beasley and S. G. Springer Algorithm AS 111: * "The Percentage Points of the Normal Distribution" Applied Statistics */ if (p < 0 || p > 1) throw new IllegalArgumentException("Illegal argument " + p + " for qnorm(p)."); double split = 0.42, a0 = 2.50662823884, a1 = -18.61500062529, a2 = 41.39119773534, a3 = -25.44106049637, b1 = -8.47351093090, b2 = 23.08336743743, b3 = -21.06224101826, b4 = 3.13082909833, c0 = -2.78718931138, c1 = -2.29796479134, c2 = 4.85014127135, c3 = 2.32121276858, d1 = 3.54388924762, d2 = 1.63706781897, q = p - 0.5; double r, ppnd; if (Math.abs(q) <= split) { r = q * q; ppnd = q * (((a3 * r + a2) * r + a1) * r + a0) / ((((b4 * r + b3) * r + b2) * r + b1) * r + 1); } else { r = p; if (q > 0) r = 1 - p; if (r > 0) { r = Math.sqrt(-Math.log(r)); ppnd = (((c3 * r + c2) * r + c1) * r + c0) / ((d2 * r + d1) * r + 1); if (q < 0) ppnd = -ppnd; } else { ppnd = 0; } } if (upper) ppnd = 1 - ppnd; return (ppnd); } public static double qnorm(double p, boolean upper, double mu, double sigma2) { return (qnorm(p, upper) * Math.sqrt(sigma2) + mu); } public static double pnorm(double z, boolean upper) { /* * Reference: I. D. Hill Algorithm AS 66: "The Normal Integral" Applied * Statistics */ double ltone = 7.0, utzero = 18.66, con = 1.28, a1 = 0.398942280444, a2 = 0.399903438504, a3 = 5.75885480458, a4 = 29.8213557808, a5 = 2.62433121679, a6 = 48.6959930692, a7 = 5.92885724438, b1 = 0.398942280385, b2 = 3.8052e-8, b3 = 1.00000615302, b4 = 3.98064794e-4, b5 = 1.986153813664, b6 = 0.151679116635, b7 = 5.29330324926, b8 = 4.8385912808, b9 = 15.1508972451, b10 = 0.742380924027, b11 = 30.789933034, b12 = 3.99019417011; double y, alnorm; if (z < 0) { upper = !upper; z = -z; } if (z <= ltone || upper && z <= utzero) { y = 0.5 * z * z; if (z > con) { alnorm = b1 * Math.exp(-y) / (z - b2 + b3 / (z + b4 + b5 / (z - b6 + b7 / (z + b8 - b9 / (z + b10 + b11 / (z + b12)))))); } else { alnorm = 0.5 - z * (a1 - a2 * y / (y + a3 - a4 / (y + a5 + a6 / (y + a7)))); } } else { alnorm = 0; } if (!upper) alnorm = 1 - alnorm; return (alnorm); } public static double pnorm(double x, boolean upper, double mu, double sigma2) { return (pnorm((x - mu) / Math.sqrt(sigma2), upper)); } public static double qt(double p, double ndf, boolean lower_tail) { // Algorithm 396: Student's t-quantiles by // G.W. Hill CACM 13(10), 619-620, October 1970 if (p <= 0 || p >= 1 || ndf < 1) throw new IllegalArgumentException("Invalid p or df in call to qt(double,double,boolean)."); double eps = 1e-12; double M_PI_2 = 1.570796326794896619231321691640; // pi/2 boolean neg; double P, q, prob, a, b, c, d, y, x; if ((lower_tail && p > 0.5) || (!lower_tail && p < 0.5)) { neg = false; P = 2 * (lower_tail ? (1 - p) : p); } else { neg = true; P = 2 * (lower_tail ? p : (1 - p)); } if (Math.abs(ndf - 2) < eps) { /* df ~= 2 */ q = Math.sqrt(2 / (P * (2 - P)) - 2); } else if (ndf < 1 + eps) { /* df ~= 1 */ prob = P * M_PI_2; q = Math.cos(prob) / Math.sin(prob); } else { /*-- usual case; including, e.g., df = 1.1 */ a = 1 / (ndf - 0.5); b = 48 / (a * a); c = ((20700 * a / b - 98) * a - 16) * a + 96.36; d = ((94.5 / (b + c) - 3) / b + 1) * Math.sqrt(a * M_PI_2) * ndf; y = Math.pow(d * P, 2 / ndf); if (y > 0.05 + a) { /* Asymptotic inverse expansion about normal */ x = qnorm(0.5 * P, false); y = x * x; if (ndf < 5) c += 0.3 * (ndf - 4.5) * (x + 0.6); c = (((0.05 * d * x - 5) * x - 7) * x - 2) * x + b + c; y = (((((0.4 * y + 6.3) * y + 36) * y + 94.5) / c - y - 3) / b + 1) * x; y = a * y * y; if (y > 0.002)/* FIXME: This cutoff is machine-precision dependent */ y = Math.exp(y) - 1; else { /* Taylor of e^y -1 : */ y = (0.5 * y + 1) * y; } } else { y = ((1 / (((ndf + 6) / (ndf * y) - 0.089 * d - 0.822) * (ndf + 2) * 3) + 0.5 / (ndf + 4)) * y - 1) * (ndf + 1) / (ndf + 2) + 1 / y; } q = Math.sqrt(ndf * y); } if (neg) q = -q; return q; } public static double pt(double t, double df) { // ALGORITHM AS 3 APPL. STATIST. (1968) VOL.17, P.189 // Computes P(T<t) double a, b, idf, im2, ioe, s, c, ks, fk, k; double g1 = 0.3183098862;// =1/pi; if (df < 1) throw new IllegalArgumentException("Illegal argument df for pt(t,df)."); idf = df; a = t / Math.sqrt(idf); b = idf / (idf + t * t); im2 = df - 2; ioe = idf % 2; s = 1; c = 1; idf = 1; ks = 2 + ioe; fk = ks; if (im2 >= 2) { for (k = ks; k <= im2; k += 2) { c = c * b * (fk - 1) / fk; s += c; if (s != idf) { idf = s; fk += 2; } } } if (ioe != 1) return 0.5 + 0.5 * a * Math.sqrt(b) * s; if (df == 1) s = 0; return 0.5 + (a * b * s + Math.atan(a)) * g1; } public double pchisq(double q, double df) { // Posten, H. (1989) American Statistician 43 p. 261-265 double df2 = df * .5; double q2 = q * .5; int n = 5, k; double tk, CFL, CFU, prob; if (q <= 0 || df <= 0) throw new IllegalArgumentException("Illegal argument " + q + " or " + df + " for qnorm(p)."); if (q < df) { tk = q2 * (1 - n - df2) / (df2 + 2 * n - 1 + n * q2 / (df2 + 2 * n)); for (k = n - 1; k > 1; k--) tk = q2 * (1 - k - df2) / (df2 + 2 * k - 1 + k * q2 / (df2 + 2 * k + tk)); CFL = 1 - q2 / (df2 + 1 + q2 / (df2 + 2 + tk)); prob = Math.exp(df2 * Math.log(q2) - q2 - logGamma(df2 + 1) - Math.log(CFL)); } else { tk = (n - df2) / (q2 + n); for (k = n - 1; k > 1; k--) tk = (k - df2) / (q2 + k / (1 + tk)); CFU = 1 + (1 - df2) / (q2 + 1 / (1 + tk)); prob = 1 - Math.exp((df2 - 1) * Math.log(q2) - q2 - logGamma(df2) - Math.log(CFU)); } return prob; } public static double logBeta(double a, double b) { return logGamma(a) + logGamma(b) - logGamma(a + b); } public static double betainv(double x, double p, double q) { // ALGORITHM AS 63 APPL. STATIST. VOL.32, NO.1 // Computes P(Beta>x) double beta = logBeta(p, q), acu = 1E-14; double cx, psq, pp, qq, x2, term, ai, betain, ns, rx, temp; boolean indx; if (p <= 0 || q <= 0) return (-1.0); if (x <= 0 || x >= 1) return (-1.0); psq = p + q; cx = 1 - x; if (p < psq * x) { x2 = cx; cx = x; pp = q; qq = p; indx = true; } else { x2 = x; pp = p; qq = q; indx = false; } term = 1; ai = 1; betain = 1; ns = qq + cx * psq; rx = x2 / cx; temp = qq - ai; if (ns == 0) rx = x2; while (temp > acu && temp > acu * betain) { term = term * temp * rx / (pp + ai); betain = betain + term; temp = Math.abs(term); if (temp > acu && temp > acu * betain) { ai++; ns--; if (ns >= 0) { temp = qq - ai; if (ns == 0) rx = x2; } else { temp = psq; psq += 1; } } } betain *= Math.exp(pp * Math.log(x2) + (qq - 1) * Math.log(cx) - beta) / pp; if (indx) betain = 1 - betain; return (betain); } public static double pf(double x, double df1, double df2) { // ALGORITHM AS 63 APPL. STATIST. VOL.32, NO.1 // Computes P(F>x) return (betainv(df1 * x / (df1 * x + df2), 0.5 * df1, 0.5 * df2)); } // From libbow, dirichlet.c // Written by Tom Minka <minka@stat.cmu.edu> public static final double logGamma(double x) { double result, y, xnum, xden; int i; final double d1 = -5.772156649015328605195174e-1; final double p1[] = { 4.945235359296727046734888e0, 2.018112620856775083915565e2, 2.290838373831346393026739e3, 1.131967205903380828685045e4, 2.855724635671635335736389e4, 3.848496228443793359990269e4, 2.637748787624195437963534e4, 7.225813979700288197698961e3 }; final double q1[] = { 6.748212550303777196073036e1, 1.113332393857199323513008e3, 7.738757056935398733233834e3, 2.763987074403340708898585e4, 5.499310206226157329794414e4, 6.161122180066002127833352e4, 3.635127591501940507276287e4, 8.785536302431013170870835e3 }; final double d2 = 4.227843350984671393993777e-1; final double p2[] = { 4.974607845568932035012064e0, 5.424138599891070494101986e2, 1.550693864978364947665077e4, 1.847932904445632425417223e5, 1.088204769468828767498470e6, 3.338152967987029735917223e6, 5.106661678927352456275255e6, 3.074109054850539556250927e6 }; final double q2[] = { 1.830328399370592604055942e2, 7.765049321445005871323047e3, 1.331903827966074194402448e5, 1.136705821321969608938755e6, 5.267964117437946917577538e6, 1.346701454311101692290052e7, 1.782736530353274213975932e7, 9.533095591844353613395747e6 }; final double d4 = 1.791759469228055000094023e0; final double p4[] = { 1.474502166059939948905062e4, 2.426813369486704502836312e6, 1.214755574045093227939592e8, 2.663432449630976949898078e9, 2.940378956634553899906876e10, 1.702665737765398868392998e11, 4.926125793377430887588120e11, 5.606251856223951465078242e11 }; final double q4[] = { 2.690530175870899333379843e3, 6.393885654300092398984238e5, 4.135599930241388052042842e7, 1.120872109616147941376570e9, 1.488613728678813811542398e10, 1.016803586272438228077304e11, 3.417476345507377132798597e11, 4.463158187419713286462081e11 }; final double c[] = { -1.910444077728e-03, 8.4171387781295e-04, -5.952379913043012e-04, 7.93650793500350248e-04, -2.777777777777681622553e-03, 8.333333333333333331554247e-02, 5.7083835261e-03 }; final double a = 0.6796875; if ((x <= 0.5) || ((x > a) && (x <= 1.5))) { if (x <= 0.5) { result = -Math.log(x); /* Test whether X < machine epsilon. */ if (x + 1 == 1) { return result; } } else { result = 0; x = (x - 0.5) - 0.5; } xnum = 0; xden = 1; for (i = 0; i < 8; i++) { xnum = xnum * x + p1[i]; xden = xden * x + q1[i]; } result += x * (d1 + x * (xnum / xden)); } else if ((x <= a) || ((x > 1.5) && (x <= 4))) { if (x <= a) { result = -Math.log(x); x = (x - 0.5) - 0.5; } else { result = 0; x -= 2; } xnum = 0; xden = 1; for (i = 0; i < 8; i++) { xnum = xnum * x + p2[i]; xden = xden * x + q2[i]; } result += x * (d2 + x * (xnum / xden)); } else if (x <= 12) { x -= 4; xnum = 0; xden = -1; for (i = 0; i < 8; i++) { xnum = xnum * x + p4[i]; xden = xden * x + q4[i]; } result = d4 + x * (xnum / xden); } /* X > 12 */ else { y = Math.log(x); result = x * (y - 1) - y * 0.5 + .9189385332046727417803297; x = 1 / x; y = x * x; xnum = c[6]; for (i = 0; i < 6; i++) { xnum = xnum * y + c[i]; } xnum *= x; result += xnum; } return result; } } /** * * @(#)Univariate.java * * DAMAGE (c) 2000 by Sundar Dorai-Raj * @author * Sundar Dorai-Raj * Email: sdoraira@vt.edu * This program is free software; * you can redistribute it and/or * modify it under the terms of the GNU General * Public License * as published by the Free Software Foundation; either version * 2 * of the License, or (at your option) any later version, * provided that * any use properly credits the author. * This program is distributed in the * hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the * implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. * See the * GNU General Public License for more details at http://www.gnu.org * * * */ class Univariate { private double[] x, sortx; private double[] summary = new double[6]; private boolean isSorted = false; public double[] five = new double[5]; private int n; private double mean, variance, stdev; private double median, min, Q1, Q3, max; public Univariate(double[] data) { x = (double[]) data.clone(); n = x.length; createSummaryStats(); } private void createSummaryStats() { int i; mean = 0; for (i = 0; i < n; i++) mean += x[i]; mean /= n; variance = variance(); stdev = stdev(); double sumxx = 0; variance = 0; for (i = 0; i < n; i++) sumxx += x[i] * x[i]; if (n > 1) variance = (sumxx - n * mean * mean) / (n - 1); stdev = Math.sqrt(variance); } public double[] summary() { summary[0] = n; summary[1] = mean; summary[2] = variance; summary[3] = stdev; summary[4] = Math.sqrt(variance / n); summary[5] = mean / summary[4]; return (summary); } public double mean() { return (mean); } public double variance() { return (variance); } public double stdev() { return (stdev); } public double SE() { return (Math.sqrt(variance / n)); } public double max() { if (!isSorted) sortx = sort(); return (sortx[n - 1]); } public double min() { if (!isSorted) sortx = sort(); return (sortx[0]); } public double median() { return (quant(0.50)); } public double quant(double q) { if (!isSorted) sortx = sort(); if (q > 1 || q < 0) return (0); else { double index = (n + 1) * q; if (index - (int) index == 0) return sortx[(int) index - 1]; else return q * sortx[(int) Math.floor(index) - 1] + (1 - q) * sortx[(int) Math.ceil(index) - 1]; } } public double[] sort() { sortx = (double[]) x.clone(); int incr = (int) (n * .5); while (incr >= 1) { for (int i = incr; i < n; i++) { double temp = sortx[i]; int j = i; while (j >= incr && temp < sortx[j - incr]) { sortx[j] = sortx[j - incr]; j -= incr; } sortx[j] = temp; } incr /= 2; } isSorted = true; return (sortx); } public double[] getData() { return (x); } public int size() { return (n); } public double elementAt(int index) { double element = 0; try { element = x[index]; } catch (ArrayIndexOutOfBoundsException e) { System.out.println("Index " + index + " does not exist in data."); } return (element); } public double[] subset(int[] indices) { int k = indices.length, i = 0; double elements[] = new double[k]; try { for (i = 0; i < k; i++) elements[i] = x[k]; } catch (ArrayIndexOutOfBoundsException e) { System.out.println("Index " + i + " does not exist in data."); } return (elements); } public int compare(double t) { int index = n - 1; int i; boolean found = false; for (i = 0; i < n && !found; i++) if (sortx[i] > t) { index = i; found = true; } return (index); } public int[] between(double t1, double t2) { int[] indices = new int[2]; indices[0] = compare(t1); indices[1] = compare(t2); return (indices); } public int indexOf(double element) { int index = -1; for (int i = 0; i < n; i++) if (Math.abs(x[i] - element) < 1e-6) index = i; return (index); } }