com.datumbox.framework.core.statistics.distributions.ContinuousDistributions.java Source code

Java tutorial

Introduction

Here is the source code for com.datumbox.framework.core.statistics.distributions.ContinuousDistributions.java

Source

/**
 * Copyright (C) 2013-2017 Vasilis Vryniotis <bbriniotis@datumbox.com>
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package com.datumbox.framework.core.statistics.distributions;

import com.datumbox.framework.common.utilities.RandomGenerator;
import org.apache.commons.math3.distribution.MultivariateNormalDistribution;

/**
 * This class provides methods for the CDFs and PDFs of the most common continuous 
 * distributions.
 * 
 * @author Vasilis Vryniotis <bbriniotis@datumbox.com>
 */
public class ContinuousDistributions {

    /**
     * Returns the probability from 0 to x of a specific chisquare score and degrees of freedom
     * 
     * @param x
     * @param df
     * @return 
     */
    public static double chisquareCdf(double x, int df) {
        if (df <= 0) {
            throw new IllegalArgumentException("The degrees of freedom need to be positive.");
        }

        return ContinuousDistributions.gammaCdf(x / 2.0, df / 2.0);
    }

    /**
     * Returns the p-value of a specific z score for Gaussian
     * Ported from C# code posted at http://jamesmccaffrey.wordpress.com/2010/11/05/programmatically-computing-the-area-under-the-normal-curve/
     * 
     * @param z
     * @return 
     */
    public static double gaussCdf(double z) {
        // input = z-value (-inf to +inf)
        // output = p under Normal curve from -inf to z
        // e.g., if z = 0.0, function returns 0.5000
        // ACM Algorithm #209
        double y; // 209 scratch variable
        double p; // result. called z in 209
        double w; // 209 scratch variable

        if (z == 0.0) {
            p = 0.0;
        } else {
            y = Math.abs(z) / 2.0;
            if (y >= 3.0) {
                p = 1.0;
            } else if (y < 1.0) {
                w = y * y;
                p = ((((((((0.000124818987 * w - 0.001075204047) * w + 0.005198775019) * w - 0.019198292004) * w
                        + 0.059054035642) * w - 0.151968751364) * w + 0.319152932694) * w - 0.531923007300) * w
                        + 0.797884560593) * y * 2.0;
            } else {
                y = y - 2.0;
                p = (((((((((((((-0.000045255659 * y + 0.000152529290) * y - 0.000019538132) * y - 0.000676904986)
                        * y + 0.001390604284) * y - 0.000794620820) * y - 0.002034254874) * y + 0.006549791214) * y
                        - 0.010557625006) * y + 0.011630447319) * y - 0.009279453341) * y + 0.005353579108) * y
                        - 0.002141268741) * y + 0.000535310849) * y + 0.999936657524;
            }
        }

        if (z > 0.0) {
            return (p + 1.0) / 2.0;
        }

        return (1.0 - p) / 2.0;
    }

    /**
     * It estimates a numeric approximation of gamma function.
     * 
     * @param x     The input x
     * @return      The value of gamma(x)
     */
    public static double gamma(double x) {
        return Math.exp(logGamma(x));
    }

    /**
     * Log Gamma Function
     * 
     * @param Z
     * @return 
     */
    public static double logGamma(double Z) {
        double S = 1.0 + 76.18009173 / Z - 86.50532033 / (Z + 1.0) + 24.01409822 / (Z + 2.0)
                - 1.231739516 / (Z + 3.0) + 0.00120858003 / (Z + 4.0) - 0.00000536382 / (Z + 5.0);
        double LG = (Z - 0.5) * Math.log(Z + 4.5) - (Z + 4.5) + Math.log(S * 2.50662827465);

        return LG;
    }

    /**
     * Internal function used by StudentCdf
     * 
     * @param x
     * @param A
     * @param B
     * @return 
     */
    protected static double betinc(double x, double A, double B) {
        double A0 = 0.0;
        double B0 = 1.0;
        double A1 = 1.0;
        double B1 = 1.0;
        double M9 = 0.0;
        double A2 = 0.0;
        while (Math.abs((A1 - A2) / A1) > 0.00001) {
            A2 = A1;
            double C9 = -(A + M9) * (A + B + M9) * x / (A + 2.0 * M9) / (A + 2.0 * M9 + 1.0);
            A0 = A1 + C9 * A0;
            B0 = B1 + C9 * B0;
            M9 = M9 + 1;
            C9 = M9 * (B - M9) * x / (A + 2.0 * M9 - 1.0) / (A + 2.0 * M9);
            A1 = A0 + C9 * A1;
            B1 = B0 + C9 * B1;
            A0 = A0 / B1;
            B0 = B0 / B1;
            A1 = A1 / B1;
            B1 = 1.0;
        }
        return A1 / A;
    }

    /**
     * Calculates the probability from -INF to X under Student's Distribution
     * Ported to PHP from Javascript implementation found at http://www.math.ucla.edu/~tom/distributions/tDist.html
     * 
     * @param x
     * @param df
     * @return 
     */
    public static double studentsCdf(double x, int df) {
        if (df <= 0) {
            throw new IllegalArgumentException("The degrees of freedom need to be positive.");
        }

        double A = df / 2.0;
        double S = A + 0.5;
        double Z = df / (df + x * x);
        double BT = Math.exp(logGamma(S) - logGamma(0.5) - logGamma(A) + A * Math.log(Z) + 0.5 * Math.log(1.0 - Z));
        double betacdf;
        if (Z < (A + 1.0) / (S + 2.0)) {
            betacdf = BT * betinc(Z, A, 0.5);
        } else {
            betacdf = 1 - BT * betinc(1.0 - Z, 0.5, A);
        }

        double tcdf;
        if (x < 0) {
            tcdf = betacdf / 2.0;
        } else {
            tcdf = 1.0 - betacdf / 2.0;
        }

        return tcdf;
    }

    /**
     * Calculates the probability from 0 to X under Exponential Distribution
     * 
     * @param x
     * @param lamda
     * @return
     */
    public static double exponentialCdf(double x, double lamda) {
        if (x < 0 || lamda <= 0) {
            throw new IllegalArgumentException("All the parameters must be positive.");
        }

        double probability = 1.0 - Math.exp(-lamda * x);

        return probability;
    }

    /**
     * Calculates the probability from 0 to X under Beta Distribution
     * 
     * @param x
     * @param a
     * @param b
     * @return 
     */
    public static double betaCdf(double x, double a, double b) {
        if (x < 0 || a <= 0 || b <= 0) {
            throw new IllegalArgumentException("All the parameters must be positive.");
        }

        double Bcdf = 0.0;

        if (x == 0) {
            return Bcdf;
        } else if (x >= 1) {
            Bcdf = 1.0;
            return Bcdf;
        }

        double S = a + b;

        double BT = Math.exp(logGamma(S) - logGamma(b) - logGamma(a) + a * Math.log(x) + b * Math.log(1 - x));
        if (x < (a + 1.0) / (S + 2.0)) {
            Bcdf = BT * betinc(x, a, b);
        } else {
            Bcdf = 1.0 - BT * betinc(1.0 - x, b, a);
        }

        return Bcdf;
    }

    /**
     * Calculates the probability from 0 to X under F Distribution
     * 
     * @param x
     * @param f1
     * @param f2
     * @return
     */
    public static double fCdf(double x, int f1, int f2) {
        if (x < 0 || f1 <= 0 || f2 <= 0) {
            throw new IllegalArgumentException("All the parameters must be positive.");
        }

        double Z = x / (x + (double) f2 / f1);
        double FCdf = betaCdf(Z, f1 / 2.0, f2 / 2.0);

        return FCdf;
    }

    /**
     * Internal function used by gammaCdf
     * 
     * @param x
     * @param A
     * @return 
     */
    private static double gCf(double x, double A) {
        // Good for X>A+1
        double A0 = 0;
        double B0 = 1;
        double A1 = 1;
        double B1 = x;
        double AOLD = 0;
        double N = 0;
        while (Math.abs((A1 - AOLD) / A1) > .00001) {
            AOLD = A1;
            N = N + 1;
            A0 = A1 + (N - A) * A0;
            B0 = B1 + (N - A) * B0;
            A1 = x * A0 + N * A1;
            B1 = x * B0 + N * B1;
            A0 = A0 / B1;
            B0 = B0 / B1;
            A1 = A1 / B1;
            B1 = 1;
        }
        double Prob = Math.exp(A * Math.log(x) - x - logGamma(A)) * A1;

        return 1.0 - Prob;
    }

    /**
     * Internal function used by gammaCdf
     * 
     * @param x
     * @param A
     * @return 
     */
    private static double gSer(double x, double A) {
        // Good for X<A+1.
        double T9 = 1 / A;
        double G = T9;
        double I = 1;
        while (T9 > G * 0.00001) {
            T9 = T9 * x / (A + I);
            G = G + T9;
            ++I;
        }
        G = G * Math.exp(A * Math.log(x) - x - logGamma(A));

        return G;
    }

    /**
     * Internal function used by gammaCdf
     * 
     * @param x
     * @param a
     * @return
     */
    protected static double gammaCdf(double x, double a) {
        if (x < 0) {
            throw new IllegalArgumentException("The x parameter must be positive.");
        }

        double GI;
        if (a > 200) {
            double z = (x - a) / Math.sqrt(a);
            double y = gaussCdf(z);
            double b1 = 2 / Math.sqrt(a);
            double phiz = 0.39894228 * Math.exp(-z * z / 2);
            double w = y - b1 * (z * z - 1) * phiz / 6; //Edgeworth1
            double b2 = 6 / a;
            int zXor4 = ((int) z) ^ 4;
            double u = 3 * b2 * (z * z - 3) + b1 * b1 * (zXor4 - 10 * z * z + 15);
            GI = w - phiz * z * u / 72; //Edgeworth2
        } else if (x < a + 1) {
            GI = gSer(x, a);
        } else {
            GI = gCf(x, a);
        }

        return GI;
    }

    /**
     * Calculates the probability from 0 to X under Gamma Distribution
     * 
     * @param x
     * @param a
     * @param b
     * @return
     */
    public static double gammaCdf(double x, double a, double b) {
        if (a <= 0 || b <= 0) {
            throw new IllegalArgumentException("All the parameters must be positive.");
        }

        double GammaCdf = ContinuousDistributions.gammaCdf(x / b, a);

        return GammaCdf;
    }

    /**
     * Returns the cumulative probability of Uniform
     * 
     * @param x
     * @param a
     * @param b
     * @return
     */
    public static double uniformCdf(double x, double a, double b) {
        if (a >= b) {
            throw new IllegalArgumentException("The a must be smaller than b.");
        }

        double probabilitySum;
        if (x < a) {
            probabilitySum = 0.0;
        } else if (x < b) {
            probabilitySum = (x - a) / (b - a);
        } else {
            probabilitySum = 1;
        }

        return probabilitySum;
    }

    /**
     * Returns the cumulative probability of kolmogorov
     * 
     * @param z
     * @return 
     */
    public static double kolmogorov(double z) {
        //Kolmogorov distribution. Error<.0000001
        if (z < 0.27) {
            return 0.0;
        } else if (z > 3.2) {
            return 1.1;
        }

        double ks = 0;
        double y = -2 * z * z;
        for (int i = 27; i >= 1; i = i - 2) {
            ks = Math.exp(i * y) * (1 - ks);
        }
        return 1.0 - 2.0 * ks;
    }

    /*
    Inverse Cdf functions
    */

    /**
     * Returns the z score of a specific pvalue for Gaussian
     * Partially ported from http://home.online.no/~pjacklam/notes/invnorm/impl/karimov/StatUtil.java
     * Other implementations http://home.online.no/~pjacklam/notes/invnorm/index.html#Java
     * 
     * @param p
     * @return 
     */
    public static double gaussInverseCdf(double p) {
        final double P_LOW = 0.02425D;
        final double P_HIGH = 1.0D - P_LOW;
        final double ICDF_A[] = { -3.969683028665376e+01, 2.209460984245205e+02, -2.759285104469687e+02,
                1.383577518672690e+02, -3.066479806614716e+01, 2.506628277459239e+00 };
        final double ICDF_B[] = { -5.447609879822406e+01, 1.615858368580409e+02, -1.556989798598866e+02,
                6.680131188771972e+01, -1.328068155288572e+01 };
        final double ICDF_C[] = { -7.784894002430293e-03, -3.223964580411365e-01, -2.400758277161838e+00,
                -2.549732539343734e+00, 4.374664141464968e+00, 2.938163982698783e+00 };
        final double ICDF_D[] = { 7.784695709041462e-03, 3.224671290700398e-01, 2.445134137142996e+00,
                3.754408661907416e+00 };

        // Define break-points.
        // variable for result
        double z;

        if (p == 0) {
            z = Double.NEGATIVE_INFINITY;
        } else if (p == 1) {
            z = Double.POSITIVE_INFINITY;
        } else if (Double.isNaN(p) || p < 0 || p > 1) {
            z = Double.NaN;
        } else if (p < P_LOW) { // Rational approximation for lower region:
            double q = Math.sqrt(-2 * Math.log(p));
            z = (((((ICDF_C[0] * q + ICDF_C[1]) * q + ICDF_C[2]) * q + ICDF_C[3]) * q + ICDF_C[4]) * q + ICDF_C[5])
                    / ((((ICDF_D[0] * q + ICDF_D[1]) * q + ICDF_D[2]) * q + ICDF_D[3]) * q + 1);
        } else if (P_HIGH < p) { // Rational approximation for upper region:
            double q = Math.sqrt(-2 * Math.log(1 - p));
            z = -(((((ICDF_C[0] * q + ICDF_C[1]) * q + ICDF_C[2]) * q + ICDF_C[3]) * q + ICDF_C[4]) * q + ICDF_C[5])
                    / ((((ICDF_D[0] * q + ICDF_D[1]) * q + ICDF_D[2]) * q + ICDF_D[3]) * q + 1);
        } else { // Rational approximation for central region:
            double q = p - 0.5D;
            double r = q * q;
            z = (((((ICDF_A[0] * r + ICDF_A[1]) * r + ICDF_A[2]) * r + ICDF_A[3]) * r + ICDF_A[4]) * r + ICDF_A[5])
                    * q
                    / (((((ICDF_B[0] * r + ICDF_B[1]) * r + ICDF_B[2]) * r + ICDF_B[3]) * r + ICDF_B[4]) * r + 1);
        }

        return z;
    }

    /**
     * Returns the x score of a specific pvalue and degrees of freedom for Chisquare. It We just do a bisectionsearch for a value within CHI_EPSILON, relying on the monotonicity of chisquareCdf().
    Ported from Javascript code posted at http://www.fourmilab.ch/rpkp/experiments/analysis/chiCalc.js
     * 
     * @param p
     * @param df
     * @return 
     */
    public static double chisquareInverseCdf(double p, int df) {
        final double CHI_EPSILON = 0.000001; /* Accuracy of critchi approximation */
        final double CHI_MAX = 99999.0; /* Maximum chi-square value */
        double minchisq = 0.0;
        double maxchisq = CHI_MAX;

        if (p <= 0.0) {
            return CHI_MAX;
        } else if (p >= 1.0) {
            return 0.0;
        }

        double chisqval = df / Math.sqrt(p); /* fair first value */
        while ((maxchisq - minchisq) > CHI_EPSILON) {
            if (1 - chisquareCdf(chisqval, df) < p) {
                maxchisq = chisqval;
            } else {
                minchisq = chisqval;
            }
            chisqval = (maxchisq + minchisq) * 0.5;
        }

        return chisqval;
    }

    /* 
    Other functions 
    */

    /**
     * Compute the quantile function for the normal distribution. For small to moderate probabilities, algorithm referenced
     * below is used to obtain an initial approximation which is polished with a final Newton step. For very large arguments, an algorithm of Wichura is used.
     * Used by ShapiroWilk Test
     * Ported by Javascript implementation found at https://raw.github.com/rniwa/js-shapiro-wilk/master/shapiro-wilk.js
     * Originally ported from http://svn.r-project.org/R/trunk/src/nmath/qnorm.c
     * 
     * @param p
     * @param mu
     * @param sigma
     * @return 
     */
    public static double normalQuantile(double p, double mu, double sigma) {
        // The inverse of cdf.
        if (sigma < 0) {
            throw new IllegalArgumentException("The sigma parameter must be positive.");
        } else if (sigma == 0) {
            return mu;
        }

        double r;
        double val;

        double q = p - 0.5;

        if (0.075 <= p && p <= 0.925) {
            r = 0.180625 - q * q;
            val = q * (((((((r * 2509.0809287301226727 + 33430.575583588128105) * r + 67265.770927008700853) * r
                    + 45921.953931549871457) * r + 13731.693765509461125) * r + 1971.5909503065514427) * r
                    + 133.14166789178437745) * r + 3.387132872796366608)
                    / (((((((r * 5226.495278852854561 + 28729.085735721942674) * r + 39307.89580009271061) * r
                            + 21213.794301586595867) * r + 5394.1960214247511077) * r + 687.1870074920579083) * r
                            + 42.313330701600911252) * r + 1);
        } else { /* closer than 0.075 from {0,1} boundary */
            /* r = min(p, 1-p) < 0.075 */
            if (q > 0) {
                r = 1 - p;
            } else {
                r = p;/* = R_DT_Iv(p) ^=  p */
            }

            r = Math.sqrt(-Math.log(r)); /* r = sqrt(-log(r))  <==>  min(p, 1-p) = exp( - r^2 ) */

            if (r <= 5.0) { /* <==> min(p,1-p) >= exp(-25) ~= 1.3888e-11 */
                r += -1.6;
                val = (((((((r * 7.7454501427834140764e-4 + 0.0227238449892691845833) * r + 0.24178072517745061177)
                        * r + 1.27045825245236838258) * r + 3.64784832476320460504) * r + 5.7694972214606914055) * r
                        + 4.6303378461565452959) * r + 1.42343711074968357734)
                        / (((((((r * 1.05075007164441684324e-9 + 5.475938084995344946e-4) * r
                                + 0.0151986665636164571966) * r + 0.14810397642748007459) * r
                                + 0.68976733498510000455) * r + 1.6763848301838038494) * r + 2.05319162663775882187)
                                * r + 1.0);
            } else { /* very close to  0 or 1 */
                r += -5.0;
                val = (((((((r * 2.01033439929228813265e-7 + 2.71155556874348757815e-5) * r
                        + 0.0012426609473880784386) * r + 0.026532189526576123093) * r + 0.29656057182850489123) * r
                        + 1.7848265399172913358) * r + 5.4637849111641143699) * r + 6.6579046435011037772)
                        / (((((((r * 2.04426310338993978564e-15 + 1.4215117583164458887e-7) * r
                                + 1.8463183175100546818e-5) * r + 7.868691311456132591e-4) * r
                                + 0.0148753612908506148525) * r + 0.13692988092273580531) * r
                                + 0.59983220655588793769) * r + 1.0);
            }

            if (q < 0.0) {
                val = -val;
            }
            /* return (q >= 0.)? r : -r ;*/
        }
        return mu + sigma * val;
    }

    /**
     * Calculates probability pi, ai under dirichlet distribution
     * 
     * @param pi    The vector with probabilities.
     * @param ai    The vector with pseudocounts.
     * @return      The probability
     */
    public static double dirichletPdf(double[] pi, double[] ai) {
        double probability = 1.0;
        double sumAi = 0.0;
        double productGammaAi = 1.0;

        double tmp;
        int piLength = pi.length;
        for (int i = 0; i < piLength; ++i) {
            tmp = ai[i];
            sumAi += tmp;
            productGammaAi *= gamma(tmp);
            probability *= Math.pow(pi[i], tmp - 1);
        }

        probability *= gamma(sumAi) / productGammaAi;

        return probability;
    }

    /**
     * Implementation for single alpha value.
     * 
     * @param pi    The vector with probabilities.
     * @param a     The alpha parameter for all pseudocounts.
     * @return      The probability
     */
    public static double dirichletPdf(double[] pi, double a) {
        double probability = 1.0;

        int piLength = pi.length;

        for (int i = 0; i < piLength; ++i) {
            probability *= Math.pow(pi[i], a - 1);
        }

        double sumAi = piLength * a;
        double productGammaAi = Math.pow(gamma(a), piLength);
        probability *= gamma(sumAi) / productGammaAi;

        return probability;
    }

    /**
     * Samples from Multinomial Normal Distribution.
     * 
     * @param mean
     * @param covariance
     * @return              A multinomialGaussianSample from the Multinomial Normal Distribution
     */
    public static double[] multinomialGaussianSample(double[] mean, double[][] covariance) {
        MultivariateNormalDistribution gaussian = new MultivariateNormalDistribution(mean, covariance);
        gaussian.reseedRandomGenerator(RandomGenerator.getThreadLocalRandom().nextLong());
        return gaussian.sample();
    }

    /**
     * Calculates the PDF of Multinomial Normal Distribution for a particular x.
     * 
     * @param mean
     * @param covariance
     * @param x             The x record
     * @return              The multinomialGaussianPdf of x
     */
    public static double multinomialGaussianPdf(double[] mean, double[][] covariance, double[] x) {
        MultivariateNormalDistribution gaussian = new MultivariateNormalDistribution(mean, covariance);
        return gaussian.density(x);
    }
}