org.apache.mahout.clustering.dirichlet.UncommonDistributions.java Source code

Introduction

Here is the source code for org.apache.mahout.clustering.dirichlet.UncommonDistributions.java
Source

/**
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.mahout.clustering.dirichlet;

import org.apache.commons.math3.distribution.NormalDistribution;
import org.apache.commons.math3.distribution.RealDistribution;
import org.apache.mahout.common.RandomUtils;
import org.apache.mahout.common.RandomWrapper;
import org.apache.mahout.math.DenseVector;
import org.apache.mahout.math.Vector;

public final class UncommonDistributions {

    public static final double SQRT2PI = Math.sqrt(2.0 * Math.PI);

    private static final RandomWrapper RANDOM = RandomUtils.getRandom();

    private UncommonDistributions() {
    }

    // =============== start of BSD licensed code. See LICENSE.txt
    /**
     * Returns a double sampled according to this distribution. Uniformly fast for all k > 0. (Reference:
     * Non-Uniform Random Variate Generation, Devroye http://cgm.cs.mcgill.ca/~luc/rnbookindex.html) Uses
     * Cheng's rejection algorithm (GB) for k>=1, rejection from Weibull distribution for 0 < k < 1.
     */
    public static double rGamma(double k, double lambda) {
        boolean accept = false;
        if (k >= 1.0) {
            // Cheng's algorithm
            double b = k - Math.log(4.0);
            double c = k + Math.sqrt(2.0 * k - 1.0);
            double lam = Math.sqrt(2.0 * k - 1.0);
            double cheng = 1.0 + Math.log(4.5);
            double x;
            do {
                double u = RANDOM.nextDouble();
                double v = RANDOM.nextDouble();
                double y = 1.0 / lam * Math.log(v / (1.0 - v));
                x = k * Math.exp(y);
                double z = u * v * v;
                double r = b + c * y - x;
                if (r >= 4.5 * z - cheng || r >= Math.log(z)) {
                    accept = true;
                }
            } while (!accept);
            return x / lambda;
        } else {
            // Weibull algorithm
            double c = 1.0 / k;
            double d = (1.0 - k) * Math.pow(k, k / (1.0 - k));
            double x;
            do {
                double u = RANDOM.nextDouble();
                double v = RANDOM.nextDouble();
                double z = -Math.log(u);
                double e = -Math.log(v);
                x = Math.pow(z, c);
                if (z + e >= d + x) {
                    accept = true;
                }
            } while (!accept);
            return x / lambda;
        }
    }

    // ============= end of BSD licensed code

    /**
     * Returns a random sample from a beta distribution with the given shapes
     * 
     * @param shape1
     *          a double representing shape1
     * @param shape2
     *          a double representing shape2
     * @return a Vector of samples
     */
    public static double rBeta(double shape1, double shape2) {
        double gam1 = rGamma(shape1, 1.0);
        double gam2 = rGamma(shape2, 1.0);
        return gam1 / (gam1 + gam2);

    }

    /**
     * Returns a vector of random samples from a beta distribution with the given shapes
     * 
     * @param k
     *          the number of samples to return
     * @param shape1
     *          a double representing shape1
     * @param shape2
     *          a double representing shape2
     * @return a Vector of samples
     */
    public static Vector rBeta(int k, double shape1, double shape2) {
        // List<Double> params = new ArrayList<Double>(2);
        // params.add(shape1);
        // params.add(Math.max(0, shape2));
        Vector result = new DenseVector(k);
        for (int i = 0; i < k; i++) {
            result.set(i, rBeta(shape1, shape2));
        }
        return result;
    }

    /**
     * Return a random sample from the chi-squared (chi^2) distribution with df degrees of freedom.
     * 
     * @return a double sample
     */
    public static double rChisq(double df) {
        double result = 0.0;
        for (int i = 0; i < df; i++) {
            double sample = rNorm(0.0, 1.0);
            result += sample * sample;
        }
        return result;
    }

    /**
     * Return a random value from a normal distribution with the given mean and standard deviation
     * 
     * @param mean
     *          a double mean value
     * @param sd
     *          a double standard deviation
     * @return a double sample
     */
    public static double rNorm(double mean, double sd) {
        RealDistribution dist = new NormalDistribution(RANDOM.getRandomGenerator(), mean, sd,
                NormalDistribution.DEFAULT_INVERSE_ABSOLUTE_ACCURACY);
        return dist.sample();
    }

    /**
     * Return the normal density function value for the sample x
     * 
     * pdf = 1/[sqrt(2*p)*s] * e^{-1/2*[(x-m)/s]^2}
     * 
     * @param x
     *          a double sample value
     * @param m
     *          a double mean value
     * @param s
     *          a double standard deviation
     * @return a double probability value
     */
    public static double dNorm(double x, double m, double s) {
        double xms = (x - m) / s;
        double ex = xms * xms / 2.0;
        double exp = Math.exp(-ex);
        return exp / (SQRT2PI * s);
    }

    /** Returns one sample from a multinomial. */
    public static int rMultinom(Vector probabilities) {
        // our probability argument are not normalized.
        double total = probabilities.zSum();
        double nextDouble = RANDOM.nextDouble();
        double p = nextDouble * total;
        for (int i = 0; i < probabilities.size(); i++) {
            double pi = probabilities.get(i);
            if (p < pi) {
                return i;
            } else {
                p -= pi;
            }
        }
        // can't happen except for round-off error so we don't care what we return here
        return 0;
    }

    /**
     * Returns a multinomial vector sampled from the given probabilities
     * 
     * rmultinom should be implemented as successive binomial sampling.
     * 
     * Keep a normalizing amount that starts with 1 (I call it total).
     * 
     * For each i k[i] = rbinom(p[i] / total, size); total -= p[i]; size -= k[i];
     * 
     * @param size
     *          the size parameter of the binomial distribution
     * @param probabilities
     *          a Vector of probabilities
     * @return a multinomial distribution Vector
     */
    public static Vector rMultinom(int size, Vector probabilities) {
        // our probability argument may not be normalized.
        double total = probabilities.zSum();
        int cardinality = probabilities.size();
        Vector result = new DenseVector(cardinality);
        for (int i = 0; total > 0 && i < cardinality; i++) {
            double p = probabilities.get(i);
            int ki = rBinomial(size, p / total);
            total -= p;
            size -= ki;
            result.set(i, ki);
        }
        return result;
    }

    /**
     * Returns an integer sampled according to this distribution. Takes time proportional to np + 1. (Reference:
     * Non-Uniform Random Variate Generation, Devroye http://cgm.cs.mcgill.ca/~luc/rnbookindex.html) Second
     * time-waiting algorithm.
     */
    public static int rBinomial(int n, double p) {
        if (p >= 1.0) {
            return n; // needed to avoid infinite loops and negative results
        }
        double q = -Math.log1p(-p);
        double sum = 0.0;
        int x = 0;
        while (sum <= q) {
            double u = RANDOM.nextDouble();
            double e = -Math.log(u);
            sum += e / (n - x);
            x++;
        }
        if (x == 0) {
            return 0;
        }
        return x - 1;
    }

    /**
     * Sample from a Dirichlet distribution, returning a vector of probabilities using a stick-breaking
     * algorithm
     * 
     * @param totalCounts
     *          an unnormalized count Vector
     * @param alpha0
     *          a double
     * @return a Vector of probabilities
     */
    public static Vector rDirichlet(Vector totalCounts, double alpha0) {
        Vector pi = totalCounts.like();
        double total = totalCounts.zSum();
        double remainder = 1.0;
        for (int k = 0; k < pi.size(); k++) {
            double countK = totalCounts.get(k);
            total -= countK;
            double betaK = rBeta(1.0 + countK, Math.max(0.0, alpha0 + total));
            double piK = betaK * remainder;
            pi.set(k, piK);
            remainder -= piK;
        }
        return pi;
    }

}