edu.byu.nlp.stats.RandomGeneratorsTest.java Source code

Java tutorial

Introduction

Here is the source code for edu.byu.nlp.stats.RandomGeneratorsTest.java

Source

/**
 * Copyright 2013 Brigham Young University
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package edu.byu.nlp.stats;

import static org.fest.assertions.Assertions.assertThat;

import java.util.Arrays;

import org.apache.commons.math3.random.MersenneTwister;
import org.apache.commons.math3.random.RandomGenerator;
import org.apache.commons.math3.special.Gamma;
import org.junit.Test;

import edu.byu.nlp.util.DoubleArrays;
import edu.byu.nlp.util.IntArrays;

/**
 * @author rah67
 *
 */
public class RandomGeneratorsTest {

    /**
     * Dirichlet Compounad Multinomial where \alpha_k = 1 for all k. 
     * 
     * NOTE: this DCM is based on iid samples from a Categorical distribution and hence lacks the
     * multinomial factorial.
     */
    // TODO(rhaertel): Implement a DCM class and just call that
    private double dcmLogDensityWithUniformPrior(int[] observations) {
        double sumAlpha = observations.length;
        double logDensity = Gamma.logGamma(sumAlpha) - Gamma.logGamma(IntArrays.sum(observations) + sumAlpha);
        for (int i = 0; i < observations.length; i++) {
            logDensity += Gamma.logGamma(observations[i] + 1.0) - Gamma.logGamma(1.0);
        }
        return logDensity;
    }

    /**
     * Tests the property that, with a sufficient number of samples, the empirical distribution
     * of the samples approaches that specified by the parameters.
     * 
     * The methodology used is Bayesian hypothesis testing. It is true that a utility function could
     * have been used here (e.g., log-loss \equivto K-L divergence), but then we would have some
     * arbitrary threshold we would have to set as an acceptable metric. Instead, we can use
     * the arbitrary scale for Bayes factory given by Jeffreys.
     * 
     * The hypothesis is designed as such. H1 states that the observed data was generated using the
     * provided parameters \hat{\theta}, i.e., p(\theta | H1) = 1 iff \theta = \hat{\theta}. This is
     * to be interpreted as meaning that the code correctly samples from the provided distribution.
     * H2 is that all parameters are equally likely a-priori, i.e., \theta | H2 ~ uniform Dirichlet.
     * Should this hypothesis be more likely than the previous hypothesis, there is a bug. 
     * 
     * Assuming a uniform prior over H1 and H2, then the posterior probability of the respective
     * hypotheses after observing x (a count vector) are: p(H1 | x) \propto p(x|H1) and
     * p(H2 | x) \propto p(x | H2). Thus, we can use the Bayes Factor, p(x | H1) / p(x | H2) to
     * compare hypotheses. According to Jeffrey's scale, a ratio of 100:1 would give decisive support
     * to the conclusion that there are no bugs.
     * 
     * Note that there is a relationship between the number of samples we draw, the dimensionality
     * of the parameter vector, and the strength of the factor.
     * 
     * With enough samples then the Bayes factor will either be much greater than 100 (no bugs) or
     * much less than 1.0/100.0 (bugs).
     * 
     * See http://idiom.ucsd.edu/~rlevy/lign251/fall2007/lecture_9.pdf.
     */
    public double logBayesFactor(int[] counts, double[] thetaHat) {
        assertThat(counts.length).isEqualTo(thetaHat.length);

        // p(x | H1) = \int p(\theta | H1) \prod_i p(x_i | \theta) d\theta
        //           = \prod_i p(x_i | \hat{\theta})
        double logPOfXGivenH1 = 0.0;
        for (int i = 0; i < counts.length; i++) {
            logPOfXGivenH1 += counts[i] * Math.log(thetaHat[i]);
        }

        // p(x | H2) = \int p(\theta | H2) \prod_i p(x_i | \theta) d\theta
        // x | H2 ~ UniformDCM
        double logPOfXGivenH2 = dcmLogDensityWithUniformPrior(counts);
        return logPOfXGivenH1 - logPOfXGivenH2;
    }

    /**
     * Test method for {@link edu.byu.nlp.stats.RandomGenerators#nextIntUnnormalizedProbs(org.apache.commons.math3.random.RandomGenerator, double[])}.
     */
    @Test
    public void testNextIntUnnormalizedProbsWithBinarySearch() {
        final int numSamples = 10000;
        final double[] theta = new double[] { 0.1, 0.1, 0.1, 0.2, 0.1, 0.1, 0.1, 0.1, 0.1 };

        // Ensure binary search will kick in
        assertThat(theta.length).isGreaterThan(RandomGenerators.BINARY_SEARCH_THRESHOLD);

        // Convert the parameters to an unnormalized log prob.
        double[] unnormalizedProbs = theta.clone();
        DoubleArrays.multiplyToSelf(unnormalizedProbs, 1234);

        RandomGenerator rnd = new MersenneTwister();

        // Sample and count how often each event occurred.
        int[] counts = new int[theta.length];
        for (int i = 0; i < numSamples; i++) {
            ++counts[RandomGenerators.nextIntUnnormalizedProbs(rnd, unnormalizedProbs)];
        }

        assertThat(logBayesFactor(counts, theta)).isGreaterThanOrEqualTo(Math.log(100.0));
    }

    /**
     * Test method for {@link edu.byu.nlp.stats.RandomGenerators#nextIntUnnormalizedProbs(org.apache.commons.math3.random.RandomGenerator, double[])}.
     */
    @Test
    public void testNextIntUnnormalizedProbsWithLinearSearch() {
        final int numSamples = 10000;
        final double[] theta = new double[] { 0.3, 0.3, 0.4 };

        // Ensure linear search will be used.
        assertThat(theta.length).isLessThan(RandomGenerators.BINARY_SEARCH_THRESHOLD);

        // Convert the parameters to an unnormalized log prob.
        double[] unnormalizedProbs = theta.clone();
        DoubleArrays.multiplyToSelf(unnormalizedProbs, 1234);

        RandomGenerator rnd = new MersenneTwister();

        // Sample and count how often each event occurred.
        int[] counts = new int[theta.length];
        for (int i = 0; i < numSamples; i++) {
            ++counts[RandomGenerators.nextIntUnnormalizedProbs(rnd, unnormalizedProbs)];
        }
        System.out.println(Arrays.toString(counts));
        assertThat(logBayesFactor(counts, theta)).isGreaterThanOrEqualTo(Math.log(100.0));
    }

    /**
     * Test method for {@link edu.byu.nlp.stats.RandomGenerators#nextIntUnnormalizedLogProbs(org.apache.commons.math3.random.RandomGenerator, double[])}.
     */
    @Test
    public void testNextIntUnnormalizedLogProbsWithBinarySearch() {
        final int numSamples = 10000;
        final double[] theta = new double[] { 0.1, 0.1, 0.1, 0.2, 0.1, 0.1, 0.1, 0.1, 0.1 };

        // Ensure binary search will kick in
        assertThat(theta.length).isGreaterThan(RandomGenerators.BINARY_SEARCH_THRESHOLD);

        // Convert the parameters to an unnormalized log prob.
        double[] unnormalizedLogProbs = DoubleArrays.log(theta);
        DoubleArrays.addToSelf(unnormalizedLogProbs, -1234);

        RandomGenerator rnd = new MersenneTwister();

        // Sample and count how often each event occurred.
        int[] counts = new int[theta.length];
        for (int i = 0; i < numSamples; i++) {
            ++counts[RandomGenerators.nextIntUnnormalizedLogProbs(rnd, unnormalizedLogProbs)];
        }

        assertThat(logBayesFactor(counts, theta)).isGreaterThanOrEqualTo(Math.log(100.0));
    }

    /**
     * Test method for {@link edu.byu.nlp.stats.RandomGenerators#nextIntUnnormalizedLogProbs(org.apache.commons.math3.random.RandomGenerator, double[])}.
     */
    @Test
    public void testNextIntUnnormalizedLogProbsWithLinearSearch() {
        final int numSamples = 10000;
        final double[] theta = new double[] { 0.3, 0.3, 0.4 };

        // Ensure linear search will be used.
        assertThat(theta.length).isLessThan(RandomGenerators.BINARY_SEARCH_THRESHOLD);

        // Convert the parameters to an unnormalized log prob.
        double[] unnormalizedLogProbs = DoubleArrays.log(theta);
        DoubleArrays.addToSelf(unnormalizedLogProbs, -1234);

        RandomGenerator rnd = new MersenneTwister();

        // Sample and count how often each event occurred.
        int[] counts = new int[theta.length];
        for (int i = 0; i < numSamples; i++) {
            ++counts[RandomGenerators.nextIntUnnormalizedLogProbs(rnd, unnormalizedLogProbs)];
        }

        assertThat(logBayesFactor(counts, theta)).isGreaterThanOrEqualTo(Math.log(100.0));
    }
}