datafu.pig.stats.WilsonBinConf.java Source code

Java tutorial

Introduction

Here is the source code for datafu.pig.stats.WilsonBinConf.java

Source

/*
 * Copyright 2010 LinkedIn, Inc
 * 
 * Licensed under the Apache License, Version 2.0 (the "License"); you may not
 * use this file except in compliance with the License. You may obtain a copy of
 * the License at
 * 
 * http://www.apache.org/licenses/LICENSE-2.0
 * 
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
 * License for the specific language governing permissions and limitations under
 * the License.
 */

package datafu.pig.stats;

import java.io.IOException;

import org.apache.commons.math.MathException;
import org.apache.commons.math.distribution.NormalDistribution;
import org.apache.commons.math.distribution.NormalDistributionImpl;
import org.apache.pig.data.DataType;
import org.apache.pig.data.Tuple;
import org.apache.pig.data.TupleFactory;
import org.apache.pig.impl.logicalLayer.schema.Schema;

import com.google.common.collect.ImmutableList;

import datafu.pig.util.SimpleEvalFunc;

/**
 * Computes the {@link <a href="http://en.wikipedia.org/wiki/Binomial_proportion_confidence_interval#Wilson_score_interval" target="_blank">Wilsonian binomial proportion confidence interval</a>}
 * <p>
 * Constructor requires the confidence interval (alpha) parameter, and the
 * parameters are the number of positive (success) outcomes and the total
 * number of observations. The UDF returns the (lower,upper) confidence
 * interval. 
 * <p>
 * Example:
 * <pre>
 * {@code
 * -- the Wilsonian binomial proportion confidence interval for scoring
 * %declare WILSON_ALPHA 0.10
 *
 * define WilsonBinConf      datafu.pig.stats.WilsonBinConf('$WILSON_ALPHA'); 
 *
 * bar = FOREACH foo GENERATE WilsonBinConf(successes, totals).lower as score;
 * quux = ORDER bar BY score DESC;
 * top = LIMIT quux 10;
 * }
 * </pre></p>
 */
public class WilsonBinConf extends SimpleEvalFunc<Tuple> {
    private static TupleFactory tupleFactory = TupleFactory.getInstance();
    private final double alpha;

    public WilsonBinConf(double alpha) {
        this.alpha = alpha;
    }

    public WilsonBinConf(String alpha) {
        this(Double.parseDouble(alpha));
    }

    public Tuple call(Number x, Number n) throws IOException {
        if (x == null || n == null)
            return null;
        return binconf(x.longValue(), n.longValue());
    }

    /**
     * @param x The number of positive (success) outcomes
     * @param n The number of observations
     * @return The (lower,upper) confidence interval
     */
    public Tuple binconf(Long x, Long n) throws IOException {
        NormalDistribution normalDist = new NormalDistributionImpl();

        if (x == null || n == null)
            return null;
        if (x < 0 || n < 0)
            throw new IllegalArgumentException("non-negative values expected");
        if (x > n)
            throw new IllegalArgumentException("invariant violation: number of successes > number of obs");
        if (n == 0)
            return tupleFactory.newTuple(ImmutableList.of(Double.valueOf(0), Double.valueOf(0)));

        try {
            double zcrit = -1.0 * normalDist.inverseCumulativeProbability(alpha / 2);
            double z2 = zcrit * zcrit;
            double p = x / (double) n;

            double a = p + z2 / 2 / n;
            double b = zcrit * Math.sqrt((p * (1 - p) + z2 / 4 / n) / n);
            double c = (1 + z2 / n);

            double lower = (a - b) / c;
            double upper = (a + b) / c;

            // Add corrections for when x is very close to n.  This improves the estimates.
            // For more info on wilson binomial confidence interval, see paper:
            // L.D. Brown, T.T. Cai and A. DasGupta, Interval estimation for a binomial proportion (with discussion), 
            //   _Statistical Science,_*16*:101-133, 2001. 
            // http://www-stat.wharton.upenn.edu/~tcai/paper/Binomial-StatSci.pdf

            if (x == 1)
                lower = -Math.log(1 - alpha) / n;
            if (x == (n - 1))
                upper = 1 + Math.log(1 - alpha) / n;

            return tupleFactory.newTuple(ImmutableList.of(lower, upper));
        } catch (MathException e) {
            throw new IOException("math error", e);
        }
    }

    @Override
    public Schema outputSchema(Schema input) {
        return new Schema(ImmutableList.of(new Schema.FieldSchema("lower", DataType.DOUBLE),
                new Schema.FieldSchema("upper", DataType.DOUBLE)));
    }
}