Here you can find the source of sumError(long samples, long count, double m2, double mean)
Var(S) = Var(sum(1 / p * X * Bern(p))) = sum(Var(1 / p * X * Bern(p))) [Bienayme formula] = n * Var(1 / p * X * Bern(p)) [X * Bern(p) are iid] = n * 1 / p^2 * Var(X * Bern(p)) [1 / p is constant] = n * 1 / p^2 * (Var(X) * Var(Bern(p)) + E(X)^2 * Var(Bern(p)) + Var(X) * E(Bern(p))^2 [Product of independent variables] = n * 1 / p^2 * (Var(X) * p(1 - p) + E(X)^2 * p(1 - p) + Var(X) * p^2) [Variance of a Bernoulli distribution] = n * 1 / p * (Var(X) + E(X)^2 * (1 - p)) = samples / p^2 * (Var(X) + E(X)^2 * (1 - p)) [samples = n * p, since it's only the observed rows]Therefore Stddev(S) = 1 / p * sqrt(samples * (variance + mean^2 * (1 - p)))
public static double sumError(long samples, long count, double m2, double mean)
//package com.java2s; /*// w w w . j a va 2 s. c om * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ public class Main { /** * Computes the standard deviation for the random variable S = sum(1 / p * X * Bern(p)) * <br /><br /> * Derivation: * <pre> * Var(S) = Var(sum(1 / p * X * Bern(p))) * = sum(Var(1 / p * X * Bern(p))) [Bienayme formula] * = n * Var(1 / p * X * Bern(p)) [X * Bern(p) are iid] * = n * 1 / p^2 * Var(X * Bern(p)) [1 / p is constant] * = n * 1 / p^2 * (Var(X) * Var(Bern(p)) + E(X)^2 * Var(Bern(p)) + Var(X) * E(Bern(p))^2 [Product of independent variables] * = n * 1 / p^2 * (Var(X) * p(1 - p) + E(X)^2 * p(1 - p) + Var(X) * p^2) [Variance of a Bernoulli distribution] * = n * 1 / p * (Var(X) + E(X)^2 * (1 - p)) * = samples / p^2 * (Var(X) + E(X)^2 * (1 - p)) [samples = n * p, since it's only the observed rows] * </pre> * Therefore Stddev(S) = 1 / p * sqrt(samples * (variance + mean^2 * (1 - p))) */ public static double sumError(long samples, long count, double m2, double mean) { if (count == 0) { return Double.POSITIVE_INFINITY; } double p = samples / (double) count; double variance = m2 / samples; double error = 1 / p * Math.sqrt(samples * (variance + mean * mean * (1 - p))); return conservativeError(error, p, samples); } private static double conservativeError(double error, double p, double samples) { // Heuristic to determine that the sample is too small if (p < 0.01 && samples < 100) { return Double.POSITIVE_INFINITY; } return error; } }