org.apache.sysml.runtime.compress.estim.CompressedSizeEstimatorSample.java Source code

Java tutorial

Introduction

Here is the source code for org.apache.sysml.runtime.compress.estim.CompressedSizeEstimatorSample.java

Source

/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 * 
 *   http://www.apache.org/licenses/LICENSE-2.0
 * 
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */

package org.apache.sysml.runtime.compress.estim;

import java.util.Arrays;
import java.util.HashMap;
import java.util.HashSet;

import org.apache.commons.math3.distribution.ChiSquaredDistribution;
import org.apache.commons.math3.random.RandomDataGenerator;
import org.apache.sysml.hops.OptimizerUtils;
import org.apache.sysml.runtime.compress.BitmapEncoder;
import org.apache.sysml.runtime.compress.ReaderColumnSelection;
import org.apache.sysml.runtime.compress.CompressedMatrixBlock;
import org.apache.sysml.runtime.compress.ReaderColumnSelectionDense;
import org.apache.sysml.runtime.compress.ReaderColumnSelectionDenseSample;
import org.apache.sysml.runtime.compress.ReaderColumnSelectionSparse;
import org.apache.sysml.runtime.compress.UncompressedBitmap;
import org.apache.sysml.runtime.compress.utils.DblArray;
import org.apache.sysml.runtime.matrix.data.MatrixBlock;

public class CompressedSizeEstimatorSample extends CompressedSizeEstimator {
    private static final boolean CORRECT_NONZERO_ESTIMATE = false; //TODO enable for production
    private final static double SHLOSSER_JACKKNIFE_ALPHA = 0.975;
    public static final float HAAS_AND_STOKES_ALPHA1 = 0.9F; //0.9 recommended in paper
    public static final float HAAS_AND_STOKES_ALPHA2 = 30F; //30 recommended in paper
    public static final float HAAS_AND_STOKES_UJ2A_C = 50; //50 recommend in paper

    private int[] _sampleRows = null;
    private RandomDataGenerator _rng = null;
    private int _numRows = -1;

    public CompressedSizeEstimatorSample(MatrixBlock data, int[] sampleRows) {
        super(data);
        _sampleRows = sampleRows;
        _rng = new RandomDataGenerator();
        _numRows = CompressedMatrixBlock.TRANSPOSE_INPUT ? _data.getNumColumns() : _data.getNumRows();
    }

    public CompressedSizeEstimatorSample(MatrixBlock mb, int sampleSize) {
        this(mb, null);
        _sampleRows = getSortedUniformSample(_numRows, sampleSize);
    }

    /**
     * set the sample rows (assumed to be sorted)
     * 
     * @param sampleRows sample rows, assumed to be sorted
     */
    public void setSampleRows(int[] sampleRows) {
        _sampleRows = sampleRows;
    }

    @Override
    public CompressedSizeInfo estimateCompressedColGroupSize(int[] colIndexes) {
        //extract statistics from sample
        UncompressedBitmap ubm = BitmapEncoder.extractBitmapFromSample(colIndexes, _data, _sampleRows);
        SizeEstimationFactors fact = computeSizeEstimationFactors(ubm, false);

        //estimate number of distinct values 
        int totalCardinality = getNumDistinctValues(colIndexes);
        totalCardinality = Math.max(totalCardinality, fact.numVals); //fix anomalies w/ large sample fraction
        totalCardinality = Math.min(totalCardinality, _numRows); //fix anomalies w/ large sample fraction

        //estimate unseen values
        // each unseen is assumed to occur only once (it did not show up in the sample because it is rare)
        int unseen = Math.max(0, totalCardinality - fact.numVals);
        int sampleSize = _sampleRows.length;

        //estimate number of offsets
        double sparsity = OptimizerUtils.getSparsity(_data.getNumRows(), _data.getNumColumns(),
                _data.getNonZeros());

        // expected value given that we don't store the zero values
        float totalNumOffs = (float) (_numRows * (1 - Math.pow(1 - sparsity, colIndexes.length)));
        if (CORRECT_NONZERO_ESTIMATE) {
            long numZeros = sampleSize - fact.numOffs;
            float C = Math.max(1 - (float) fact.numSingle / sampleSize, (float) sampleSize / _numRows);
            totalNumOffs = _numRows - ((numZeros > 0) ? (float) _numRows / sampleSize * C * numZeros : 0);
        }

        // For a single offset, the number of blocks depends on the value of
        // that offset. small offsets (first group of rows in the matrix)
        // require a small number of blocks and large offsets (last group of
        // rows) require a large number of blocks. The unseen offsets are
        // distributed over the entire offset range. A reasonable and fast
        // estimate for the number of blocks is to use the arithmetic mean of
        // the number of blocks used for the first index (=1) and that of the
        // last index.
        int numUnseenSeg = Math.round(
                unseen * (2.0f * BitmapEncoder.BITMAP_BLOCK_SZ + _numRows) / 2 / BitmapEncoder.BITMAP_BLOCK_SZ);
        int totalNumSeg = fact.numSegs + numUnseenSeg;
        int totalNumRuns = getNumRuns(ubm, sampleSize, _numRows) + unseen;

        //construct new size info summary
        return new CompressedSizeInfo(totalCardinality,
                getRLESize(totalCardinality, totalNumRuns, colIndexes.length),
                getOLESize(totalCardinality, totalNumOffs, totalNumSeg, colIndexes.length));
    }

    @Override
    public CompressedSizeInfo estimateCompressedColGroupSize(UncompressedBitmap ubm) {
        //compute size estimation factors
        SizeEstimationFactors fact = computeSizeEstimationFactors(ubm, true);

        //construct new size info summary
        return new CompressedSizeInfo(fact.numVals, getRLESize(fact.numVals, fact.numRuns, ubm.getNumColumns()),
                getOLESize(fact.numVals, fact.numOffs, fact.numSegs, ubm.getNumColumns()));
    }

    private int getNumDistinctValues(int[] colIndexes) {
        return haasAndStokes(colIndexes);
    }

    private int getNumRuns(UncompressedBitmap sampleUncompressedBitmap, int sampleSize, int totalNumRows) {
        int numVals = sampleUncompressedBitmap.getNumValues();
        // all values in the sample are zeros
        if (numVals == 0)
            return 0;
        float numRuns = 0;
        for (int vi = 0; vi < numVals; vi++) {
            int[] offsets = sampleUncompressedBitmap.getOffsetsList(vi);
            float offsetsRatio = ((float) offsets.length) / sampleSize;
            float avgAdditionalOffsets = offsetsRatio * totalNumRows / sampleSize;
            if (avgAdditionalOffsets < 1) {
                // Ising-Stevens does not hold
                // fall-back to using the expected number of offsets as an upper
                // bound on the number of runs
                numRuns += ((float) offsets.length) * totalNumRows / sampleSize;
                continue;
            }
            int intervalEnd, intervalSize;
            float additionalOffsets;
            // probability of an index being non-offset in current and previous
            // interval respectively
            float nonOffsetProb, prevNonOffsetProb = 1;
            boolean reachedSampleEnd = false;
            // handling the first interval separately for simplicity
            int intervalStart = -1;
            if (_sampleRows[0] == 0) {
                // empty interval
                intervalStart = 0;
            } else {
                intervalEnd = _sampleRows[0];
                intervalSize = intervalEnd - intervalStart - 1;
                // expected value of a multivariate hypergeometric distribution
                additionalOffsets = offsetsRatio * intervalSize;
                // expected value of an Ising-Stevens distribution
                numRuns += (intervalSize - additionalOffsets) * additionalOffsets / intervalSize;
                intervalStart = intervalEnd;
                prevNonOffsetProb = (intervalSize - additionalOffsets) / intervalSize;
            }
            // for handling separators

            int withinSepRun = 0;
            boolean seenNonOffset = false, startedWithOffset = false, endedWithOffset = false;
            int offsetsPtrs = 0;
            for (int ix = 1; ix < sampleSize; ix++) {
                // start of a new separator
                // intervalStart will always be pointing at the current value
                // in the separator block

                if (offsetsPtrs < offsets.length && offsets[offsetsPtrs] == intervalStart) {
                    startedWithOffset = true;
                    offsetsPtrs++;
                    endedWithOffset = true;
                } else {
                    seenNonOffset = true;
                    endedWithOffset = false;
                }
                while (intervalStart + 1 == _sampleRows[ix]) {
                    intervalStart = _sampleRows[ix];
                    if (seenNonOffset) {
                        if (offsetsPtrs < offsets.length && offsets[offsetsPtrs] == intervalStart) {
                            withinSepRun = 1;
                            offsetsPtrs++;
                            endedWithOffset = true;
                        } else {
                            numRuns += withinSepRun;
                            withinSepRun = 0;
                            endedWithOffset = false;
                        }
                    } else if (offsetsPtrs < offsets.length && offsets[offsetsPtrs] == intervalStart) {
                        offsetsPtrs++;
                        endedWithOffset = true;
                    } else {
                        seenNonOffset = true;
                        endedWithOffset = false;
                    }
                    //
                    ix++;
                    if (ix == sampleSize) {
                        // end of sample which searching for a start
                        reachedSampleEnd = true;
                        break;
                    }
                }

                // runs within an interval of unknowns
                if (reachedSampleEnd)
                    break;
                intervalEnd = _sampleRows[ix];
                intervalSize = intervalEnd - intervalStart - 1;
                // expected value of a multivariate hypergeometric distribution
                additionalOffsets = offsetsRatio * intervalSize;
                // expected value of an Ising-Stevens distribution
                numRuns += (intervalSize - additionalOffsets) * additionalOffsets / intervalSize;
                nonOffsetProb = (intervalSize - additionalOffsets) / intervalSize;

                // additional runs resulting from x's on the boundaries of the
                // separators
                // endedWithOffset = findInArray(offsets, intervalStart) != -1;
                if (seenNonOffset) {
                    if (startedWithOffset) {
                        // add p(y in the previous interval)
                        numRuns += prevNonOffsetProb;
                    }
                    if (endedWithOffset) {
                        // add p(y in the current interval)
                        numRuns += nonOffsetProb;
                    }
                } else {
                    // add p(y in the previous interval and y in the current
                    // interval)
                    numRuns += prevNonOffsetProb * nonOffsetProb;
                }
                prevNonOffsetProb = nonOffsetProb;
                intervalStart = intervalEnd;
                // reseting separator variables
                seenNonOffset = startedWithOffset = endedWithOffset = false;
                withinSepRun = 0;

            }
            // last possible interval
            if (intervalStart != totalNumRows - 1) {
                intervalEnd = totalNumRows;
                intervalSize = intervalEnd - intervalStart - 1;
                // expected value of a multivariate hypergeometric distribution
                additionalOffsets = offsetsRatio * intervalSize;
                // expected value of an Ising-Stevens distribution
                numRuns += (intervalSize - additionalOffsets) * additionalOffsets / intervalSize;
                nonOffsetProb = (intervalSize - additionalOffsets) / intervalSize;
            } else {
                nonOffsetProb = 1;
            }
            // additional runs resulting from x's on the boundaries of the
            // separators
            endedWithOffset = intervalStart == offsets[offsets.length - 1];
            if (seenNonOffset) {
                if (startedWithOffset) {
                    numRuns += prevNonOffsetProb;
                }
                if (endedWithOffset) {
                    // add p(y in the current interval)
                    numRuns += nonOffsetProb;
                }
            } else {
                if (endedWithOffset)
                    // add p(y in the previous interval and y in the current
                    // interval)
                    numRuns += prevNonOffsetProb * nonOffsetProb;
            }
        }
        return Math.round(numRuns);
    }

    private int haasAndStokes(int[] colIndexes) {
        ReaderColumnSelection reader = new ReaderColumnSelectionDenseSample(_data, colIndexes, _sampleRows,
                !CompressedMatrixBlock.MATERIALIZE_ZEROS);
        return haasAndStokes(_numRows, _sampleRows.length, reader);
    }

    /**
     * TODO remove, just for local debugging.
     * 
     * @param colIndexes column indexes
     * @return exact number of district values
     */
    @SuppressWarnings("unused")
    private int getExactNumDistinctValues(int[] colIndexes) {
        HashSet<DblArray> distinctVals = new HashSet<DblArray>();
        ReaderColumnSelection reader = (_data.isInSparseFormat() && CompressedMatrixBlock.TRANSPOSE_INPUT)
                ? new ReaderColumnSelectionSparse(_data, colIndexes, !CompressedMatrixBlock.MATERIALIZE_ZEROS)
                : new ReaderColumnSelectionDense(_data, colIndexes, !CompressedMatrixBlock.MATERIALIZE_ZEROS);
        DblArray val = null;
        while (null != (val = reader.nextRow()))
            distinctVals.add(val);
        return distinctVals.size();
    }

    /**
     * Returns a sorted array of n integers, drawn uniformly from the range [0,range).
     * 
     * @param range the range
     * @param smplSize sample size
     * @return sorted array of integers
     */
    private int[] getSortedUniformSample(int range, int smplSize) {
        if (smplSize == 0)
            return new int[] {};
        int[] sample = _rng.nextPermutation(range, smplSize);
        Arrays.sort(sample);
        return sample;
    }

    /////////////////////////////////////////////////////
    // Sample Cardinality Estimator library
    /////////////////////////////////////////

    /**
     * M. Charikar, S. Chaudhuri, R. Motwani, and V. R. Narasayya, Towards
     * estimation error guarantees for distinct values, PODS'00.
     * 
     * @param nRows number of rows
     * @param sampleSize sample size
     * @param sampleRowsReader
     *             a reader for the sampled rows
     * @return error estimator
     */
    @SuppressWarnings("unused")
    private static int guaranteedErrorEstimator(int nRows, int sampleSize, ReaderColumnSelection sampleRowsReader) {
        HashMap<DblArray, Integer> valsCount = getValCounts(sampleRowsReader);
        // number of values that occur only once
        int singltonValsCount = 0;
        int otherValsCount = 0;
        for (Integer c : valsCount.values()) {
            if (c == 1)
                singltonValsCount++;
            else
                otherValsCount++;
        }
        return (int) Math.round(otherValsCount + singltonValsCount * Math.sqrt(((double) nRows) / sampleSize));
    }

    /**
     * Peter J. Haas, Jeffrey F. Naughton, S. Seshadri, and Lynne Stokes. 
     * Sampling-Based Estimation of the Number of Distinct Values of an
     * Attribute. VLDB'95, Section 3.2.
     * 
     * @param nRows number of rows
     * @param sampleSize sample size
     * @param sampleRowsReader reader
     * @return estimator
     */
    @SuppressWarnings("unused")
    private static int shlosserEstimator(int nRows, int sampleSize, ReaderColumnSelection sampleRowsReader) {
        return shlosserEstimator(nRows, sampleSize, sampleRowsReader, getValCounts(sampleRowsReader));
    }

    private static int shlosserEstimator(int nRows, int sampleSize, ReaderColumnSelection sampleRowsReader,
            HashMap<DblArray, Integer> valsCount) {
        double q = ((double) sampleSize) / nRows;
        double oneMinusQ = 1 - q;

        int[] freqCounts = getFreqCounts(valsCount);

        double numerSum = 0, denomSum = 0;
        int iPlusOne = 1;
        for (int i = 0; i < freqCounts.length; i++, iPlusOne++) {
            numerSum += Math.pow(oneMinusQ, iPlusOne) * freqCounts[i];
            denomSum += iPlusOne * q * Math.pow(oneMinusQ, i) * freqCounts[i];
        }
        int estimate = (int) Math.round(valsCount.size() + freqCounts[0] * numerSum / denomSum);
        return estimate < 1 ? 1 : estimate;
    }

    /**
     * Peter J. Haas, Jeffrey F. Naughton, S. Seshadri, and Lynne Stokes.
     * Sampling-Based Estimation of the Number of Distinct Values of an
     * Attribute. VLDB'95, Section 4.3.
     * 
     * @param nRows number of rows
     * @param sampleSize sample size
     * @param sampleRowsReader row reader
     * @return estimator
     */
    @SuppressWarnings("unused")
    private static int smoothedJackknifeEstimator(int nRows, int sampleSize,
            ReaderColumnSelection sampleRowsReader) {
        return smoothedJackknifeEstimator(nRows, sampleSize, sampleRowsReader, getValCounts(sampleRowsReader));
    }

    private static int smoothedJackknifeEstimator(int nRows, int sampleSize, ReaderColumnSelection sampleRowsReader,
            HashMap<DblArray, Integer> valsCount) {
        int[] freqCounts = getFreqCounts(valsCount);
        // all values in the sample are zeros
        if (freqCounts.length == 0)
            return 0;
        // nRows is N and sampleSize is n

        int d = valsCount.size();
        double f1 = freqCounts[0];
        int Nn = nRows * sampleSize;
        double D0 = (d - f1 / sampleSize) / (1 - (nRows - sampleSize + 1) * f1 / Nn);
        double NTilde = nRows / D0;
        /*-
         *
         * h (as defined in eq. 5 in the paper) can be implemented as:
         * 
         * double h = Gamma(nRows - NTilde + 1) x Gamma.gamma(nRows -sampleSize + 1) 
         *            ----------------------------------------------------------------
         *        Gamma.gamma(nRows - sampleSize - NTilde + 1) x Gamma.gamma(nRows + 1)
         * 
         * 
         * However, for large values of nRows, Gamma.gamma returns NAN
         * (factorial of a very large number).
         * 
         * The following implementation solves this problem by levaraging the
         * cancelations that show up when expanding the factorials in the
         * numerator and the denominator.
         * 
         * 
         *       min(A,D-1) x [min(A,D-1) -1] x .... x B
         * h = -------------------------------------------
         *       C x [C-1] x .... x max(A+1,D)
         * 
         * where A = N-\tilde{N}
         *       B = N-\tilde{N} - n + a
         *       C = N
         *       D = N-n+1
         *       
         *       
         *
         */
        double A = (int) nRows - NTilde;
        double B = A - sampleSize + 1;
        double C = nRows;
        double D = nRows - sampleSize + 1;
        A = Math.min(A, D - 1);
        D = Math.max(A + 1, D);
        double h = 1;

        for (; A >= B || C >= D; A--, C--) {
            if (A >= B)
                h *= A;
            if (C >= D)
                h /= C;
        }
        // end of h computation

        double g = 0, gamma = 0;
        // k here corresponds to k+1 in the paper (the +1 comes from replacing n
        // with n-1)
        for (int k = 2; k <= sampleSize + 1; k++) {
            g += 1.0 / (nRows - NTilde - sampleSize + k);
        }
        for (int i = 1; i <= freqCounts.length; i++) {
            gamma += i * (i - 1) * freqCounts[i - 1];
        }
        gamma *= (nRows - 1) * D0 / Nn / (sampleSize - 1);
        gamma += D0 / nRows - 1;

        double estimate = (d + nRows * h * g * gamma) / (1 - (nRows - NTilde - sampleSize + 1) * f1 / Nn);
        return estimate < 1 ? 1 : (int) Math.round(estimate);
    }

    /**
     * Peter J. Haas, Jeffrey F. Naughton, S. Seshadri, and Lynne Stokes. 1995.
     * Sampling-Based Estimation of the Number of Distinct Values of an
     * Attribute. VLDB'95, Section 5.2, recommended estimator by the authors
     * 
     * @param nRows number of rows
     * @param sampleSize sample size
     * @param sampleRowsReader row reader
     * @return estimator
     */
    @SuppressWarnings("unused")
    private static int shlosserJackknifeEstimator(int nRows, int sampleSize,
            ReaderColumnSelection sampleRowsReader) {
        HashMap<DblArray, Integer> valsCount = getValCounts(sampleRowsReader);

        // uniformity chi-square test
        double nBar = ((double) sampleSize) / valsCount.size();
        // test-statistic
        double u = 0;
        for (int cnt : valsCount.values()) {
            u += Math.pow(cnt - nBar, 2);
        }
        u /= nBar;
        if (sampleSize != usedSampleSize)
            computeCriticalValue(sampleSize);
        if (u < uniformityCriticalValue) {
            // uniform
            return smoothedJackknifeEstimator(nRows, sampleSize, sampleRowsReader, valsCount);
        } else {
            return shlosserEstimator(nRows, sampleSize, sampleRowsReader, valsCount);
        }
    }

    /*
     * In the shlosserSmoothedJackknifeEstimator as long as the sample size did
     * not change, we will have the same critical value each time the estimator
     * is used (given that alpha is the same). We cache the critical value to
     * avoid recomputing it in each call.
     */
    private static double uniformityCriticalValue;
    private static int usedSampleSize;

    private static void computeCriticalValue(int sampleSize) {
        ChiSquaredDistribution chiSqr = new ChiSquaredDistribution(sampleSize - 1);
        uniformityCriticalValue = chiSqr.inverseCumulativeProbability(SHLOSSER_JACKKNIFE_ALPHA);
        usedSampleSize = sampleSize;
    }

    /**
     * Haas, Peter J., and Lynne Stokes.
     * "Estimating the number of classes in a finite population." Journal of the
     * American Statistical Association 93.444 (1998): 1475-1487.
     * 
     * The hybrid estimator given by Eq. 33 in Section 6
     * 
     * @param nRows number of rows
     * @param sampleSize sample size
     * @param sampleRowsReader row reader
     * @return estimator
     */
    private static int haasAndStokes(int nRows, int sampleSize, ReaderColumnSelection sampleRowsReader) {
        HashMap<DblArray, Integer> valsCount = getValCounts(sampleRowsReader);
        // all values in the sample are zeros.
        if (valsCount.size() == 0)
            return 1;
        int[] freqCounts = getFreqCounts(valsCount);
        float q = ((float) sampleSize) / nRows;
        float _1MinusQ = 1 - q;
        // Eq. 11
        float duj1Fraction = ((float) sampleSize) / (sampleSize - _1MinusQ * freqCounts[0]);
        float duj1 = duj1Fraction * valsCount.size();
        // Eq. 16
        float gamma = 0;
        for (int i = 1; i <= freqCounts.length; i++) {
            gamma += i * (i - 1) * freqCounts[i - 1];
        }
        gamma *= duj1 / sampleSize / sampleSize;
        gamma += duj1 / nRows - 1;
        gamma = Math.max(gamma, 0);
        int estimate;

        if (gamma < HAAS_AND_STOKES_ALPHA1) {
            // UJ2 - begining of page 1479
            //   System.out.println("uj2");
            estimate = (int) (duj1Fraction
                    * (valsCount.size() - freqCounts[0] * _1MinusQ * Math.log(_1MinusQ) * gamma / q));
        } else if (gamma < HAAS_AND_STOKES_ALPHA2) {
            // UJ2a - end of page 1998
            //System.out.println("uj2a");
            int numRemovedClasses = 0;
            float updatedNumRows = nRows;
            int updatedSampleSize = sampleSize;

            for (Integer cnt : valsCount.values()) {
                if (cnt > HAAS_AND_STOKES_UJ2A_C) {
                    numRemovedClasses++;
                    freqCounts[cnt - 1]--;
                    updatedSampleSize -= cnt;
                    /*
                     * To avoid solving Eq. 20 numerically for the class size in
                     * the full population (N_j), the current implementation
                     * just scales cnt (n_j) by the sampling ratio (q).
                     * Intuitively, the scaling should be fine since cnt is
                     * large enough. Also, N_j in Eq. 20 is lower-bounded by cnt
                     * which is already large enough to make the denominator in
                     * Eq. 20 very close to 1.
                     */
                    updatedNumRows -= ((float) cnt) / q;
                }
            }
            if (updatedSampleSize == 0) {
                // use uJ2a

                estimate = (int) (duj1Fraction
                        * (valsCount.size() - freqCounts[0] * (_1MinusQ) * Math.log(_1MinusQ) * gamma / q));
            } else {
                float updatedQ = ((float) updatedSampleSize) / updatedNumRows;
                int updatedSampleCardinality = valsCount.size() - numRemovedClasses;
                float updatedDuj1Fraction = ((float) updatedSampleSize)
                        / (updatedSampleSize - (1 - updatedQ) * freqCounts[0]);
                float updatedDuj1 = updatedDuj1Fraction * updatedSampleCardinality;
                float updatedGamma = 0;
                for (int i = 1; i <= freqCounts.length; i++) {
                    updatedGamma += i * (i - 1) * freqCounts[i - 1];
                }
                updatedGamma *= updatedDuj1 / updatedSampleSize / updatedSampleSize;
                updatedGamma += updatedDuj1 / updatedNumRows - 1;
                updatedGamma = Math.max(updatedGamma, 0);

                estimate = (int) (updatedDuj1Fraction * (updatedSampleCardinality
                        - freqCounts[0] * (1 - updatedQ) * Math.log(1 - updatedQ) * updatedGamma / updatedQ))
                        + numRemovedClasses;
            }

        } else {
            // Sh3 - end of section 3
            float fraq1Numer = 0;
            float fraq1Denom = 0;
            float fraq2Numer = 0;
            float fraq2Denom = 0;
            for (int i = 1; i <= freqCounts.length; i++) {
                fraq1Numer += i * q * q * Math.pow(1 - q * q, i - 1) * freqCounts[i - 1];
                fraq1Denom += Math.pow(_1MinusQ, i) * (Math.pow(1 + q, i) - 1) * freqCounts[i - 1];
                fraq2Numer += Math.pow(_1MinusQ, i) * freqCounts[i - 1];
                fraq2Denom += i * q * Math.pow(_1MinusQ, i - 1) * freqCounts[i - 1];
            }
            estimate = (int) (valsCount.size()
                    + freqCounts[0] * fraq1Numer / fraq1Denom * fraq2Numer * fraq2Numer / fraq2Denom / fraq2Denom);
        }
        return estimate < 1 ? 1 : estimate;
    }

    private static HashMap<DblArray, Integer> getValCounts(ReaderColumnSelection sampleRowsReader) {
        HashMap<DblArray, Integer> valsCount = new HashMap<DblArray, Integer>();
        DblArray val = null;
        Integer cnt;
        while (null != (val = sampleRowsReader.nextRow())) {
            cnt = valsCount.get(val);
            if (cnt == null)
                cnt = 0;
            cnt++;
            valsCount.put(val, cnt);
        }
        return valsCount;
    }

    private static int[] getFreqCounts(HashMap<DblArray, Integer> valsCount) {
        int maxCount = 0;
        for (Integer c : valsCount.values()) {
            if (c > maxCount)
                maxCount = c;
        }

        /*
         * freqCounts[i-1] = how many values occured with a frequecy i
         */
        int[] freqCounts = new int[maxCount];
        for (Integer c : valsCount.values()) {
            freqCounts[c - 1]++;
        }
        return freqCounts;

    }
}