org.apache.solr.handler.component.StatsField.java Source code

Introduction

Here is the source code for org.apache.solr.handler.component.StatsField.java
Source

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.solr.handler.component;

import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.Collections;
import java.util.EnumSet;
import java.util.IdentityHashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Set;

import org.apache.commons.lang.StringUtils;
import org.apache.lucene.legacy.LegacyNumericType;
import org.apache.lucene.index.LeafReaderContext;
import org.apache.lucene.queries.function.FunctionQuery;
import org.apache.lucene.queries.function.ValueSource;
import org.apache.lucene.queries.function.valuesource.FieldCacheSource;
import org.apache.lucene.queries.function.valuesource.QueryValueSource;
import org.apache.lucene.search.Query;
import org.apache.solr.common.SolrException;
import org.apache.solr.common.SolrException.ErrorCode;
import org.apache.solr.common.params.CommonParams;
import org.apache.solr.common.params.ModifiableSolrParams;
import org.apache.solr.common.params.SolrParams;
import org.apache.solr.common.params.StatsParams;
import org.apache.solr.common.util.StrUtils;
import org.apache.solr.request.DocValuesStats;
import org.apache.solr.request.SolrQueryRequest;
import org.apache.solr.schema.IndexSchema;
import org.apache.solr.schema.SchemaField;
import org.apache.solr.search.DocIterator;
import org.apache.solr.search.DocSet;
import org.apache.solr.search.QParser;
import org.apache.solr.search.QParserPlugin;
import org.apache.solr.search.QueryParsing;
import org.apache.solr.search.SolrIndexSearcher;
import org.apache.solr.search.SyntaxError;
import org.apache.solr.util.hll.HLL;
import org.apache.solr.util.hll.HLLType;

import com.google.common.hash.Hashing;
import com.google.common.hash.HashFunction;

/**
 * Models all of the information associated with a single {@link StatsParams#STATS_FIELD}
 * instance.
 *
 * @see StatsComponent
 */
public class StatsField {

    /**
     * An enumeration representing the sumer set of all possible stat values that can be computed.
     * Each of these enum values can be specified as a local param in a <code>stats.field</code> 
     * (eg: <code>stats.field={!min=true mean=true}my_field_name</code>) but not all enum values 
     * are valid for all field types (eg: <code>mean</code> is meaningless for String fields)
     *
     * @lucene.internal
     * @lucene.experimental
     */
    public static enum Stat {
        min(true), max(true), missing(true), sum(true), count(true), mean(false, sum, count), sumOfSquares(
                true), stddev(false, sum, count, sumOfSquares), distinctValues(
                        true), countDistinct(false, distinctValues), percentiles(true) {
                            /** special for percentiles **/
                            boolean parseParams(StatsField sf) {
                                String percentileParas = sf.localParams.get(this.name());
                                if (percentileParas != null) {
                                    List<Double> percentiles = new ArrayList<Double>();
                                    try {
                                        for (String percentile : StrUtils.splitSmart(percentileParas, ',')) {
                                            percentiles.add(Double.parseDouble(percentile));
                                        }
                                        if (!percentiles.isEmpty()) {
                                            sf.percentilesList.addAll(percentiles);
                                            sf.tdigestCompression = sf.localParams.getDouble("tdigestCompression",
                                                    sf.tdigestCompression);
                                            return true;
                                        }
                                    } catch (NumberFormatException e) {
                                        throw new SolrException(ErrorCode.BAD_REQUEST,
                                                "Unable to parse " + StatsParams.STATS_FIELD + " local params: "
                                                        + sf.localParams + " due to: " + e.getMessage(),
                                                e);
                                    }

                                }
                                return false;
                            }
                        },
        cardinality(true) {
            /** special for percentiles **/
            boolean parseParams(StatsField sf) {
                try {
                    sf.hllOpts = HllOptions.parseHllOptions(sf.localParams, sf.schemaField);
                    return (null != sf.hllOpts);
                } catch (Exception e) {
                    throw new SolrException(ErrorCode.BAD_REQUEST, "Unable to parse " + StatsParams.STATS_FIELD
                            + " local params: " + sf.localParams + " due to: " + e.getMessage(), e);
                }
            }
        };

        private final List<Stat> distribDeps;

        /**
         * Sole constructor for Stat enum values
         * @param deps the set of stat values, other then this one, which are a distributed 
         *        dependency and must be computed and returned by each individual shards in 
         *        order to compute <i>this</i> stat over the entire distributed result set.
         * @param selfDep indicates that when computing this stat across a distributed result 
         *        set, each shard must compute this stat <i>in addition to</i> any other 
         *        distributed dependencies.
         * @see #getDistribDeps
         */
        Stat(boolean selfDep, Stat... deps) {
            distribDeps = new ArrayList<Stat>(deps.length + 1);
            distribDeps.addAll(Arrays.asList(deps));
            if (selfDep) {
                distribDeps.add(this);
            }
        }

        /**
         * Given a String, returns the corrisponding Stat enum value if any, otherwise returns null.
         */
        public static Stat forName(String paramKey) {
            try {
                return Stat.valueOf(paramKey);
            } catch (IllegalArgumentException e) {
                return null;
            }
        }

        /**
         * The stats that must be computed and returned by each shard involved in a distributed 
         * request in order to compute the overall value for this stat across the entire distributed 
         * result set.  A Stat instance may include itself in the <code>getDistribDeps()</code> result,
         * but that is not always the case.
         */
        public EnumSet<Stat> getDistribDeps() {
            return EnumSet.copyOf(this.distribDeps);
        }

        /** 
         * Called when the name of a stat is found as a local param on this {@link StatsField}
         * @return true if the user is requesting this stat, else false
         */
        boolean parseParams(StatsField sf) {
            return sf.localParams.getBool(this.name(), false);
        }

    }

    /**
     * the equivilent stats if "calcdistinct" is specified
     * @see Stat#countDistinct
     * @see Stat#distinctValues
     */
    private static final EnumSet<Stat> CALCDISTINCT_PSUEDO_STAT = EnumSet.of(Stat.countDistinct,
            Stat.distinctValues);

    /**
     * The set of stats computed by default when no localparams are used to specify explicit stats 
     */
    public final static Set<Stat> DEFAULT_STATS = Collections.<Stat>unmodifiableSet(EnumSet.of(Stat.min, Stat.max,
            Stat.missing, Stat.sum, Stat.count, Stat.mean, Stat.sumOfSquares, Stat.stddev));

    private final SolrIndexSearcher searcher;
    private final ResponseBuilder rb;
    private final String originalParam; // for error messages
    private final SolrParams localParams;
    private final ValueSource valueSource; // may be null if simple field stats
    private final SchemaField schemaField; // may be null if function/query stats
    private final String key;
    private final boolean topLevelCalcDistinct;
    private final String[] facets;
    private final List<String> tagList;
    private final List<String> excludeTagList;
    private final EnumSet<Stat> statsToCalculate = EnumSet.noneOf(Stat.class);
    private final EnumSet<Stat> statsInResponse = EnumSet.noneOf(Stat.class);
    private final List<Double> percentilesList = new ArrayList<Double>();
    private final boolean isShard;

    private double tdigestCompression = 100.0D;
    private HllOptions hllOpts;

    /**
     * @param rb the current request/response
     * @param statsParam the raw {@link StatsParams#STATS_FIELD} string
     */
    public StatsField(ResponseBuilder rb, String statsParam) {
        this.rb = rb;
        this.searcher = rb.req.getSearcher();
        this.originalParam = statsParam;

        SolrParams params = rb.req.getParams();
        try {
            isShard = params.getBool("isShard", false);
            SolrParams localParams = QueryParsing.getLocalParams(originalParam, params);
            if (null == localParams) {
                // simplest possible input: bare string (field name)
                ModifiableSolrParams customParams = new ModifiableSolrParams();
                customParams.add(QueryParsing.V, originalParam);
                localParams = customParams;
            }

            this.localParams = localParams;

            String parserName = localParams.get(QueryParsing.TYPE);
            SchemaField sf = null;
            ValueSource vs = null;

            if (StringUtils.isBlank(parserName)) {

                // basic request for field stats
                sf = searcher.getSchema().getField(localParams.get(QueryParsing.V));

            } else {
                // we have a non trivial request to compute stats over a query (or function)

                // NOTE we could use QParser.getParser(...) here, but that would redundently
                // reparse everything.  ( TODO: refactor a common method in QParser ?)
                QParserPlugin qplug = rb.req.getCore().getQueryPlugin(parserName);
                QParser qp = qplug.createParser(localParams.get(QueryParsing.V), localParams, params, rb.req);

                // figure out what type of query we are dealing, get the most direct ValueSource
                vs = extractValueSource(qp.parse());

                // if this ValueSource directly corrisponds to a SchemaField, act as if
                // we were asked to compute stats on it directly
                // ie:  "stats.field={!func key=foo}field(foo)" == "stats.field=foo"
                sf = extractSchemaField(vs, searcher.getSchema());
                if (null != sf) {
                    vs = null;
                }
            }

            assert ((null == vs) ^ (null == sf)) : "exactly one of vs & sf must be null";

            this.schemaField = sf;
            this.valueSource = vs;

        } catch (SyntaxError e) {
            throw new SolrException(ErrorCode.BAD_REQUEST, "Unable to parse " + StatsParams.STATS_FIELD + ": "
                    + originalParam + " due to: " + e.getMessage(), e);
        }

        // allow explicit setting of the response key via localparams...
        this.key = localParams.get(CommonParams.OUTPUT_KEY,
                // default to the main param value...
                localParams.get(CommonParams.VALUE,
                        // default to entire original param str.
                        originalParam));

        this.topLevelCalcDistinct = null == schemaField ? params.getBool(StatsParams.STATS_CALC_DISTINCT, false)
                : params.getFieldBool(schemaField.getName(), StatsParams.STATS_CALC_DISTINCT, false);

        populateStatsSets();

        String[] facets = params.getFieldParams(key, StatsParams.STATS_FACET);
        this.facets = (null == facets) ? new String[0] : facets;
        String tagStr = localParams.get(CommonParams.TAG);
        this.tagList = (null == tagStr) ? Collections.<String>emptyList() : StrUtils.splitSmart(tagStr, ',');

        // figure out if we need a special base DocSet
        String excludeStr = localParams.get(CommonParams.EXCLUDE);
        this.excludeTagList = (null == excludeStr) ? Collections.<String>emptyList()
                : StrUtils.splitSmart(excludeStr, ',');

        assert ((null == this.valueSource)
                ^ (null == this.schemaField)) : "exactly one of valueSource & schemaField must be null";
    }

    /**
     * Inspects a {@link Query} to see if it directly maps to a {@link ValueSource},
     * and if so returns it -- otherwise wraps it as needed.
     *
     * @param q Query whose scores we have been asked to compute stats of
     * @returns a ValueSource to use for computing the stats
     */
    private static ValueSource extractValueSource(Query q) {
        return (q instanceof FunctionQuery) ?
        // Common case: we're wrapping a func, so we can directly pull out ValueSource
                ((FunctionQuery) q).getValueSource() :
                // asked to compute stats over a query, wrap it up as a ValueSource
                new QueryValueSource(q, 0.0F);
    }

    /**
     * Inspects a {@link ValueSource} to see if it directly maps to a {@link SchemaField}, 
     * and if so returns it.
     *
     * @param vs ValueSource we've been asked to compute stats of
     * @param schema The Schema to use
     * @returns Corrisponding {@link SchemaField} or null if the ValueSource is more complex
     * @see FieldCacheSource
     */
    private static SchemaField extractSchemaField(ValueSource vs, IndexSchema schema) {
        if (vs instanceof FieldCacheSource) {
            String fieldName = ((FieldCacheSource) vs).getField();
            return schema.getField(fieldName);
        }
        return null;
    }

    /** 
     * The key to be used when refering to this {@link StatsField} instance in the 
     * response tp clients.
     */
    public String getOutputKey() {
        return key;
    }

    /**
     * Computes a base {@link DocSet} for the current request to be used
     * when computing global stats for the local index.
     *
     * This is typically the same as the main DocSet for the {@link ResponseBuilder}
     * unless {@link CommonParams#TAG tag}ged filter queries have been excluded using 
     * the {@link CommonParams#EXCLUDE ex} local param
     */
    public DocSet computeBaseDocSet() throws IOException {

        DocSet docs = rb.getResults().docSet;
        Map<?, ?> tagMap = (Map<?, ?>) rb.req.getContext().get("tags");

        if (excludeTagList.isEmpty() || null == tagMap) {
            // either the exclude list is empty, or there
            // aren't any tagged filters to exclude anyway.
            return docs;
        }

        IdentityHashMap<Query, Boolean> excludeSet = new IdentityHashMap<Query, Boolean>();
        for (String excludeTag : excludeTagList) {
            Object olst = tagMap.get(excludeTag);
            // tagMap has entries of List<String,List<QParser>>, but subject to change in the future
            if (!(olst instanceof Collection))
                continue;
            for (Object o : (Collection<?>) olst) {
                if (!(o instanceof QParser))
                    continue;
                QParser qp = (QParser) o;
                try {
                    excludeSet.put(qp.getQuery(), Boolean.TRUE);
                } catch (SyntaxError e) {
                    // this shouldn't be possible since the request should have already
                    // failed when attempting to execute the query, but just in case...
                    throw new SolrException(ErrorCode.BAD_REQUEST,
                            "Excluded query can't be parsed: " + originalParam + " due to: " + e.getMessage(), e);
                }
            }
        }
        if (excludeSet.size() == 0)
            return docs;

        List<Query> qlist = new ArrayList<Query>();

        // add the base query
        if (!excludeSet.containsKey(rb.getQuery())) {
            qlist.add(rb.getQuery());
        }

        // add the filters
        if (rb.getFilters() != null) {
            for (Query q : rb.getFilters()) {
                if (!excludeSet.containsKey(q)) {
                    qlist.add(q);
                }
            }
        }

        // get the new base docset for this facet
        return searcher.getDocSet(qlist);
    }

    /**
     * Computes the {@link StatsValues} for this {@link StatsField} relative to the 
     * specified {@link DocSet} 
     * @see #computeBaseDocSet
     */
    public StatsValues computeLocalStatsValues(DocSet base) throws IOException {

        if (statsToCalculate.isEmpty()) {
            // perf optimization for the case where we compute nothing
            // ie: stats.field={!min=$domin}myfield&domin=false
            return StatsValuesFactory.createStatsValues(this);
        }

        if (null != schemaField && (schemaField.multiValued() || schemaField.getType().multiValuedFieldCache())) {

            // TODO: should this also be used for single-valued string fields? (should work fine)
            return DocValuesStats.getCounts(searcher, this, base, facets);
        } else {
            // either a single valued field we pull from FieldCache, or an explicit
            // function ValueSource
            return computeLocalValueSourceStats(base);
        }
    }

    private StatsValues computeLocalValueSourceStats(DocSet base) throws IOException {

        IndexSchema schema = searcher.getSchema();

        final StatsValues allstats = StatsValuesFactory.createStatsValues(this);

        List<FieldFacetStats> facetStats = new ArrayList<>();
        for (String facetField : facets) {
            SchemaField fsf = schema.getField(facetField);

            if (fsf.multiValued()) {
                throw new SolrException(SolrException.ErrorCode.BAD_REQUEST,
                        "Stats can only facet on single-valued fields, not: " + facetField);
            }

            facetStats.add(new FieldFacetStats(searcher, fsf, this));
        }

        final Iterator<LeafReaderContext> ctxIt = searcher.getIndexReader().leaves().iterator();
        LeafReaderContext ctx = null;
        for (DocIterator docsIt = base.iterator(); docsIt.hasNext();) {
            final int doc = docsIt.nextDoc();
            if (ctx == null || doc >= ctx.docBase + ctx.reader().maxDoc()) {
                // advance
                do {
                    ctx = ctxIt.next();
                } while (ctx == null || doc >= ctx.docBase + ctx.reader().maxDoc());
                assert doc >= ctx.docBase;

                // propagate the context among accumulators.
                allstats.setNextReader(ctx);
                for (FieldFacetStats f : facetStats) {
                    f.setNextReader(ctx);
                }
            }

            // accumulate
            allstats.accumulate(doc - ctx.docBase);
            for (FieldFacetStats f : facetStats) {
                f.facet(doc - ctx.docBase);
            }
        }

        for (FieldFacetStats f : facetStats) {
            allstats.addFacet(f.name, f.facetStatsValues);
        }
        return allstats;
    }

    /**
     * The searcher that should be used for processing local stats
     * @see SolrQueryRequest#getSearcher
     */
    public SolrIndexSearcher getSearcher() {
        // see AbstractStatsValues.setNextReader

        return searcher;
    }

    /**
     * The {@link SchemaField} whose results these stats are computed over, may be null 
     * if the stats are computed over the results of a function or query
     *
     * @see #getValueSource
     */
    public SchemaField getSchemaField() {
        return schemaField;
    }

    /**
     * The {@link ValueSource} of a function or query whose results these stats are computed 
     * over, may be null if the stats are directly over a {@link SchemaField}
     *
     * @see #getValueSource
     */
    public ValueSource getValueSource() {
        return valueSource;
    }

    public List<String> getTagList() {
        return tagList;
    }

    public String toString() {
        return "StatsField<" + originalParam + ">";
    }

    /**
     * A helper method which inspects the {@link #localParams} associated with this StatsField, 
     * and uses them to populate the {@link #statsInResponse} and {@link #statsToCalculate} data 
     * structures
     */
    private void populateStatsSets() {
        boolean statSpecifiedByLocalParam = false;
        // local individual stat
        Iterator<String> itParams = localParams.getParameterNamesIterator();

        while (itParams.hasNext()) {
            String paramKey = itParams.next();
            Stat stat = Stat.forName(paramKey);
            if (stat != null) {
                statSpecifiedByLocalParam = true;
                if (stat.parseParams(this)) {
                    statsInResponse.add(stat);
                }
            }
        }

        // if no individual stat setting use the default set
        if (!(statSpecifiedByLocalParam
                // calcdistinct (as a local param) is a psuedo-stat, prevents default set
                || localParams.getBool("calcdistinct", false))) {
            statsInResponse.addAll(DEFAULT_STATS);
        }

        // calcDistinct is a psuedo-stat with optional top level param default behavior
        // if not overridden by the specific individual stats
        if (localParams.getBool("calcdistinct", topLevelCalcDistinct)) {
            for (Stat stat : CALCDISTINCT_PSUEDO_STAT) {
                // assume true, but don't include if specific stat overrides
                if (localParams.getBool(stat.name(), true)) {
                    statsInResponse.add(stat);
                }
            }
        }

        for (Stat stat : statsInResponse) {
            statsToCalculate.addAll(stat.getDistribDeps());
        }
    }

    public boolean calculateStats(Stat stat) {
        return statsToCalculate.contains(stat);
    }

    public boolean includeInResponse(Stat stat) {
        if (isShard) {
            return statsToCalculate.contains(stat);
        }

        if (statsInResponse.contains(stat)) {
            return true;
        }
        return false;
    }

    public List<Double> getPercentilesList() {
        return percentilesList;
    }

    public boolean getIsShard() {
        return isShard;
    }

    public double getTdigestCompression() {
        return tdigestCompression;
    }

    public HllOptions getHllOptions() {
        return hllOpts;
    }

    /**
     * Helper Struct for parsing and encapsulating all of the options relaed to building a {@link HLL}
     *
     * @see Stat#cardinality
     * @lucene.internal
     */
    public static final class HllOptions {
        final HashFunction hasher;

        // NOTE: this explanation linked to from the java-hll jdocs...
        // https://github.com/aggregateknowledge/postgresql-hll/blob/master/README.markdown#explanation-of-parameters-and-tuning
        // ..if i'm understanding the regwidth chart correctly, a value of 6 should be a enough
        // to support any max cardinality given that we're always dealing with hashes and 
        // the cardinality of the set of all long values is 2**64 == 1.9e19
        //
        // But i guess that assumes a *perfect* hash and high log2m? ... if the hash algo is imperfect 
        // and/or log2m is low (ie: user is less concerned about accuracy), then many diff hash values 
        // might fall in the same register (ie: bucket) and having a wider register to count more of 
        // them may be useful

        final int log2m;
        final int regwidth;

        final static String ERR = "cardinality must be specified as 'true' (for default tunning) or decimal number between 0 and 1 to adjust accuracy vs memory usage (large number is more memory and more accuracy)";

        private HllOptions(int log2m, int regwidth, HashFunction hasher) {
            this.log2m = log2m;
            this.regwidth = regwidth;
            this.hasher = hasher;
        }

        /** 
         * Creates an HllOptions based on the (local) params specified (if appropriate).
         *
         * @param localParams the LocalParams for this {@link StatsField}
         * @param field the field corresponding to this {@link StatsField}, may be null if these stats are over a value source
         * @return the {@link HllOptions} to use based on the params, or null if no {@link HLL} should be computed
         * @throws SolrException if there are invalid options
         */
        public static HllOptions parseHllOptions(SolrParams localParams, SchemaField field) throws SolrException {

            String cardinalityOpt = localParams.get(Stat.cardinality.name());
            if (StringUtils.isBlank(cardinalityOpt)) {
                return null;
            }

            final LegacyNumericType hashableNumType = getHashableNumericType(field);

            // some sane defaults
            int log2m = 13; // roughly equivilent to "cardinality='0.33'"
            int regwidth = 6; // with decent hash, this is plenty for all valid long hashes

            if (LegacyNumericType.FLOAT.equals(hashableNumType) || LegacyNumericType.INT.equals(hashableNumType)) {
                // for 32bit values, we can adjust our default regwidth down a bit
                regwidth--;

                // NOTE: EnumField uses LegacyNumericType.INT, and in theory we could be super conservative
                // with it, but there's no point - just let the EXPLICIT HLL handle it
            }

            // TODO: we could attempt additional reductions in the default regwidth based on index
            // statistics -- but thta doesn't seem worth the effort.  for tiny indexes, the 
            // EXPLICIT and SPARSE HLL representations have us nicely covered, and in general we don't 
            // want to be too aggresive about lowering regwidth or we could really poor results if 
            // log2m is also low and  there is heavy hashkey collision

            try {
                // NFE will short out here if it's not a number
                final double accuracyOpt = Double.parseDouble(cardinalityOpt);

                // if a float between 0 and 1 is specified, treat it as a prefrence of accuracy
                // - 0 means accuracy is not a concern, save RAM
                // - 1 means be as accurate as possible, using as much RAM as needed.

                if (accuracyOpt < 0D || 1.0D < accuracyOpt) {
                    throw new SolrException(ErrorCode.BAD_REQUEST, ERR);
                }

                // use accuracyOpt as a scaling factor between min & max legal log2m values
                log2m = HLL.MINIMUM_LOG2M_PARAM
                        + (int) Math.round(accuracyOpt * (HLL.MAXIMUM_LOG2M_PARAM - HLL.MINIMUM_LOG2M_PARAM));

                // use accuracyOpt as a scaling factor for regwidth as well, BUT...
                // be more conservative -- HLL.MIN_REGWIDTH_PARAM is too absurdly low to be useful
                // use previously computed (hashableNumType) default regwidth -1 as lower bound for scaling
                final int MIN_HUERISTIC_REGWIDTH = regwidth - 1;
                regwidth = MIN_HUERISTIC_REGWIDTH
                        + (int) Math.round(accuracyOpt * (HLL.MAXIMUM_REGWIDTH_PARAM - MIN_HUERISTIC_REGWIDTH));

            } catch (NumberFormatException nfe) {
                // param value isn't a number -- let's check for simple true/false
                if (!localParams.getBool(Stat.cardinality.name(), false)) {
                    return null;
                }
            }

            // let explicit params override both the default and/or any accuracy specification
            log2m = localParams.getInt("hllLog2m", log2m);
            regwidth = localParams.getInt("hllRegwidth", regwidth);

            // validate legal values
            if (log2m < HLL.MINIMUM_LOG2M_PARAM || HLL.MAXIMUM_LOG2M_PARAM < log2m) {
                throw new SolrException(ErrorCode.BAD_REQUEST, "hllLog2m must be at least "
                        + HLL.MINIMUM_LOG2M_PARAM + " and at most " + HLL.MAXIMUM_LOG2M_PARAM + " (" + log2m + ")");
            }
            if (regwidth < HLL.MINIMUM_REGWIDTH_PARAM || HLL.MAXIMUM_REGWIDTH_PARAM < regwidth) {
                throw new SolrException(ErrorCode.BAD_REQUEST, "hllRegwidth must be at least "
                        + HLL.MINIMUM_REGWIDTH_PARAM + " and at most " + HLL.MAXIMUM_REGWIDTH_PARAM);
            }

            HashFunction hasher = localParams.getBool("hllPreHashed", false) ? null : Hashing.murmur3_128();

            if (null == hasher) {
                // if this is a function, or a non Long field, pre-hashed is invalid
                // NOTE: we ignore hashableNumType - it's LONG for non numerics like Strings
                if (null == field || !LegacyNumericType.LONG.equals(field.getType().getNumericType())) {
                    throw new SolrException(ErrorCode.BAD_REQUEST,
                            "hllPreHashed is only supported with Long based fields");
                }
            }

            // if we're still here, then we need an HLL...
            return new HllOptions(log2m, regwidth, hasher);
        }

        /** @see HLL */
        public int getLog2m() {
            return log2m;
        }

        /** @see HLL */
        public int getRegwidth() {
            return regwidth;
        }

        /** May be null if user has indicated that field values are pre-hashed */
        public HashFunction getHasher() {
            return hasher;
        }

        public HLL newHLL() {
            // Although it (in theory) saves memory for "medium" size sets, the SPARSE type seems to have
            // some nasty impacts on response time as it gets larger - particularly in distrib requests.
            // Merging large SPARSE HLLs is much much slower then merging FULL HLLs with the same num docs
            //
            // TODO: add more tunning options for this.
            return new HLL(getLog2m(), getRegwidth(), -1 /* auto explict threshold */,
                    false /* no sparse representation */, HLLType.EMPTY);

        }
    }

    /**
     * Returns the effective {@link LegacyNumericType} for the field for the purposes of hash values.
     * ie: If the field has an explict LegacyNumericType that is returned; If the field has no explicit
     * LegacyNumericType then {@link LegacyNumericType#LONG} is returned;  If field is null, then
     * {@link LegacyNumericType#FLOAT} is assumed for ValueSource.
     */
    private static LegacyNumericType getHashableNumericType(SchemaField field) {
        if (null == field) {
            return LegacyNumericType.FLOAT;
        }
        final LegacyNumericType result = field.getType().getNumericType();
        return null == result ? LegacyNumericType.LONG : result;
    }
}