tufts.vue.ds.Field.java Source code

Java tutorial

Introduction

Here is the source code for tufts.vue.ds.Field.java

Source

/*
* Copyright 2003-2010 Tufts University  Licensed under the
 * Educational Community License, Version 2.0 (the "License"); you may
 * not use this file except in compliance with the License. You may
 * obtain a copy of the License at
 * 
 * http://www.osedu.org/licenses/ECL-2.0
 * 
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an "AS IS"
 * BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
 * or implied. See the License for the specific language governing
 * permissions and limitations under the License.
 */

package tufts.vue.ds;

import tufts.Util;

import java.util.*;

import java.text.DateFormat;
import java.text.NumberFormat;
import java.text.DecimalFormat;
import tufts.vue.LWComponent;
import tufts.vue.DEBUG;

import com.google.common.collect.*;
import org.apache.commons.lang.StringEscapeUtils;

/**
 * Represents a column in a data-set, or pseudo-column from an XML mapped data-set.
 *
 * Besides simply recording the name of the column, this class mainly provides
 * data-analysis of on all the values found in the column, discovering enumerated
 * types and doing some data-type analysis.  It also includes the ability to
 * associate a LWComponent node style with specially marked values.
 * 
 * @version $Revision: 1.25 $ / $Date: 2010-02-03 19:13:16 $ / $Author: mike $
 * @author Scott Fraize
 */

public class Field implements tufts.vue.XMLUnmarshalListener {
    private static final org.apache.log4j.Logger Log = org.apache.log4j.Logger.getLogger(Field.class);

    public static final String EMPTY_VALUE = "";

    public static final String TYPE_TEXT = "TEXT";
    public static final String TYPE_INTEGER = "INTEGER";
    public static final String TYPE_DECIMAL = "DECIMAL";
    public static final String TYPE_DATE = "DATE";
    public static final String TYPE_QUANTILE = "QUANTILE";

    private static final int MAX_ENUM_VALUE_LENGTH = 192;
    private static final int MAX_DATE_VALUE_LENGTH = 40;
    private static final DateFormat DateParser = DateFormat.getDateTimeInstance();

    private Schema schema; // should be final, but not due to castor persistance
    private String name;

    /** the number of actual (non-empty) values that have been inspected for analysis */
    private int mValuesSeen;
    /** the string length of the longest value seen */
    private int mMaxValueLen;
    /** if true, all values found were unique -- there were no repeated values */
    private boolean mAllValuesUnique;
    /** if true, the values were too long to meaninfully track and enumerate */
    private boolean mValueTrackDisabled;

    /** map of all possible unique values for enumeration tracking */
    private final Multiset<String> mValues = LinkedHashMultiset.create();

    private String mType = TYPE_INTEGER; // starts most specific as default, is cleared upon finding anything else
    private boolean mTypeDetermined = false;

    private final Collection<String> mDataComments = new ArrayList();

    /** map of values currently present in a given context (e.g., a VUE map) */
    private Multiset<String> mContextValues;

    private LWComponent mNodeStyle;

    //========================================================================================
    // These variables are only relevant to Fields numeric type:

    private static final int QUANTILE_BUCKETS = 4; // # of quantile ranges to create (4=quartiles, 5=quintiles, etc)

    private double mMinValue = Double.MAX_VALUE;
    private double mMaxValue = Double.MIN_VALUE;
    private double mValuesTotal;
    private double mMeanValue;
    private double mMedianValue;
    private double mStandardDeviation;
    private double[] mQuantiles;
    private boolean mAllValuesAreIntegers = true; // defaults true: won't be valid until final analysis
    //========================================================================================

    private transient boolean mXMLRestoreUnderway;

    /**
     * A persistant reference to a Field for storing associations in maps via castor.
     * Note variable names in this class don't have more than one cap letter to best
     * work with castor auto-mappings.  Changing the variable names here will break
     * persistance for previously stored associations under the old names.
     */
    public static final class PersistRef {
        public String fieldName, schemaName, schemaId, schemaGuid, schemaDsguid;

        @Override
        public String toString() {
            return String.format("FieldRef[%s.%s %s/%s]", schemaName, fieldName, schemaId, schemaGuid);
        }

        public PersistRef() {
        } // for castor

        PersistRef(Field field) {
            final Schema s = field.getSchema();
            schemaId = s.getMapLocalID();
            schemaGuid = s.getGUID();
            schemaDsguid = s.getDSGUID();
            schemaName = s.getName();
            fieldName = field.getName();
        }
    }

    /** for castor persistance */
    public Field() {
        this.name = "<empty>";
    }

    private transient Collection<PersistRef> mRelatedFields;

    Collection<PersistRef> getRelatedFieldRefs() {
        return mRelatedFields;
    }

    Field(String n, Schema schema) {
        this.name = n;
        setSchema(schema);
        flushStats(true);
        if (DEBUG.SCHEMA) {
            Log.debug("instanced " + Util.tags(this));
            //Log.debug("instanced " + Util.tags(this), new Throwable("HERE"));
        }
    }

    //     /** for castor persistance */
    //     public final String getMapLocalID() {
    //         return String.format("%s.%s", schema.getMapLocalID(), name);
    //     }

    /** must be called by parent Schema after de-serialization (needed for persistance) */
    void setSchema(Schema s) {
        this.schema = s;
    }

    /** for persistance of associations */
    public Collection<PersistRef> getRelatedFields() {
        if (mXMLRestoreUnderway) {
            Log.debug("RETURNING RELATED FIELDS for " + this);
            return mRelatedFields;
        } else {
            Collection<PersistRef> persists = new ArrayList();
            for (Field f : Association.getPairedFields(this)) {
                persists.add(new PersistRef(f));
            }
            if (DEBUG.SCHEMA && persists.size() > 0) {
                Log.debug(this + ": GOT RELATED FIELDS: " + Util.tags(persists));
            }
            return persists;
        }
    }

    /** interface {@link XMLUnmarshalListener} -- init */
    public void XML_initialized(Object context) {
        mXMLRestoreUnderway = true;
        mRelatedFields = new HashSet();
    }

    /** interface {@link XMLUnmarshalListener} -- track us */
    public void XML_completed(Object context) {
        mXMLRestoreUnderway = false;
        if (mRelatedFields.size() > 0) {
            if (DEBUG.Enabled) {
                Log.debug("GOT RELATED FIELDS for " + this);
                Util.dump(mRelatedFields);
            }
            // todo: later, process to re-construct associations
        } else
            mRelatedFields = Collections.EMPTY_LIST;
    }

    /** Wrapper for display of special values: e.g., EMPTY_VALUE ("") to "(no value)" */
    public static String valueText(Object value) {
        if (value == null)
            return null;
        else if (value == EMPTY_VALUE)
            return "(no value)";
        else
            return value.toString();
    }

    public String valueDisplay(Object value) {

        final String display;

        if (isQuantile()) {
            display = getName() + ": " + StringEscapeUtils.escapeHtml(valueText(value));
        } else {
            display = StringEscapeUtils.escapeHtml(valueText(value));
        }
        //Log.debug(this + "; valueDisplay: " + value + " -> " + Util.tags(display));
        return display;
    }

    public int countValues(String value) {
        return mValues.count(value);
    }

    void annotateIncludedValues(final Collection<LWComponent> nodes) {
        if (mValues == null || count(mValues) < 1) {
            if (mContextValues != null)
                mContextValues.clear();
            return;
        }
        if (mContextValues == null)
            mContextValues = HashMultiset.create();
        else
            mContextValues.clear();
        if (DEBUG.META)
            Log.debug("MARKING INCLUDED VALUES AGAINST " + nodes.size() + " NODES for " + this);

        final Set<String> valuesToCheck = mValues.elementSet();
        for (LWComponent c : nodes) {
            for (String value : valuesToCheck) {
                //if (c.getDataSchema() == schema && c.hasDataValue(this.name, value)) {
                if (c.hasDataValue(this.name, value)) {
                    //if (!c.isDataValueNode()) // SMF - changed to allow data-value nodes 2009-10-04
                    mContextValues.add(value);
                    //Log.debug(String.format("found in context: %s=[%s], count=%d", this.name, value, mContextValues.count(value)));
                }
            }

            //             final Iterator<String> i = valuesToCheck.iterator();
            //             while (i.hasNext()) {
            //                 final String value = i.next();
            //                 if (c.isSchematicFieldNode() && c.hasDataValue(this.name, value)) {
            //                     //Log.debug(String.format("found in context: %s=[%s]", this.name, value));
            //                     mContextValues.add(value);
            //                     i.remove();
            //                 }
            //             }
            //             if (valuesToCheck.size() < 1) {
            //                 //Log.debug(this + "; no more values to check, found: " + mContextValues);
            //                 Log.debug(String.format("all %d data-set values found on the map, done marking early for [%s]",
            //                                         mValues.size(),
            //                                         this.name));
            //                 if (mContextValues.size() != mValues.size())
            //                     Log.error(new Throwable(String.format("context values %d != data-set values size %d in [%s]", 
            //                                                           mContextValues.size(),
            //                                                           mValues.size(),
            //                                                           this.name)));
            //                 //                     Log.debug(String.format("all values discovered, found %3d on-map out of %3d in data-set [%s]",
            //                 //                                             mContextValues.size(),
            //                 //                                             mValues.size(),
            //                 //                                             this.name));
            //                 break;
            //             }

        }
    }

    public boolean hasContextValue(String value) {
        return mContextValues != null && mContextValues.contains(value);
    }

    public int countContextValue(String value) {
        return mContextValues == null ? 0 : mContextValues.count(value);
    }

    public int getContextValueCount() {
        return mContextValues == null ? 0 : mContextValues.size();
    }

    protected void flushStats() {
        flushStats(false);
    }

    private void flushStats(boolean init) {
        if (!init)
            Log.debug("flushing " + this);

        // reset to initial defaults

        mValues.clear();
        mValuesSeen = 0;
        mValueTrackDisabled = false;
        mAllValuesUnique = true;
        mAllValuesAreIntegers = true;
        mMaxValueLen = 0;
        mType = TYPE_INTEGER;
        mTypeDetermined = false;
        mDataComments.clear();

        mMinValue = Double.MAX_VALUE;
        mMaxValue = Double.MIN_VALUE;
        mValuesTotal = 0;
        mMeanValue = 0;
        mMedianValue = 0;
        mStandardDeviation = 0;
        mQuantiles = null;

        // we keep the nodeStyle, which is the whole reason we use a flush instead of
        // just creating new Schema+Field objects when reloading.  Tho at this point,
        // may be easier to re-create all & just carry over the styles.
    }

    /** for persistance */
    public void setStyleNode(LWComponent style) {
        if (DEBUG.SCHEMA)
            Log.debug(String.format("setStyleNode %-22s%s", this, style));
        //         if (mNodeStyle != null)
        //             Log.warn("resetting field style " + this + " to " + style, new Throwable("HERE"));
        mNodeStyle = style;
    }

    public boolean hasStyleNode() {
        return mNodeStyle != null;
    }

    public LWComponent getStyleNode() {
        return mNodeStyle;
    }

    public String getName() {
        return name;
    }

    /** for castor persistance only */
    public void setName(String s) {
        name = s;
    }

    public Schema getSchema() {
        return schema;
    }

    public String getType() {
        return mType;
    }

    public boolean isNumeric() {
        return getType() == TYPE_DECIMAL || getType() == TYPE_INTEGER;
    }

    private static final String NoCause = "(explicit-type-set)";

    private void takeType(String type, String cause) {
        if (DEBUG.Enabled)
            Log.debug(toTerm() + " type=>" + type + " on " + Util.tags(cause));
        mType = type;
    }

    private void setType(String type) {
        setType(type, NoCause);
    }

    private void setType(String type, String cause) {
        takeType(type, cause);
        mTypeDetermined = true;
    }

    public boolean isQuantile() {
        return mType == TYPE_QUANTILE;
    }

    @Override
    public String toString() {
        if (schema == null)
            return String.format("<?>.%s", getName());
        else
            return String.format("%s.%s", schema.getName(), getName());
    }

    public String toTerm() {
        return Relation.quoteKey(this);
    }

    //     @Override
    //     public String toString() {
    //         //if (isNumeric) type=TYPE_DECIMAL; // HACK: NEED ANALYSIS PHASE
    //         //return getName();

    //         final String numeric = isNumeric ? "/NUMERIC" : "";

    //         //final String name = schema.getName() + "." + getName();
    //         final String name = getName();

    //         if (mValuesSeen() == 1)
    //             //return String.format("<html><code>%s</code>:<br>\"%s\"", getName(), getValues().toArray()[0]);
    //             return String.format("%-14s=\"%s\"", name, getValues().toArray()[0]);
    //         else if (mAllValuesUnique)
    //             return String.format("%-14s (%d)/%s%s", name, mValuesSeen(), type, numeric);
    //         else
    //             return String.format("%-14s [%d]/%s%s", name, uniqueValueCount(), type, numeric);
    //     }

    public boolean isPossibleKeyField() {
        //return mAllValuesUnique && mValuesSeen == schema.getRowCount() && !(type == TYPE_DATE);
        return !mValueTrackDisabled && mAllValuesUnique && uniqueValueCount() == valueCount()
                && valueCount() == schema.getRowCount() && !(mType == TYPE_DATE);
    }

    /** @return true if this is the schema's unique key field */
    public boolean isKeyField() {
        return schema.getKeyField() == this;

        //         boolean t = (schema.getKeyField() == this);
        //         Log.debug(String.format("isKeyField=%s %s", t ? "YES" : "no", Util.tags(this)));
        //         return t;
    }

    public boolean isUntrackedValue() {
        return mValueTrackDisabled;
    }

    /** @return true if all the values for this Field have been fully tracked and recorded, and more than one
     * unique value was found */
    public boolean isEnumerated() {
        return !mValueTrackDisabled && uniqueValueCount() > 1;
    }

    /** @return true if this field appeared a single time in the entire data set.
     * This can generally only be true for fields from an XML data-set, in which a single-value
     * "column" is in effect created by an XML key that only appears once, such as keys
     * that apply to the entire feed.
     */
    public boolean isSingleton() {
        return mAllValuesUnique && (mValues != null && count(mValues) < 2);
    }

    /** @return true if every value found for this field has the same value.
     * Will always be true if isSingleton() is true
     */
    public boolean isSingleValue() {
        return uniqueValueCount() == 1;
    }

    /** @return the instance value count: the number of a times any value appeared for this field (includes repeats) */
    protected int valueCount() {
        return mValuesSeen;
    }

    public int getEnumValuesSeen() {
        return isEnumerated() ? uniqueValueCount() : -1;
    }

    protected int uniqueValueCount() {
        if (mValues == null) {
            if (mMaxValueLen == 0)
                return 0;
            else
                return valueCount();
        } else {
            return count(mValues);
        }

        //return mValues == null ? valueCount() : mValues.entrySet().size();
    }

    /** @return the count of all unique values in the Multiset */
    private static int count(Multiset m) {
        // to fulfill the java.util.Collection contract, Multiset.size() returns the *virtual*
        // count of items in the set, not the unqiue items as a counting HashMap impl would do --
        // we have to actually pull the entrySet/elementSet and count that to get the count of
        // unique values.  Forunately, the impl appears to cache the entrySet, so it's not creating
        // a new one each time.  (The elementSet is also cached, tho in the current google impl, the
        // entrySet has to do a tad less delegation to extract the backingMap size)

        return m == null ? 0 : m.entrySet().size();
        //return m == null ? 0 : m.elementSet().size();
    }

    public int getMaxValueLength() {
        return mMaxValueLen;
    }

    /**
     * @return the set of all unique values this Field has been seen to take amonst all rows in
     * the data-set.  Note that the returned set is modifiable, and should NOT be modified.
     */
    public Set<String> getValues() {
        return mValues.elementSet();
        // note: the set from elementSet() can modify the backing Multiset
        //return mValues == null ? Collections.EMPTY_SET : mValues.elementSet();
    }

    private static final Multiset EMPTY_MULTISET = Multisets.unmodifiableMultiset(HashMultiset.create(0));

    public Multiset<String> getValueSet() {
        return mValues == null ? EMPTY_MULTISET : Multisets.unmodifiableMultiset(mValues);
        //return mValues == null ? EMPTY_MULTISET : mValues;
    }
    //     public Map<String,Integer> getValueMap() {
    //         return mValues == null ? Collections.EMPTY_MAP : mValues;
    //     }

    // todo: may want to move this to a separate analysis code set
    void trackValue(String value) {

        if (value == null)
            return;

        final int valueLen = value.length();

        if (valueLen > mMaxValueLen)
            mMaxValueLen = valueLen;

        if (mValueTrackDisabled)
            return;

        if (value == EMPTY_VALUE) {
            ; // don't increment value count
        } else if (valueLen == 0) {
            value = EMPTY_VALUE; // don't increment value count
        } else {
            mValuesSeen++;
        }

        if (mValuesSeen > 1 && value.length() > MAX_ENUM_VALUE_LENGTH) {
            mValueTrackDisabled = true;
            setType(TYPE_TEXT, value);
            return;
        }

        if (mValues.contains(value))
            mAllValuesUnique = false;

        mValues.add(value);
        //Log.debug(this + " added " + value + "; size=" + count(mValues));

        if (value == EMPTY_VALUE)
            return;

        if (!mTypeDetermined)
            trackForTypeInference(value);

    }

    // the inferencing depends on not passing this method null or empty values
    private void trackForTypeInference(final String text) {
        if (text.indexOf(':') > 0) {
            if (isDateValue(text)) {
                // THIS IS A MAJOR GUESS: we guess it's a date field if we see a single valid date
                setType(TYPE_DATE, text);
            } else {
                // having seen a ':' but not being a date, we infer that this is text (e.g., not numeric)
                setType(TYPE_TEXT, text);
            }
        } else {
            final double number = getNumericValue(text, true);
            if (Double.isNaN(number)) {
                // the first non-numeric we see, mark us as text
                setType(TYPE_TEXT, text);
            } else {
                //Log.debug(Util.tags(text) + " = " + number);
                if (number < mMinValue)
                    mMinValue = number;
                else if (number > mMaxValue)
                    mMaxValue = number;
                mValuesTotal += number;
                if (mAllValuesAreIntegers && number != (long) number) {
                    mAllValuesAreIntegers = false;
                    takeType(TYPE_DECIMAL, text); // do NOT use setType -- this is still a guess, value is not determined yet
                }
            }
        }
    }

    private double getNumericValue(final String text) {
        return getNumericValue(text, true);
    }

    // DecimalFormat's are not synchronized, thus these cannot be static.
    private final NumberFormat LocalNumberFormat = NumberFormat.getInstance();
    //private final NumberFormat LocalCurrencyFormat = NumberFormat.getCurrencyInstance();

    /** @return double value if one found, Double.NaN otherwise */
    private double getNumericValue(final String text, final boolean tryCurrency) {

        try {
            // Double.parseDouble handles most stuff, including "0x2F" style
            // hex values was well as scientific notation.
            return Double.parseDouble(text);
        } catch (Throwable t) {
        }

        Number value = null;

        try {
            // This handles values of the form "1,234,567". It will also extract any
            // number that can be found at the head of a string: e.g. "7foo" will return
            // 7, or "70%" will return 70 (*not* 0.70).  The instance of LocalNumberFormat will
            // generally be a DecimalFormat
            value = LocalNumberFormat.parse(text);
        } catch (Throwable t) {
        }

        // Note that if we use a NumberFormat.getCurrencyInstance() here to handle
        // currency, it will only allow the local currency symbol.

        if (value == null && tryCurrency && text.length() > 1 && isCurrencySymbol(text.codePointAt(0))) {
            value = getNumericValue(text.substring(1), false); // NOTE RECURSION
            //Log.debug("HANDLED CURRENCY " + Util.tags(text) + " = " + Util.tags(value));
        }

        // could allow for percent parsers that return value/100

        if (DEBUG.SCHEMA || DEBUG.DATA)
            Log.debug(Util.tags(text) + " = " + Util.tags(value));

        return value == null ? Double.NaN : value.doubleValue();
    }

    private static boolean isCurrencySymbol(int c) {
        // checking '$' should be redundant
        return c == '$' || Character.getType(c) == Character.CURRENCY_SYMBOL;
    }

    private static boolean isDateValue(String value) {
        Date date = null;

        try {
            date = new Date(value);
            if (DEBUG.Enabled)
                Log.debug("PARSED DATE: " + Util.tags(date) + " from " + value);
        } catch (Throwable t) {
            if (DEBUG.DATA)
                Log.debug("Failed to parse [" + value + "] as date: " + t);
        }

        //         try {
        //             date = DateParser.parse(value);
        //         } catch (java.text.ParseException e) {
        //             eoutln("Failed to parse [" + value + "] as date: " + e);
        //             return false;
        //         }

        return date != null;
    }

    //     private static boolean isNumericValue(String value) {
    //        try {
    //             Double.parseDouble(value);
    //        } catch (Throwable t) {
    //             //if (DEBUG.SCHEMA) Log.info(t);
    //             return false;
    //        }
    //         return true;
    //     }

    /** compute quantiles via median values and return the absolute median */
    private static double computeQuantiles(final double[] quantiles, final double[] values) {

        // Note: The quantile ranges will change depending on how the boundaries are handled
        // (e.g., off-by-one differences in computing which index to use). There does not
        // appear to be a commonly agreed upon method of resolving this in either direction.

        Arrays.sort(values);

        final boolean EVEN_REGIONS = (quantiles.length % 2 != 0);

        if (DEBUG.Enabled)
            Log.debug("count of all possible values: " + values.length);
        //for (int i = 0; i < values.length; i++) Log.debug("v" + i + ": " + values[i]);

        final int regions = quantiles.length + 1;
        final float range = (float) values.length / (float) regions;
        if (DEBUG.Enabled)
            Log.debug(
                    "each of " + regions + " quantile regions has an approx sample size of: " + range + " samples");

        // TODO: the below median computation for ranges with an even # of buckets should
        // be done for each range

        for (int i = 0; i < quantiles.length; i++) {
            final float rawIndex = (i + 1) * range;
            //final int index = Math.round(rawIndex);
            final int index = (int) Math.floor(rawIndex); // using floor will exactly align middle index in odd numbered value sets
            quantiles[i] = values[index];
            if (DEBUG.Enabled)
                Log.debug(
                        String.format("quantile %d index %3.2f (%d) value = " + values[index], i, rawIndex, index));
        }

        // If the number of buckets is even (and thus the # of quantile values needed is odd),
        // the middle quantile will be the median.

        final double median;
        final int halfIndex = values.length / 2;

        if (values.length % 2 == 0) {
            // even # of sample values -- absolute median must be computed separately by averaging middle two values
            final double belowMedian = values[halfIndex - 1];
            final double aboveMedian = values[halfIndex];
            median = (belowMedian + aboveMedian) / 2.0;
            if (DEBUG.Enabled)
                Log.debug(String.format("AVERAGED MEDIAN: %g from %g+%g halfIndex=%d", median, belowMedian,
                        aboveMedian, halfIndex));
        } else {
            // odd # of sample values -- median already represented by the middle value
            median = values[halfIndex];
            if (DEBUG.Enabled)
                Log.debug(String.format("PICKED MEDIAN: %g from exact middle index=%d", median, halfIndex));
            //median = quantiles[(quantiles.length + 1) / 2 - 1];
        }

        if (EVEN_REGIONS) {
            if (quantiles[quantiles.length / 2] != median) {
                if (DEBUG.Enabled)
                    Log.info(String.format("PATCHING MIDDLE QUANTILE TO ABSOLUTE MEDIAN; %g -> %g",
                            quantiles[quantiles.length / 2], median));
                quantiles[quantiles.length / 2] = median;
            }
        }

        return median;
    }

    private static void computeValueRangeQuantiles(final double[] quantiles, final double minValue,
            final double maxValue) {
        final double allValueRange = (maxValue - minValue);
        final double quantileValueRange = allValueRange / (quantiles.length + 1);

        if (DEBUG.Enabled)
            Log.debug(String.format("computing value-based quantiles for values (%g-%g) range=%g, quantileRange=%g",
                    minValue, maxValue, allValueRange, quantileValueRange));
        for (int i = 0; i < quantiles.length; i++) {
            quantiles[i] = minValue + (quantileValueRange * (i + 1));
            if (DEBUG.Enabled)
                Log.debug(String.format("quantile %d value = %g", i, quantiles[i]));
        }

    }

    private static final boolean USE_VALUE_RANGE_QUANTILES = true; // original Anoop method
    private static final boolean USE_STANDARD_QUANTILES = false; // standard statistical method (resource intensive: duplicates & sorts entire sample set)
    //private static final boolean USE_COMPRESSED_SAMPLE_QUANTILES = !USE_STANDARD_QUANTILES; // ignore repeated values in sample set

    /** compute and record standard method quantile values as well as the median value */
    private void computeQuantiles(final double[] allValues) {
        // NOTE: for data-sets with many repeated values, several of the quantiles may
        // cover exactly the same range of values.  Adding another type of analysis for
        // that case would be useful, or perhaps rolling our own "modified quantile"
        // analysis that forces quantiles to cover different values.

        // E.g: if QUANTILE_BUCKETS=4 (we want 4 buckets), we need to produce 3 (three) quantile
        // values to divide the range into 4 (four) regions

        mQuantiles = new double[QUANTILE_BUCKETS - 1];

        if (USE_STANDARD_QUANTILES) {
            // this will fill mQuantiles with appropriate values
            mMedianValue = computeQuantiles(mQuantiles, allValues);

        } else if (USE_VALUE_RANGE_QUANTILES) {

            // This method can produce more semantically meaningful quantiles, but this
            // backfires and renders the quantiles mostly useless if there are outliers.
            // E.g., a single high outlier can leave almost all values in the first
            // bucket, nothing at all in the middle buckets, and the single high-flyer
            // in the top bucket.

            computeValueRangeQuantiles(mQuantiles, mMinValue, mMaxValue);

            mMedianValue = Double.NaN; // uncomputed

        } else {

            // This is the STANDARD method except with "compressed" samples -- only
            // unique values are analyized.

            int validCount = mValues.elementSet().size();
            if (mValues.contains(EMPTY_VALUE))
                validCount--;

            final double[] uniqueValues = new double[validCount];

            int i = 0;
            for (String s : mValues.elementSet())
                if (s != EMPTY_VALUE)
                    uniqueValues[i++] = getNumericValue(s, true);

            mMedianValue = computeQuantiles(mQuantiles, uniqueValues);
        }
    }

    private static final boolean SKEW_QUANTILES_LOW = false; // anecdotally "more balanced" when skewing high
    private static final boolean SKEW_QUANTILES_HIGH = !SKEW_QUANTILES_LOW;

    /** @return the quantile the given value is determined to lie in.  Will return values from 0 - (QUANTILE_BUCKETS-1) */
    private int getQuantile(final double value) {

        // note:
        // using "value <= mQuantiles[i]" skews data to lower quantiles
        // using "value <  mQuantiles[i]" skews data to higher quantiles

        if (SKEW_QUANTILES_LOW) {
            for (int i = 0; i < mQuantiles.length; i++)
                if (value <= mQuantiles[i])
                    return i;
        } else {
            for (int i = 0; i < mQuantiles.length; i++)
                if (value < mQuantiles[i])
                    return i;
        }
        return mQuantiles.length;
    }

    private String getQuantileName(int i) {

        // A 1.0 TOP_RANGE_ADJUSTMENT value on only works for integer ranges; won't work for
        // sub-integer value ranges.  This adjustment entirely depends on which way we skew in
        // getQuantile.

        // For non-integer values we just allow the quantile names to be ambiguously overlapping
        // (e.g., allow the MAX of one range to equal the MIN of the next range).

        final double TOP_RANGE_ADJUSTMENT;

        if (mAllValuesAreIntegers && SKEW_QUANTILES_HIGH) // could still adjust low, but would need different adjustment
            TOP_RANGE_ADJUSTMENT = 1.0;
        else
            TOP_RANGE_ADJUSTMENT = 0.0;

        final double min, max;

        if (i == 0)
            min = mMinValue;
        else
            min = mQuantiles[i - 1];

        if (i == mQuantiles.length)
            max = mMaxValue;
        else
            max = mQuantiles[i] - TOP_RANGE_ADJUSTMENT;

        if (mAllValuesAreIntegers)
            return String.format("Q%d: %,.0f-%,.0f", i + 1, min, max);
        else
            return String.format("Q%d: %,g-%,g", i + 1, min, max);

    }

    //private static final String[] QUANTILE_NAMES = { "Lowest", "Low", "Medium", "High", "Highest" };

    void performFinalAnalysis() {

        mTypeDetermined = true;

        if (!isNumeric() || uniqueValueCount() <= (QUANTILE_BUCKETS * 3))
            return;

        if (isKeyField())
            return;

        //-----------------------------------------------------------------------------
        // Compute common summary statistics & quantiles
        //-----------------------------------------------------------------------------

        mMeanValue = mValuesTotal / mValuesSeen;

        // TODO: we could compute the quantile values in much less memory by using a
        // sorted-by-value version of the existing mValues Multiset, and iterating through it by
        // increasing "count" to find the appropriate median values.

        // performance: if all values are integers/longs, we could optimize the following codepaths to
        // use integer types & parsing code

        final double[] allValues;

        if (USE_STANDARD_QUANTILES)
            allValues = new double[mValuesSeen];
        else
            allValues = null;

        double totalSquaredDeviations = 0;
        int count = 0;

        for (DataRow row : schema.getRows()) {
            final String text = row.getValue(this);

            if (text == null) {
                // this should only happen in XML data-sets with fields that don't have
                // values in all rows
                continue;
            }

            final double value = getNumericValue(text);

            if (Double.isNaN(value))
                continue;

            if (USE_STANDARD_QUANTILES)
                allValues[count] = value;
            count++;

            final double meanDeviation = value - mMeanValue;

            totalSquaredDeviations += (meanDeviation * meanDeviation);
        }

        if (count != mValuesSeen) {
            Log.warn(this + Util.TERM_RED + ": COUNT != mValuesSeen; " + count + " != " + mValuesSeen
                    + Util.TERM_CLEAR);
            return;
        }

        final double variance = totalSquaredDeviations / mValuesSeen;

        mStandardDeviation = Math.sqrt(variance);

        //-----------------------------------------------------------------------------
        // Create quantiles
        //-----------------------------------------------------------------------------

        computeQuantiles(allValues);

        //-----------------------------------------------------------------------------
        // Explicitly create quantile value records (we do this first only so they are ordered)
        //-----------------------------------------------------------------------------

        final double range = mMaxValue - mMinValue;

        //final String[] quantileNames = QUANTILE_NAMES.clone();
        final String[] quantileNames = new String[QUANTILE_BUCKETS];

        //final Field quantileField = this;
        final Field quantileField = schema.addFieldBefore(this,
                String.format("%s [Q%d]", getName(), QUANTILE_BUCKETS));

        quantileField.setType(TYPE_QUANTILE);

        //quantileField.setStyleNode(getStyleNode()); // TODO: WON'T WORK: style-node not yet set
        // duplicate v.s. crate new via data-action so we don't use up color schemes
        quantileField.setStyleNode(DataAction.initNewStyleNode(getStyleNode().duplicate()));

        //Util.printStackTrace("SETTING LABEL ON " + Util.tags(quantileField.getStyleNode() + " for " + this));
        quantileField.getStyleNode()
                .setLabelTemplate(String.format("%s Range\n${%s}", getName(), quantileField.getName()));

        for (int i = 0; i < QUANTILE_BUCKETS; i++) {
            quantileNames[i] = getQuantileName(i);
            // We add the possible values now only to enforce the order in mValues for the DataTree            
            quantileField.mValues.add(quantileNames[i]);
            //quantileField.trackValue(quantileNames[i]); 
        }

        //-----------------------------------------------------------------------------
        // Assign quantile values to all rows:
        //-----------------------------------------------------------------------------

        for (DataRow row : schema.getRows()) {
            final String text = row.getValue(this);

            if (text == null) {
                // this should only happen in XML data-sets with fields that don't have
                // values in all rows
                continue;
            }

            final double value = getNumericValue(text, true);
            final String quantileValue;

            if (Double.isNaN(value)) {
                quantileValue = Field.EMPTY_VALUE;
            } else {
                quantileValue = quantileNames[getQuantile(value)];
                row.addValue(quantileField, quantileValue);
            }

            // Don't bother to add quartile values for empty values
            //row.addValue(quantileField, quantileValue);
        }

        //-----------------------------------------------------------------------------

        if (DEBUG.Enabled) {
            //final double deviationQ = range / QUANTILE_BUCKETS;
            //quantileField.trackValue(String.format("(DeviationQ: %.1f)", deviationQ));
            //             quantileField.mValues.add(String.format("(Std Dev: %.1f)", mStandardDeviation));
            //             quantileField.mValues.add(String.format("(Segments: %.1f)", range / mStandardDeviation));
        }

        final double deviationsToCoverAllValues = range / mStandardDeviation; // # of std-dev's needed to cover all values

        if (mAllValuesAreIntegers) {
            quantileField.addDataComment(String.format("Mean: %.1f", mMeanValue));
            if (!Double.isNaN(mMedianValue))
                quantileField.addDataComment(String.format("Median: %.1f", mMedianValue));
            quantileField.addDataComment(String.format("Std Dev: %d x %.1f", (int) Math.round(mStandardDeviation),
                    deviationsToCoverAllValues
            //(int) Math.round(deviationsToCoverAllValues)
            ));
        } else {
            quantileField.addDataComment(String.format("Mean: %g", mMeanValue));
            if (!Double.isNaN(mMedianValue))
                quantileField.addDataComment(String.format("Median: %g", mMedianValue));
            quantileField.addDataComment(
                    String.format("Std Dev: %g x %.1f", mStandardDeviation, deviationsToCoverAllValues));
        }
    }

    public Collection<String> getDataComments() {
        return mDataComments;
    }

    private void addDataComment(String s) {
        mDataComments.add(s);
    }

    // This code appears to be calculating a quantile by calculating the linear % location the value
    // has within the total range of possible values. We're now computing quantiles using a standard definition of
    // quantile / quartile that involves computing by median.

    //     private int getQuantile(final double value) {
    //         return getQuantile(mMinValue, mMaxValue, value, QUANTILE_BUCKETS);
    //     }

    //     private static int getQuantile
    //         (final double min,
    //          final double max,
    //          final double value,
    //          final int N)
    //     {
    //         final double ratio = (value-min) / (max-min);
    //         final int quantile = (int) Math.ceil(ratio*N);

    //         if (quantile <= 0) {
    //             Log.warn("quantile="+quantile + " for value " + value);
    //             return 1;
    //         } else
    //             return quantile;
    //     }

    //     private static String getQuantileRange
    //         (final double min,
    //          final double max,
    //          final int quantile,
    //          final int N)
    //     {
    //         final double lowVal = min + (max-min)*(quantile-1)/N;
    //         final double highVal = min + (max-min)*(quantile)/N;

    //         return String.format("%.1f-%.1f", lowVal, highVal);
    //     }

    private String sampleValues(boolean unique) {

        if (count(mValues) <= 20)
            return unique ? mValues.elementSet().toString() : mValues.toString();

        final StringBuilder buf = new StringBuilder("[examples: ");

        int count = 0;
        for (String s : mValues.elementSet()) {
            buf.append('"');
            buf.append(s);
            buf.append('"');
            if (++count >= 3)
                break;
            buf.append(", ");
        }
        buf.append("]");
        return buf.toString();
    }

    public String valuesDebug() {
        if (mValues == null) {
            if (mValuesSeen == 0)
                return "(empty)";
            else
                return String.format("%5d values (un-tracked; max-len%6d)", mValuesSeen, mMaxValueLen);
        } else if (isSingleton()) {
            return "singleton" + mValues.elementSet();
        } else if (mAllValuesUnique) {
            if (count(mValues) > 1) {
                return String.format("%5d unique, single-instance values; %s", count(mValues), sampleValues(true));
                //                    String s = String.format("%2d unique, single-instance values", values.size());
                //                     if (values.size() < 16)
                //                         //return s + "; " + values.keySet();
                //                         return s + "; " + values.toString();
                //                     else
                //                         return s + "; " + sampleValues();
            } else
                return "<empty>?";
        } else
            return String.format("%5d values, %4d unique: %s", valueCount(), count(mValues), sampleValues(false));
        //return String.format("%5d unique values in %5d; %s", values.size(), valueCount(), sampleValues(false));

    }

    /** interface {@link XMLUnmarshalListener} -- does nothing here */
    public void XML_fieldAdded(Object context, String name, Object child) {
    }

    /** interface {@link XMLUnmarshalListener} -- does nothing here */
    public void XML_addNotify(Object context, String name, Object parent) {
    }

}

// abstract class AbstractValue implements CharSequence {
//     final String value;
//     AbstractValue(String s) { value = s; }
//     public int length() { return value.length(); }
//     public char charAt(int index) { return value.charAt(index); }
//     public CharSequence subSequence(int start, int end) { return value.subSequence(start, end); }
//     public int compareTo(String anotherString) { return value.compareTo(anotherString); }
// }
// final class QValue extends AbstractValue {
//     public final int quantile;
//     QValue(String s, int qv) { super(s); quantile = qv; }
// }